diff --git a/.bc-linter.yml b/.bc-linter.yml
new file mode 100644
index 0000000000000..cafa3a51c3ac1
--- /dev/null
+++ b/.bc-linter.yml
@@ -0,0 +1,15 @@
+version: 1
+paths:
+include:
+  - "**/*.py"
+exclude:
+  - ".*"
+  - ".*/**"
+  - "**/.*/**"
+  - "**/.*"
+  - "**/_*/**"
+  - "**/_*.py"
+  - "**/test/**"
+  - "**/benchmarks/**"
+  - "**/test_*.py"
+  - "**/*_test.py"
diff --git a/.ci/docker/ci_commit_pins/torchbench.txt b/.ci/docker/ci_commit_pins/torchbench.txt
index 394e46873a17a..efbc3ceeb2afe 100644
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
@@ -1 +1 @@
-22bc29b4d503fc895ff73bc720ff396e9723465f
+e03a63be43e33596f7f0a43b0f530353785e4a59
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
index 9774863e25c41..9f23feb5adfaf 100755
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@@ -20,7 +20,7 @@ pip_install \
 
 pip_install coloredlogs packaging
 pip_install onnxruntime==1.22.1
-pip_install onnxscript==0.3.1
+pip_install onnxscript==0.4.0
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile
index 776053a5d8750..d2788b2713f7a 100644
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@@ -69,6 +69,11 @@ RUN bash ./install_cuda.sh 12.9
 RUN bash ./install_magma.sh 12.9
 RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda
 
+FROM cuda as cuda13.0
+RUN bash ./install_cuda.sh 13.0
+RUN bash ./install_magma.sh 13.0
+RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda
+
 FROM cpu as rocm
 ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
index abe47bbe9188c..5dee4325857fb 100755
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@@ -67,6 +67,12 @@ case ${image} in
         DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
         MANY_LINUX_VERSION="2_28"
         ;;
+    manylinux2_28-builder:cuda13*)
+        TARGET=cuda_final
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
+        MANY_LINUX_VERSION="2_28"
+        ;;
     manylinuxaarch64-builder:cuda*)
         TARGET=cuda_final
         GPU_IMAGE=amd64/almalinux:8
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index a2e178934ef46..c9d2fddb13244 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -339,7 +339,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:
 
-onnxscript==0.3.1
+onnxscript==0.4.0
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 9c2771eb00688..57f997f300896 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -181,7 +181,6 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi
 
 # AWS specific CUDA build guidance
-ENV TORCH_CUDA_ARCH_LIST Maxwell
 ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
 ENV CUDA_PATH /usr/local/cuda
 
diff --git a/.ci/lumen_cli/cli/build_cli/register_build.py b/.ci/lumen_cli/cli/build_cli/register_build.py
index a86c15a00e069..9f35a9c8165dc 100644
--- a/.ci/lumen_cli/cli/build_cli/register_build.py
+++ b/.ci/lumen_cli/cli/build_cli/register_build.py
@@ -2,7 +2,7 @@
 import logging
 
 from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
-from cli.lib.core.vllm import VllmBuildRunner
+from cli.lib.core.vllm.vllm_build import VllmBuildRunner
 
 
 logger = logging.getLogger(__name__)
diff --git a/.ci/lumen_cli/cli/lib/common/pip_helper.py b/.ci/lumen_cli/cli/lib/common/pip_helper.py
new file mode 100644
index 0000000000000..1eed8406c9f7d
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py
@@ -0,0 +1,71 @@
+import glob
+import logging
+import shlex
+import shutil
+import sys
+from collections.abc import Iterable
+from importlib.metadata import PackageNotFoundError, version
+from typing import Optional, Union
+
+from cli.lib.common.utils import run_command
+
+
+logger = logging.getLogger(__name__)
+
+
+def pip_install_packages(
+    packages: Iterable[str] = (),
+    env=None,
+    *,
+    requirements: Optional[str] = None,
+    constraints: Optional[str] = None,
+    prefer_uv: bool = False,
+) -> None:
+    use_uv = prefer_uv and shutil.which("uv") is not None
+    base = (
+        [sys.executable, "-m", "uv", "pip", "install"]
+        if use_uv
+        else [sys.executable, "-m", "pip", "install"]
+    )
+    cmd = base[:]
+    if requirements:
+        cmd += ["-r", requirements]
+    if constraints:
+        cmd += ["-c", constraints]
+    cmd += list(packages)
+    logger.info("pip installing packages: %s", " ".join(map(shlex.quote, cmd)))
+    run_command(" ".join(map(shlex.quote, cmd)), env=env)
+
+
+def pip_install_first_match(pattern: str, extras: Optional[str] = None, pref_uv=False):
+    wheel = first_matching_pkg(pattern)
+    target = f"{wheel}[{extras}]" if extras else wheel
+    logger.info("Installing %s...", target)
+    pip_install_packages([target], prefer_uv=pref_uv)
+
+
+def run_python(args: Union[str, list[str]], env=None):
+    """
+    Run the python in the current environment.
+    """
+    if isinstance(args, str):
+        args = shlex.split(args)
+    cmd = [sys.executable] + args
+    run_command(" ".join(map(shlex.quote, cmd)), env=env)
+
+
+def pkg_exists(name: str) -> bool:
+    try:
+        pkg_version = version(name)
+        logger.info("%s already exist with version: %s", name, pkg_version)
+        return True
+    except PackageNotFoundError:
+        logger.info("%s is not installed", name)
+        return False
+
+
+def first_matching_pkg(pattern: str) -> str:
+    matches = sorted(glob.glob(pattern))
+    if not matches:
+        raise FileNotFoundError(f"No wheel matching: {pattern}")
+    return matches[0]
diff --git a/.ci/lumen_cli/cli/lib/common/utils.py b/.ci/lumen_cli/cli/lib/common/utils.py
index d7809146dd4d0..05790bd66acf6 100644
--- a/.ci/lumen_cli/cli/lib/common/utils.py
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@@ -7,6 +7,7 @@
 import shlex
 import subprocess
 import sys
+from contextlib import contextmanager
 from typing import Optional
 
 
@@ -77,3 +78,40 @@ def str2bool(value: Optional[str]) -> bool:
     if value in false_value_set:
         return False
     raise ValueError(f"Invalid string value for boolean conversion: {value}")
+
+
+@contextmanager
+def temp_environ(updates: dict[str, str]):
+    """
+    Temporarily set environment variables and restore them after the block.
+    Args:
+        updates: Dict of environment variables to set.
+    """
+    missing = object()
+    old: dict[str, str | object] = {k: os.environ.get(k, missing) for k in updates}
+    try:
+        os.environ.update(updates)
+        yield
+    finally:
+        for k, v in old.items():
+            if v is missing:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v  # type: ignore[arg-type]
+
+
+@contextmanager
+def working_directory(path: str):
+    """
+    Temporarily change the working directory inside a context.
+    """
+    if not path:
+        # No-op context
+        yield
+        return
+    prev_cwd = os.getcwd()
+    try:
+        os.chdir(path)
+        yield
+    finally:
+        os.chdir(prev_cwd)
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
new file mode 100644
index 0000000000000..7f3a930b2cc64
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@@ -0,0 +1,232 @@
+import logging
+from typing import Any
+
+from cli.lib.common.git_helper import clone_external_repo
+from cli.lib.common.pip_helper import pip_install_packages
+from cli.lib.common.utils import run_command, temp_environ, working_directory
+
+
+logger = logging.getLogger(__name__)
+
+
+def sample_vllm_test_library():
+    """
+    Simple sample to unblock the vllm ci development, which is mimic to
+    https://github.com/vllm-project/vllm/blob/main/.buildkite/test-pipeline.yaml
+    see run_test_plan for more details
+    """
+    # TODO(elainewy): Read from yaml file to handle the env and tests for vllm
+    return {
+        "vllm_basic_correctness_test": {
+            "title": "Basic Correctness Test",
+            "id": "vllm_basic_correctness_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "steps": [
+                "pytest -v -s basic_correctness/test_cumem.py",
+                "pytest -v -s basic_correctness/test_basic_correctness.py",
+                "pytest -v -s basic_correctness/test_cpu_offload.py",
+                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
+            ],
+        },
+        "vllm_basic_models_test": {
+            "title": "Basic models test",
+            "id": "vllm_basic_models_test",
+            "steps": [
+                "pytest -v -s models/test_transformers.py",
+                "pytest -v -s models/test_registry.py",
+                "pytest -v -s models/test_utils.py",
+                "pytest -v -s models/test_vision.py",
+                "pytest -v -s models/test_initialization.py",
+            ],
+        },
+        "vllm_entrypoints_test": {
+            "title": "Entrypoints Test ",
+            "id": "vllm_entrypoints_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "steps": [
+                " ".join(
+                    [
+                        "pytest",
+                        "-v",
+                        "-s",
+                        "entrypoints/llm",
+                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
+                        "--ignore=entrypoints/llm/test_generate.py",
+                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
+                        "--ignore=entrypoints/llm/test_collective_rpc.py",
+                    ]
+                ),
+                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
+                "pytest -v -s entrypoints/llm/test_generate.py ",
+                "pytest -v -s entrypoints/llm/test_generate_multiple_loras.py",
+                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+            ],
+        },
+        "vllm_regression_test": {
+            "title": "Regression Test",
+            "id": "vllm_regression_test",
+            "package_install": ["modelscope"],
+            "steps": [
+                "pytest -v -s test_regression.py",
+            ],
+        },
+        "vllm_lora_tp_test_distributed": {
+            "title": "LoRA TP Test (Distributed)",
+            "id": "vllm_lora_tp_test_distributed",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s -x lora/test_chatglm3_tp.py",
+                "echo $VLLM_WORKER_MULTIPROC_METHOD",
+                "pytest -v -s -x lora/test_llama_tp.py",
+                "pytest -v -s -x lora/test_multi_loras_with_tp.py",
+            ],
+        },
+        "vllm_lora_280_failure_test": {
+            "title": "LoRA 280 failure test",
+            "id": "vllm_lora_280_failure_test",
+            "steps": ["pytest -v lora/test_quant_model.py"],
+        },
+        "vllm_multi_model_processor_test": {
+            "title": "Multi-Modal Processor Test",
+            "id": "vllm_multi_model_processor_test",
+            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
+            "steps": [
+                "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
+            ],
+        },
+        "vllm_pytorch_compilation_unit_tests": {
+            "title": "PyTorch Compilation Unit Tests",
+            "id": "vllm_pytorch_compilation_unit_tests",
+            "steps": [
+                "pytest -v -s compile/test_pass_manager.py",
+                "pytest -v -s compile/test_fusion.py",
+                "pytest -v -s compile/test_fusion_attn.py",
+                "pytest -v -s compile/test_silu_mul_quant_fusion.py",
+                "pytest -v -s compile/test_sequence_parallelism.py",
+                "pytest -v -s compile/test_async_tp.py",
+                "pytest -v -s compile/test_fusion_all_reduce.py",
+                "pytest -v -s compile/test_decorator.py",
+            ],
+        },
+        # TODO(elainewy):need to add g6 with 4 gpus to run this test
+        "vllm_lora_test": {
+            "title": "LoRA Test %N",
+            "id": "lora_test",
+            "parallelism": 4,
+            "steps": [
+                "echo '[checking] list sharded lora tests:'",
+                " ".join(
+                    [
+                        "pytest -q --collect-only lora",
+                        "--shard-id=$$BUILDKITE_PARALLEL_JOB",
+                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
+                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
+                    ]
+                ),
+                "echo '[checking] Done. list lora tests'",
+                " ".join(
+                    [
+                        "pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB",
+                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
+                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
+                    ]
+                ),
+            ],
+        },
+    }
+
+
+def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0):
+    """
+    a method to check if the test plan is parallelism or not.
+    """
+    parallelism = int(tests.get("parallelism", "0"))
+    is_parallel = parallelism and parallelism > 1
+
+    if not is_parallel:
+        return False
+
+    if shard_id > num_shards:
+        raise RuntimeError(
+            f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided"
+        )
+
+    if num_shards != parallelism:
+        raise RuntimeError(
+            f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided"
+        )
+
+    return True
+
+
+def run_test_plan(
+    test_plan: str,
+    test_target: str,
+    tests_map: dict[str, Any],
+    shard_id: int = 0,
+    num_shards: int = 0,
+):
+    """
+    a method to run list of tests based on the test plan.
+    """
+    logger.info("run %s tests.....", test_target)
+    if test_plan not in tests_map:
+        raise RuntimeError(
+            f"test {test_plan} not found, please add it to test plan pool"
+        )
+    tests = tests_map[test_plan]
+    pkgs = tests.get("package_install", [])
+    title = tests.get("title", "unknown test")
+
+    is_parallel = check_parallelism(tests, title, shard_id, num_shards)
+    if is_parallel:
+        title = title.replace("%N", f"{shard_id}/{num_shards}")
+
+    logger.info("Running tests: %s", title)
+    if pkgs:
+        logger.info("Installing packages: %s", pkgs)
+        pip_install_packages(packages=pkgs, prefer_uv=True)
+    with (
+        working_directory(tests.get("working_directory", "tests")),
+        temp_environ(tests.get("env_vars", {})),
+    ):
+        failures = []
+        for step in tests["steps"]:
+            logger.info("Running step: %s", step)
+            if is_parallel:
+                step = replace_buildkite_placeholders(step, shard_id, num_shards)
+                logger.info("Running parallel step: %s", step)
+            code = run_command(cmd=step, check=False, use_shell=True)
+            if code != 0:
+                failures.append(step)
+            logger.info("Finish running step: %s", step)
+        if failures:
+            logger.error("Failed tests: %s", failures)
+            raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")
+        logger.info("Done. All tests passed")
+
+
+def clone_vllm(dst: str = "vllm"):
+    clone_external_repo(
+        target="vllm",
+        repo="https://github.com/vllm-project/vllm.git",
+        dst=dst,
+        update_submodules=True,
+    )
+
+
+def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
+    mapping = {
+        "$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards),
+        "$$BUILDKITE_PARALLEL_JOB": str(shard_id),
+    }
+    for k in sorted(mapping, key=len, reverse=True):
+        step = step.replace(k, mapping[k])
+    return step
diff --git a/.ci/lumen_cli/cli/lib/core/vllm.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
similarity index 96%
rename from .ci/lumen_cli/cli/lib/core/vllm.py
rename to .ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
index 735394402413c..d067a14f75902 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@@ -13,7 +13,6 @@
     env_str_field,
     with_params_help,
 )
-from cli.lib.common.git_helper import clone_external_repo
 from cli.lib.common.path_helper import (
     copy,
     ensure_dir_exists,
@@ -22,6 +21,7 @@
     is_path_exist,
 )
 from cli.lib.common.utils import run_command
+from cli.lib.core.vllm.lib import clone_vllm
 
 
 logger = logging.getLogger(__name__)
@@ -42,7 +42,7 @@ class VllmBuildParameters:
     """
 
     # USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH.
-    #  Otherwise docker build pull torch nightly during build
+    # Otherwise docker build pull torch nightly during build
     # TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True
     use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True)
     torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")
@@ -152,6 +152,7 @@ def run(self):
         3. run docker build
         """
         inputs = VllmBuildParameters()
+        logger.info("Running vllm build with inputs: %s", inputs)
         clone_vllm()
 
         self.cp_dockerfile_if_exist(inputs)
@@ -252,12 +253,3 @@ def _generate_docker_build_cmd(
                 --progress=plain .
         """
         ).strip()
-
-
-def clone_vllm():
-    clone_external_repo(
-        target="vllm",
-        repo="https://github.com/vllm-project/vllm.git",
-        dst="vllm",
-        update_submodules=True,
-    )
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
new file mode 100644
index 0000000000000..2be8e246486eb
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@@ -0,0 +1,257 @@
+import logging
+import os
+import re
+import subprocess
+import sys
+from collections.abc import Iterable
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+from cli.lib.common.cli_helper import BaseRunner
+from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
+from cli.lib.common.path_helper import copy, remove_dir
+from cli.lib.common.pip_helper import (
+    pip_install_first_match,
+    pip_install_packages,
+    pkg_exists,
+    run_python,
+)
+from cli.lib.common.utils import run_command, working_directory
+from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class VllmTestParameters:
+    """
+    Parameters defining the vllm external test input
+
+    !!!DO NOT ADD SECRETS IN THIS CLASS!!!
+    you can put environment variable name in VllmTestParameters if it's not the same as the secret one
+    fetch secrests directly from env variables during runtime
+    """
+
+    torch_whls_path: Path = env_path_field("WHEELS_PATH", "./dist")
+
+    vllm_whls_path: Path = env_path_field(
+        "VLLM_WHEELS_PATH", "./dist/external/vllm/wheels"
+    )
+
+    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
+
+    def __post_init__(self):
+        if not self.torch_whls_path.exists():
+            raise ValueError("missing torch_whls_path")
+        if not self.vllm_whls_path.exists():
+            raise ValueError("missing vllm_whls_path")
+
+
+class TestInpuType(Enum):
+    TEST_PLAN = "test_plan"
+    UNKNOWN = "unknown"
+
+
+class VllmTestRunner(BaseRunner):
+    def __init__(self, args: Any):
+        self.work_directory = "vllm"
+        self.test_plan = ""
+        self.test_type = TestInpuType.UNKNOWN
+
+        self.shard_id = args.shard_id
+        self.num_shards = args.num_shards
+
+        if args.test_plan:
+            self.test_plan = args.test_plan
+            self.test_type = TestInpuType.TEST_PLAN
+
+        # Matches the structeur in the artifacts.zip from torcb build
+        self.TORCH_WHL_PATH_REGEX = "torch*.whl"
+        self.TORCH_WHL_EXTRA = "opt-einsum"
+        self.TORCH_ADDITIONAL_WHLS_REGEX = [
+            "vision/torchvision*.whl",
+            "audio/torchaudio*.whl",
+        ]
+
+        # Match the structure of the artifacts.zip from vllm external build
+        self.VLLM_TEST_WHLS_REGEX = [
+            "xformers/*.whl",
+            "vllm/vllm*.whl",
+            "flashinfer-python/flashinfer*.whl",
+        ]
+
+    def prepare(self):
+        """
+        prepare test environment for vllm. This includes clone vllm repo, install all wheels, test dependencies and set env
+        """
+        params = VllmTestParameters()
+        logger.info("Display VllmTestParameters %s", params)
+        self._set_envs(params)
+
+        clone_vllm(dst=self.work_directory)
+        with working_directory(self.work_directory):
+            remove_dir(Path("vllm"))
+            self._install_wheels(params)
+            self._install_dependencies()
+        # verify the torches are not overridden by test dependencies
+        check_versions()
+
+    def run(self):
+        """
+        main function to run vllm test
+        """
+        self.prepare()
+        with working_directory(self.work_directory):
+            if self.test_type == TestInpuType.TEST_PLAN:
+                if self.num_shards > 1:
+                    run_test_plan(
+                        self.test_plan,
+                        "vllm",
+                        sample_vllm_test_library(),
+                        self.shard_id,
+                        self.num_shards,
+                    )
+                else:
+                    run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
+            else:
+                raise ValueError(f"Unknown test type {self.test_type}")
+
+    def _install_wheels(self, params: VllmTestParameters):
+        logger.info("Running vllm test with inputs: %s", params)
+        if not pkg_exists("torch"):
+            # install torch from local whls if it's not installed yet.
+            torch_p = f"{str(params.torch_whls_path)}/{self.TORCH_WHL_PATH_REGEX}"
+            pip_install_first_match(torch_p, self.TORCH_WHL_EXTRA)
+
+        torch_whls_path = [
+            f"{str(params.torch_whls_path)}/{whl_path}"
+            for whl_path in self.TORCH_ADDITIONAL_WHLS_REGEX
+        ]
+        for torch_whl in torch_whls_path:
+            pip_install_first_match(torch_whl)
+        logger.info("Done. Installed torch and other torch-related wheels ")
+
+        logger.info("Installing vllm wheels")
+        vllm_whls_path = [
+            f"{str(params.vllm_whls_path)}/{whl_path}"
+            for whl_path in self.VLLM_TEST_WHLS_REGEX
+        ]
+        for vllm_whl in vllm_whls_path:
+            pip_install_first_match(vllm_whl)
+        logger.info("Done. Installed vllm wheels")
+
+    def _install_test_dependencies(self):
+        """
+        This method replaces torch dependencies with local torch wheel info in
+        requirements/test.in file from vllm repo. then generates the test.txt
+        in runtime
+        """
+        logger.info("generate test.txt from requirements/test.in with local torch whls")
+        preprocess_test_in()
+        copy("requirements/test.txt", "snapshot_constraint.txt")
+
+        run_command(
+            f"{sys.executable} -m uv pip compile requirements/test.in "
+            "-o test.txt "
+            "--index-strategy unsafe-best-match "
+            "--constraint snapshot_constraint.txt "
+            "--torch-backend cu128"
+        )
+        pip_install_packages(requirements="test.txt", prefer_uv=True)
+        logger.info("Done. installed requirements for test dependencies")
+
+    def _install_dependencies(self):
+        pip_install_packages(packages=["-e", "tests/vllm_test_utils"], prefer_uv=True)
+        pip_install_packages(packages=["hf_transfer"], prefer_uv=True)
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+        # using script from vllm repo to remove all torch packages from requirements txt
+        run_python("use_existing_torch.py")
+
+        # install common packages
+        for requirements in ["requirements/common.txt", "requirements/build.txt"]:
+            pip_install_packages(
+                requirements=requirements,
+                prefer_uv=True,
+            )
+        # install test packages
+        self._install_test_dependencies()
+
+    def _set_envs(self, inputs: VllmTestParameters):
+        os.environ["TORCH_CUDA_ARCH_LIST"] = inputs.torch_cuda_arch_list
+        if not validate_cuda(get_env("TORCH_CUDA_ARCH_LIST")):
+            logger.warning(
+                "Missing supported TORCH_CUDA_ARCH_LIST. "
+                "Currently support TORCH_CUDA_ARCH_LIST env var "
+                "with supported arch [8.0, 8.9, 9.0]"
+            )
+
+        os.environ["HF_TOKEN"] = os.getenv("VLLM_TEST_HUGGING_FACE_TOKEN", "")
+        if not get_env("HF_TOKEN"):
+            raise ValueError(
+                "missing required HF_TOKEN, please set VLLM_TEST_HUGGING_FACE_TOKEN env var"
+            )
+        if not get_env("TORCH_CUDA_ARCH_LIST"):
+            raise ValueError(
+                "missing required TORCH_CUDA_ARCH_LIST, please set TORCH_CUDA_ARCH_LIST env var"
+            )
+
+
+def preprocess_test_in(
+    target_file: str = "requirements/test.in", additional_packages: Iterable[str] = ()
+):
+    """
+    This modifies the target_file file in place in vllm work directory.
+    It removes torch and unwanted packages in target_file and replace with local torch whls
+    package  with format "$WHEEL_PACKAGE_NAME @ file://<LOCAL_PATH>"
+    """
+    additional_package_to_move = list(additional_packages or ())
+    pkgs_to_remove = [
+        "torch",
+        "torchvision",
+        "torchaudio",
+        "xformers",
+        "mamba_ssm",
+    ] + additional_package_to_move
+    # Read current requirements
+    target_path = Path(target_file)
+    lines = target_path.read_text().splitlines()
+
+    # Remove lines starting with the package names (==, @, >=) — case-insensitive
+    pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
+    kept_lines = [line for line in lines if not pattern.match(line)]
+
+    # Get local installed torch/vision/audio from pip freeze
+    # This is hacky, but it works
+    pip_freeze = subprocess.check_output(["pip", "freeze"], text=True)
+    header_lines = [
+        line
+        for line in pip_freeze.splitlines()
+        if re.match(
+            r"^(torch|torchvision|torchaudio)\s*@\s*file://", line, re.IGNORECASE
+        )
+    ]
+
+    # Write back: header_lines + blank + kept_lines
+    out = "\n".join(header_lines + [""] + kept_lines) + "\n"
+    target_path.write_text(out)
+    logger.info("[INFO] Updated %s", target_file)
+
+
+def validate_cuda(value: str) -> bool:
+    VALID_VALUES = {"8.0", "8.9", "9.0"}
+    return all(v in VALID_VALUES for v in value.split())
+
+
+def check_versions():
+    """
+    check installed packages version
+    """
+    logger.info("Double check installed packages")
+    patterns = ["torch", "xformers", "torchvision", "torchaudio", "vllm"]
+    for pkg in patterns:
+        pkg_exists(pkg)
+    logger.info("Done. checked installed packages")
diff --git a/.ci/lumen_cli/cli/run.py b/.ci/lumen_cli/cli/run.py
index 5b436de6d0de3..1711109170756 100644
--- a/.ci/lumen_cli/cli/run.py
+++ b/.ci/lumen_cli/cli/run.py
@@ -5,6 +5,7 @@
 
 from cli.build_cli.register_build import register_build_commands
 from cli.lib.common.logger import setup_logging
+from cli.test_cli.register_test import register_test_commands
 
 
 logger = logging.getLogger(__name__)
@@ -20,6 +21,7 @@ def main():
 
     # registers second-level subcommands
     register_build_commands(subparsers)
+    register_test_commands(subparsers)
 
     # parse args after all options are registered
     args = parser.parse_args()
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_boolcontext b/.ci/lumen_cli/cli/test_cli/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_boolcontext
rename to .ci/lumen_cli/cli/test_cli/__init__.py
diff --git a/.ci/lumen_cli/cli/test_cli/register_test.py b/.ci/lumen_cli/cli/test_cli/register_test.py
new file mode 100644
index 0000000000000..2973341b83ed2
--- /dev/null
+++ b/.ci/lumen_cli/cli/test_cli/register_test.py
@@ -0,0 +1,62 @@
+import argparse
+import logging
+
+from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
+from cli.lib.core.vllm.vllm_test import VllmTestRunner
+
+
+logger = logging.getLogger(__name__)
+
+# Maps targets to their argparse configuration and runner
+# it adds new target to path python -m cli.run build external {target} with buildrunner
+_TARGETS: dict[str, TargetSpec] = {
+    "vllm": {
+        "runner": VllmTestRunner,
+        "help": "test vLLM with pytorch main",
+    }
+    # add yours ...
+}
+
+
+def common_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add common CLI arguments to the given parser.
+    """
+    parser.add_argument(
+        "--shard-id",
+        type=int,
+        default=1,
+        help="a shard id to run, e.g. '0,1,2,3'",
+    )
+    parser.add_argument(
+        "--num-shards",
+        type=int,
+        default=1,
+        help="a number of shards to run, e.g. '4'",
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        "-tp",
+        "--test-plan",
+        type=str,
+        help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",
+    )
+
+
+def register_test_commands(subparsers: argparse._SubParsersAction) -> None:
+    build_parser = subparsers.add_parser(
+        "test",
+        help="test related commands",
+        formatter_class=RichHelp,
+    )
+    build_subparsers = build_parser.add_subparsers(dest="test_command", required=True)
+    overview = "\n".join(
+        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
+    )
+    external_parser = build_subparsers.add_parser(
+        "external",
+        help="Test external targets",
+        description="Test third-party targets.\n\nAvailable targets:\n" + overview,
+        formatter_class=RichHelp,
+    )
+    register_targets(external_parser, _TARGETS, common_args=common_args)
diff --git a/.ci/lumen_cli/pyproject.toml b/.ci/lumen_cli/pyproject.toml
index 6937277cf1033..bf5edc77d9250 100644
--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@@ -6,6 +6,7 @@ dependencies = [
     "GitPython==3.1.45",
     "docker==7.1.0",
     "pytest==7.3.2",
+    "uv==0.8.6"
 ]
 
 [tool.setuptools]
diff --git a/.ci/lumen_cli/tests/test_run_plan.py b/.ci/lumen_cli/tests/test_run_plan.py
new file mode 100644
index 0000000000000..a85ed2e3986f6
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_run_plan.py
@@ -0,0 +1,185 @@
+# tests/test_run_test_plan.py
+import importlib
+from contextlib import nullcontext
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+
+MOD = "cli.lib.core.vllm.lib"
+
+# We import inside tests so the MOD override above applies everywhere
+run_test_plan_import_path = f"{MOD}.run_test_plan"
+
+
+def _get_cmd(c):
+    # Support both kwargs and positional args
+    return c.kwargs.get("cmd", c.args[0] if c.args else None)
+
+
+def _get_check(c):
+    if "check" in c.kwargs:
+        return c.kwargs["check"]
+    # If positional, assume second arg is 'check' when present; default False
+    return c.args[1] if len(c.args) > 1 else False
+
+
+@pytest.fixture
+def patch_module(monkeypatch):
+    """
+    Patch helpers ('pip_install_packages', 'temp_environ', 'working_directory',
+    'run_command', 'logger') inside the target module and expose them.
+    """
+    module = importlib.import_module(MOD)
+
+    # Create fakes/mocks
+    pip_install_packages = MagicMock(name="pip_install_packages")
+    run_command = MagicMock(name="run_command", return_value=0)
+
+    # temp_environ / working_directory: record calls but act as context managers
+    temp_calls: list[dict] = []
+    workdir_calls: list[str] = []
+
+    def fake_working_directory(path: str):
+        workdir_calls.append(path)
+        return nullcontext()
+
+    def fake_temp_env(map: dict[str, str]):
+        temp_calls.append(map)
+        return nullcontext()
+
+    logger = SimpleNamespace(
+        info=MagicMock(name="logger.info"),
+        error=MagicMock(name="logger.error"),
+    )
+
+    # Apply patches (raise if attribute doesn't exist)
+    monkeypatch.setattr(
+        module, "pip_install_packages", pip_install_packages, raising=True
+    )
+    monkeypatch.setattr(module, "run_command", run_command, raising=True)
+    monkeypatch.setattr(
+        module, "working_directory", fake_working_directory, raising=True
+    )
+    monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True)
+    monkeypatch.setattr(module, "logger", logger, raising=True)
+
+    return SimpleNamespace(
+        module=module,
+        run_test_plan=module.run_test_plan,  # expose to avoid getattr("constant") (Ruff B009)
+        pip_install_packages=pip_install_packages,
+        run_command=run_command,
+        temp_calls=temp_calls,
+        workdir_calls=workdir_calls,
+        logger=logger,
+    )
+
+
+def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_module):
+    run_test_plan = patch_module.run_test_plan
+
+    tests_map = {
+        "basic": {
+            "title": "Basic suite",
+            "package_install": [],
+            "working_directory": "tests",
+            "env_vars": {"GLOBAL_FLAG": "1"},
+            "steps": [
+                "export A=x && pytest -q",
+                "export B=y && pytest -q tests/unit",
+            ],
+        }
+    }
+
+    # One exit code per step (export + two pytest)
+    patch_module.run_command.side_effect = [0, 0, 0]
+
+    run_test_plan("basic", "cpu", tests_map)
+
+    calls = patch_module.run_command.call_args_list
+    cmds = [_get_cmd(c) for c in calls]
+    checks = [_get_check(c) for c in calls]
+
+    assert cmds == [
+        "export A=x && pytest -q",
+        "export B=y && pytest -q tests/unit",
+    ]
+    assert all(chk is False for chk in checks)
+
+    assert patch_module.workdir_calls == ["tests"]
+    assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}]
+
+
+def test_installs_packages_when_present(monkeypatch, patch_module):
+    run_test_plan = patch_module.module.run_test_plan
+
+    tests_map = {
+        "with_pkgs": {
+            "title": "Needs deps",
+            "package_install": ["timm==1.0.0", "flash-attn"],
+            "steps": ["pytest -q"],
+        }
+    }
+
+    patch_module.run_command.return_value = 0
+
+    run_test_plan("with_pkgs", "gpu", tests_map)
+
+    patch_module.pip_install_packages.assert_called_once_with(
+        packages=["timm==1.0.0", "flash-attn"],
+        prefer_uv=True,
+    )
+
+
+def test_raises_on_missing_plan(patch_module):
+    run_test_plan = patch_module.module.run_test_plan
+    with pytest.raises(RuntimeError) as ei:
+        run_test_plan("nope", "cpu", tests_map={})
+
+    assert "test nope not found" in str(ei.value)
+
+
+def test_aggregates_failures_and_raises(monkeypatch, patch_module):
+    run_test_plan = patch_module.module.run_test_plan
+
+    tests_map = {
+        "mix": {
+            "title": "Some pass some fail",
+            "steps": [
+                "pytest test_a.py",  # 0 → pass
+                "pytest test_b.py",  # 1 → fail
+                "pytest test_c.py",  # 2 → fail
+            ],
+        }
+    }
+
+    # Simulate pass, fail, fail
+    patch_module.run_command.side_effect = [0, 1, 2]
+
+    with pytest.raises(RuntimeError) as ei:
+        run_test_plan("mix", "cpu", tests_map)
+
+    msg = str(ei.value)
+    assert "2 pytest runs failed" in msg
+    # Ensure logger captured failed tests list
+    patch_module.logger.error.assert_called_once()
+    # And we attempted all three commands
+    assert patch_module.run_command.call_count == 3
+
+
+def test_custom_working_directory_used(patch_module):
+    run_test_plan = patch_module.module.run_test_plan
+
+    tests_map = {
+        "customwd": {
+            "title": "Custom wd",
+            "working_directory": "examples/ci",
+            "steps": ["pytest -q"],
+        }
+    }
+
+    patch_module.run_command.return_value = 0
+    run_test_plan("customwd", "cpu", tests_map)
+
+    assert patch_module.workdir_calls == ["examples/ci"]
diff --git a/.ci/lumen_cli/tests/test_utils.py b/.ci/lumen_cli/tests/test_utils.py
new file mode 100644
index 0000000000000..45ae5ad6d407b
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_utils.py
@@ -0,0 +1,143 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from cli.lib.common.utils import temp_environ, working_directory  # <-- replace import
+
+
+class EnvIsolatedTestCase(unittest.TestCase):
+    """Base class that snapshots os.environ and CWD for isolation."""
+
+    def setUp(self):
+        import os
+        import tempfile
+
+        self._env_backup = dict(os.environ)
+
+        # Snapshot/repair CWD if it's gone
+        try:
+            self._cwd_backup = os.getcwd()
+        except FileNotFoundError:
+            # If CWD no longer exists, switch to a safe place and record that
+            self._cwd_backup = tempfile.gettempdir()
+            os.chdir(self._cwd_backup)
+
+        # Create a temporary directory for the test to run in
+        self._temp_dir = tempfile.mkdtemp()
+        os.chdir(self._temp_dir)
+
+    def tearDown(self):
+        import os
+        import shutil
+        import tempfile
+
+        # Restore cwd first (before cleaning up temp dir)
+        try:
+            os.chdir(self._cwd_backup)
+        except OSError:
+            os.chdir(tempfile.gettempdir())
+
+        # Clean up temporary directory
+        try:
+            shutil.rmtree(self._temp_dir, ignore_errors=True)
+        except Exception:
+            pass  # Ignore cleanup errors
+
+        # Restore env
+        to_del = set(os.environ.keys()) - set(self._env_backup.keys())
+        for k in to_del:
+            os.environ.pop(k, None)
+        for k, v in self._env_backup.items():
+            os.environ[k] = v
+
+
+class TestTempEnviron(EnvIsolatedTestCase):
+    def test_sets_and_restores_new_var(self):
+        var = "TEST_TMP_ENV_NEW"
+        self.assertNotIn(var, os.environ)
+
+        with temp_environ({var: "123"}):
+            self.assertEqual(os.environ[var], "123")
+
+        self.assertNotIn(var, os.environ)  # removed after exit
+
+    def test_overwrites_and_restores_existing_var(self):
+        var = "TEST_TMP_ENV_OVERWRITE"
+        os.environ[var] = "orig"
+
+        with temp_environ({var: "override"}):
+            self.assertEqual(os.environ[var], "override")
+
+        self.assertEqual(os.environ[var], "orig")  # restored
+
+    def test_multiple_vars_and_missing_cleanup(self):
+        v1, v2 = "TEST_ENV_V1", "TEST_ENV_V2"
+        os.environ.pop(v1, None)
+        os.environ[v2] = "keep"
+
+        with temp_environ({v1: "a", v2: "b"}):
+            self.assertEqual(os.environ[v1], "a")
+            self.assertEqual(os.environ[v2], "b")
+
+        self.assertNotIn(v1, os.environ)  # newly-added -> removed
+        self.assertEqual(os.environ[v2], "keep")  # pre-existing -> restored
+
+    def test_restores_even_on_exception(self):
+        var = "TEST_TMP_ENV_EXCEPTION"
+        self.assertNotIn(var, os.environ)
+
+        with self.assertRaises(RuntimeError):
+            with temp_environ({var: "x"}):
+                self.assertEqual(os.environ[var], "x")
+                raise RuntimeError("boom")
+
+        self.assertNotIn(var, os.environ)  # removed after exception
+
+
+class TestWorkingDirectory(EnvIsolatedTestCase):
+    def test_changes_and_restores(self):
+        start = Path.cwd()
+        with tempfile.TemporaryDirectory() as td:
+            target = Path(td) / "wd"
+            target.mkdir()
+
+            with working_directory(str(target)):
+                self.assertEqual(Path.cwd().resolve(), target.resolve())
+
+        self.assertEqual(Path.cwd(), start)
+
+    def test_noop_when_empty_path(self):
+        start = Path.cwd()
+        with working_directory(""):
+            self.assertEqual(Path.cwd(), start)
+        self.assertEqual(Path.cwd(), start)
+
+    def test_restores_on_exception(self):
+        start = Path.cwd()
+
+        with tempfile.TemporaryDirectory() as td:
+            target = Path(td) / "wd_exc"
+            target.mkdir()
+
+            with self.assertRaises(ValueError):
+                with working_directory(str(target)):
+                    # Normalize both sides to handle /var -> /private/var
+                    self.assertEqual(Path.cwd().resolve(), target.resolve())
+                    raise ValueError("boom")
+
+        self.assertEqual(Path.cwd().resolve(), start.resolve())
+
+    def test_raises_for_missing_dir(self):
+        start = Path.cwd()
+        with tempfile.TemporaryDirectory() as td:
+            missing = Path(td) / "does_not_exist"
+            with self.assertRaises(FileNotFoundError):
+                # os.chdir should raise before yielding
+                with working_directory(str(missing)):
+                    pass
+        self.assertEqual(Path.cwd(), start)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/.ci/lumen_cli/tests/test_vllm.py b/.ci/lumen_cli/tests/test_vllm.py
index 8a6e729a32d5d..849eb0c40ee37 100644
--- a/.ci/lumen_cli/tests/test_vllm.py
+++ b/.ci/lumen_cli/tests/test_vllm.py
@@ -4,12 +4,15 @@
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 
-import cli.lib.core.vllm as vllm
+import cli.lib.core.vllm.vllm_build as vllm_build
+
+
+_VLLM_BUILD_MODULE = "cli.lib.core.vllm.vllm_build"
 
 
 class TestVllmBuildParameters(unittest.TestCase):
-    @patch("cli.lib.core.vllm.local_image_exists", return_value=True)
-    @patch("cli.lib.core.vllm.is_path_exist", return_value=True)
+    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
+    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=True)
     @patch(
         "cli.lib.common.envs_helper.env_path_optional",
         side_effect=lambda name, default=None, resolve=True: {
@@ -34,13 +37,13 @@ class TestVllmBuildParameters(unittest.TestCase):
     def test_params_success_normalizes_and_validates(
         self, mock_env_path, mock_is_path, mock_local_img
     ):
-        params = vllm.VllmBuildParameters()
+        params = vllm_build.VllmBuildParameters()
         self.assertEqual(params.torch_whls_path, Path("/abs/dist"))
         self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile"))
         self.assertEqual(params.output_dir, Path("/abs/shared"))
         self.assertEqual(params.base_image, "my/image:tag")
 
-    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
+    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
     @patch.dict(
         os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True
     )
@@ -48,14 +51,14 @@ def test_params_missing_torch_whls_raises(self, _is_path):
         with tempfile.TemporaryDirectory() as td:
             os.chdir(td)
             with self.assertRaises(ValueError) as cm:
-                vllm.VllmBuildParameters(
+                vllm_build.VllmBuildParameters(
                     use_local_base_image=False,
                     use_local_dockerfile=False,
                 )
         err = cm.exception
         self.assertIn("TORCH_WHEELS_PATH", str(err))
 
-    @patch("cli.lib.core.vllm.local_image_exists", return_value=False)
+    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=False)
     @patch.dict(
         os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True
     )
@@ -63,14 +66,14 @@ def test_params_missing_local_base_image_raises(self, _local_img):
         with tempfile.TemporaryDirectory() as td:
             os.chdir(td)
             with self.assertRaises(ValueError) as cm:
-                vllm.VllmBuildParameters(
+                vllm_build.VllmBuildParameters(
                     use_torch_whl=False,
                     use_local_dockerfile=False,
                 )
         err = cm.exception
         self.assertIn("BASE_IMAGE", str(err))
 
-    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
+    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
     @patch.dict(
         os.environ,
         {"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"},
@@ -80,14 +83,14 @@ def test_params_missing_dockerfile_raises(self, _is_path):
         with tempfile.TemporaryDirectory() as td:
             os.chdir(td)
             with self.assertRaises(ValueError) as cm:
-                vllm.VllmBuildParameters(
+                vllm_build.VllmBuildParameters(
                     use_torch_whl=False,
                     use_local_base_image=False,
                 )
         err = cm.exception
         self.assertIn("DOCKERFILE_PATH", str(err))
 
-    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
+    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
     @patch.dict(
         os.environ,
         {"OUTPUT_DIR": ""},
@@ -95,14 +98,13 @@ def test_params_missing_dockerfile_raises(self, _is_path):
     )
     def test_params_missing_output_dir(self, _is_path):
         with self.assertRaises(FileNotFoundError):
-            vllm.VllmBuildParameters()
+            vllm_build.VllmBuildParameters()
 
 
 class TestBuildCmdAndRun(unittest.TestCase):
-    @patch("cli.lib.core.vllm.local_image_exists", return_value=True)
+    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
     def test_generate_docker_build_cmd_includes_bits(self, _exists):
-        runner = vllm.VllmBuildRunner()
-        # Craft inputs that simulate a prepared build
+        runner = vllm_build.VllmBuildRunner()
         inputs = MagicMock()
         inputs.output_dir = Path("/abs/out")
         inputs.use_local_base_image = True
@@ -118,7 +120,7 @@ def test_generate_docker_build_cmd_includes_bits(self, _exists):
         inputs.tag_name = "vllm-wheels"
 
         cmd = runner._generate_docker_build_cmd(inputs)
-        squashed = " ".join(cmd.split())  # normalize whitespace for matching
+        squashed = " ".join(cmd.split())
 
         self.assertIn("--output type=local,dest=/abs/out", squashed)
         self.assertIn("-f docker/Dockerfile.nightly_torch", squashed)
@@ -136,18 +138,17 @@ def test_generate_docker_build_cmd_includes_bits(self, _exists):
         self.assertIn("--target export-wheels", squashed)
         self.assertIn("-t vllm-wheels", squashed)
 
-    @patch("cli.lib.core.vllm.run_command")
-    @patch("cli.lib.core.vllm.ensure_dir_exists")
-    @patch("cli.lib.core.vllm.clone_vllm")
+    @patch(f"{_VLLM_BUILD_MODULE}.run_command")
+    @patch(f"{_VLLM_BUILD_MODULE}.ensure_dir_exists")
+    @patch(f"{_VLLM_BUILD_MODULE}.clone_vllm")
     @patch.object(
-        vllm.VllmBuildRunner,
+        vllm_build.VllmBuildRunner,
         "_generate_docker_build_cmd",
         return_value="docker buildx ...",
     )
     @patch.dict(
         os.environ,
         {
-            # Make __post_init__ validations pass cheaply
             "USE_TORCH_WHEEL": "0",
             "USE_LOCAL_BASE_IMAGE": "0",
             "USE_LOCAL_DOCKERFILE": "0",
@@ -158,24 +159,18 @@ def test_generate_docker_build_cmd_includes_bits(self, _exists):
     def test_run_calls_clone_prepare_and_build(
         self, mock_gen, mock_clone, mock_ensure, mock_run
     ):
-        # Stub parameters instance so we avoid FS/Docker accesses in run()
         params = MagicMock()
         params.output_dir = Path("shared")
         params.use_local_dockerfile = False
         params.use_torch_whl = False
 
-        with patch("cli.lib.core.vllm.VllmBuildParameters", return_value=params):
-            runner = vllm.VllmBuildRunner()
+        with patch(f"{_VLLM_BUILD_MODULE}.VllmBuildParameters", return_value=params):
+            runner = vllm_build.VllmBuildRunner()
             runner.run()
 
         mock_clone.assert_called_once()
         mock_ensure.assert_called_once_with(Path("shared"))
         mock_gen.assert_called_once_with(params)
         mock_run.assert_called_once()
-        # ensure we run in vllm workdir
         _, kwargs = mock_run.call_args
         assert kwargs.get("cwd") == "vllm"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
index d6b1efb8a7831..3fbd25be1da3d 100644
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@@ -66,6 +66,9 @@ case ${CUDA_VERSION} in
             TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
         fi
         ;;
+    13.0)
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        ;;
     12.6)
         TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
         ;;
@@ -110,11 +113,15 @@ DEPS_SONAME=(
 )
 
 
-# CUDA_VERSION 12.6, 12.8, 12.9
-if [[ $CUDA_VERSION == 12* ]]; then
+# CUDA_VERSION 12.*, 13.*
+if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
     export USE_STATIC_CUDNN=0
     # Try parallelizing nvcc as well
-    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+    TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+    # Compress the fatbin with -compress-mode=size for CUDA 13
+    if [[ $CUDA_VERSION == 13* ]]; then
+        export TORCH_NVCC_FLAGS="$TORCH_NVCC_FLAGS -compress-mode=size"
+    fi
     if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
         echo "Bundling with cudnn and cublas."
         DEPS_LIST+=(
@@ -167,22 +174,29 @@ if [[ $CUDA_VERSION == 12* ]]; then
     else
         echo "Using nvidia libs from pypi."
         CUDA_RPATHS=(
-            '$ORIGIN/../../nvidia/cublas/lib'
-            '$ORIGIN/../../nvidia/cuda_cupti/lib'
-            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
-            '$ORIGIN/../../nvidia/cuda_runtime/lib'
             '$ORIGIN/../../nvidia/cudnn/lib'
-            '$ORIGIN/../../nvidia/cufft/lib'
-            '$ORIGIN/../../nvidia/curand/lib'
-            '$ORIGIN/../../nvidia/cusolver/lib'
-            '$ORIGIN/../../nvidia/cusparse/lib'
-            '$ORIGIN/../../nvidia/cusparselt/lib'
-            '$ORIGIN/../../cusparselt/lib'
-            '$ORIGIN/../../nvidia/nccl/lib'
             '$ORIGIN/../../nvidia/nvshmem/lib'
-            '$ORIGIN/../../nvidia/nvtx/lib'
-            '$ORIGIN/../../nvidia/cufile/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/cusparselt/lib'
         )
+        if [[ $CUDA_VERSION == 13* ]]; then
+            CUDA_RPATHS+=('$ORIGIN/../../nvidia/cu13/lib')
+        else
+            CUDA_RPATHS+=(
+                '$ORIGIN/../../nvidia/cublas/lib'
+                '$ORIGIN/../../nvidia/cuda_cupti/lib'
+                '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+                '$ORIGIN/../../nvidia/cuda_runtime/lib'
+                '$ORIGIN/../../nvidia/cufft/lib'
+                '$ORIGIN/../../nvidia/curand/lib'
+                '$ORIGIN/../../nvidia/cusolver/lib'
+                '$ORIGIN/../../nvidia/cusparse/lib'
+                '$ORIGIN/../../cusparselt/lib'
+                '$ORIGIN/../../nvidia/nvtx/lib'
+                '$ORIGIN/../../nvidia/cufile/lib'
+            )
+        fi
+
         CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
         export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
         export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index 444e129ea1849..1c88554c2af96 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -195,8 +195,16 @@ fi
 
 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
+
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
-  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
+  J=2  # default to 2 jobs
+  case "$RUNNER" in
+    linux.12xlarge.memory|linux.24xlarge.memory)
+      J=24
+      ;;
+  esac
+  echo "Building FlashAttention with job limit $J"
+  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}"
 fi
 
 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index f1d30700b998d..d8cbd12cb5daf 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -152,9 +152,7 @@ function get_pinned_commit() {
 function install_torchaudio() {
   local commit
   commit=$(get_pinned_commit audio)
-  # TODO (huydhn): PyTorch CI docker image set the default TORCH_CUDA_ARCH_LIST
-  # to Maxwell. This default doesn't make sense anymore and should be cleaned up
-  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
+  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]] && command -v nvidia-smi; then
     TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
     export TORCH_CUDA_ARCH_LIST
   fi
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index daa258d283fa3..5a82ec2fa85ee 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -1629,6 +1629,14 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
   install_torchvision
   build_xla
   test_xla
+elif [[ "$TEST_CONFIG" == *vllm* ]]; then
+    if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
+      TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
+      export TORCH_CUDA_ARCH_LIST
+    fi
+    echo "VLLM CI TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
+    (cd .ci/lumen_cli && python -m pip install -e .)
+    python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
 elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
   test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
diff --git a/.ci/pytorch/windows/cuda130.bat b/.ci/pytorch/windows/cuda130.bat
new file mode 100644
index 0000000000000..f38cd789f2da6
--- /dev/null
+++ b/.ci/pytorch/windows/cuda130.bat
@@ -0,0 +1,59 @@
+@echo off
+
+set MODULE_NAME=pytorch
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd %~dp0
+) ELSE (
+    call internal\clean.bat
+)
+IF ERRORLEVEL 1 goto :eof
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto :eof
+
+REM Check for optional components
+
+set USE_CUDA=
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+IF "%NVTOOLSEXT_PATH%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
+        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+    ) ELSE (
+        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+        exit /b 1
+    )
+)
+
+IF "%CUDA_PATH_V130%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\nvcc.exe" (
+        set "CUDA_PATH_V130=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
+    ) ELSE (
+        echo CUDA 13.0 not found, failing
+        exit /b 1
+    )
+)
+
+IF "%BUILD_VISION%" == "" (
+    set TORCH_CUDA_ARCH_LIST=7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+) ELSE (
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+)
+
+set "CUDA_PATH=%CUDA_PATH_V130%"
+set "PATH=%CUDA_PATH_V130%\bin;%PATH%"
+
+:optcheck
+
+call internal\check_opts.bat
+IF ERRORLEVEL 1 goto :eof
+
+if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\..
+call  %~dp0\internal\copy.bat
+IF ERRORLEVEL 1 goto :eof
+
+call  %~dp0\internal\setup.bat
+IF ERRORLEVEL 1 goto :eof
diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat
index a0eb650f8506a..1349d3e661f55 100644
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@@ -26,6 +26,7 @@ if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%
 if %CUDA_VER% EQU 126 goto cuda126
 if %CUDA_VER% EQU 128 goto cuda128
 if %CUDA_VER% EQU 129 goto cuda129
+if %CUDA_VER% EQU 130 goto cuda130
 
 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
@@ -113,6 +114,33 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
 
 goto cuda_common
 
+:cuda130
+
+set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe
+if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    set "ARGS="
+)
+
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.12.0.46_cuda13-archive
+set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+)
+
+@REM cuDNN 8.3+ required zlib to be installed on the path
+echo Installing ZLIB dlls
+curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+
+goto cuda_common
+
 :cuda_common
 :: NOTE: We only install CUDA if we don't have it installed already.
 :: With GHA runners these should be pre-installed as part of our AMI process
diff --git a/.circleci/scripts/binary_upload.sh b/.circleci/scripts/binary_upload.sh
index 6c4aa8bee1dfd..d48077e112455 100755
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@@ -51,16 +51,12 @@ s3_upload() {
     s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
   fi
   (
-    cache_control_flag=""
-    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
-      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
-    fi
     for pkg in ${PKG_DIR}/*.${extension}; do
       (
         set -x
         shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
         ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
-          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
+          --metadata "checksum-sha256=${shm_id}"
       )
     done
   )
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 97b485c73751e..0b9c14848239c 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-a645da617ed8836727cf9c28944d87154700d360
+10a5002c6195bd95e34df8fe28ff8a2d55a2a922
diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt
index adf39fa0fb40c..80c5a90c7be99 100644
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@@ -1 +1 @@
-bbea1cefdd1a29b53355b1655f5d2ae343921f85
+add1adfec742dfb13e614dab3372b5aafd1ff046
diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
index a54daa74c3a9b..330a78424feed 100644
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@@ -67,6 +67,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 #################### TORCH NIGHTLY  BASE IMAGE ####################
 
@@ -90,6 +92,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     fi
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 WORKDIR /workspace
 
@@ -112,6 +116,7 @@ ARG PINNED_TORCH_VERSION
 RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
     --mount=type=cache,target=/root/.cache/uv \
     if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
+        echo "[INFO] Installing torch wheels to build vllm"; \
         torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
         vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
         audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
@@ -119,10 +124,10 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
         uv pip install --system "${vision_whl}"; \
         uv pip install --system "${audio_whl}"; \
     elif [ -n "$PINNED_TORCH_VERSION" ]; then \
-        echo "[INFO] Installing pinned torch nightly version: $PINNED_TORCH_VERSION"; \
+        echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
         uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
     else \
-        echo "[INFO] Installing torch nightly with latest one"; \
+        echo "[INFO] Installing torch nightly with latest one to build vllm"; \
         uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
     fi
 
@@ -136,15 +141,22 @@ uv pip install --system -r requirements/common.txt
 
 
 # Must put before installing xformers, so it can install the correct version of xfomrers.
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a'
+ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list}
+
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
 
+RUN echo ${TORCH_CUDA_ARCH_LIST}
+RUN echo ${MAX_JOBS}
+RUN pip freeze | grep -E 'ninja'
+
 # Build xformers with cuda and torch nightly/wheel
 # following official xformers guidance: https://github.com/facebookresearch/xformers#build
-ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
+# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31
+ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497
 ENV CCACHE_DIR=/root/.cache/ccache
+
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     echo 'git clone xformers...' \
@@ -157,14 +169,14 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
     && cd .. \
     && rm -rf xformers
+
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system xformers-dist/*.whl --verbose
 
 # Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
-RUN cat  torch_build_versions.txt
-
+RUN cat torch_build_versions.txt
 RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
 
 #################### BASE BUILD IMAGE ####################
@@ -177,6 +189,8 @@ ARG TARGETPLATFORM
 
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 COPY . .
 
@@ -192,7 +206,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # Max jobs used by Ninja to build extensions
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
-ARG nvcc_threads=2
+ARG nvcc_threads=4
 ENV NVCC_THREADS=$nvcc_threads
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
@@ -216,11 +230,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
         && export SCCACHE_IDLE_TIMEOUT=0 \
         && export CMAKE_BUILD_TYPE=Release \
+        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
         && sccache --show-stats \
         && python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38 \
         && sccache --show-stats; \
     fi
 
+ARG vllm_target_device="cuda"
+ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
@@ -229,6 +246,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         # Clean any existing CMake artifacts
         rm -rf .deps && \
         mkdir -p .deps && \
+        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
     fi
 
@@ -296,6 +314,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     fi
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 # Default mount file as placeholder, this just avoid the mount error
 ARG TORCH_WHEELS_PATH="./requirements"
@@ -308,7 +328,7 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
         torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
         vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
         audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
-        echo "Found: '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
+        echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
         uv pip install --system "${torch_whl}[opt-einsum]"; \
         uv pip install --system "${vision_whl}"; \
         uv pip install --system "${audio_whl}"; \
@@ -364,6 +384,8 @@ FROM vllm-base as test
 
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
 
 COPY tests/ tests/
 COPY examples examples
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 21e1c0bab2121..82ec085b11be8 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -16,17 +16,19 @@
 
 
 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-CUDA_ARCHES = ["12.6", "12.8", "12.9"]
+CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
     "12.6": "12.6.3",
     "12.8": "12.8.1",
     "12.9": "12.9.1",
+    "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
     "12.6": "9",
     "12.8": "9",
     "12.9": "9",
+    "13.0": "9",
 }
 
 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
@@ -93,6 +95,23 @@
         "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
     ),
+    "13.0": (
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
     "xpu": (
         "intel-cmplr-lib-rt==2025.1.1 | "
         "intel-cmplr-lib-ur==2025.1.1 | "
@@ -124,9 +143,7 @@ def get_nccl_wheel_version(arch_version: str) -> str:
     requirements = map(
         str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
     )
-    return next(x for x in requirements if x.startswith("nvidia-nccl-cu")).split("==")[
-        1
-    ]
+    return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1]
 
 
 def read_nccl_pin(arch_version: str) -> str:
@@ -223,8 +240,12 @@ def generate_libtorch_matrix(
         if os == "linux":
             arches += CUDA_ARCHES
             arches += ROCM_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
         elif os == "windows":
             arches += CUDA_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
     if libtorch_variants is None:
         libtorch_variants = [
             "shared-with-deps",
@@ -289,6 +310,8 @@ def generate_wheels_matrix(
             arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
         elif os == "windows":
             arches += CUDA_ARCHES + XPU_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
         elif os == "linux-aarch64":
             # Separate new if as the CPU type is different and
             # uses different build/test scripts
@@ -323,7 +346,7 @@ def generate_wheels_matrix(
             # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
 
             if (
-                arch_version in ["12.9", "12.8", "12.6"]
+                arch_version in ["13.0", "12.9", "12.8", "12.6"]
                 and os == "linux"
                 or arch_version in CUDA_AARCH64_ARCHES
             ):
@@ -386,6 +409,7 @@ def generate_wheels_matrix(
     return ret
 
 
+validate_nccl_dep_consistency("13.0")
 validate_nccl_dep_consistency("12.9")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat
index 0f11fe34068eb..75c916ecdbef7 100644
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@@ -35,6 +35,9 @@ cd magma
 mkdir build && cd build
 
 set GPU_TARGET=All
+if "%CUVER_NODOT%" == "130" (
+  set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
+)
 if "%CUVER_NODOT%" == "129" (
   set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 )
diff --git a/.github/scripts/windows/build_triton.bat b/.github/scripts/windows/build_triton.bat
index 1c2d260cde6bf..d26dc8bf3b198 100644
--- a/.github/scripts/windows/build_triton.bat
+++ b/.github/scripts/windows/build_triton.bat
@@ -1,18 +1,12 @@
 @echo on
 
-set PYTHON_PREFIX=%PY_VERS:.=%
-set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py%
-call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
-:: Create a new conda environment
-if "%PY_VERS%" == "3.13t" (
-    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13
-) else (
-    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
-)
+set DESIRED_PYTHON=%PY_VERS%
+call .ci/pytorch/windows/internal/install_python.bat
+
 :: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
-call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja==1.11.1.4
+%PYTHON_EXEC% -m pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja==1.11.1.4
 
 dir "%VC_INSTALL_PATH%"
 
 call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
-call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
+%PYTHON_EXEC% .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index b14a13f3f90c2..e0998e46fb5f6 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -114,12 +114,12 @@ jobs:
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       {%- elif config["gpu_arch_type"] == "rocm" %}
       runs_on: linux.rocm.gpu
-      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.8", "12.9"] %}
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.6"] %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
       {%- elif config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
       {%- else %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.4xlarge
diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2
index 763784f5f3e1e..1039a6214a7af 100644
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@@ -15,7 +15,7 @@
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: !{{ config["desired_cuda"] }}
 {%- if config["gpu_arch_version"] %}
-      GPU_ARCH_VERSION: !{{ config["gpu_arch_version"] }}
+      GPU_ARCH_VERSION: "!{{ config["gpu_arch_version"] }}"
 {%- endif %}
       GPU_ARCH_TYPE: !{{ config["gpu_arch_type"] }}
 {%- if include_skip_tests %}
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 476dd182db0f8..2d9e4d0e27b25 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -187,6 +187,8 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        with:
+          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
 
       - name: configure aws credentials
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index ae4c5e802c61d..6b4bd429e3c9f 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -128,7 +128,7 @@ jobs:
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
     runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
-    timeout-minutes: 240
+    timeout-minutes: 480
     outputs:
       docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
@@ -269,6 +269,7 @@ jobs:
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
           BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }}
+          RUNNER: ${{ inputs.runner }}
         run: |
           START_TIME=$(date +%s)
           if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@@ -340,6 +341,7 @@ jobs:
             -e HUGGING_FACE_HUB_TOKEN \
             -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
             -e BUILD_ADDITIONAL_PACKAGES \
+            -e RUNNER \
             --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
             --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 07be3720b2bf2..52e1f1bbe9563 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -72,6 +72,10 @@ on:
         required: false
         description: |
           HF Auth token to avoid rate limits when downloading models or datasets from hub
+      VLLM_TEST_HUGGING_FACE_TOKEN:
+        required: false
+        description: |
+          HF Auth token to test vllm
       SCRIBE_GRAPHQL_ACCESS_TOKEN:
         required: false
         description: |
@@ -286,6 +290,7 @@ jobs:
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
           DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
+          VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
           ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
@@ -362,6 +367,7 @@ jobs:
             -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e HUGGING_FACE_HUB_TOKEN \
+            -e VLLM_TEST_HUGGING_FACE_TOKEN \
             -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
             -e DASHBOARD_TAG \
             -e ARTIFACTS_FILE_SUFFIX \
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
index b2d50efd7d96c..cc2f54fc45f84 100644
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@@ -48,6 +48,7 @@ jobs:
       fail-fast: false
       matrix:
         include: [
+          { tag: "cuda13.0" },
           { tag: "cuda12.9" },
           { tag: "cuda12.8" },
           { tag: "cuda12.6" },
diff --git a/.github/workflows/build-magma-windows.yml b/.github/workflows/build-magma-windows.yml
index 80d870f419e42..b7d293a5cec11 100644
--- a/.github/workflows/build-magma-windows.yml
+++ b/.github/workflows/build-magma-windows.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: windows-2022
     strategy:
       matrix:
-        cuda_version: ["129", "128", "126"]
+        cuda_version: ["130", "129", "128", "126"]
         config: ["Release", "Debug"]
     env:
       CUDA_VERSION: ${{ matrix.cuda_version }}
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
index e84b84f6158ba..ce42d5644c936 100644
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@@ -46,9 +46,11 @@ jobs:
       fail-fast: false
       matrix:
         include: [
+          { name: "manylinux2_28-builder",          tag: "cuda13.0",         runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.9",         runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index d54f459d0b43e..e0f1027b8a194 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -194,7 +194,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
         device: ["xpu"]
     timeout-minutes: 40
     env:
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index ebf1137d3df22..36400e75a9368 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -122,7 +122,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -148,7 +148,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -233,7 +233,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -259,7 +259,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -344,7 +344,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -370,7 +370,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -455,7 +455,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -481,7 +481,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -566,7 +566,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -592,7 +592,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -677,7 +677,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -703,7 +703,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -788,7 +788,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -814,7 +814,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -899,7 +899,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -925,7 +925,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index 9f4a8194d2874..776e77e808263 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -122,7 +122,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -145,7 +145,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -154,7 +154,7 @@ jobs:
       build_name: libtorch-cuda12_6-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_6-shared-with-deps-release-upload:  # Uploading
@@ -169,7 +169,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -190,7 +190,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -213,7 +213,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -222,7 +222,7 @@ jobs:
       build_name: libtorch-cuda12_8-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_8-shared-with-deps-release-upload:  # Uploading
@@ -237,7 +237,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -258,7 +258,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -281,7 +281,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -290,7 +290,7 @@ jobs:
       build_name: libtorch-cuda12_9-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
@@ -305,7 +305,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -326,7 +326,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -350,7 +350,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: libtorch-cxx11-builder
@@ -419,7 +419,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -440,7 +440,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -464,7 +464,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: libtorch-cxx11-builder
@@ -533,7 +533,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml
index 97c507f80284f..ec08b2c78eb67 100644
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@@ -52,7 +52,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -75,7 +75,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -83,6 +83,6 @@ jobs:
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 1bd5066d5ac7f..518dc3c720f85 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -119,7 +119,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -142,7 +142,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -150,7 +150,7 @@ jobs:
       build_name: manywheel-py3_9-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_6-upload:  # Uploading
@@ -165,7 +165,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -185,7 +185,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -208,7 +208,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -216,7 +216,7 @@ jobs:
       build_name: manywheel-py3_9-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_8-upload:  # Uploading
@@ -231,7 +231,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -251,7 +251,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -274,7 +274,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -282,7 +282,7 @@ jobs:
       build_name: manywheel-py3_9-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_9-upload:  # Uploading
@@ -297,7 +297,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -307,6 +307,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_9-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_9-rocm6_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -317,7 +383,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -340,7 +406,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -408,7 +474,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -428,7 +494,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -451,7 +517,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -519,7 +585,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -711,7 +777,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -734,7 +800,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -742,7 +808,7 @@ jobs:
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-upload:  # Uploading
@@ -757,7 +823,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -777,7 +843,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -800,7 +866,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -808,7 +874,7 @@ jobs:
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-upload:  # Uploading
@@ -823,7 +889,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -843,7 +909,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -866,7 +932,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -874,7 +940,7 @@ jobs:
       build_name: manywheel-py3_10-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_9-upload:  # Uploading
@@ -889,7 +955,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -899,6 +965,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_10-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_10-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_10-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_10-rocm6_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -909,7 +1041,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -932,7 +1064,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -1000,7 +1132,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -1020,7 +1152,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -1043,7 +1175,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -1111,7 +1243,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -1303,7 +1435,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -1326,7 +1458,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -1334,7 +1466,7 @@ jobs:
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-upload:  # Uploading
@@ -1349,7 +1481,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -1369,7 +1501,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -1392,7 +1524,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -1400,7 +1532,7 @@ jobs:
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-upload:  # Uploading
@@ -1415,7 +1547,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -1435,7 +1567,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -1458,7 +1590,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -1466,7 +1598,7 @@ jobs:
       build_name: manywheel-py3_11-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_9-upload:  # Uploading
@@ -1481,7 +1613,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -1491,6 +1623,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_11-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_11-rocm6_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1501,7 +1699,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -1524,7 +1722,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -1592,7 +1790,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -1612,7 +1810,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -1635,7 +1833,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -1703,7 +1901,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -1895,7 +2093,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -1918,7 +2116,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -1926,7 +2124,7 @@ jobs:
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-upload:  # Uploading
@@ -1941,7 +2139,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -1961,7 +2159,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -1984,7 +2182,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -1992,7 +2190,7 @@ jobs:
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-upload:  # Uploading
@@ -2007,7 +2205,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -2027,7 +2225,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -2050,7 +2248,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -2058,7 +2256,7 @@ jobs:
       build_name: manywheel-py3_12-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_9-upload:  # Uploading
@@ -2073,7 +2271,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -2083,6 +2281,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_12-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_12-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_12-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_12-rocm6_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2093,7 +2357,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -2116,7 +2380,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -2184,7 +2448,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -2204,7 +2468,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -2227,7 +2491,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -2295,7 +2559,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -2487,7 +2751,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -2510,7 +2774,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -2518,7 +2782,7 @@ jobs:
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-upload:  # Uploading
@@ -2533,7 +2797,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -2553,7 +2817,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -2576,7 +2840,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -2584,7 +2848,7 @@ jobs:
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-upload:  # Uploading
@@ -2599,7 +2863,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -2619,7 +2883,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -2642,7 +2906,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -2650,7 +2914,7 @@ jobs:
       build_name: manywheel-py3_13-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_9-upload:  # Uploading
@@ -2665,7 +2929,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -2675,6 +2939,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_13-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_13-rocm6_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2685,7 +3015,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -2708,7 +3038,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -2776,7 +3106,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -2796,7 +3126,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -2819,7 +3149,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -2887,7 +3217,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -3079,7 +3409,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -3102,7 +3432,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -3110,7 +3440,7 @@ jobs:
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-upload:  # Uploading
@@ -3125,7 +3455,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -3145,7 +3475,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -3168,7 +3498,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -3176,7 +3506,7 @@ jobs:
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-upload:  # Uploading
@@ -3191,7 +3521,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -3211,7 +3541,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -3234,7 +3564,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -3242,7 +3572,7 @@ jobs:
       build_name: manywheel-py3_13t-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_9-upload:  # Uploading
@@ -3257,7 +3587,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -3267,6 +3597,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_13t-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_13t-rocm6_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3277,7 +3673,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -3300,7 +3696,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -3368,7 +3764,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -3388,7 +3784,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -3411,7 +3807,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -3479,7 +3875,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -3671,7 +4067,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -3694,7 +4090,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -3702,7 +4098,7 @@ jobs:
       build_name: manywheel-py3_14-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_6-upload:  # Uploading
@@ -3717,7 +4113,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -3737,7 +4133,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -3760,7 +4156,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -3768,7 +4164,7 @@ jobs:
       build_name: manywheel-py3_14-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_8-upload:  # Uploading
@@ -3783,7 +4179,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -3803,7 +4199,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -3826,7 +4222,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -3834,7 +4230,7 @@ jobs:
       build_name: manywheel-py3_14-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_9-upload:  # Uploading
@@ -3849,7 +4245,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -3859,6 +4255,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_14-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_14-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_14-rocm6_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3869,7 +4331,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -3892,7 +4354,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -3960,7 +4422,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -3980,7 +4442,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -4003,7 +4465,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -4071,7 +4533,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -4263,7 +4725,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -4286,7 +4748,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -4294,7 +4756,7 @@ jobs:
       build_name: manywheel-py3_14t-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_6-upload:  # Uploading
@@ -4309,7 +4771,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -4329,7 +4791,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -4352,7 +4814,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -4360,7 +4822,7 @@ jobs:
       build_name: manywheel-py3_14t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_8-upload:  # Uploading
@@ -4375,7 +4837,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -4395,7 +4857,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -4418,7 +4880,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -4426,7 +4888,7 @@ jobs:
       build_name: manywheel-py3_14t-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_9-upload:  # Uploading
@@ -4441,7 +4903,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -4451,6 +4913,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_14t-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_14t-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_14t-rocm6_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -4461,7 +4989,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -4484,7 +5012,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -4552,7 +5080,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -4572,7 +5100,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -4595,7 +5123,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
@@ -4663,7 +5191,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
index a3e5937fdcc4e..8177bac3fe216 100644
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@@ -54,7 +54,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -77,7 +77,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 75c393b46e59b..9c3a96d4caeed 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -299,7 +299,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
@@ -415,7 +415,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
@@ -527,7 +527,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
@@ -549,7 +549,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
@@ -665,7 +665,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
@@ -777,7 +777,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
@@ -799,7 +799,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
@@ -915,7 +915,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
@@ -1027,7 +1027,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index eccd332c74a1f..d212894b74433 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -299,7 +299,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
@@ -415,7 +415,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
@@ -527,7 +527,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
@@ -549,7 +549,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
@@ -665,7 +665,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
@@ -777,7 +777,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
@@ -799,7 +799,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
@@ -915,7 +915,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
@@ -1027,7 +1027,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index dd592f9d2600b..73d2ededd8715 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -287,7 +287,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -399,7 +399,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -507,7 +507,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
       build_name: wheel-py3_9-cuda12_6
@@ -525,7 +525,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -637,7 +637,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -745,7 +745,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
       build_name: wheel-py3_9-cuda12_8
@@ -763,7 +763,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -875,7 +875,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -983,7 +983,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
       build_name: wheel-py3_9-cuda12_9
@@ -1472,7 +1472,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -1584,7 +1584,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -1692,7 +1692,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
       build_name: wheel-py3_10-cuda12_6
@@ -1710,7 +1710,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -1822,7 +1822,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -1930,7 +1930,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
       build_name: wheel-py3_10-cuda12_8
@@ -1948,7 +1948,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -2060,7 +2060,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -2168,7 +2168,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
       build_name: wheel-py3_10-cuda12_9
@@ -2657,7 +2657,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -2769,7 +2769,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -2877,7 +2877,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
       build_name: wheel-py3_11-cuda12_6
@@ -2895,7 +2895,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -3007,7 +3007,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -3115,7 +3115,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
       build_name: wheel-py3_11-cuda12_8
@@ -3133,7 +3133,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -3245,7 +3245,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -3353,7 +3353,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
       build_name: wheel-py3_11-cuda12_9
@@ -3842,7 +3842,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3954,7 +3954,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -4062,7 +4062,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
       build_name: wheel-py3_12-cuda12_6
@@ -4080,7 +4080,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -4192,7 +4192,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -4300,7 +4300,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
       build_name: wheel-py3_12-cuda12_8
@@ -4318,7 +4318,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -4430,7 +4430,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -4538,7 +4538,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
       build_name: wheel-py3_12-cuda12_9
@@ -5027,7 +5027,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -5139,7 +5139,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -5247,7 +5247,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
       build_name: wheel-py3_13-cuda12_6
@@ -5265,7 +5265,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -5377,7 +5377,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -5485,7 +5485,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
       build_name: wheel-py3_13-cuda12_8
@@ -5503,7 +5503,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -5615,7 +5615,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -5723,7 +5723,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
       build_name: wheel-py3_13-cuda12_9
@@ -6212,7 +6212,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6324,7 +6324,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6432,7 +6432,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
       build_name: wheel-py3_13t-cuda12_6
@@ -6450,7 +6450,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6562,7 +6562,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6670,7 +6670,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
       build_name: wheel-py3_13t-cuda12_8
@@ -6688,7 +6688,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6800,7 +6800,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6908,7 +6908,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
       build_name: wheel-py3_13t-cuda12_9
@@ -7397,7 +7397,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
@@ -7509,7 +7509,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
@@ -7617,7 +7617,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.14"
       build_name: wheel-py3_14-cuda12_6
@@ -7635,7 +7635,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
@@ -7747,7 +7747,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
@@ -7855,7 +7855,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.14"
       build_name: wheel-py3_14-cuda12_8
@@ -7873,7 +7873,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
@@ -7985,7 +7985,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
@@ -8093,7 +8093,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.14"
       build_name: wheel-py3_14-cuda12_9
@@ -8582,7 +8582,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
@@ -8694,7 +8694,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
@@ -8802,7 +8802,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.14t"
       build_name: wheel-py3_14t-cuda12_6
@@ -8820,7 +8820,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
@@ -8932,7 +8932,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
@@ -9040,7 +9040,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.14t"
       build_name: wheel-py3_14t-cuda12_8
@@ -9058,7 +9058,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
@@ -9170,7 +9170,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
@@ -9278,7 +9278,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.14t"
       build_name: wheel-py3_14t-cuda12_9
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 19b0e88b5921a..0081e4e1f895d 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -201,9 +201,9 @@ jobs:
       sync-tag: rocm-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
-          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.4" },
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index 5a586f67e7bfe..14524069ab5a9 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -5,6 +5,9 @@ on:
     tags:
       - ciflow/vllm/*
   workflow_dispatch:
+  schedule:
+    # Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
+    - cron: '0 0,12 * * *'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -26,20 +29,42 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  torch-build-sm89:
-    name: sm89-vllm-test
+  torch-build:
+    name: ci-vllm-test
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-additional-packages: "vision audio torchao"
+      build-additional-packages: "vision audio"
       build-external-packages: "vllm"
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.9'
+      cuda-arch-list: '8.0;8.9;9.0'
       runner: linux.24xlarge.memory
       test-matrix: |
         { include: [
-          { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu"  },
-          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config:  "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_280_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 2, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 3, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4"},
         ]}
     secrets: inherit
+
+  vllm-test-sm89:
+      name: ci-vllm-test
+      uses: ./.github/workflows/_linux-test.yml
+      needs: [
+        torch-build,
+      ]
+      with:
+        build-environment: linux-jammy-cuda12.8-py3.12-gcc11
+        docker-image: ${{ needs.torch-build.outputs.docker-image }}
+        test-matrix: ${{ needs.torch-build.outputs.test-matrix }}
+      secrets: inherit
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 64d05318afa3d..328b2f5e89ccb 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -132,7 +132,7 @@ include_patterns = [
     'test/test_complex.py',
     'test/test_datapipe.py',
     'test/test_futures.py',
-    # 'test/test_numpy_interop.py',
+    'test/test_numpy_interop.py',
     'test/test_torch.py',
     'test/test_type_hints.py',
     'test/test_type_info.py',
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 0f083a582404c..d8787154a2137 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -475,10 +475,6 @@ if(USE_ROCM)
     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
       ${native_hip_bgemm} ${native_hip_ck})
   endif()
-  if(WIN32) # Windows doesn't support Composable Kernels and Triton
-    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
-      ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
-  endif()
 
   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
   list(APPEND all_hip_cpp
diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
index 39ab441478e8f..f4316def4fb42 100644
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@@ -1,7 +1,6 @@
 #pragma once
 
-#include <c10/core/Allocator.h>
-#include <c10/core/DeviceType.h>
+#include <c10/hip/HIPCachingAllocator.h>
 
 // Use of c10::hip namespace here makes hipification easier, because
 // I don't have to also fix namespaces.  Sorry!
@@ -10,22 +9,227 @@ namespace c10::hip {
 // Takes a valid HIPAllocator (of any sort) and turns it into
 // an allocator pretending to be a CUDA allocator.  See
 // Note [Masquerading as CUDA]
-class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
-  Allocator* allocator_;
+class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator {
+  HIPCachingAllocator::HIPAllocator* allocator_;
 public:
-  explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
+  explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator)
     : allocator_(allocator) {}
+
+  virtual ~HIPAllocatorMasqueradingAsCUDA() = default;
+
+  // From c10::Allocator
+
   DataPtr allocate(size_t size) override {
     DataPtr r = allocator_->allocate(size);
     r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
     return r;
   }
+
+  bool is_simple_data_ptr(const DataPtr& data_ptr) const override {
+    return allocator_->is_simple_data_ptr(data_ptr);
+  }
+
   DeleterFnPtr raw_deleter() const override {
     return allocator_->raw_deleter();
   }
+
   void copy_data(void* dest, const void* src, std::size_t count) const final {
     allocator_->copy_data(dest, src, count);
   }
+
+  // From DeviceAllocator
+
+  bool initialized() override {
+    return allocator_->initialized();
+  }
+
+  void emptyCache(MempoolId_t mempool_id = {0, 0}) override {
+    allocator_->emptyCache(mempool_id);
+  }
+
+  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
+    HIPStream hip_stream = HIPStream(stream);
+    recordStream(ptr, hip_stream);
+  }
+
+  CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override {
+    return allocator_->getDeviceStats(device);
+  }
+
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
+    allocator_->resetAccumulatedStats(device);
+  }
+
+  void resetPeakStats(c10::DeviceIndex device) override {
+    allocator_->resetPeakStats(device);
+  }
+
+  // From CUDAAllocator
+
+  void* raw_alloc(size_t nbytes) override {
+    return allocator_->raw_alloc(nbytes);
+  }
+
+  void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override {
+    return allocator_->raw_alloc_with_stream(nbytes, stream);
+  }
+
+  void raw_delete(void* ptr) override {
+    allocator_->raw_delete(ptr);
+  }
+
+  void init(int device_count) override {
+    allocator_->init(device_count);
+  }
+
+  double getMemoryFraction(c10::DeviceIndex device) override {
+    return allocator_->getMemoryFraction(device);
+  }
+
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
+    allocator_->setMemoryFraction(fraction, device);
+  }
+
+  void enable(bool value) override {
+    allocator_->enable(value);
+  }
+
+  bool isEnabled() const override {
+    return allocator_->isEnabled();
+  }
+
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
+    allocator_->cacheInfo(device, largestBlock);
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* size) override {
+    return allocator_->getBaseAllocation(ptr, size);
+  }
+
+  void recordStream(const DataPtr& ptr, HIPStream stream) override {
+    allocator_->recordStream(ptr, stream);
+  }
+
+  HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override {
+    return allocator_->snapshot(mempool_id);
+  }
+
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)> filter) override {
+    allocator_->beginAllocateToPool(device, mempool_id, filter);
+  }
+
+  void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) override {
+    allocator_->endAllocateToPool(device, mempool_id);
+  }
+
+  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->releasePool(device, mempool_id);
+  }
+
+  int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    return allocator_->getPoolUseCount(device, mempool_id);
+  }
+
+  void createOrIncrefPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      HIPAllocator* allocator = nullptr) override {
+    allocator_->createOrIncrefPool(device, mempool_id, allocator);
+  }
+
+  void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->setUseOnOOM(device, mempool_id);
+  }
+
+  bool checkPoolLiveAllocations(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) override {
+    return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations);
+  }
+
+  HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override {
+    return allocator_->shareIpcHandle(ptr);
+  }
+
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+    return allocator_->getIpcDevPtr(handle);
+  }
+
+  bool isHistoryEnabled() override {
+    return allocator_->isHistoryEnabled();
+  }
+
+  void recordHistory(
+      bool enabled,
+      HIPCachingAllocator::CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      HIPCachingAllocator::RecordContext when,
+      bool clearHistory) override {
+    allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+  }
+
+  void recordAnnotation(
+      const std::vector<std::pair<std::string, std::string>>& md) override {
+    allocator_->recordAnnotation(md);
+  }
+
+  void pushCompileContext(std::string& md) override {
+    allocator_->pushCompileContext(md);
+  }
+
+  void popCompileContext() override {
+    allocator_->popCompileContext();
+  }
+
+  void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override {
+    allocator_->attachOutOfMemoryObserver(observer);
+  }
+
+  void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override {
+    allocator_->attachAllocatorTraceTracker(tracker);
+  }
+
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override {
+    allocator_->enablePeerAccess(dev, dev_to_access);
+  }
+
+  hipError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      hipStream_t stream,
+      bool p2p_enabled) override {
+    return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+  }
+
+  std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) override {
+    return allocator_->getCheckpointState(device, id);
+  }
+
+  HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) override {
+    auto cpd = allocator_->setCheckpointPoolState(device, pps);
+    for (auto& ptr : cpd.dataptrs_allocd) {
+      ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index()));
+    }
+    return cpd;
+  }
+
+  std::string name() override {
+    return allocator_->name();
+  }
+
 };
 
 } // namespace c10::hip
diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
index 46f7d247293a1..53e7980b3d3f9 100644
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
@@ -1,10 +1,11 @@
-#include <c10/core/Allocator.h>
+#include <c10/hip/HIPCachingAllocator.h>
+#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
 #include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
 
 namespace c10 { namespace hip {
 namespace HIPCachingAllocatorMasqueradingAsCUDA {
 
-Allocator* get() {
+HIPCachingAllocator::HIPAllocator* get() {
   static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
   return &allocator;
 }
diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
index 3aaa9d06c5e91..1d3606b456fca 100644
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
@@ -10,9 +10,185 @@ class DataPtr;
 namespace hip {
 namespace HIPCachingAllocatorMasqueradingAsCUDA {
 
-C10_HIP_API Allocator* get();
+C10_HIP_API HIPCachingAllocator::HIPAllocator* get();
 C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream);
 
+inline void* raw_alloc(size_t nbytes) {
+  return get()->raw_alloc(nbytes);
+}
+
+inline void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) {
+  return get()->raw_alloc_with_stream(nbytes, stream);
+}
+
+inline void raw_delete(void* ptr) {
+  return get()->raw_delete(ptr);
+}
+
+inline void init(int device_count) {
+  return get()->init(device_count);
+}
+
+inline double getMemoryFraction(c10::DeviceIndex device) {
+  return get()->getMemoryFraction(device);
+}
+
+inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
+  return get()->setMemoryFraction(fraction, device);
+}
+
+inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+  return get()->emptyCache(mempool_id);
+}
+
+inline void enable(bool value) {
+  return get()->enable(value);
+}
+
+inline bool isEnabled() {
+  return get()->isEnabled();
+}
+
+inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+  return get()->cacheInfo(device, largestBlock);
+}
+
+inline void* getBaseAllocation(void* ptr, size_t* size) {
+  return get()->getBaseAllocation(ptr, size);
+}
+
+inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device) {
+  return get()->getDeviceStats(device);
+}
+
+inline void resetAccumulatedStats(c10::DeviceIndex device) {
+  return get()->resetAccumulatedStats(device);
+}
+
+inline void resetPeakStats(c10::DeviceIndex device) {
+  return get()->resetPeakStats(device);
+}
+
+inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
+  return get()->snapshot(mempool_id);
+}
+
+inline std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
+    c10::DeviceIndex device,
+    MempoolId_t id) {
+  return get()->getCheckpointState(device, id);
+}
+
+inline HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
+    c10::DeviceIndex device,
+    std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) {
+  return get()->setCheckpointPoolState(device, std::move(pps));
+}
+
+inline void beginAllocateToPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    std::function<bool(hipStream_t)> filter) {
+  get()->beginAllocateToPool(device, mempool_id, std::move(filter));
+}
+
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->endAllocateToPool(device, mempool_id);
+}
+
+inline void recordHistory(
+    bool enabled,
+    HIPCachingAllocator::CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    HIPCachingAllocator::RecordContext when,
+    bool clearHistory) {
+  return get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+}
+
+inline void recordAnnotation(
+    const std::vector<std::pair<std::string, std::string>>& md) {
+  return get()->recordAnnotation(md);
+}
+
+inline void pushCompileContext(std::string& md) {
+  return get()->pushCompileContext(md);
+}
+
+inline void popCompileContext() {
+  return get()->popCompileContext();
+}
+
+inline bool isHistoryEnabled() {
+  return get()->isHistoryEnabled();
+}
+
+inline bool checkPoolLiveAllocations(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    const std::unordered_set<void*>& expected_live_allocations) {
+  return get()->checkPoolLiveAllocations(
+      device, mempool_id, expected_live_allocations);
+}
+
+inline void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) {
+  return get()->attachOutOfMemoryObserver(std::move(observer));
+}
+
+inline void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) {
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->releasePool(device, mempool_id);
+}
+
+inline void createOrIncrefPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) {
+  get()->createOrIncrefPool(device, mempool_id, allocator_ptr);
+}
+
+inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->setUseOnOOM(device, mempool_id);
+}
+
+inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->getPoolUseCount(device, mempool_id);
+}
+
+inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  return get()->getIpcDevPtr(std::move(handle));
+}
+
+inline HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) {
+  return get()->shareIpcHandle(ptr);
+}
+
+inline std::string name() {
+  return get()->name();
+}
+
+inline hipError_t memcpyAsync(
+    void* dst,
+    int dstDevice,
+    const void* src,
+    int srcDevice,
+    size_t count,
+    hipStream_t stream,
+    bool p2p_enabled) {
+  return get()->memcpyAsync(
+      dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+}
+
+inline void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  return get()->enablePeerAccess(dev, dev_to_access);
+}
+
 } // namespace HIPCachingAllocatorMasqueradingAsCUDA
 } // namespace hip
 } // namespace c10
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 64c39fcaef239..cb437fb45ce21 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -260,6 +260,7 @@ namespace at::native {
     check_foreach_api_restrictions(input, tensors1, tensors2);            \
                                                                           \
     std::vector<Tensor> result;                                           \
+    result.reserve(input.size());                                         \
     for (const auto i : c10::irange(input.size())) {                      \
       result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalar)); \
     }                                                                     \
@@ -288,6 +289,7 @@ namespace at::native {
     check_foreach_api_restrictions(input, tensors1, tensors2, scalars);       \
                                                                               \
     std::vector<Tensor> result;                                               \
+    result.reserve(input.size());                                             \
     for (const auto i : c10::irange(input.size())) {                          \
       result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalars[i])); \
     }                                                                         \
@@ -417,6 +419,7 @@ std::vector<Tensor> foreach_tensor_ternary_lerp_slow(
     TensorList tensors3) {
   check_foreach_api_restrictions(tensors1, tensors2, tensors3);
   std::vector<Tensor> result;
+  result.reserve(tensors1.size());
   for (const auto i : c10::irange(tensors1.size())) {
     result.emplace_back(tensors1[i].lerp(tensors2[i], tensors3[i]));
   }
@@ -439,6 +442,7 @@ std::vector<Tensor> foreach_tensor_lerp_scalarlist_kernel_slow(
     at::ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2, scalars);
   std::vector<Tensor> result;
+  result.reserve(tensors1.size());
   for (const auto i : c10::irange(tensors1.size())) {
     result.emplace_back(tensors1[i].lerp(tensors2[i], scalars[i]));
   }
@@ -469,6 +473,7 @@ std::vector<Tensor> foreach_tensor_norm_slow(
     std::optional<ScalarType> dtype) {
   check_foreach_api_restrictions(tensors);
   std::vector<Tensor> result;
+  result.reserve(tensors.size());
   for (const auto& t : tensors) {
     result.emplace_back(at::linalg_vector_norm(t, ord, {}, false, dtype));
   }
@@ -478,6 +483,7 @@ std::vector<Tensor> foreach_tensor_norm_slow(
 std::vector<Tensor> foreach_tensor_max_slow(TensorList tensors) {
   check_foreach_api_restrictions(tensors);
   std::vector<Tensor> result;
+  result.reserve(tensors.size());
   for (const auto& t : tensors) {
     result.emplace_back(at::max(t));
   }
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index a1e55265e0581..02356e4105f1a 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -53,8 +53,8 @@ inline void check_foreach_api_restrictions(
 inline void check_foreach_api_restrictions(
     TensorList tensors1,
     TensorList tensors2) {
-  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
-  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  check_foreach_api_restrictions(tensors1);
+  check_foreach_api_restrictions(tensors2);
   TORCH_CHECK(
       tensors1.size() == tensors2.size(),
       "Tensor lists must have the same number of tensors, got ",
@@ -67,21 +67,8 @@ inline void check_foreach_api_restrictions(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3) {
-  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
-  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
-  TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor.");
-  TORCH_CHECK(
-      tensors1.size() == tensors2.size(),
-      "Tensor lists must have the same number of tensors, got ",
-      tensors1.size(),
-      " and ",
-      tensors2.size());
-  TORCH_CHECK(
-      tensors1.size() == tensors3.size(),
-      "Tensor lists must have the same number of tensors, got ",
-      tensors1.size(),
-      " and ",
-      tensors3.size());
+  check_foreach_api_restrictions(tensors1, tensors2);
+  check_foreach_api_restrictions(tensors1, tensors3);
 }
 
 inline void check_foreach_api_restrictions(
@@ -90,12 +77,7 @@ inline void check_foreach_api_restrictions(
     TensorList tensors3,
     ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2, tensors3);
-  TORCH_CHECK(
-      tensors1.size() == scalars.size(),
-      "Tensor list must have same number of elements as scalar list, got ",
-      tensors1.size(),
-      " and ",
-      scalars.size());
+  check_foreach_api_restrictions(tensors1, scalars);
 }
 
 inline void check_foreach_api_restrictions(
@@ -103,12 +85,7 @@ inline void check_foreach_api_restrictions(
     TensorList tensors2,
     ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2);
-  TORCH_CHECK(
-      tensors1.size() == scalars.size(),
-      "Tensor list must have same number of elements as scalar list, got ",
-      tensors1.size(),
-      " and ",
-      scalars.size());
+  check_foreach_api_restrictions(tensors1, scalars);
 }
 
 // Helper function called in check_fast_path_restrictions to check whether all
diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal
index 048b2e5ae7c9a..b41e64d70ced5 100644
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@@ -358,6 +358,7 @@ kernel void index_copy_strided(
     constant long* input_strides,
     constant long* output_strides,
     constant long* source_strides,
+    constant long& indices_stride,
     uint thread_index [[thread_position_in_grid]]) {
   int pos[max_ndim];
   pos_from_thread_index(int(thread_index), pos, sizes, ndim);
@@ -374,7 +375,7 @@ kernel void index_copy_strided(
   // find the last index in the indices array that equals this coordinate
   int last_matching_index = -1;
   for (uint i = 0; i < indices_numel; i++) {
-    if (indices[i] == orig_dim) {
+    if (indices[i * indices_stride] == orig_dim) {
       last_matching_index = int(i);
     }
   }
@@ -413,6 +414,7 @@ kernel void index_copy_strided(
       constant long*,                                           \
       constant long*,                                           \
       constant long*,                                           \
+      constant long&,                                           \
       uint);
 
 #define REGISTER_MASKED_FILL_SCALAR(SIZE, DTYPE)                            \
diff --git a/aten/src/ATen/native/mps/kernels/Pooling.metal b/aten/src/ATen/native/mps/kernels/Pooling.metal
index 45a8d680afcd0..3eee8bb079a7a 100644
--- a/aten/src/ATen/native/mps/kernels/Pooling.metal
+++ b/aten/src/ATen/native/mps/kernels/Pooling.metal
@@ -1,5 +1,6 @@
 #include <ATen/native/mps/kernels/Pooling.h>
 #include <c10/metal/atomic.h>
+#include <c10/metal/utils.h>
 #include <metal_array>
 #include <metal_stdlib>
 
@@ -502,8 +503,8 @@ void avg_pool_3d_input_iter(
       padding,
       count_include_pad);
 
-  T value_sum = 0;
-  auto divisor = has_divisor_override
+  opmath_t<T> value_sum = 0;
+  opmath_t<T> divisor = has_divisor_override
       ? divisor_override
       : (bounds0.count) * (bounds1.count) * (bounds2.count);
 
@@ -516,11 +517,58 @@ void avg_pool_3d_input_iter(
       for (auto i2 = bounds2.start; i2 < bounds2.end; i2++) {
         auto offset2 = input_strides[2] * i2;
         auto input_value = input[offset0 + offset1 + offset2];
-        value_sum += input_value;
+        value_sum += static_cast<opmath_t<T>>(input_value);
       }
     }
   }
-  *output = value_sum / static_cast<T>(divisor);
+  *output = static_cast<T>(value_sum / divisor);
+}
+
+// Iterates through all the input elements that this kernel needs to
+// apply max to. Specialized for 2 pooling dimensions.
+template <typename T>
+void avg_pool_2d_input_iter(
+    constant T* input,
+    device T* output,
+    constant int32_t* input_sizes,
+    constant int32_t* input_strides,
+    thread int32_t (&pooling_dim_indices)[3],
+    constant int32_t* kernel_size,
+    constant int32_t* stride,
+    constant int32_t* padding,
+    bool count_include_pad,
+    bool has_divisor_override,
+    int32_t divisor_override) {
+  auto bounds0 = get_avg_pool_input_iter_bounds<0>(
+      input_sizes,
+      pooling_dim_indices,
+      kernel_size,
+      stride,
+      padding,
+      count_include_pad);
+  auto bounds1 = get_avg_pool_input_iter_bounds<1>(
+      input_sizes,
+      pooling_dim_indices,
+      kernel_size,
+      stride,
+      padding,
+      count_include_pad);
+
+  opmath_t<T> value_sum = 0;
+  opmath_t<T> divisor = has_divisor_override
+      ? divisor_override
+      : (bounds0.count) * (bounds1.count);
+
+  for (auto i0 = bounds0.start; i0 < bounds0.end; i0++) {
+    auto offset0 = input_strides[0] * i0;
+
+    for (auto i1 = bounds1.start; i1 < bounds1.end; i1++) {
+      auto offset1 = input_strides[1] * i1;
+      auto input_value = input[offset0 + offset1];
+      value_sum += static_cast<opmath_t<T>>(input_value);
+    }
+  }
+  *output = static_cast<T>(value_sum / divisor);
 }
 
 template <typename T>
@@ -619,18 +667,33 @@ kernel void avg_pool(
   input_sizes += leading_dims;
   input_strides += leading_dims;
 
-  avg_pool_3d_input_iter<T>(
-      input,
-      output,
-      input_sizes,
-      input_strides,
-      pooling_dim_indices,
-      kernel_size,
-      stride,
-      padding,
-      params.count_include_pad,
-      params.has_divisor_override,
-      params.divisor_override);
+  if (pooling_dims == 3) {
+    avg_pool_3d_input_iter<T>(
+        input,
+        output,
+        input_sizes,
+        input_strides,
+        pooling_dim_indices,
+        kernel_size,
+        stride,
+        padding,
+        params.count_include_pad,
+        params.has_divisor_override,
+        params.divisor_override);
+  } else if (pooling_dims == 2) {
+    avg_pool_2d_input_iter<T>(
+        input,
+        output,
+        input_sizes,
+        input_strides,
+        pooling_dim_indices,
+        kernel_size,
+        stride,
+        padding,
+        params.count_include_pad,
+        params.has_divisor_override,
+        params.divisor_override);
+  }
 }
 
 template <typename T>
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 82f815db95155..fa19d2f4d127f 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -230,7 +230,7 @@ static void index_put_kernel_mps(TensorIterator& iter,
                 index.numel());
     int64_t idx = index.item<int64_t>();
     TORCH_CHECK(idx == 0, "index_copy_(): the only valid index for a 0-dim tensor is 0, but got ", idx);
-    result.copy_(source);
+    result.copy_(source.squeeze());
     return;
   }
 
@@ -254,11 +254,12 @@ static void index_put_kernel_mps(TensorIterator& iter,
     }
   }
 
-  TORCH_CHECK(source.size(dim) == index.numel(),
+  const auto source_size_dim = source.dim() > 0 ? source.size(dim) : 1;
+  TORCH_CHECK(index.numel() == source_size_dim,
               "index_copy_(): Number of indices (",
               index.numel(),
               ") should be equal to source.size(dim) (",
-              source.size(dim),
+              source_size_dim,
               ")");
 
   auto stream = getCurrentMPSStream();
@@ -281,7 +282,7 @@ static void index_put_kernel_mps(TensorIterator& iter,
       [computeEncoder setComputePipelineState:indexCopyPSO];
       mtl_setArgs(computeEncoder, result, self, source, index, dim_arg, self.sizes(), ndim, indices_numel);
       if (!is_dense) {
-        mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides());
+        mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides(), index.strides());
       }
       mtl_dispatch1DJob(computeEncoder, indexCopyPSO, result.numel());
     }
@@ -595,28 +596,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
 }
 
 Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
-  IntArrayRef input_shape = self.sizes();
-  auto num_input_dims = input_shape.size();
-
-  auto num_indices = index.numel();
-  TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector");
-
-  dim = maybe_wrap_dim(dim, self.dim());
-  std::vector<int64_t> shape_data(num_input_dims);
-
-  // Calculate new shape
-  for (const auto i : c10::irange(num_input_dims)) {
-    if (i == static_cast<decltype(i)>(dim)) {
-      shape_data[i] = num_indices;
-    } else {
-      shape_data[i] = input_shape[i];
-    }
-  }
-
-  IntArrayRef output_shape = IntArrayRef(shape_data.data(), num_input_dims);
-
-  Tensor result = at::empty(output_shape, self.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
-
+  Tensor result = at::empty({0}, self.options());
   index_select_out_mps(self, dim, index, result);
   return result;
 }
@@ -638,25 +618,11 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
   TORCH_CHECK(self.scalar_type() == output.scalar_type(),
               "index_select(): self and output must have the same scalar type");
   TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
-  TORCH_CHECK(output.dim() == 0 || index.size(-1) == output.size(dim),
-              "index_select(): index and output must have the same size at `dim`th dimension, but got ",
-              index.size(-1),
-              " and ",
-              output.size(dim),
-              ".");
-
-  for (const auto i : irange(self.dim())) {
-    if (i == dim)
-      continue;
-    TORCH_CHECK(self.size(i) == output.size(i),
-                "index_select(): self and output must have the same dimensions except for `dim`th dimension, but got ",
-                self.size(i),
-                " and ",
-                output.size(i),
-                " at dimension ",
-                i,
-                ".");
+  auto output_size = self.sizes().vec();
+  if (self.dim() > 0) {
+    output_size[dim] = num_indices;
   }
+  at::native::resize_output(output, output_size);
 
   // Empty index
   if (num_indices == 0 || self.numel() == 0) {
diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 0c2c25946bb4b..2945ebf715f27 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -460,6 +460,9 @@ Tensor replication_pad3d_backward_mps(const Tensor& grad_output, const Tensor& i
 
 // backward pass is explicitly handled in autograd by negating the "pad" argument
 Tensor constant_pad_nd_mps(const Tensor& self, IntArrayRef pad, const Scalar& value) {
+  if (pad.empty()) {
+    return self.clone();
+  }
   if (pad.size() > 6) {
     TORCH_WARN_ONCE("MPS: The constant padding of more than 3 dimensions is not currently supported natively. ",
                     "It uses View Ops default implementation to run. This may have performance implications.");
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index 6ae3122cf3d19..d916320b2e238 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -1137,17 +1137,30 @@ Tensor max_unpooling3d_forward_mps(const Tensor& self,
  bool count_include_pad,
  std::optional<int64_t> divisor_override,
  const Tensor& output) {
-  mps::avg_pool2d_template(input,
-                           output,
-                           std::nullopt,
-                           {kH, kW},
-                           {dH, dW},
-                           {padH, padW},
-                           {1, 1},
-                           ceil_mode,
-                           count_include_pad,
-                           divisor_override,
-                           "avg_pool2d");
+  if (ceil_mode) {
+    mps::avg_pool_out_mps_template(output,
+                                   input,
+                                   {kH, kW},
+                                   {dH, dW},
+                                   {padH, padW},
+                                   ceil_mode,
+                                   count_include_pad,
+                                   divisor_override,
+                                   /*pooling_dims=*/2,
+                                   "avg_pool3d");
+  } else {
+    mps::avg_pool2d_template(input,
+                             output,
+                             std::nullopt,
+                             {kH, kW},
+                             {dH, dW},
+                             {padH, padW},
+                             {1, 1},
+                             ceil_mode,
+                             count_include_pad,
+                             divisor_override,
+                             "avg_pool2d");
+  }
 }
 
 TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps)
diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
index 7722272dfcc27..963a47a21fa9f 100644
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -460,4 +460,6 @@ at::Tensor _qconv_prepack_onednn(
     int64_t groups,
     std::optional<torch::List<int64_t>> input_shape=std::nullopt);
 
+#define FP8E4M3_MAX 448.0
+
 #endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 8624c9ef03367..3b50bad579023 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -1483,6 +1483,8 @@ static at::Tensor _fp8_convolution_onednn_ref(
     }
     y_f32.div_(output_scale);
     if (x1.scalar_type() == at::kFloat8_e4m3fn) {
+      // Avoid NaN
+      y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX);
       // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
       y_f32 = y_f32.to(at::kHalf);
     }
@@ -1497,6 +1499,8 @@ static at::Tensor _fp8_convolution_onednn_ref(
   y_f32.div_(output_scale);
   auto out_dtype = output_dtype.has_value() ? output_dtype.value() : at::kFloat8_e4m3fn;
   if (out_dtype == at::kFloat8_e4m3fn) {
+    // Avoid NaN
+    y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX);
     // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
     return y_f32.to(at::kHalf).to(out_dtype);
   }
@@ -1730,12 +1734,13 @@ static at::Tensor _quantized_convolution_onednn(
   output_sizes = at::native::conv_output_size(input_size, kernel_size, padding.vec(), stride.vec(), dilation.vec());
   ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()});
   // Output is not a quantized tensor but data type is uint8
+  auto out_dtype = output_dtype.has_value() ? output_dtype.value() : act_dtype;
   at::Tensor output = has_accum_postop_sum ?
     accum.value() :
     at::empty(
       dst_dims,
       at::device(c10::kCPU)
-          .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : act_dtype))
+          .dtype(out_dtype)
           .memory_format(kSpatialDim == 2 ?
               c10::MemoryFormat::ChannelsLast :
               c10::MemoryFormat::ChannelsLast3d)
@@ -1755,6 +1760,16 @@ static at::Tensor _quantized_convolution_onednn(
     unary_scalars,
     unary_algorithm.has_value() ? unary_algorithm.value() : ""
   );
+  // Avoid NaN if output dtype is fp8
+  if (out_dtype == c10::kFloat8_e4m3fn) {
+    // To avoid NaN, we need to clamp the intermediate results (in fp32) to [-488, 488]
+    // before converting to fp8
+    auto post_ops = op_attr.get_post_ops();
+    post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.0/output_scale, 0.0);
+    post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, -FP8E4M3_MAX, FP8E4M3_MAX);
+    op_attr.set_post_ops(post_ops);
+    output_scale = 1.0f;
+  }
 
 #if IDEEP_PREREQ(3, 1, 0, 0)
   // Use oneDNN's APIs instead of prepare/compute from ideep to reduce integration overhead.
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index bd6a1086c8cb9..a3a494d16fd69 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -1012,6 +1012,12 @@ static at::Tensor fp8_qlinear_onednn_ref(
           "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum");
     }
     y_f32.div_(output_scale);
+    if (x1.scalar_type() == c10::kFloat8_e4m3fn) {
+      // Avoid NaN
+      y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX);
+      // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
+      y_f32 = y_f32.to(at::kHalf);
+    }
     x1.copy_(y_f32.to(x1.scalar_type()).view(x1.sizes()));
     return x1;
   } else if (binary_post_op == "add") {
@@ -1038,6 +1044,12 @@ static at::Tensor fp8_qlinear_onednn_ref(
   y_f32.div_(output_scale);
   y_f32 = y_f32.view(output_size);
   auto out_dtype = output_dtype.has_value() ? output_dtype.value() : at::kFloat8_e4m3fn;
+  if (out_dtype == at::kFloat8_e4m3fn) {
+    // Avoid NaN
+    y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX);
+    // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
+    return y_f32.to(at::kHalf).to(out_dtype);
+  }
   return y_f32.to(out_dtype);
 }
 
@@ -1118,7 +1130,7 @@ static at::Tensor linear_int8_with_onednn_weight(
 #if defined(__powerpc__)
   if (is_fp8) {
 #else
-  if(is_fp8 && !cpuinfo_has_x86_amx_int8()) {
+  if(is_fp8 && !cpuinfo_has_x86_amx_fp16()) {
 #endif
     // Fall back to ref impl on old platforms because not supported
     // Transpose weight to align with behavior in oneDNN
@@ -1155,12 +1167,13 @@ static at::Tensor linear_int8_with_onednn_weight(
   }
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
+  auto out_dtype = output_dtype.has_value() ? output_dtype.value() : input.scalar_type();
   at::Tensor output = binary_post_op == "sum" ?
       other.value() :
       at::empty(
         dst_dims,
         at::device(c10::kCPU)
-            .dtype(fp32_output ? c10::kFloat : (bf16_output ? c10::kBFloat16 : input.scalar_type()))
+            .dtype(out_dtype)
       );
   if (output.numel() == 0) {
     return output;
@@ -1195,6 +1208,16 @@ static at::Tensor linear_int8_with_onednn_weight(
     unary_post_op_args,
     unary_post_op_algorithm
   );
+  // Avoid NaN if output dtype is fp8
+  if (out_dtype == c10::kFloat8_e4m3fn) {
+    // To avoid NaN, we need to clamp the intermediate results (in fp32) to [-488, 488]
+    // before converting to fp8
+    auto post_ops = op_attr.get_post_ops();
+    post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.0/output_scale, 0.0);
+    post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, -FP8E4M3_MAX, FP8E4M3_MAX);
+    op_attr.set_post_ops(post_ops);
+    output_scale = 1.0f;
+  }
   if (input_scale != 1.0f) {
     op_attr.set_scales_mask(DNNL_ARG_SRC, 0);
   }
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index 3bd68feca1c2f..b4ae4e677bcd2 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -305,7 +305,7 @@ static inline at::Tensor pack_weight_to_onednn_tensor(
 #if defined(__powerpc__)
   if (is_fp8){
 #else
-  if(is_fp8 && !cpuinfo_has_x86_amx_int8()) {
+  if(is_fp8 && !cpuinfo_has_x86_amx_fp16()) {
 #endif
     // oneDNN's fp8 requires AMX support
     // If AMX is not available, fall back to reference implementation
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 4b85b2d28753a..00a43920b0967 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -431,7 +431,12 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
     return false;
   }
   auto head_dim_limit = 128;
-  // TODO(eqy): add head dim >= 256 cases once support is finalized
+  if (cudnn_version >= 91000) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    if (dprops->major == 9 && !dprops->minor) {
+      head_dim_limit = 256;
+    }
+  }
   if (d_qk > head_dim_limit || d_v > head_dim_limit) {
     if (debug) {
       TORCH_WARN("head_dim should be no more than ", head_dim_limit);
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
index 854ac950a867d..01435da5c360e 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
@@ -388,11 +388,16 @@ mha_bwd_ck(const at::Tensor &dout,                   // batch_size x seqlen_q x
         dv_expanded = dv;
     }
 
-    uint64_t drop_seed = 1, drop_offset = 0;
-    drop_seed = *philox_seed.data_ptr<int64_t>();
-    drop_offset = *philox_offset.data_ptr<int64_t>();
-    auto drop_seed_offset = std::make_pair(&drop_seed, &drop_offset);
-
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+    uint64_t* drop_seed, drop_offset;
+    int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
+    std::pair<uint64_t*, uint64_t*> drop_seed_offset = {nullptr,nullptr};
+    if(is_dropout) {
+        drop_seed_offset.first = philox_seed[0].data_ptr<uint64_t>();
+        drop_seed_offset.second = philox_seed[1].data_ptr<uint64_t>();
+    }
 
     if (seqlen_q > 0) {
         ck_tile::stream_config stream_config{stream};
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip
index 05f97414acdd8..419263a24591c 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip
@@ -177,7 +177,6 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x
     TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
 
     const auto sizes = q.sizes();
-
     const int batch_size = sizes[0];
     int seqlen_q = sizes[1];
     int num_heads = sizes[2];
@@ -226,7 +225,6 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x
     CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
     CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
 
-
     at::Tensor q_padded, k_padded, v_padded;
     if (head_size % 8 != 0) {
         q_padded = at::pad(temp_q, {0, 8 - head_size % 8});
@@ -239,7 +237,6 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x
         v_padded = v;
     }
 
-
     at::Tensor out;
     if (out_.has_value()) {
         out = out_.value();
@@ -266,7 +263,6 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x
     auto opts = q.options();
     bool has_lse = true;
     bool has_dropout = p_dropout > 0.0f;
-
     at::Tensor softmax_lse;
     // TODO - check gradient, only training require lse
     softmax_lse = at::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
@@ -277,46 +273,41 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x
         p = at::empty({batch_size, num_heads, seqlen_q, seqlen_k}, opts.dtype(at::kByte));
     }
     else {
-        p = at::empty({ 0 }, opts);
+        p = at::empty({ 0 }, opts.dtype(at::kByte));
     }
 
-    int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
-    auto rng_state = at::empty({2}, opts.dtype(at::kLong));
-    auto rng_state_ptr = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
-
 
+    uint64_t drop_seed = 1, drop_offset = 0;
+    int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
 
-    at::Tensor seed_t, offset_t;
+    auto rng_state_options = at::TensorOptions().dtype(at::kUInt64).device(at::kCUDA);
+    auto rng_state = at::zeros({2}, rng_state_options.dtype(at::kUInt64));
+    auto _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
 
     if (p_dropout > 0.0)  {
+
         auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
             gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
         // See Note [Acquire lock when using random generators]
         std::lock_guard<std::mutex> lock(gen->mutex_);
-
         auto philox_args = gen->philox_cuda_state(counter_offset);
 
+        std::tie(drop_seed, drop_offset) = at::cuda::philox::unpack(philox_args);
 
-
-        hipLaunchKernelGGL(
-            flash::ParsePhiloxCudaState, dim3(1), dim3(64), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), philox_args, rng_state_ptr);
-        seed_t = at::scalar_tensor(at::Scalar(static_cast<uint64_t>(rng_state_ptr[0])), at::dtype(at::kLong));
-        offset_t = at::scalar_tensor(at::Scalar(static_cast<uint64_t>(rng_state_ptr[1])), at::dtype(at::kLong));
-    }
-    else
-    {
-        seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
-        offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
     }
+    rng_state[0] = *(reinterpret_cast<int64_t*>(&drop_seed));
+    rng_state[1] = *(reinterpret_cast<int64_t*>(&drop_offset));
+    auto drop_options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA);
 
     std::optional<at::Tensor> attn_bias;
     if( attn_bias_.has_value())
     {
       attn_bias = attn_bias_;
     }
-
     if (seqlen_k > 0) {
-        auto drop_seed_offset = std::make_pair(rng_state_ptr, rng_state_ptr + 1);
+        auto drop_seed_offset = std::make_pair(rng_state[0].data_ptr<uint64_t>(),
+                                               rng_state[1].data_ptr<uint64_t>());
         auto stream = at::cuda::getCurrentHIPStream().stream();
         ck_tile::stream_config stream_config{stream};
 
@@ -332,7 +323,7 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x
         auto args =
             get_ck_fmha_fwd_args(
                 has_lse,
-                return_dropout_randval,
+                has_dropout,
                 mask,
                 batch_size,
                 seqlen_q,
@@ -358,12 +349,11 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x
         out.zero_();
         softmax_lse.fill_(std::numeric_limits<float>::infinity());
     }
-
     if (seqlenq_ngroups_swapped) {
         out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
         q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
         softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
     }
-    return {out, q_padded, k_padded, v_padded, softmax_lse, seed_t, offset_t, p};
+    return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
 }
 } //namespace pytorch_flash
diff --git a/aten/src/ATen/xpu/CachingHostAllocator.cpp b/aten/src/ATen/xpu/CachingHostAllocator.cpp
index 1255285d25af0..d531b46c3c554 100644
--- a/aten/src/ATen/xpu/CachingHostAllocator.cpp
+++ b/aten/src/ATen/xpu/CachingHostAllocator.cpp
@@ -30,6 +30,12 @@ struct XPUCachingHostAllocatorImpl
   bool query_event(XPUEvent& event) override {
     return event.query();
   }
+
+  bool pinned_use_background_threads() override {
+    // Using background threads for XPU causes a hang on Windows during program
+    // exit. Will be enabled once the issue is resolved.
+    return false;
+  }
 };
 
 DECLARE_HOST_ALLOCATOR(
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 46db044d27f49..2901009f7c4d1 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1103,6 +1103,8 @@ def maybe_mark_profile(*args, **kwargs):
             )
         elif args.export_nativert:
             frozen_model_iter_fn = export_nativert(model, example_inputs)
+        elif args.torchscript_jit_trace:
+            frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
         else:
             frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
 
@@ -1481,6 +1483,28 @@ def load(cls, model, example_inputs):
         return cls.cache[key]
 
 
+class JitTracedCache:
+    cache: dict[weakref.ref, Any] = {}
+
+    @classmethod
+    def load(cls, model, example_inputs):
+        key = weakref.ref(model)
+        if key not in cls.cache:
+            example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+            if example_args:
+                jit_traced_module = torch.jit.trace(
+                    model, example_inputs=example_args, strict=False
+                )
+            else:
+                jit_traced_module = torch.jit.trace(
+                    model, example_kwarg_inputs=example_kwargs, strict=False
+                )
+
+            cls.cache[key] = jit_traced_module
+
+        return cls.cache[key]
+
+
 def export(model, example_inputs):
     from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
 
@@ -1527,6 +1551,16 @@ def opt_aot_inductor(_, example_inputs, collect_outputs=False):
     return opt_aot_inductor
 
 
+def torchscript_jit_trace(model, example_inputs):
+    optimized = JitTracedCache.load(model, example_inputs)
+
+    def opt_jit_trace(_, example_inputs, collect_outputs=False):
+        example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+        return optimized(*example_args, **example_kwargs)
+
+    return opt_jit_trace
+
+
 def download_retry_decorator(download_fn):
     """
     Decorator function for applying retry logic to a download function.
@@ -2277,6 +2311,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     self.args.export
                     or self.args.export_aot_inductor
                     or self.args.export_nativert
+                    or self.args.torchscript_jit_trace
                 ):
                     # apply export on module directly
                     # no need for n iterations
@@ -2673,7 +2708,11 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                         niters=1,
                     )
 
-            if self.args.export_aot_inductor or self.args.export_nativert:
+            if (
+                self.args.export_aot_inductor
+                or self.args.export_nativert
+                or self.args.torchscript_jit_trace
+            ):
                 optimized_model_iter_fn = optimize_ctx
             else:
                 optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
@@ -3431,6 +3470,11 @@ def get_example_inputs(self):
         action="store_true",
         help="Measure pass rate with Export+NativeRT",
     )
+    group.add_argument(
+        "--torchscript-jit-trace",
+        action="store_true",
+        help="Measure pass rate with TorchScript jit.trace",
+    )
     group.add_argument(
         "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"
     )
@@ -3876,6 +3920,10 @@ def run(runner, args, original_dir=None):
         optimize_ctx = export_nativert
         experiment = speedup_experiment
         output_filename = "export_nativert.csv"
+    elif args.torchscript_jit_trace:
+        optimize_ctx = torchscript_jit_trace
+        experiment = speedup_experiment
+        output_filename = "torchscript_jit_trace.csv"
     elif args.xla:
         (dev,) = args.devices
         os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev]
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 09a515584d97c..c5608f53ffeae 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -824,9 +824,13 @@ def get_pt_operator_registry_dict(
         apple_sdks = kwargs.get("apple_sdks"),
     )
 
+    # Extract existing linker_flags from kwargs and combine with default flags
+    existing_linker_flags = kwargs.pop("linker_flags", [])
+    combined_linker_flags = get_no_as_needed_linker_flag() + existing_linker_flags
+
     return dict(
         srcs = code_gen_files["srcs"],
-        linker_flags = get_no_as_needed_linker_flag(),
+        linker_flags = combined_linker_flags,
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         soname = "libtorch-code-gen.$(ext)",
@@ -1144,6 +1148,9 @@ def define_buck_targets(
             "--replace",
             "@AT_KLEIDIAI_ENABLED@",
             "0",
+            "--replace",
+            "@AT_USE_EIGEN_SPARSE@",
+            "0",
         ]),
         outs = {
             "Config.h": ["Config.h"],
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index fcd7b4b4b31da..972181327b1f6 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -643,47 +643,43 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
-  // From https://stackoverflow.com/a/3057522/23845
-  // TODO: does C++14 have a stdlib template for this?
-  template <typename T>
-  struct identity {
-    typedef T type;
-  };
-
   template <typename T>
   ArrayRef<T> generic_sizes() {
-    return _generic_sizes(identity<T>());
-  }
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
 
-  ArrayRef<int64_t> _generic_sizes(identity<int64_t>) {
-    return sizes();
-  }
-  ArrayRef<c10::SymInt> _generic_sizes(identity<c10::SymInt>) {
-    return sym_sizes();
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return sizes();
+    } else {
+      return sym_sizes();
+    }
   }
 
   template <typename T>
   ArrayRef<T> generic_strides() {
-    return _generic_strides(identity<T>());
-  }
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
 
-  ArrayRef<int64_t> _generic_strides(identity<int64_t>) {
-    return strides();
-  }
-  ArrayRef<c10::SymInt> _generic_strides(identity<c10::SymInt>) {
-    return sym_strides();
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return strides();
+    } else {
+      return sym_strides();
+    }
   }
 
   template <typename T>
   T generic_storage_offset() {
-    return _generic_storage_offset(identity<T>());
-  }
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
 
-  int64_t _generic_storage_offset(identity<int64_t>) {
-    return storage_offset();
-  }
-  c10::SymInt _generic_storage_offset(identity<c10::SymInt>) {
-    return sym_storage_offset();
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return storage_offset();
+    } else {
+      return sym_storage_offset();
+    }
   }
 
   /**
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 4c23e636f4307..3a06e0b5c9632 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -4153,11 +4153,8 @@ std::atomic<CaptureId_t> MemPool::uuid_{1};
 MemPool::MemPool(
     CUDACachingAllocator::CUDAAllocator* allocator,
     bool is_user_created,
-    bool use_on_oom,
-    bool symmetric)
-    : allocator_(allocator),
-      is_user_created_(is_user_created),
-      symmetric_(symmetric) {
+    bool use_on_oom)
+    : allocator_(allocator), is_user_created_(is_user_created) {
   if (is_user_created_) {
     id_ = {0, uid_++};
   } else {
@@ -4180,10 +4177,6 @@ MempoolId_t MemPool::id() {
   return id_;
 }
 
-bool MemPool::is_symmetric() {
-  return symmetric_;
-}
-
 CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
   return allocator_;
 }
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 75a2d4c8e481b..bd8f47a312529 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -538,8 +538,7 @@ struct C10_CUDA_API MemPool {
   MemPool(
       CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
       bool is_user_created = true,
-      bool use_on_oom = false,
-      bool symmetric = false);
+      bool use_on_oom = false);
   MemPool(const MemPool&) = delete;
   MemPool(MemPool&&) = default;
   MemPool& operator=(const MemPool&) = delete;
@@ -547,7 +546,6 @@ struct C10_CUDA_API MemPool {
   ~MemPool();
 
   MempoolId_t id();
-  bool is_symmetric();
   CUDACachingAllocator::CUDAAllocator* allocator();
   int use_count();
   c10::DeviceIndex device();
@@ -559,7 +557,6 @@ struct C10_CUDA_API MemPool {
   CUDACachingAllocator::CUDAAllocator* allocator_;
   bool is_user_created_;
   MempoolId_t id_;
-  bool symmetric_;
   c10::DeviceIndex device_;
 };
 
diff --git a/docs/source/backends.md b/docs/source/backends.md
index 3e6cdc9697bf0..71f977de64195 100644
--- a/docs/source/backends.md
+++ b/docs/source/backends.md
@@ -169,6 +169,10 @@ These backends include:
 .. autofunction:: torch.backends.cuda.sdp_kernel
 ```
 
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.is_ck_sdpa_available
+```
+
 ## torch.backends.cudnn
 
 ```{eval-rst}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4f47652e88d2d..9b04d22c087df 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1221,6 +1221,9 @@
     "reduce_typed_storage_child",
     "storage_from_cache",
     # torch.multiprocessing.spawn
+    # Added docstring for this but I think we need to go through
+    # and add the entire torch.multiprocessing.spawn module to a .rst...
+    "should_use_parallel_start",
     "start_processes",
     # torch.nn.functional
     "adaptive_max_pool1d_with_indices",  # documented as adaptive_max_pool1d
diff --git a/docs/source/distributed.tensor.md b/docs/source/distributed.tensor.md
index 64f2f02c81077..cb12eb195c02c 100644
--- a/docs/source/distributed.tensor.md
+++ b/docs/source/distributed.tensor.md
@@ -179,6 +179,18 @@ specifying the {class}`DeviceMesh` and {class}`Placement` for the {class}`DTenso
 
 ```
 
+### Random Operations
+
+DTensor provides distributed RNG functionality to ensure that random operations on sharded tensors get unique values, and random operations on replicated tensors get the same values. This system requires that all participating
+ranks (e.g. SPMD ranks) start out using the same generator state before each dtensor random operation is performed,
+and if this is true, it ensures they all end up at the same state after each dtensor random operation completes. There is no communication performed during random operations to synchronize RNG states.
+
+Operators that accept a `generator` kwarg will utilize the user-passed generator, if passed, or the default generator for the device otherwise. Whichever generator is used, it will be advanced after the DTensor operation.  It is valid to use the same generator for both DTensor and non-DTensor operations, but care must be taken to ensure the non-DTensor operations advance the generator state equally on all ranks if so.
+
+When using DTensor together with Pipeline Parallelism, ranks for each pipeline stage should use a distinct seed, and ranks within a pipeline stage should use the same seed.
+
+DTensor's RNG infra is based on the philox based RNG algorithm, and supports any philox based backend (cuda, and other cuda-like devices), but unfortunately does not yet support the CPU backend.
+
 ## Debugging
 
 ```{eval-rst}
diff --git a/docs/source/onnx.md b/docs/source/onnx.md
index 06b049ec39bcc..b0ed78dbe69b8 100644
--- a/docs/source/onnx.md
+++ b/docs/source/onnx.md
@@ -84,8 +84,6 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
     :noindex:
 .. autofunction:: is_in_onnx_export
     :noindex:
-.. autofunction:: enable_fake_mode
-    :noindex:
 ```
 
 ### Classes
diff --git a/docs/source/onnx_export.md b/docs/source/onnx_export.md
index 029952aa4e995..0adfec359d0b8 100644
--- a/docs/source/onnx_export.md
+++ b/docs/source/onnx_export.md
@@ -245,5 +245,4 @@ Each initialized value, input, output has the following metadata:
 .. autofunction:: torch.onnx.is_in_onnx_export
 .. autoclass:: torch.onnx.OnnxExporterError
     :members:
-.. autofunction:: torch.onnx.enable_fake_mode
 ```
diff --git a/docs/source/optim.md b/docs/source/optim.md
index 38587705ed216..8c3174c76fb29 100644
--- a/docs/source/optim.md
+++ b/docs/source/optim.md
@@ -165,6 +165,7 @@ for input, target in dataset:
     Adamax
     ASGD
     LBFGS
+    Muon
     NAdam
     RAdam
     RMSprop
@@ -210,6 +211,7 @@ Below is a table showing the available and default implementations of each algor
     :class:`Adamax`;foreach;yes;no
     :class:`ASGD`;foreach;yes;no
     :class:`LBFGS`;for-loop;no;no
+    :class:`Muon`;for-loop;no;no
     :class:`NAdam`;foreach;yes;no
     :class:`RAdam`;foreach;yes;no
     :class:`RMSprop`;foreach;yes;no
@@ -233,6 +235,7 @@ Below table is showing the stability status for fused implementations:
     :class:`Adamax`;unsupported;unsupported;unsupported
     :class:`ASGD`;unsupported;unsupported;unsupported
     :class:`LBFGS`;unsupported;unsupported;unsupported
+    :class:`Muon`;unsupported;unsupported;unsupported
     :class:`NAdam`;unsupported;unsupported;unsupported
     :class:`RAdam`;unsupported;unsupported;unsupported
     :class:`RMSprop`;unsupported;unsupported;unsupported
diff --git a/pyrefly.toml b/pyrefly.toml
index e063747349771..6b94aeb5c1ca5 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -1,4 +1,4 @@
-project_includes = [
+project-includes = [
     "torch",
     "caffe2",
     "test/test_bundled_images.py",
@@ -7,12 +7,11 @@ project_includes = [
     "test/test_datapipe.py",
     "test/test_futures.py",
     "test/test_numpy_interop.py",
-    "test/test_torch.py",
     "test/test_type_hints.py",
     "test/test_type_info.py",
     "test/test_utils.py",
 ]
-project_excludes = [
+project-excludes = [
   "torch/include/**",
   "torch/csrc/**",
   "torch/distributed/elastic/agent/server/api.py",
@@ -27,7 +26,7 @@ project_excludes = [
   "*/__pycache__/**",
   "*/.*",
 ]
-replace_imports_with_any = [
+ignore-missing-imports = [
     "torch._C._jit_tree_views.*",
     "torch.for_onnx.onnx.*",
     "torch.ao.quantization.experimental.apot_utils.*",
@@ -85,4 +84,16 @@ replace_imports_with_any = [
     "redis.*"
 ]
 
-untyped_def_behavior = "check-and-infer-return-any"
\ No newline at end of file
+untyped_def_behavior = "check-and-infer-return-any"
+
+# Shut off noisy errors
+errors.implicit-import = false
+
+# We exclude test_torch.py because it is full of errors, but most functions lack type signatures,
+# and mypy.ini specifies `check_untyped_defs = False` for this file.
+# If you check even the unannotated stuff mypy produces 322 errors.
+# "test/test_torch.py",
+# Uncomment this file to check
+# [[tool.pyrefly.sub-config]]
+# matches = "test/test_torch.py"
+# untyped-def-behavior = "skip-and-infer-return-any"
diff --git a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
index b038db2eaabac..0831958da761d 100644
--- a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
@@ -386,7 +386,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
   ASSERT_TRUE(
       setenv(c10d::TORCH_NCCL_ENABLE_MONITORING[0].c_str(), "1", 1) == 0);
   auto tempFilename = c10::str(
-      std::filesystem::temp_directory_path().string(), "/nccl_trace_rank_");
+      std::filesystem::temp_directory_path().string(), "/comm_lib_trace_rank_");
   ASSERT_TRUE(
       setenv("TORCH_NCCL_DEBUG_INFO_TEMP_FILE", tempFilename.c_str(), 1) == 0);
   // Enable nccl flight recorder.
@@ -401,7 +401,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
   // The only difference is that we are storing traces also in memory for
   // validation.
   std::string fileNamePrefix = c10d::getCvarString(
-      {"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/nccl_trace_rank_");
+      {"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/comm_lib_trace_rank_");
   std::unique_ptr<TestDebugInfoWriter> wrterForTestPtr =
       std::make_unique<TestDebugInfoWriter>(fileNamePrefix);
   std::vector<uint8_t>& traces = wrterForTestPtr->getTraces();
diff --git a/test/cpp/nativert/test_layout_planner.cpp b/test/cpp/nativert/test_layout_planner.cpp
new file mode 100644
index 0000000000000..060bc93918871
--- /dev/null
+++ b/test/cpp/nativert/test_layout_planner.cpp
@@ -0,0 +1,498 @@
+#include <gtest/gtest.h>
+
+#include <utility>
+
+#define LayoutPlannerTests_TEST_FRIENDS                                  \
+  friend class LayoutPlannerCtorTests;                                   \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstruct);                    \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructSymbolicShape);       \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructNoMetadata);          \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructPlanWithOverlap);     \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructPlanNoOverlap);       \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructNoOutVariant);        \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructOutputAlias);         \
+  FRIEND_TEST(                                                           \
+      LayoutPlannerCtorTests, TestConstructPlanWithMaybeAliasingToCopy); \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructListPackNoUnpack);    \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructTensorList);
+
+#include <torch/csrc/autograd/generated/variable_factories.h> // @manual
+
+#include <torch/nativert/executor/Executor.h> // @manual
+#include <torch/nativert/executor/SerialGraphExecutor.h> // @manual
+#include <torch/nativert/executor/Weights.h> // @manual
+#include <torch/nativert/executor/memory/LayoutManager.h> // @manual
+#include <torch/nativert/kernels/KernelFactory.h> // @manual
+#include <torch/nativert/kernels/KernelHandlerRegistry.h> // @manual
+#include <torch/nativert/kernels/KernelRegistry.h> // @manual
+
+using namespace ::testing;
+
+namespace torch::nativert /* must be same as namespace that includes TEST_FRIEND
+                             declarations */
+{
+
+class LayoutPlannerCtorTests : public testing::Test {
+ public:
+  void SetUp() override {
+    // register static dispatch kernel handler
+    register_kernel_handlers();
+  }
+  void TearDown() override {
+    executor_config.reset();
+    graph.reset();
+    executor.reset();
+  }
+
+  void createPlannerForModel(
+      const std::string& model,
+      const ExecutorConfig& cfg = {},
+      const std::unordered_map<std::string, torch::_export::TensorMeta>&
+          tensorMeta = {}) {
+    executor_config = std::make_unique<ExecutorConfig>(cfg);
+
+    graph = stringToGraph(model);
+
+    if (!tensorMeta.empty()) {
+      graph->setTensorValuesMeta(tensorMeta);
+    }
+
+    auto kernels = KernelFactory().initializeNodeKernels(
+        *graph, nullptr, *executor_config, nullptr);
+
+    auto kernelSchemas = Executor::getKernelSchemas(kernels.nodeKernels);
+
+    planner = std::make_unique<LayoutPlanner>(
+        *graph,
+        kernelSchemas,
+        ExecutionFrame::getPersistentValueMask(*graph),
+        executor_config->layoutPlannerSettings);
+
+    frame = std::make_unique<ExecutionFrame>(
+        *graph, Weights(graph.get()), *executor_config, planner.get());
+
+    executor = std::make_unique<SerialGraphExecutor>(
+        *graph, std::move(kernels.nodeKernels), *executor_config);
+  }
+
+  torch::_export::TensorMeta createSymbolicTensorMeta(
+      const std::vector<int64_t>& dims,
+      std::string device = "cpu",
+      torch::_export::ScalarType dtype = torch::_export::ScalarType::FLOAT) {
+    torch::_export::TensorMeta out_meta;
+
+    torch::_export::Device d;
+    d.set_type(std::move(device));
+    out_meta.set_device(d);
+
+    std::vector<torch::_export::SymInt> symvec;
+    for (size_t i = 0; i < dims.size(); ++i) {
+      torch::_export::SymInt symint;
+      torch::_export::SymExpr symexpr;
+      symexpr.set_expr_str(std::string("s") + std::to_string(i));
+      symint.set_as_expr(symexpr);
+      symvec.push_back(symint);
+    }
+
+    out_meta.set_sizes(symvec);
+    out_meta.set_dtype(dtype);
+    out_meta.set_layout(torch::_export::Layout::Strided);
+
+    {
+      torch::_export::SymInt i;
+      i.set_as_int(0);
+      out_meta.set_storage_offset(i);
+    }
+
+    return out_meta;
+  }
+
+  torch::_export::TensorMeta createTensorMeta(
+      const std::vector<int64_t>& dims,
+      std::string device = "cpu",
+      torch::_export::ScalarType dtype = torch::_export::ScalarType::FLOAT) {
+    torch::_export::TensorMeta out_meta;
+
+    torch::_export::Device d;
+    d.set_type(std::move(device));
+    out_meta.set_device(d);
+
+    std::vector<torch::_export::SymInt> symvec;
+    for (const auto dim : dims) {
+      torch::_export::SymInt symint;
+      symint.set_as_int(dim);
+      symvec.push_back(symint);
+    }
+
+    out_meta.set_sizes(symvec);
+    out_meta.set_dtype(dtype);
+    out_meta.set_layout(torch::_export::Layout::Strided);
+
+    {
+      torch::_export::SymInt i;
+      i.set_as_int(0);
+      out_meta.set_storage_offset(i);
+    }
+
+    return out_meta;
+  }
+
+ protected:
+  std::unique_ptr<Graph> graph;
+  std::unique_ptr<ExecutionFrame> frame;
+  std::unique_ptr<SerialGraphExecutor> executor;
+  std::unique_ptr<LayoutPlanner> planner;
+  std::unique_ptr<ExecutorConfig> executor_config;
+};
+
+namespace {
+ExecutorConfig create_enabled_executor_config() {
+  ExecutorConfig cfg;
+  cfg.enableStaticCPUKernels = true;
+  cfg.layoutPlannerSettings =
+      LayoutPlannerSettings()
+          .setAlgorithmType(LayoutPlannerAlgorithmType::GreedyBySize)
+          .setEnabled(true)
+          .setLayoutManagerSettings(
+              LayoutManagerSettings().setDeallocateBetweenRequests(false));
+  return cfg;
+};
+} // namespace
+
+TEST_F(LayoutPlannerCtorTests, TestConstructOutputAlias) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+  return (%out_t))";
+
+  createPlannerForModel(model, create_enabled_executor_config());
+  // no outputs
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructNoOutVariant) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+
+  auto executor_config = create_enabled_executor_config();
+  executor_config.enableStaticCPUKernels = false;
+
+  createPlannerForModel(model, executor_config, meta);
+  // no out variant (static dispatch disabled)
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructTensorList) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t0 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %out_t1 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+
+      %l[] = prim.ListPack(l0=%out_t0, l1=%out_t1)
+      %x0, %x1 = prim.ListUnpack(self=%l)
+
+      %res0 = torch.ops.aten.clone.default(self=%x0, memory_format=None)
+      %res1 = torch.ops.aten.clone.default(self=%x1, memory_format=None)
+  return (%res0, %res1))";
+
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  EXPECT_EQ(planner->get_planned_values().size(), 2);
+
+  auto& out_t0_lifetime = planner->planned_allocation_specs_[0].lifetime;
+  auto& out_t1_lifetime = planner->planned_allocation_specs_[1].lifetime;
+
+  EXPECT_EQ(
+      std::abs(
+          static_cast<int64_t>(out_t0_lifetime.start) -
+          static_cast<int64_t>(out_t1_lifetime.start)),
+      1);
+  EXPECT_EQ(
+      std::abs(
+          static_cast<int64_t>(out_t0_lifetime.end) -
+          static_cast<int64_t>(out_t1_lifetime.end)),
+      1);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructListPackNoUnpack) {
+  auto model = R"(graph(%weight1, %weight2):
+%weight1_plannable = torch.ops.aten.clone.default(self=%weight1, memory_format=None)
+%weights_list[] = prim.ListPack(l0=%weight1_plannable, l1=%weight2)
+%weights_cat = torch.ops.aten.cat.default(tensors=%weights_list, dim=0)
+return (%weights_cat)
+)";
+
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  auto& weight1_plannable_lifetime =
+      planner->planned_allocation_specs_[0].lifetime;
+  EXPECT_EQ(weight1_plannable_lifetime.start, 1);
+  EXPECT_EQ(weight1_plannable_lifetime.end, 3);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructReturnTensorListValues) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t0 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %out_t1 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+
+      %l[] = prim.ListPack(l0=%out_t0, l1=%out_t1)
+      %x0, %x1 = prim.ListUnpack(self=%l)
+  return (%x0, %x1))";
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructInputTensorList) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %l[] = prim.ListPack(l0=%y0, l1=%y1)
+      %x0, %x1 = prim.ListUnpack(self=%l)
+
+      %res0 = torch.ops.aten.clone.default(self=%x0, memory_format=None)
+      %res1 = torch.ops.aten.clone.default(self=%x1, memory_format=None)
+  return (%res0, %res1))";
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructReturnTensorList) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %y0_clone = torch.ops.aten.clone.default(self=%y0, memory_format=None)
+      %y1_clone = torch.ops.aten.clone.default(self=%y1, memory_format=None)
+
+      %l[] = prim.ListPack(l0=%y0_clone, l1=%y1_clone)
+  return (%l))";
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructUnsupportedDevice) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})}};
+
+  {
+    torch::_export::Device d;
+    d.set_type("cuda");
+    meta["out_t"].set_device(std::move(d));
+  }
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+
+  // not cpu
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructNoMetadata) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+
+  createPlannerForModel(model, create_enabled_executor_config());
+  // no metadata
+
+  planner->create_plan();
+  EXPECT_EQ(planner->planned_allocation_specs_.size(), 1);
+  EXPECT_EQ(planner->get_planned_values().size(), 1);
+  auto& spec = planner->planned_allocation_specs_[0];
+  EXPECT_EQ(spec.size, 0);
+  EXPECT_EQ(spec.lifetime.start, 1);
+  EXPECT_EQ(spec.lifetime.end, 2);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructSymbolicShape) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createSymbolicTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+  EXPECT_EQ(planner->get_planned_values().size(), 1);
+  EXPECT_EQ(planner->planned_allocation_specs_.size(), 1);
+  EXPECT_EQ(
+      planner->planned_allocation_specs_[0].size,
+      0 /* haven't populated IValues yet */);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstruct) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+
+  auto& specs = planner->planned_allocation_specs_;
+
+  EXPECT_EQ(specs.size(), 1);
+  EXPECT_EQ(specs[0].lifetime.start, 1);
+  EXPECT_EQ(specs[0].lifetime.end, 2);
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+
+  executor->execute(*frame, {tensor, tensor});
+
+  // 10 * 10 * 10 * 4 rounded up to the nearest multiple of 64 ==> 64 * 63 =
+  // 4032
+  auto aligned_size = LayoutManager::get_aligned_nbytes(
+      10 * 10 * 10 * at::elementSize(at::ScalarType::Float));
+  EXPECT_EQ(specs[0].size, aligned_size);
+  EXPECT_EQ(specs[0].size, 4032);
+
+  planner->with_plan([&](const LayoutPlan& plan) {
+    EXPECT_EQ(plan.total_size, 4032);
+    EXPECT_EQ(plan.allocations.size(), 1);
+    EXPECT_EQ(plan.allocations[0].size, 4032);
+    EXPECT_EQ(plan.allocations[0].offset, 0);
+    return;
+  });
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructPlanNoOverlap) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})},
+      {"out2_t", createTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out1_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res1 = torch.ops.aten.clone.default(self=%out1_t, memory_format=None)
+      %out2_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res2 = torch.ops.aten.clone.default(self=%out2_t, memory_format=None)
+  return (%res1, %res2))";
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+
+  executor->execute(*frame, {tensor, tensor});
+
+  planner->with_plan([&](const LayoutPlan& plan) {
+    EXPECT_EQ(plan.total_size, 4032);
+    EXPECT_EQ(plan.allocations.size(), 2);
+    EXPECT_EQ(plan.allocations[0].size, 4032);
+    EXPECT_EQ(plan.allocations[0].offset, 0);
+    EXPECT_EQ(plan.allocations[1].size, 4032);
+    EXPECT_EQ(plan.allocations[1].offset, 0);
+    return;
+  });
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructPlanWithOverlap) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})},
+      {"out2_t", createTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %out2_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out2_t, memory_format=None)
+      %res1 = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res, %res1))";
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+
+  executor->execute(*frame, {tensor, tensor});
+
+  planner->with_plan([&](const LayoutPlan& plan) {
+    EXPECT_EQ(plan.total_size, 8064);
+    EXPECT_EQ(plan.allocations.size(), 2);
+    EXPECT_EQ(plan.allocations[0].size, 4032);
+    EXPECT_EQ(plan.allocations[0].offset, 0);
+    EXPECT_EQ(plan.allocations[1].offset, 4032);
+    EXPECT_EQ(plan.allocations[1].size, 4032);
+  });
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructPlanWithMaybeAliasingToCopy) {
+  auto model = R"(graph(%input):
+          %i1 = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None)
+          %i2 = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None)
+          %out_t = torch.ops.aten.matmul.default(self=%i1, other=%i2)
+          return (%out_t))";
+
+  createPlannerForModel(model, create_enabled_executor_config());
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+
+  executor->execute(*frame, {tensor});
+
+  // i1 and i2 could alias input, so we should be safe and not plan them
+  planner->with_plan([&](const LayoutPlan& plan) {
+    EXPECT_EQ(plan.total_size, 0);
+    EXPECT_EQ(plan.allocations.size(), 0);
+    return;
+  });
+}
+
+TEST_F(LayoutPlannerCtorTests, TestCreateMultiplePlanners) {
+  auto executor_config = create_enabled_executor_config();
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+
+  graph = stringToGraph(model);
+
+  std::vector<std::pair<
+      std::unique_ptr<LayoutPlanner>,
+      std::vector<std::unique_ptr<OpKernel>>>>
+      planners;
+  for ([[maybe_unused]] const auto _ : c10::irange(2)) {
+    auto kernels = KernelFactory().initializeNodeKernels(
+        *graph, nullptr, executor_config, nullptr);
+    auto kernelSchemas = Executor::getKernelSchemas(kernels.nodeKernels);
+    planners.emplace_back(
+        std::make_unique<LayoutPlanner>(
+            *graph,
+            kernelSchemas,
+            ExecutionFrame::getPersistentValueMask(*graph),
+            executor_config.layoutPlannerSettings),
+        std::move(kernels.nodeKernels));
+  }
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+  for (auto& [layout_planner, kernels] : planners) {
+    ExecutionFrame execution_frame(
+        *graph, Weights(graph.get()), executor_config, layout_planner.get());
+    SerialGraphExecutor graph_executor(
+        *graph, std::move(kernels), executor_config);
+    graph_executor.execute(execution_frame, {tensor, tensor});
+    layout_planner->with_plan([&](const LayoutPlan& plan) {
+      EXPECT_EQ(plan.total_size, 4032);
+      EXPECT_EQ(plan.allocations.size(), 1);
+      EXPECT_EQ(plan.allocations[0].size, 4032);
+      EXPECT_EQ(plan.allocations[0].offset, 0);
+      return;
+    });
+  }
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_static_kernel_ops.cpp b/test/cpp/nativert/test_static_kernel_ops.cpp
new file mode 100644
index 0000000000000..fcdac1cd5f174
--- /dev/null
+++ b/test/cpp/nativert/test_static_kernel_ops.cpp
@@ -0,0 +1,539 @@
+#include <fmt/format.h>
+#include <gtest/gtest.h>
+#include <torch/torch.h>
+#include <random>
+#include "test/cpp/nativert/static_kernel_test_utils.h" // @manual
+
+namespace torch::nativert {
+
+namespace {
+std::vector<c10::IValue> generateArgsForQuantizedEmbeddingBag() {
+  // Set seed for reproducibility
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<int> int_dis(0, 15); // num_embeddings - 1
+  int num_embeddings = 16;
+  int embedding_dim = 32;
+  int num_lengths = 10;
+
+  auto weight =
+      at::randint(0, 255, {num_embeddings, embedding_dim}).to(at::kByte);
+
+  // Generate random lengths
+  std::vector<int> np_lengths(num_lengths);
+  for (auto& length : np_lengths) {
+    length = int_dis(gen);
+  }
+  int total_length = 0;
+  for (const auto& length : np_lengths) {
+    total_length += length;
+  }
+  // Generate random indices
+  at::Tensor indices =
+      torch::empty({total_length}, torch::dtype(torch::kInt32));
+  auto indices_accessor = indices.accessor<int, 1>();
+  for (int i = 0; i < total_length; ++i) {
+    indices_accessor[i] = int_dis(gen);
+  }
+  // Create lengths tensor
+  at::Tensor lengths = torch::from_blob(
+      np_lengths.data(), {num_lengths}, torch::dtype(torch::kInt32));
+  // Calculate offsets
+  at::Tensor offsets = torch::cat(
+      {torch::zeros({1}, torch::dtype(torch::kInt32)),
+       torch::cumsum(lengths, 0)});
+  offsets = offsets.to(torch::dtype(torch::kInt32));
+
+  at::Tensor per_sample_weights = at::randn(indices.sizes());
+
+  std::vector<c10::IValue> args{weight, indices, offsets, per_sample_weights};
+  return args;
+}
+
+std::vector<c10::IValue> generateArgsForEmbeddingBag(bool include_padding_idx) {
+  torch::Tensor weight = torch::randn({10, 3}, torch::dtype(torch::kFloat32));
+  torch::Tensor indices =
+      torch::randint(0, 10, {20}, torch::dtype(torch::kInt64));
+  torch::Tensor offsets =
+      torch::tensor({0, 5, 10, 15, 20}, torch::dtype(torch::kInt64));
+  torch::Tensor per_sample_weights =
+      torch::rand({20}, torch::dtype(torch::kFloat32));
+  // Define the padding_idx
+  int64_t padding_idx = 1;
+  // Create a vector of IValues to store the arguments
+  std::vector<c10::IValue> args;
+  args.emplace_back(weight);
+  args.emplace_back(indices);
+  args.emplace_back(offsets);
+  args.emplace_back(per_sample_weights);
+  if (include_padding_idx) {
+    args.emplace_back(padding_idx);
+  }
+  return args;
+}
+} // namespace
+
+TEST(StaticKernelTest, QuantizedEmbeddingBagByteRowwiseOffsets) {
+  const std::string graph =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
+%out = torch.ops.quantized.embedding_bag_byte_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true)
+%res = torch.ops.aten.clone.default(self=%out, memory_format=None)
+return (%res)
+)";
+
+  std::vector<c10::IValue> args = generateArgsForQuantizedEmbeddingBag();
+
+  testStaticKernelEquality(graph, args);
+}
+
+TEST(StaticKernelTest, QuantizedEmbeddingBag4BitRowwiseOffsets) {
+  const std::string graph =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
+%out = torch.ops.quantized.embedding_bag_4bit_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true)
+%res = torch.ops.aten.clone.default(self=%out, memory_format=None)
+return (%res)
+)";
+  std::vector<c10::IValue> args = generateArgsForQuantizedEmbeddingBag();
+
+  testStaticKernelEquality(graph, args);
+}
+
+TEST(StaticKernelTest, EmbeddingBag) {
+  const std::string graph =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
+%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true)
+%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
+%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
+%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
+%res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None)
+return (%res1, %res2, %res3, %res4)
+)";
+  std::vector<c10::IValue> args = generateArgsForEmbeddingBag(false);
+  testStaticKernelEquality(graph, args);
+
+  // Test use_max_indices False
+  const std::string graph2 =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
+%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true)
+%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
+%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
+%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
+return (%res1, %res2, %res3, %out2)
+)";
+  std::vector<c10::IValue> args2 = generateArgsForEmbeddingBag(false);
+  testStaticKernelEquality(graph2, args2);
+}
+
+TEST(StaticKernelTest, EmbeddingBagPaddingIdx) {
+  const std::string graph =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx):
+%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx)
+%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
+%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
+%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
+%res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None)
+return (%res1, %res2, %res3, %res4)
+)";
+  std::vector<c10::IValue> args = generateArgsForEmbeddingBag(true);
+  testStaticKernelEquality(graph, args);
+
+  // Test use_max_indices False
+  const std::string graph2 =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx):
+%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx)
+%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
+%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
+%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
+return (%res1, %res2, %res3, %out2)
+)";
+  std::vector<c10::IValue> args2 = generateArgsForEmbeddingBag(true);
+  testStaticKernelEquality(graph2, args2);
+}
+
+TEST(StaticKernelTest, Aten_ToCopy) {
+  for (auto& target_dtype :
+       {"None",
+        "ScalarType::FLOAT",
+        "ScalarType::DOUBLE",
+        "ScalarType::HALF",
+        "ScalarType::INT",
+        "ScalarType::LONG"}) {
+    for (auto& target_memory_format : {
+             "None",
+             "MemoryFormat::PreserveFormat",
+             "MemoryFormat::ContiguousFormat",
+         }) {
+      for (auto& input_dtype :
+           {at::kLong, at::kInt, at::kFloat, at::kDouble, at::kHalf}) {
+        for (auto& permute_input : {true, false}) {
+          const std::string graph = fmt::format(
+              R"(graph(%input):
+%out = torch.ops.aten._to_copy.default(self=%input, dtype={}, memory_format={})
+return (%out)
+)",
+              target_dtype,
+              target_memory_format);
+          at::Tensor input =
+              at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(input_dtype);
+          if (permute_input) {
+            input = input.permute({1, 0, 3, 2});
+          }
+
+          testStaticKernelEquality(graph, {input});
+        }
+      }
+    }
+  }
+}
+
+TEST(StaticKernelTest, Aten_ToCopy_Aliasing) {
+  const std::string graph =
+      R"(graph(%input):
+          %out = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None)
+          return (%out))";
+
+  at::Tensor input =
+      at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat);
+
+  torch::nativert::ExecutorConfig config;
+  config.enableStaticCPUKernels = true;
+  SimpleTestModelRunner runner(graph, config);
+
+  // try standard aliasing case
+  auto output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 4);
+  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 4);
+  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);
+
+  // try swap out input storage between runs
+  at::Storage original_storage = input.storage();
+  input.unsafeGetTensorImpl()->set_storage_keep_dtype(
+      at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat).storage());
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_FALSE(output[0].toTensor().storage().is_alias_of(original_storage));
+  EXPECT_EQ(output[0].toTensor().dim(), 4);
+  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);
+
+  // try to upsize between runs
+  input.resize_({16, 16, 16, 16, 16});
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 5);
+  EXPECT_EQ(output[0].toTensor().numel(), 16 * 16 * 16 * 16 * 16);
+
+  // try to downsize between runs
+  input.resize_({4});
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 1);
+  EXPECT_EQ(output[0].toTensor().numel(), 4);
+
+  // try to restride between runs
+  input.as_strided_({3, 2}, {3, 6}).random_();
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 2);
+  EXPECT_EQ(output[0].toTensor().numel(), 3 * 2);
+  for (int i = 0; i < 3; i += 1) {
+    for (int j = 0; j < 2; j += 1) {
+      EXPECT_EQ(
+          output[0].toTensor().index({i, j}).item().toFloat(),
+          input.index({i, j}).item().toFloat());
+    }
+  }
+}
+
+TEST(StaticKernelTest, MulScalar) {
+  const std::string graph = R"(graph(%in0_t, %in1_t):
+    %out = torch.ops.aten.mul.Scalar(self=%in0_t, other=%in1_t)
+    return (%out)
+  )";
+
+  std::vector<std::pair<at::Tensor, std::vector<double>>> test_cases = {
+      {at::rand({3, 4}), {2.0, -2.0, -2, 2, 0.0, 1e6, 1e-6, NAN, INFINITY}},
+      {at::rand({2, 3, 4}), {2.0}},
+      {at::rand({3, 4}, at::kFloat), {3.0}}, // fp32 tensor with int scalar
+      {at::randint(0, 10, {3, 4}, at::kInt),
+       {2.0}}, // int32 tensor with double scalar
+      {at::rand({3, 4}, at::kHalf), {2.0}}, // half tensor with float scalar
+      {at::rand({3, 4}, at::kBFloat16), {2.0}}, // bf16 tensor with float scalar
+      {at::randint(0, 10, {3, 4}, at::kInt), {2}}, // int tensor with int scalar
+      {at::randint(0, 10, {3, 4}, at::kLong),
+       {2}}, // int64 tensor with int64 scalar,
+      {at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}),
+       {2}}, // int64 strided tensor with int64 scalar
+      {at::rand({3, 4}, at::kFloat).t(),
+       {2}}, // int64 strided tensor with int64 scalar
+      {at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}),
+       {2}}, // int64 strided tensor with int64 scalar
+      {at::rand({3, 4}, at::kFloat).t(),
+       {2}}, // int64 strided tensor with int64 scalar
+  };
+
+  for (const auto& [tensor, scalars] : test_cases) {
+    for (double scalar : scalars) {
+      std::vector<c10::IValue> inputs = {tensor, scalar};
+      testStaticKernelEquality(graph, inputs);
+    }
+  }
+}
+
+TEST(StaticKernelTest, SymSizeInt) {
+  const std::string graph = R"(graph(%self, %dim):
+    %out = torch.ops.aten.sym_size.int(self=%self, dim=%dim)
+    return (%out)
+  )";
+
+  // Define test cases with different tensors
+  std::vector<at::Tensor> test_cases = {
+      at::rand({3, 4, 5}), // standard 3D tensor
+      at::rand({0, 4, 5}), // empty tensor
+      at::rand({1}), // single-element tensor
+      at::rand({2, 3, 4, 5, 6}), // high-dimensional tensor
+      at::rand({3, 1, 5}) // tensor with one dimension as 1
+  };
+
+  // Iterate over each test case
+  for (const auto& tensor : test_cases) {
+    for (int64_t dim = 0; dim < tensor.dim(); ++dim) {
+      std::vector<c10::IValue> inputs = {tensor, dim};
+      testStaticKernelEquality(graph, inputs);
+    }
+  }
+}
+
+TEST(StaticKernelTest, BucketizeTensor) {
+  const std::string graph =
+      R"(graph(%input, %boundaries, %out_int32, %right):
+%out = torch.ops.aten.bucketize.Tensor(self=%input, boundaries=%boundaries, out_int32=%out_int32, right=%right)
+return (%out)
+)";
+
+  std::vector<std::pair<bool, bool>> test_cases = {
+      {false, false}, {true, false}, {false, true}, {true, true}};
+
+  for (const auto& [out_int32, right] : test_cases) {
+    at::Tensor input = at::tensor({0.1, 2.5, 3.0, 4.5, 5.0}, at::kFloat);
+    at::Tensor boundaries = at::tensor({1.0, 2.0, 3.0, 4.0}, at::kFloat);
+
+    std::vector<c10::IValue> args = {input, boundaries, out_int32, right};
+
+    testStaticKernelEquality(graph, args);
+  }
+}
+
+TEST(StaticKernelTest, SliceScatter) {
+  const std::string graph =
+      R"(graph(%self, %src, %dim, %start, %end, %step):
+%out = torch.ops.aten.slice_scatter.default(self=%self, src=%src, dim=%dim, start=%start, end=%end, step=%step)
+return (%out)
+)";
+
+  // Create input tensors
+  at::Tensor self = at::rand({5, 5}, at::kFloat);
+  at::Tensor src = at::rand({2, 5}, at::kFloat);
+  int64_t dim = 0;
+  int64_t start = 1;
+  int64_t end = 3;
+  int64_t step = 1;
+
+  // Create a vector of IValues to pass as inputs
+  std::vector<c10::IValue> inputs = {self, src, dim, start, end, step};
+
+  // Run the kernel and verify the output
+  testStaticKernelEquality(graph, inputs);
+}
+
+TEST(StaticKernelTest, QuantizedEmbeddingBagBytePrepack) {
+  const std::string graph = R"(
+    graph(%input):
+        %weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input)
+        %res = torch.ops.aten.clone.default(self=%weight, memory_format=None)
+        return (%res)
+  )";
+
+  at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float);
+
+  testStaticKernelEquality(graph, {args1});
+}
+
+TEST(StaticKernelTest, QuantizedEmbeddingBagByteUnpack) {
+  const std::string graph = R"(
+    graph(%input):
+        %weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input)
+        %output = torch.ops.quantized.embedding_bag_byte_unpack.default(weight=%weight)
+        %res = torch.ops.aten.clone.default(self=%output, memory_format=None)
+        return (%res)
+  )";
+
+  at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float);
+
+  testStaticKernelEquality(graph, {args1});
+}
+
+TEST(StaticKernelTest, QuantizedLinear) {
+  const std::string graph = R"(
+    graph(%input, %weights):
+        %packed_params = torch.ops.quantized.linear_prepack.default(W=%weights, B=None)
+        %1254 = torch.ops.quantized.linear.default(X=%input, W_prepack=%packed_params, Y_scale_i=1.0, Y_zero_point_i=1)
+        %res = torch.ops.aten.dequantize.self(self=%1254)
+        return (%res)
+  )";
+
+  at::Tensor input =
+      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQUInt8);
+  at::Tensor weight =
+      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQInt8);
+
+  testStaticKernelEquality(graph, {input, weight});
+}
+
+TEST(NativeKernelTest, View) {
+  const std::string source =
+      R"(graph(%self):
+%ret = torch.ops.aten.view.default(self=%self, size=[36])
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({6, 6});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(source, args, true);
+}
+
+TEST(NativeKernelTest, Permute) {
+  const std::string source =
+      R"(graph(%self):
+%ret = torch.ops.aten.permute.default(self=%self, dims=[1, 0])
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({2, 3});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(source, args, true);
+}
+
+TEST(NativeKernelTest, Reshape) {
+  const std::string source =
+      R"(graph(%self):
+%ret = torch.ops.aten.reshape.default(self=%self, shape=[9, 4])
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({3, 3, 4});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(source, args, true);
+}
+
+TEST(NativeKernelTest, Select) {
+  static constexpr std::string_view source =
+      R"(graph(%self):
+%ret = torch.ops.aten.select.int(self=%self, dim=1, index=0)
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({3, 3, 3});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(source, args, true);
+}
+
+TEST(NativeKernelTest, Slice) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.slice.Tensor(self=%self, dim=0, start=1, end=3, step=1)
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({5, 5});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, Split) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.split.Tensor(self=%self, split_size=2, dim=0)
+return (%ret)
+)";
+
+  auto self0 = at::rand({6, 6});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, SplitWithSizes) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.split_with_sizes.default(self=%self, split_sizes=[2, 4], dim=0)
+return (%ret)
+)";
+
+  auto self0 = at::rand({6, 6});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, TensorSplitSections) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.tensor_split.sections(self=%self, sections=3, dim=0)
+return (%ret)
+)";
+
+  auto self0 = at::rand({9, 3});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(StaticKernelTest, Stack) {
+  const std::string graph =
+      R"(graph(%tensors):
+%ret = torch.ops.aten.stack.default(tensors=%tensors, dim=0)
+return (%ret)
+)";
+
+  auto tensor1 = at::rand({2, 3});
+  auto tensor2 = at::rand({2, 3});
+  auto tensor3 = at::rand({2, 3});
+  std::vector<c10::IValue> args{
+      std::vector<at::Tensor>{tensor1, tensor2, tensor3}};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, Item) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.item.default(self=%self)
+return (%ret)
+)";
+
+  auto self0 = at::tensor({42.0});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, Narrow) {
+  const std::string graph =
+      R"(graph(%self, %dim, %start, %length):
+%ret = torch.ops.aten.narrow.default(self=%self, dim=%dim, start=%start, length=%length)
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self = at::rand({5, 5});
+  int64_t dim = 1;
+  int64_t start = 1;
+  int64_t length = 3;
+  std::vector<c10::IValue> args{self, dim, start, length};
+  testStaticKernelEquality(graph, args, true);
+}
+} // namespace torch::nativert
diff --git a/test/distributed/checkpoint/test_consolidate_hf_safetensors.py b/test/distributed/checkpoint/test_consolidate_hf_safetensors.py
index e9608c816a7c8..00d73311e1e75 100644
--- a/test/distributed/checkpoint/test_consolidate_hf_safetensors.py
+++ b/test/distributed/checkpoint/test_consolidate_hf_safetensors.py
@@ -265,6 +265,38 @@ def test_consolidate_with_two_ranks(self):
 
         dist.barrier()
 
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(2)
+    def test_consolidate_one_file_with_two_ranks(self):
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+        import safetensors
+
+        # this is testing the case where one rank has no data to write
+        # and the other rank has two tensors to write.
+        # the rank with no work should wait properly for the other rank to finish
+        checkpoint_dir = self.temp_dir
+        output_dir = os.path.join(checkpoint_dir, "consolidated")
+        os.makedirs(output_dir, exist_ok=True)
+
+        self._create_d_tensors()
+
+        global_tensor = torch.arange(16, dtype=torch.float).view(4, 4)
+
+        fqn_to_index_mapping = {"dtensor": 1, "dtensor_col": 1}
+        consolidate_safetensors_files_on_every_rank(
+            checkpoint_dir, output_dir, fqn_to_index_mapping=fqn_to_index_mapping
+        )
+
+        file1_path = os.path.join(output_dir, "model-00001-of-00001.safetensors")
+
+        loaded_dict = safetensors.torch.load_file(file1_path)
+        self.assertEqual(loaded_dict.keys(), {"dtensor", "dtensor_col"})
+        self.assertTrue(torch.equal(loaded_dict["dtensor"], global_tensor))
+        self.assertTrue(torch.equal(loaded_dict["dtensor_col"], global_tensor))
+
     def test_write_sub_tensor_to_file_optimized(self) -> None:
         """Test the _write_sub_tensor_to_file_optimized function with various scenarios."""
 
diff --git a/test/distributed/checkpoint/test_hf_safetensor_e2e.py b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
index 92f9b97237064..40558175569c9 100644
--- a/test/distributed/checkpoint/test_hf_safetensor_e2e.py
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@@ -117,6 +117,46 @@ def test_load_into_empty_dict(self) -> None:
                 torch.equal(state_dict_to_save[key], state_dict_loaded[key])
             )
 
+    @with_temp_dir
+    def test_load_with_multiple_threads(self) -> None:
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+
+        state_dict_to_save = MyTestModule().state_dict()
+        state_dict_to_load = MyTestModule().state_dict()
+
+        # Create a mapping to split tensors across multiple files
+        # This will force multiple files to be created, enabling multi-threading
+        fqn_to_index_mapping = {}
+        for i, fqn in enumerate(state_dict_to_save.keys()):
+            fqn_to_index_mapping[fqn] = (i % 2) + 1  # Split across 2 files
+
+        # Save using HuggingFaceStorageWriter with multiple files
+        dist_cp.save(
+            state_dict=state_dict_to_save,
+            storage_writer=dist_cp.HuggingFaceStorageWriter(
+                path=CHECKPOINT_DIR, fqn_to_index_mapping=fqn_to_index_mapping
+            ),
+        )
+
+        dist_cp.load(
+            state_dict=state_dict_to_load,
+            storage_reader=dist_cp.HuggingFaceStorageReader(
+                path=CHECKPOINT_DIR, thread_count=2
+            ),
+        )
+
+        self.assertEqual(
+            sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
+        )
+        for key in state_dict_to_save.keys():
+            self.assertTrue(
+                torch.equal(state_dict_to_save[key], state_dict_to_load[key])
+            )
+
 
 class TestDistributedHFSafetensorsConsolidation(DTensorTestBase):
     @with_comms
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
index 598f4260c1f3a..dabf3d78a6f13 100644
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@@ -40,7 +40,7 @@
     W,
 )
 from torch.distributed.pipelining.stage import _PipelineStageBase, PipelineStage
-from torch.testing._internal.common_distributed import requires_nccl
+from torch.testing._internal.common_distributed import requires_accelerator_dist_backend
 from torch.testing._internal.common_utils import (
     check_leaked_tensors,
     instantiate_parametrized_tests,
@@ -53,6 +53,7 @@
 
 ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "artifacts")
 
+device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 logger = logging.getLogger(__name__)
 torch.manual_seed(0)
 
@@ -740,7 +741,7 @@ def _dump_csv(pipeline_order_with_comms, filename: str):
         # print(_format_pipeline_order(simulated_schedule))
         self.assertEqual(num_steps, 113)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     def test_grad_with_v_schedule(self):
         """
         We have a special case for V schedules where 2 adjacent stages are on the same rank.
@@ -760,7 +761,6 @@ def test_grad_with_v_schedule(self):
         d_hid = 512
         batch_size = 256
         n_stages = 2
-        device = "cuda"
         full_mod = MultiMLP(d_hid, n_layers=n_stages)
         full_mod.to(device)
 
@@ -859,7 +859,7 @@ def test_grad_with_v_schedule(self):
 
         torch.distributed.destroy_process_group()
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     def test_grad_with_split_b_w(self):
         """
         Ensure that separate dInput and dWeight computations are correctly executed.
@@ -872,7 +872,6 @@ def test_grad_with_split_b_w(self):
         d_hid = 512
         batch_size = 256
         n_stages = 1
-        device = "cuda"
         full_mod = MultiMLP(d_hid, n_layers=n_stages)
         full_mod.to(device)
 
diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py
index a845598c9cc94..9ba12c3d69965 100644
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@@ -29,10 +29,9 @@
 )
 from torch.distributed.pipelining.schedules import _PipelineScheduleRuntime
 from torch.nn.modules.loss import MSELoss
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
-    requires_nccl,
+    requires_accelerator_dist_backend,
 )
 from torch.testing._internal.common_utils import (
     check_leaked_tensors,
@@ -48,7 +47,9 @@
 d_hid = 512
 batch_size = 64
 torch.manual_seed(0)
-device_type = "cuda"
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+backend = dist.get_default_backend_for_device(device_type)
+TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
 
 
 @dataclass
@@ -205,7 +206,7 @@ class ScheduleTest(MultiProcContinuousTest):
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
-        return "nccl"
+        return backend
 
     @property
     def device(self) -> torch.device:
@@ -218,8 +219,10 @@ def config(self) -> PipelineTestConfig:
             world_size=self.world_size, device=self.device, rank=self.rank
         )
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [_ScheduleForwardOnly])
     def test_forward_only(self, ScheduleClass):
         mod, mod_ref, x, _, _ = setup_models_and_data(self.config)
@@ -250,8 +253,10 @@ def test_forward_only(self, ScheduleClass):
                 x_clone = mod_ref(x_clone)
             torch.testing.assert_close(x_clone, out)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [
@@ -325,8 +330,10 @@ def test_eval_inference_mode(self, ScheduleClass):
         if self.rank == self.world_size - 1:
             self.assertTrue(len(losses) > 0, "Losses should be computed during eval()")
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_multi_iter(self, ScheduleClass):
         mod, _, x, target, loss_fn = setup_models_and_data(self.config)
@@ -346,8 +353,10 @@ def test_multi_iter(self, ScheduleClass):
 
         dist.barrier(device_ids=[self.rank])
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_kwargs_with_tracer(self, ScheduleClass):
         mod = ModelWithKwargs(d_hid, splits=self.world_size)
@@ -396,8 +405,10 @@ def test_kwargs_with_tracer(self, ScheduleClass):
             torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=5e-3)
             torch.testing.assert_close(pipe_loss, ref_loss)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_grad_with_tracer(self, ScheduleClass):
         mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
@@ -435,8 +446,10 @@ def test_grad_with_tracer(self, ScheduleClass):
         # Check gradients using helper method
         check_gradients(self.config, stage_module, ref_mod)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     @parametrize("shape_inference", [True, False])
     def test_grad_with_manual(self, ScheduleClass, shape_inference):
@@ -490,8 +503,10 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
         # Check gradients using helper method
         check_gradients(self.config, stage_module, ref_mod)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [
@@ -600,8 +615,10 @@ def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
             self.config, stage_modules, ref_mod, submod_names, rtol=5e-3, atol=5e-3
         )
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble])
     def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass):
         stages_per_rank = 2
@@ -681,8 +698,10 @@ def dw_runner():
         # Check gradients using helper method
         check_gradients(self.config, stage_modules, ref_mod, submod_names)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize(
         "schedule_class",
         [ScheduleZBVZeroBubble, ScheduleDualPipeV],
@@ -735,8 +754,10 @@ def test_v_shape_schedules(self, schedule_class, use_new_runtime):
         # Check gradients using helper method
         check_gradients(self.config, stage_modules, ref_mod, submod_names)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
@@ -813,7 +834,7 @@ class CustomSchedulesTest(MultiProcContinuousTest):
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
-        return "nccl"
+        return backend
 
     @property
     def device(self) -> torch.device:
@@ -826,8 +847,10 @@ def config(self) -> PipelineTestConfig:
             world_size=self.world_size, device=self.device, rank=self.rank
         )
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize(
         "schedule_class",
         [ScheduleVShaped, ScheduleUnbalanced],
@@ -884,8 +907,10 @@ def test_non_symmetric_stage_ids(self, schedule_class, use_new_runtime):
         # Check gradients using helper method
         check_gradients(self.config, stage_modules, ref_mod, submod_names)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleWithReorderedB])
     def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
         n_stages = 2
@@ -942,8 +967,10 @@ def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
         # Check gradients using helper method
         check_gradients(self.config, stage_modules, ref_mod, submod_names)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleWithW])
     def test_schedule_with_native_zero_bubble(self, ScheduleClass):
         n_stages = ScheduleClass.n_stages
diff --git a/test/distributed/pipelining/test_stage.py b/test/distributed/pipelining/test_stage.py
index 7c2ab9f2b4b8d..12c8d62037357 100644
--- a/test/distributed/pipelining/test_stage.py
+++ b/test/distributed/pipelining/test_stage.py
@@ -14,17 +14,15 @@
     ScheduleGPipe,
 )
 from torch.distributed.pipelining._utils import PipeliningShapeError
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
     MultiProcessTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
-    skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
 )
 from torch.utils._pytree import tree_map_only
@@ -34,7 +32,9 @@
 batch_size = 256
 chunks = 4
 
-device_type = "cuda"
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+backend = dist.get_default_backend_for_device(device_type)
+TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
 
 torch.manual_seed(0)
 
@@ -67,7 +67,7 @@ class StageTest(MultiProcContinuousTest):
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
-        return "nccl"
+        return backend
 
     @classmethod
     def device_type(cls) -> str:
@@ -77,8 +77,10 @@ def device_type(cls) -> str:
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ModelClass", [ExampleCode, MultiMLP])
     def test_tracer(self, ModelClass):
         mod = ModelClass(d_hid, self.world_size)
@@ -121,8 +123,10 @@ def _run_step(x):
         old_keys = mod.state_dict().keys()
         assert all(k in old_keys for k in submod_keys)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ModelClass", [ModelWithKwargs])
     def test_tracer_kwargs(self, ModelClass):
         mod = ModelClass(d_hid, self.world_size)
@@ -170,8 +174,10 @@ def test_tracer_kwargs(self, ModelClass):
         old_keys = mod.state_dict().keys()
         assert all(k in old_keys for k in submod_keys)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_manual(self):
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
         full_mod.to(self.device)
@@ -202,8 +208,10 @@ def _run_step(x):
             ref_out = full_mod(x)
             torch.testing.assert_close(out, ref_out)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_custom_dw_with_fb_schedule(self):
         """Tests that separate weight grad function 'dw_runner' gets run under a schedule that's only aware of F/B."""
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
@@ -262,8 +270,10 @@ def _run_step(x):
             ref_out = full_mod(x)
             torch.testing.assert_close(out, ref_out)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_output_chunks_memory_usage(self):
         """Test that output_chunks doesn't store memory for non-first stages."""
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
@@ -347,15 +357,17 @@ def tearDown(self):
     def init_pg(self):
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
-            backend="nccl",
+            backend=backend,
             store=store,
             rank=self.rank,
             world_size=self.world_size,
             device_id=self.device,
         )
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle("Flaky in CI")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_shape_prop_mismatch(self):
         """Tests shape prop errors are raised"""
         self.init_pg()
@@ -402,8 +414,10 @@ def _run_step(x):
             with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"):
                 _run_step(x)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_custom_dw_errors(self):
         """Tests expected errors are raised"""
         self.init_pg()
diff --git a/test/distributed/pipelining/test_transformer.py b/test/distributed/pipelining/test_transformer.py
index 7e58129186a69..20e830547de7b 100644
--- a/test/distributed/pipelining/test_transformer.py
+++ b/test/distributed/pipelining/test_transformer.py
@@ -73,7 +73,9 @@ def get_layers(module):
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
-instantiate_device_type_tests(TransformerTests, globals(), only_for=devices)
+instantiate_device_type_tests(
+    TransformerTests, globals(), only_for=devices, allow_xpu=True
+)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_unflatten.py b/test/distributed/pipelining/test_unflatten.py
index ae1e684d7c222..0493f39b16cb8 100644
--- a/test/distributed/pipelining/test_unflatten.py
+++ b/test/distributed/pipelining/test_unflatten.py
@@ -73,7 +73,9 @@ def test_unflatten(self, device):
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
-instantiate_device_type_tests(UnflattenTests, globals(), only_for=devices)
+instantiate_device_type_tests(
+    UnflattenTests, globals(), only_for=devices, allow_xpu=True
+)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/parallel/test_tp_random_state.py b/test/distributed/tensor/parallel/test_tp_random_state.py
index a12bf017932f2..0544022a84fce 100644
--- a/test/distributed/tensor/parallel/test_tp_random_state.py
+++ b/test/distributed/tensor/parallel/test_tp_random_state.py
@@ -66,7 +66,7 @@ def test_model_init(self):
             # in the following way:
             #   - within a tensor parallel group, the RNG is set with the same seed
             #   - across data parallel groups, the RNG is set with different seeds
-            torch.cuda.manual_seed(dp_rank)
+            torch.cuda.manual_seed(0)
 
             # disable/enable parallel RNG feature
             if random._rng_tracker:
@@ -118,14 +118,10 @@ def tp_weights_assert(tensor1, tensor2):
 
                 # compare local shards across TP groups
                 def dp_weights_assert(tensor1, tensor2):
-                    if enable_distribute_flag:
-                        # local weights shall be initialized the same across TP groups
-                        self.assertEqual(tensor1, tensor2)
-                    else:
-                        # without the parallel RNG, weight initialization violates the TP setup:
-                        # local weights are initialized differently across TP groups due to different
-                        # random seeds set in data loading.
-                        self.assertNotEqual(tensor1, tensor2)
+                    # local weights shall be initialized the same across TP groups,
+                    # and it doesn't matter whether DTensor's RNG infra is activated since all spmd ranks
+                    # started with the same seed.
+                    self.assertEqual(tensor1, tensor2)
 
                 self.check_gathered_tensors(
                     dp_rank, dp_size, tensor_gather, dp_weights_assert
diff --git a/test/distributed/tensor/test_random_ops.py b/test/distributed/tensor/test_random_ops.py
index 180286bd2e1da..ef63b3ac77c90 100644
--- a/test/distributed/tensor/test_random_ops.py
+++ b/test/distributed/tensor/test_random_ops.py
@@ -33,6 +33,11 @@
 )
 
 
+def get_generator_seed_for_device_type(device_type: str) -> int:
+    device_module = torch.get_device_module(device_type)
+    return device_module.get_rng_state()[:8].view(torch.int64).item()
+
+
 class DistTensorRandomInitTest(DTensorTestBase):
     def _run_init_op(self, init_op, *args, **kwargs):
         device_mesh = self.build_device_mesh()
@@ -105,30 +110,23 @@ def test_init_with_user_generator(self):
             torch.nn.init.uniform_(t2, 0.0, 1.0, rng)
             self.assertEqual(t1.full_tensor(), t2.full_tensor(), f"Failed at {i=}")
 
-        # ensure that we do not cache the 'seed' of `rng` from the first time we see it in DTensor
-        # TODO: we have a semantics decision to make
-        # There is a discontinuity between how the default RNG and a user-supplied RNG behaves with DTensor:
-        # (a) if the user calls `torch.manual_seed` after already using the default RNG with DTensor,
-        #     they may be surprised that it has no effect on DTensor.  They must instead call this private API
-        #     (`torch.distributed.tensor._random._rng_tracker._manual_seed`)
-        # (b) If we try to match the semantics of (a) with a user-supplied RNG, they may be very surprised to find that
-        #     their RNG object never advances its state after using it with DTensor.
-        # torch.distributed.tensor._random._rng_tracker._manual_seed(55)
-        # rng.manual_seed(55)
-        # torch.nn.init.uniform_(t1, 0.0, 1.0)
-        # torch.nn.init.uniform_(t2, 0.0, 1.0, rng)
-        # self.assertEqual(t1.full_tensor(), t2.full_tensor())
+        # ensure that we do not cache the 'seed' from the first time we see it in DTensor
+        # this is a behavior change, DTensor used to cache the generator state and not modify the original generator,
+        # now it modifies the original generator instead.
+        torch.manual_seed(55)
+        rng.manual_seed(55)
+        torch.nn.init.uniform_(t1, 0.0, 1.0)
+        torch.nn.init.uniform_(t2, 0.0, 1.0, rng)
+        self.assertEqual(t1.full_tensor(), t2.full_tensor())
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_meta_tensor_init(self):
-        # test suite sets each rank's seed to the same value but in actual
-        # execution the default random seed will be different (a random value).
-        # The DTensor random ops will use the same random seed even though the
-        # torch random generator keeps different seeds on ranks. This ensures
-        # that Replicate DTensor will have the same initialized results
-        # across ranks.
-        torch.cuda.manual_seed(self.rank)
+        # test suite sets each rank's seed to the same value.
+        # The DTensor random ops will use the same generator as the default one on the device.
+
+        # Note: this behavior changed, and now the guideline is to set the same RNG seed on all SPMD ranks.
+        torch.cuda.manual_seed(0)
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         size = [1024, 2048]
         meta_dtensor = distribute_tensor(
@@ -147,7 +145,7 @@ def test_meta_tensor_init(self):
         self.assertTrue(random._rng_tracker.distribute_region_enabled)
 
         # allgather the local tensors
-        local_tensor = funcol.all_gather_tensor(
+        gathered_local_tensors = funcol.all_gather_tensor(
             dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
         )
 
@@ -158,7 +156,8 @@ def test_meta_tensor_init(self):
                 # other rank should have an identical local tensor
                 other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
                 self.assertEqual(
-                    local_tensor[self_slice, :], local_tensor[other_slice, :]
+                    gathered_local_tensors[self_slice, :],
+                    gathered_local_tensors[other_slice, :],
                 )
 
         # Test 2: disable the distribute region for RNG
@@ -177,11 +176,11 @@ def test_meta_tensor_init(self):
 
         # compare with local tensors from other ranks
         for other_rank in range(self.world_size):
-            # the RNG result on each rank differs even they're supposed
-            # to be replicated
+            # the RNG result on each rank are the same even without the help of DTensor's RNG infra,
+            # since the default RNG is the same across ranks.
             if self.rank != other_rank:
                 other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
-                self.assertNotEqual(
+                self.assertEqual(
                     local_tensor[self_slice, :], local_tensor[other_slice, :]
                 )
 
@@ -307,7 +306,12 @@ def test_rng_tracker_init(self):
         # seed synchronization only happens after `manual_seed` or the first DTensor
         # random op call
         dt.uniform_(0, 1)
-        self.assertEqual(seed_from_rank_0, random._rng_tracker.get_seed("parallel-rng"))
+
+        # We do not maintain the copy of the seed in dtensor, but we do mutate the global rng state
+        # since we now always pull it fresh from the local device generator
+        self.assertEqual(
+            seed_from_rank_0, get_generator_seed_for_device_type(self.device_type)
+        )
 
     @with_comms
     @skip_unless_torch_gpu
@@ -326,11 +330,13 @@ def test_manual_seed(self):
             manual_seed(self.rank, device_mesh)
             # RNG tracker should already be initialized
             self.assertTrue(random._rng_tracker is not None)
-            self.assertEqual(self.rank, random._rng_tracker.get_seed("parallel-rng"))
+            self.assertEqual(
+                self.rank, get_generator_seed_for_device_type(self.device_type)
+            )
 
             # Test 2: set same seed on different ranks
             manual_seed(1234, device_mesh)
-            self.assertEqual(1234, random._rng_tracker.get_seed("parallel-rng"))
+            self.assertEqual(1234, get_generator_seed_for_device_type(self.device_type))
 
         self.assertEqual(comm_mode.get_total_counts(), 0)
 
@@ -363,7 +369,10 @@ def test_pipeline_parallel_manual_seed(self):
 
         # set the seed for each pipeline stage to 123 + pp_rank
         manual_seed(123 + pp_rank, spmd_mesh)
-        self.assertEqual(123 + pp_rank, random._rng_tracker.get_seed("parallel-rng"))
+        # dtensor no longer stores a copy of the seed, but it mutates the device's generator so we can check that
+        self.assertEqual(
+            123 + pp_rank, get_generator_seed_for_device_type(self.device_type)
+        )
 
         # mimic initializing a model weight sharded on the SPMD mesh
         spmd_dtensor = torch.distributed.tensor.ones(
@@ -448,14 +457,15 @@ def test_deterministic_rand_1d(self):
             self_slice = slice(4 * self.rank, 4 * self.rank + 4)
             for other_rank in range(self.world_size):
                 if self.rank != other_rank:
-                    # other rank should have an identical local tensor
+                    # other rank should have a different local tensor for shard placement
                     other_slice = slice(4 * other_rank, 4 * other_rank + 4)
                     self.assertNotEqual(
                         local_tensor[self_slice, :],
                         local_tensor[other_slice, :],
                     )
 
-            torch.manual_seed(self.rank)
+            # we should set manual seed to the same value on all SPMD ranks
+            torch.manual_seed(0)
             dtensor = fn(size, device_mesh=device_mesh, placements=[Replicate()])
             local_tensor = funcol.all_gather_tensor(
                 dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
@@ -465,7 +475,7 @@ def test_deterministic_rand_1d(self):
             self_slice = slice(4 * self.rank, 4 * self.rank + 4)
             for other_rank in range(self.world_size):
                 if self.rank != other_rank:
-                    # other rank should have an identical local tensor
+                    # other rank should have an identical local tensor for replicate placement
                     other_slice = slice(4 * other_rank, 4 * other_rank + 4)
                     self.assertEqual(
                         local_tensor[self_slice, :],
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 95bc8b5345239..ff0dac4fcc0e7 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2449,7 +2449,7 @@ def tearDown(self) -> None:
 
     def _verify_trace(self, t, is_json):
         ver = t["version"]
-        self.assertEqual(ver, "2.9")
+        self.assertEqual(ver, "2.10")
         pg_config = t["pg_config"]
         self.assertEqual(len(pg_config), 1)
         default_pg_info = pg_config["0"]
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 0e0a98c120ded..2f6a71c927933 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -3099,7 +3099,7 @@ def test_invalid_nccl_blocking_wait_env(self):
         self._run_invalid_nccl_blocking_wait_env("4294967295")
 
 
-class NcclRegistrationTest(MultiProcessTestCase):
+class NcclUserBufferRegistrationTest(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
@@ -3191,7 +3191,7 @@ def test_nccl_window_registration(self):
 
             # Use NCCL memory allocator
             # enable symmetric memory usage in NCCL
-            pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)
+            pool = torch.cuda.MemPool(backend.mem_allocator)
 
             # allocate memory with ncclMemAlloc
             # note: symmetric kernels are not available for dtypes like torch.int64
@@ -3201,10 +3201,16 @@ def test_nccl_window_registration(self):
                 )
 
             # register buffers to NCCL
-            backend.register_mem_pool(pool)
+            backend.register_mem_pool(pool, symm=True)
 
             # allreduce now should use NVIDIA Switches
             pg.allreduce(tensor).wait()
+            # check that further allocations are also registered
+            with torch.cuda.use_mem_pool(pool):
+                tensor = torch.arange(
+                    1024 * 1024 * 2, device=device, dtype=torch.float32
+                )
+            pg.allreduce(tensor).wait()
             torch.cuda.synchronize(device=device)
 
             # de-register buffers from NCCL
@@ -3217,7 +3223,7 @@ def test_nccl_window_registration(self):
             nccl_debug_file_content = f.read()
             # if buffers were registered and symmetric kernels ran, NCCL_DEBUG
             # should show successful registration in debug output
-            self.assertRegex(nccl_debug_file_content, "[Symmetric]")
+            self.assertRegex(nccl_debug_file_content, "Symmetric")
 
 
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
@@ -4361,10 +4367,12 @@ def started_or_scheduled(self, timing_enabled):
 class NCCLTraceTest(NCCLTraceTestBase):
     def _verify_trace(self, t, include_collectives, timing_enabled, is_json):
         ver = t["version"]
-        self.assertEqual(ver, "2.9")
-        nccl_version = t["nccl_version"]
-        torch_nccl_version = torch.cuda.nccl.version()
-        self.assertEqual(nccl_version, ".".join(str(v) for v in torch_nccl_version))
+        self.assertEqual(ver, "2.10")
+        comm_lib_version = t["comm_lib_version"]
+        torch_comm_lib_version = torch.cuda.nccl.version()
+        self.assertEqual(
+            comm_lib_version, ".".join(str(v) for v in torch_comm_lib_version)
+        )
         pg_config = t["pg_config"]
         self.assertEqual(len(pg_config), 1)
         default_pg_info = pg_config["0"]
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 8073b36f9ca3f..656c03aa6cfd6 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -1642,7 +1642,7 @@ def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
     def test_reduce_scatter_bucket(self):
-        def func(x, w, rs_0, rs_1, *, tag, ranks, group_size):
+        def func(x, w, rs_0, rs_1, tag, ranks, group_size):
             # do some unrelated matmuls
             y = torch.mm(x, w)
 
@@ -1667,35 +1667,44 @@ def func(x, w, rs_0, rs_1, *, tag, ranks, group_size):
 
             return y, rs_0_out, rs_1_out
 
-        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
-        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        rs_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        rs_1 = torch.ones(384, 256, device="cuda", dtype=torch.float32)
-        inputs = [x, w, rs_0, rs_1]
-        func(*inputs, **self.get_world_trs())
-
-        with torch._inductor.config.patch(
-            {
-                "bucket_reduce_scatters_fx": "all",
-                "reorder_for_compute_comm_overlap": False,
-            }
-        ):
-            compiled = torch.compile(func)
-            code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
-        # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unnecessary copy is made.
-        (
-            FileCheck()
-            .check_count(
-                "torch.ops._c10d_functional.reduce_scatter_tensor.default(",
-                count=1,
-                exactly=True,
+        # test "fsdp" mode to allow convert_element_type after wait
+        def func2(x, w, rs_0, rs_1, tag, ranks, group_size):
+            y, rs_0_out, rs_1_out = func(x, w, rs_0, rs_1, tag, ranks, group_size)
+            return y, rs_0_out.to(torch.float32), rs_1_out.to(torch.float32)
+
+        for f in [func, func2]:
+            x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
+            w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+            rs_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+            rs_1 = torch.ones(384, 256, device="cuda", dtype=torch.float32)
+            inputs = [x, w, rs_0, rs_1]
+            f(*inputs, **self.get_world_trs())
+
+            with torch._inductor.config.patch(
+                {
+                    "bucket_reduce_scatters_fx": "fsdp",
+                    "reorder_for_compute_comm_overlap": False,
+                }
+            ):
+                compiled = torch.compile(f)
+                compiled(*inputs, **self.get_world_trs())
+                code = run_and_get_triton_code(
+                    compiled, *inputs, **self.get_world_trs()
+                )
+            # NOTE: The first return value should be the output of the first wait_tensor.
+            # We want to make sure no unnecessary copy is made.
+            (
+                FileCheck()
+                .check_count(
+                    "torch.ops._c10d_functional.reduce_scatter_tensor.default(",
+                    count=1,
+                    exactly=True,
+                )
+                .run(code)
             )
-            .run(code)
-        )
-        out = compiled(*inputs, **self.get_world_trs())
-        correct = func(*inputs, **self.get_world_trs())
-        assert same(out, correct), f"{out} va {correct}"
+            out = compiled(*inputs, **self.get_world_trs())
+            correct = f(*inputs, **self.get_world_trs())
+            assert same(out, correct), f"{out} va {correct}"
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
index bfc6ed1b65a02..64b8062b6098f 100644
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@@ -65,48 +65,6 @@ def foo():
         out = symm_mem.empty(numel, dtype=dtype, device=self.device)
         symm_mem.rendezvous(out, group=group_name)
 
-    @skipIfRocm
-    def test_rendezvous_slice(self) -> None:
-        # Rendezvous a slice of a tensor
-        self._init_device()
-        group_name = dist.group.WORLD.group_name
-        symm_mem.enable_symm_mem_for_group(group_name)
-
-        x = symm_mem.empty((2, 1024), device=self.device)
-        # Directly rendezvousing a slice should not fail
-        hdls = [symm_mem.rendezvous(y, group=group_name) for y in torch.chunk(x, 2)]
-        # Assert that handles are not the same
-        self.assertIsNot(hdls[0], hdls[1])
-
-    @skipIfRocm
-    def test_rendezvous_view(self) -> None:
-        # Rendezvous a view of a tensor
-        self._init_device()
-        group_name = dist.group.WORLD.group_name
-        symm_mem.enable_symm_mem_for_group(group_name)
-
-        x = symm_mem.empty(1024, device=self.device)
-        y = x.view(32, 32)
-        # Directly rendezvousing a view should not fail
-        hdl_y = symm_mem.rendezvous(y, group=group_name)
-
-        # Assert that view's handle is not the same as the original tensor's handle
-        hdl_x = symm_mem.rendezvous(x, group=group_name)
-        self.assertIsNot(hdl_x, hdl_y)
-
-    @skipIfRocm
-    def test_rendezvous_same(self) -> None:
-        # Rendezvous same tensor multiple times
-        self._init_device()
-        group_name = dist.group.WORLD.group_name
-        symm_mem.enable_symm_mem_for_group(group_name)
-
-        x = symm_mem.empty(1024, device=self.device)
-        hdl_0 = symm_mem.rendezvous(x, group=group_name)
-        hdl_1 = symm_mem.rendezvous(x, group=group_name)
-        # The handle should point to the same object
-        self.assertIs(hdl_0, hdl_1)
-
     @skipIfRocm
     def test_nvshmem_put(self) -> None:
         self._init_device()
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index 8327a5611ef4c..c5e0c8315b66c 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -24,7 +24,7 @@
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
 from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_distributed import (
-    MultiProcContinuousTest,
+    MultiProcContinousTest,
     MultiProcessTestCase,
     requires_multicast_support,
     skip_if_lt_x_gpu,
@@ -46,20 +46,20 @@
 test_contexts = [nullcontext, _test_mode]
 
 # So that tests are written in device-agnostic way
-device_type = "cuda"
+device_type = "xpu"
 device_module = torch.get_device_module(device_type)
 
 
 @instantiate_parametrized_tests
-@requires_cuda_p2p_access()
-class SymmetricMemoryTest(MultiProcContinuousTest):
+# @requires_cuda_p2p_access()
+class SymmetricMemoryTest(MultiProcContinousTest):
     @property
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
     def _init_process(self, set_device: bool = True):
         if set_device:
-            torch.cuda.set_device(self.device)
+            torch.xpu.set_device(self.device)
         torch.manual_seed(42 + self.rank)
 
     def test_has_multicast_support(self) -> None:
@@ -67,12 +67,12 @@ def test_has_multicast_support(self) -> None:
         self.assertFalse(_SymmetricMemory.has_multicast_support(DeviceType.CPU, 0))
         # NOTE: DeviceType.CUDA is implicitly tested through @requires_multicast_support
 
-    @skipIfRocm
-    @skip_if_lt_x_gpu(2)
+    # @skipIfRocm
+    # @skip_if_lt_x_gpu(2)
     def test_get_backend(self) -> None:
-        backend = symm_mem.get_backend(torch.device("cuda"))
+        backend = symm_mem.get_backend(torch.device("xpu"))
         self.assertIsNotNone(backend)
-        backend = symm_mem.get_backend("cuda")
+        backend = symm_mem.get_backend("xpu")
         self.assertIsNotNone(backend)
 
     @skipIfRocm
@@ -83,13 +83,13 @@ def test_cuda_nvlink_connectivity_detection(self) -> None:
         connectivity = _detect_dma_connectivity(DeviceType.CUDA, "nvlink")
         self.assertEqual(connectivity.device_type, DeviceType.CUDA)
         self.assertEqual(connectivity.connection_type, "nvlink")
-        self.assertEqual(len(connectivity.matrix), torch.cuda.device_count())
+        self.assertEqual(len(connectivity.matrix), torch.xpu.device_count())
         for row in connectivity.matrix:
-            self.assertEqual(len(row), torch.cuda.device_count())
+            self.assertEqual(len(row), torch.xpu.device_count())
 
-    @runOnRocmArch(MI300_ARCH)
+    # @runOnRocmArch(MI300_ARCH)
     def test_large_alloc(self) -> None:
-        t = symm_mem.empty(2 * 1024**3, dtype=torch.uint8, device="cuda")
+        t = symm_mem.empty(2 * 1024**3, dtype=torch.uint8, device="xpu")
         self.assertEqual(t.numel() * t.element_size(), 2 * 1024**3)
 
     def _get_test_alloc_args(self):
@@ -127,8 +127,8 @@ def _verify_symmetric_memory(self, symm_mem_hdl):
 
         symm_mem_hdl.barrier()
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
+    # @runOnRocmArch(MI300_ARCH)
+    # @skip_if_lt_x_gpu(2)
     @parametrize("set_device", [True, False])
     def test_empty_strided_p2p(self, set_device: bool) -> None:
         self._init_process(set_device)
@@ -145,8 +145,8 @@ def test_empty_strided_p2p(self, set_device: bool) -> None:
         del t
         self._verify_symmetric_memory(symm_mem_hdl)
 
-    @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
-    @skip_if_lt_x_gpu(2)
+    # @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
+    # @skip_if_lt_x_gpu(2)
     @parametrize("set_device", [True, False])
     def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
         self._init_process(set_device)
@@ -172,12 +172,12 @@ def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
         symm_mem_hdl = _SymmetricMemory.rendezvous(t)
         self._verify_symmetric_memory(symm_mem_hdl)
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
+    # @runOnRocmArch(MI300_ARCH)
+    # @skip_if_lt_x_gpu(2)
     def test_get_signal_pad(self) -> None:
         self._init_process()
 
-        t = symm_mem.empty(1, device="cuda")
+        t = symm_mem.empty(1, device="xpu")
         symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
         peer_rank = (self.rank + 1) % self.world_size
 
@@ -206,18 +206,18 @@ def test_get_signal_pad(self) -> None:
         self.assertEqual(signal_pad.numel(), 64)
 
         # Sanity check that writes to buffer doesn't corrupt signal_pad
-        t = symm_mem.empty(0, device="cuda")
+        t = symm_mem.empty(0, device="xpu")
         symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
         signal_pad = symm_mem_hdl.get_signal_pad(self.rank)
         signal_pad.fill_(42)
         t.fill_(0)
         self.assertTrue(signal_pad.eq(42).all())
 
-    @runOnRocmArch(MI300_ARCH)
-    @requires_cuda
+    # @runOnRocmArch(MI300_ARCH)
+    # @requires_cuda
     def test_allow_overlapping_devices(self) -> None:
         os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "1"
-        t = symm_mem.empty(64, device="cuda:0")
+        t = symm_mem.empty(64, device="xpu:0")
         symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
 
         self.assertEqual(symm_mem_hdl.rank, self.rank)
@@ -232,8 +232,8 @@ def test_allow_overlapping_devices(self) -> None:
 
         os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "0"
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
+    # @runOnRocmArch(MI300_ARCH)
+    # @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1])
     def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
         self._init_process()
@@ -246,8 +246,8 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
         rank = self.rank
 
         torch.manual_seed(42 + rank)
-        A_shard = torch.rand(BATCH, M // self.world_size, K, device="cuda")
-        Bs = [torch.rand(K, N, device="cuda") for _ in range(3)]
+        A_shard = torch.rand(BATCH, M // self.world_size, K, device="xpu")
+        Bs = [torch.rand(K, N, device="xpu") for _ in range(3)]
 
         ag_output_0, mm_outputs_0 = _fused_all_gather_matmul_fallback(
             A_shard, Bs, gather_dim=gather_dim, group_name=group.group_name
@@ -262,12 +262,12 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
             assert torch.allclose(mm_output_0, mm_output_1)
             assert mm_output_0.stride(), mm_output_1.stride()
 
-    @skipIfRocm  # this requires async_input_mm support
-    @skipIf(
-        not SM90OrLater,
-        "_fused_all_gather_matmul_native currently only supports sm>=90",
-    )
-    @skip_if_lt_x_gpu(2)
+    # @skipIfRocm  # this requires async_input_mm support
+    # @skipIf(
+    #     not SM90OrLater,
+    #     "_fused_all_gather_matmul_native currently only supports sm>=90",
+    # )
+    # @skip_if_lt_x_gpu(2)
     @parametrize("symm_mem_input", [True, False])
     @parametrize("is_b_row_major", [True, False])
     def test_fused_all_gather_matmul_native(
@@ -293,20 +293,20 @@ def test_fused_all_gather_matmul_native(
             ).normal_()
         else:
             A_shard = torch.rand(
-                M // self.world_size, K, dtype=torch.bfloat16, device="cuda"
+                M // self.world_size, K, dtype=torch.bfloat16, device="xpu"
             )
 
         if is_b_row_major:
-            B = torch.rand(K, N, dtype=torch.bfloat16, device="cuda")
+            B = torch.rand(K, N, dtype=torch.bfloat16, device="xpu")
         else:
-            B = torch.rand(N, K, dtype=torch.bfloat16, device="cuda").t()
+            B = torch.rand(N, K, dtype=torch.bfloat16, device="xpu").t()
 
         ag_baseline, mm_baseline = _fused_all_gather_matmul_fallback(
             A_shard, [B], gather_dim=0, group_name=group_name
         )
         with torch.profiler.profile(
             activities=[
-                torch.profiler.ProfilerActivity.CUDA,
+                torch.profiler.ProfilerActivity.XPU,
             ],
         ) as prof:
             ag_target, mm_target = torch.ops.symm_mem.fused_all_gather_matmul(
@@ -335,10 +335,10 @@ def test_multimem_all_gather_matmul(self) -> None:
 
         torch.manual_seed(42 + self.rank)
         A_shard = torch.rand(
-            M // self.world_size, K, dtype=torch.bfloat16, device="cuda"
+            M // self.world_size, K, dtype=torch.bfloat16, device="xpu"
         )
 
-        B = torch.rand(K, N, dtype=torch.bfloat16, device="cuda")
+        B = torch.rand(K, N, dtype=torch.bfloat16, device="xpu")
 
         ag_baseline, mm_baseline = _fused_all_gather_matmul_fallback(
             A_shard, [B], gather_dim=0, group_name=group_name, return_A=False
@@ -359,8 +359,8 @@ def test_multimem_all_gather_matmul(self) -> None:
         torch.testing.assert_close(ag_target, ag_baseline)
         torch.testing.assert_close(mm_target[0], mm_baseline[0])
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
+    # @runOnRocmArch(MI300_ARCH)
+    # @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1])
     @parametrize(
         "scale_mode", ["tensor-wise", "row-wise-replicated", "row-wise-sharded"]
@@ -386,20 +386,20 @@ def test_fused_all_gather_scaled_matmul(
 
         torch.manual_seed(42 + rank)
 
-        A_shard = torch.rand(*leading_dims, K, device="cuda").to(e4m3_type)
-        Bs = [torch.rand(N, K, device="cuda").to(e4m3_type).T for _ in range(3)]
+        A_shard = torch.rand(*leading_dims, K, device="xpu").to(e4m3_type)
+        Bs = [torch.rand(N, K, device="xpu").to(e4m3_type).T for _ in range(3)]
 
         if scale_mode == "tensor-wise":
-            A_scale = torch.tensor(0.1, device="cuda")
-            B_scales = [torch.tensor(0.1, device="cuda") for _ in range(3)]
+            A_scale = torch.tensor(0.1, device="xpu")
+            B_scales = [torch.tensor(0.1, device="xpu") for _ in range(3)]
             out_dtypes = [None, torch.bfloat16, torch.float32]
         elif scale_mode == "row-wise-sharded":
-            A_scale = torch.full((*leading_dims, 1), 0.1, device="cuda")
-            B_scales = [torch.full((1, N), 0.1, device="cuda") for _ in range(3)]
+            A_scale = torch.full((*leading_dims, 1), 0.1, device="xpu")
+            B_scales = [torch.full((1, N), 0.1, device="xpu") for _ in range(3)]
             out_dtypes = [torch.bfloat16] * 3
         elif scale_mode == "row-wise-replicated":
-            A_scale = torch.full((BATCH, M, 1), 0.1, device="cuda")
-            B_scales = [torch.full((1, N), 0.1, device="cuda") for _ in range(3)]
+            A_scale = torch.full((BATCH, M, 1), 0.1, device="xpu")
+            B_scales = [torch.full((1, N), 0.1, device="xpu") for _ in range(3)]
             out_dtypes = [torch.bfloat16] * 3
         else:
             raise AssertionError(f"Invalid scale_mode: {scale_mode}")
@@ -445,7 +445,7 @@ def test_fused_all_gather_scaled_matmul(
             self.assertEqual(mm_output_0.stride(), mm_output_1.stride())
             self.assertEqual(mm_output_0.dtype, mm_output_1.dtype)
 
-    @runOnRocmArch(MI300_ARCH)
+    # @runOnRocmArch(MI300_ARCH)
     @skip_if_lt_x_gpu(2)
     @parametrize("scatter_dim", [0, 1])
     def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
@@ -459,8 +459,8 @@ def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
         rank = self.rank
 
         torch.manual_seed(42 + rank)
-        A = torch.rand(BATCH, M, K, device="cuda")
-        B = torch.rand(K, N, device="cuda")
+        A = torch.rand(BATCH, M, K, device="xpu")
+        B = torch.rand(K, N, device="xpu")
 
         output_0 = _fused_matmul_reduce_scatter_fallback(
             A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name
@@ -472,7 +472,7 @@ def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
         assert torch.allclose(output_0, output_1)
         assert output_0.stride() == output_1.stride()
 
-    @skipIfRocm  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
+    # @skipIfRocm  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
     @skip_if_lt_x_gpu(2)
     @parametrize("scatter_dim", [0, 1])
     @parametrize("rowwise", [True, False])
@@ -489,15 +489,15 @@ def test_fused_scaled_matmul_reduce_scatter(
         rank = self.rank
 
         torch.manual_seed(42 + rank)
-        A = torch.rand(BATCH, M, K, device="cuda").to(e4m3_type)
-        B = torch.rand(N, K, device="cuda").to(e4m3_type).T
+        A = torch.rand(BATCH, M, K, device="xpu").to(e4m3_type)
+        B = torch.rand(N, K, device="xpu").to(e4m3_type).T
 
         if rowwise:
-            A_scale = torch.full((BATCH, M, 1), 0.1, device="cuda")
-            B_scale = torch.full((1, N), 0.1, device="cuda")
+            A_scale = torch.full((BATCH, M, 1), 0.1, device="xpu")
+            B_scale = torch.full((1, N), 0.1, device="xpu")
         else:
-            A_scale = torch.tensor(0.1, device="cuda")
-            B_scale = torch.tensor(0.1, device="cuda")
+            A_scale = torch.tensor(0.1, device="xpu")
+            B_scale = torch.tensor(0.1, device="xpu")
 
         output_shape = [*A.shape[:-1], B.shape[1]]
 
@@ -522,7 +522,7 @@ def test_fused_scaled_matmul_reduce_scatter(
         assert outputs[0].stride() == outputs[1].stride()
         assert torch.allclose(outputs[0], outputs[1]), (outputs[0], outputs[1])
 
-    @runOnRocmArch(MI300_ARCH)
+    # @runOnRocmArch(MI300_ARCH)
     @parametrize("dim", [0, 1, 2])
     def test_optimal_layout(self, dim: int) -> None:
         t = torch.rand(8, 64, 32, 16)
@@ -535,8 +535,8 @@ def test_optimal_layout(self, dim: int) -> None:
         self.assertTrue(x.movedim(dim, 0).is_contiguous())
         self.assertTrue(torch.allclose(x, t))
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
+    # @runOnRocmArch(MI300_ARCH)
+    # @skip_if_lt_x_gpu(2)
     @parametrize("symm_mem_input", [True, False])
     def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
         self._init_process()
@@ -560,7 +560,7 @@ def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
         for r in range(self.world_size):
             self.assertTrue(chunks[r].eq(r).all())
 
-    @runOnRocmArch(MI300_ARCH)
+    # @runOnRocmArch(MI300_ARCH)
     @skip_if_lt_x_gpu(2)
     @parametrize("reduce_op", ["sum", "avg"])
     @parametrize("symm_mem_input", [True, False])
@@ -608,7 +608,7 @@ def test_subgroup(self) -> None:
         world = dist.group.WORLD
         subgroup = subgroup_0 if world.rank() < world.size() // 2 else subgroup_1
 
-        t = symm_mem.empty(64, device="cuda")
+        t = symm_mem.empty(64, device="xpu")
         symm_mem_world = symm_mem.rendezvous(t, group=world)
         symm_mem_subgroup = symm_mem.rendezvous(t, group=subgroup)
 
@@ -636,8 +636,8 @@ def test_subgroup(self) -> None:
 
 # This Test class is used to test the error handling of SymmetricMemory APIs.
 # Since a process restart is often needed after each test, we use the
-# MultiProcessTestCase instead of MultiProcContinuousTest.
-@requires_cuda_p2p_access()
+# MultiProcessTestCase instead of MultiProcContinousTest.
+# @requires_cuda_p2p_access()
 class SymmMemNegativeTest(MultiProcessTestCase):
     def setUp(self) -> None:
         super().setUp()
@@ -652,10 +652,10 @@ def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
     def _init_process(self):
-        torch.cuda.set_device(self.device)
+        torch.xpu.set_device(self.device)
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
-            backend="nccl",
+            backend="xccl",
             world_size=self.world_size,
             rank=self.rank,
             store=store,
@@ -672,15 +672,15 @@ def _init_process(self):
     def test_barrier_timeout(self) -> None:
         self._init_process()
 
-        t = symm_mem.empty(1, device="cuda")
+        t = symm_mem.empty(1, device="xpu")
         symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
 
         if self.rank == 0:
             with self.assertRaises(RuntimeError):
                 symm_mem_hdl.barrier(timeout_ms=1000)
-                torch.cuda.synchronize()
+                torch.xpu.synchronize()
         else:
-            torch.cuda.synchronize()
+            torch.xpu.synchronize()
 
         # The device-side timeout triggers a __trap() that causes all
         # subsequent host/device interactions to result in an "unspecified
@@ -698,7 +698,7 @@ def test_barrier_timeout(self) -> None:
     def test_put_signal_timeout(self) -> None:
         self._init_process()
 
-        t = symm_mem.empty(1, device="cuda")
+        t = symm_mem.empty(1, device="xpu")
         symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
 
         if self.rank == 0:
@@ -707,9 +707,9 @@ def test_put_signal_timeout(self) -> None:
                 # doesn't wait on this signal, the subsequent put will timeout.
                 symm_mem_hdl.put_signal(dst_rank=1)
                 symm_mem_hdl.put_signal(dst_rank=1, timeout_ms=1000)
-                torch.cuda.synchronize()
+                torch.xpu.synchronize()
         else:
-            torch.cuda.synchronize()
+            torch.xpu.synchronize()
 
         # The device-side timeout triggers a __trap() that causes all
         # subsequent host/device interactions to result in an "unspecified
@@ -727,15 +727,15 @@ def test_put_signal_timeout(self) -> None:
     def test_wait_signal_timeout(self) -> None:
         self._init_process()
 
-        t = symm_mem.empty(1, device="cuda")
+        t = symm_mem.empty(1, device="xpu")
         symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
 
         if self.rank == 0:
             with self.assertRaises(RuntimeError):
                 symm_mem_hdl.wait_signal(src_rank=1, timeout_ms=1000)
-                torch.cuda.synchronize()
+                torch.xpu.synchronize()
         else:
-            torch.cuda.synchronize()
+            torch.xpu.synchronize()
 
         # The device-side timeout triggers a __trap() that causes all
         # subsequent host/device interactions to result in an "unspecified
@@ -745,14 +745,14 @@ def test_wait_signal_timeout(self) -> None:
 
 
 @instantiate_parametrized_tests
-@requires_cuda_p2p_access()
-class SymmMemCollectiveTest(MultiProcContinuousTest):
+# @requires_cuda_p2p_access()
+class SymmMemCollectiveTest(MultiProcContinousTest):
     @property
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
     def _init_process(self):
-        torch.cuda.set_device(self.device)
+        torch.xpu.set_device(self.device)
         torch.manual_seed(42 + self.rank)
 
     @skip_if_lt_x_gpu(4)
@@ -992,10 +992,10 @@ def test_multimem_all_gather(self, align_bytes: int) -> None:
 
 
 @instantiate_parametrized_tests
-@requires_cuda_p2p_access()
-class LoweringTest(MultiProcContinuousTest):
+# @requires_cuda_p2p_access()
+class LoweringTest(MultiProcContinousTest):
     def _init_process(self) -> None:
-        torch.cuda.set_device(self.device)
+        torch.xpu.set_device(self.device)
         enable_symm_mem_for_group(dist.group.WORLD.group_name)
         torch.manual_seed(42 + self.rank)
         torch._inductor.config._collective.auto_select = True
@@ -1060,15 +1060,15 @@ def func_3(x):
 
 
 class SymmMemSingleProcTest(TestCase):
-    @requires_cuda
-    @skipIf(
-        not TEST_WITH_ROCM and _get_torch_cuda_version() < (12, 0),
-        "stream_write_value32 currently only supports cuda version>=12.0",
-    )
-    @runOnRocmArch(MI300_ARCH)
+    # @requires_cuda
+    # @skipIf(
+    #     not TEST_WITH_ROCM and _get_torch_cuda_version() < (12, 0),
+    #     "stream_write_value32 currently only supports xpu version>=12.0",
+    # )
+    # @runOnRocmArch(MI300_ARCH)
     def test_stream_write_value32(self):
-        tensor = torch.zeros(4, dtype=torch.uint32, device="cuda")
-        expect = torch.tril(torch.ones(4, 4, device="cuda")).to(torch.uint32)
+        tensor = torch.zeros(4, dtype=torch.uint32, device="xpu")
+        expect = torch.tril(torch.ones(4, 4, device="xpu")).to(torch.uint32)
 
         for i in range(4):
             _SymmetricMemory.stream_write_value32(tensor, i, 1)
@@ -1080,14 +1080,14 @@ def test_stream_write_value32(self):
         with self.assertRaises(RuntimeError):
             _SymmetricMemory.stream_write_value32(tensor, offset=0, val=4294967296)
 
-    @requires_cuda
+    # @requires_cuda
     @runOnRocmArch(MI300_ARCH)
     def test_memset32(self):
         t = _SymmetricMemory.empty_strided_p2p(
             (64,),
             (1,),
             dtype=torch.uint32,
-            device=torch.device("cuda:0"),
+            device=torch.device("xpu:0"),
             group_name="0",
         ).fill_(0)
 
diff --git a/test/dynamo/cpython/3_13/test_complex.diff b/test/dynamo/cpython/3_13/test_complex.diff
index feca8fcc9b049..063b9131056e3 100644
--- a/test/dynamo/cpython/3_13/test_complex.diff
+++ b/test/dynamo/cpython/3_13/test_complex.diff
@@ -1,8 +1,8 @@
 diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
-index 6ff1a8ab29d..01295e03efc 100644
+index 6ff1a8ab29d..1572433c5ae 100644
 --- a/test/dynamo/cpython/3_13/test_complex.py
 +++ b/test/dynamo/cpython/3_13/test_complex.py
-@@ -1,16 +1,146 @@
+@@ -1,16 +1,147 @@
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
@@ -19,6 +19,7 @@ index 6ff1a8ab29d..01295e03efc 100644
 +from torch._dynamo.test_case import CPythonTestCase
 +from torch.testing._internal.common_utils import (
 +    run_tests,
++    slowTest,
 +    xfailIfTorchDynamo,
 +)
 +
@@ -154,7 +155,7 @@ index 6ff1a8ab29d..01295e03efc 100644
  INF = float("inf")
  NAN = float("nan")
  DBL_MAX = sys.float_info.max
-@@ -45,7 +175,40 @@ class WithComplex:
+@@ -45,7 +176,40 @@ class WithComplex:
      def __complex__(self):
          return self.value
  
@@ -196,7 +197,7 @@ index 6ff1a8ab29d..01295e03efc 100644
  
      def assertAlmostEqual(self, a, b):
          if isinstance(a, complex):
-@@ -74,6 +237,29 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+@@ -74,6 +238,29 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
          # check that relative difference < eps
          self.assertTrue(abs((x-y)/y) < eps)
  
@@ -226,7 +227,27 @@ index 6ff1a8ab29d..01295e03efc 100644
      def assertClose(self, x, y, eps=1e-9):
          """Return true iff complexes x and y "are close"."""
          self.assertCloseAbs(x.real, y.real, eps)
-@@ -431,12 +617,13 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+@@ -93,6 +280,7 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+             q = z.__truediv__(y)
+             self.assertClose(q, x)
+ 
++    @slowTest
+     def test_truediv(self):
+         simple_real = [float(i) for i in range(-5, 6)]
+         simple_complex = [complex(x, y) for x in simple_real for y in simple_real]
+@@ -338,7 +526,10 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+ 
+     def test_boolcontext(self):
+         for i in range(100):
+-            self.assertTrue(complex(random() + 1e-6, random() + 1e-6))
++            with torch._dynamo.set_fullgraph(False):
++                r1 = random()
++                r2 = random()
++            self.assertTrue(complex(r1 + 1e-6, r2 + 1e-6))
+         self.assertTrue(not complex(0.0, 0.0))
+         self.assertTrue(1j)
+ 
+@@ -431,12 +622,13 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
          self.assertRaises(TypeError, complex, WithComplex(1), object())
          self.assertRaises(TypeError, complex, WithComplex(None), object())
  
@@ -245,7 +266,7 @@ index 6ff1a8ab29d..01295e03efc 100644
  
          self.assertRaises(EvilExc, complex, evilcomplex())
  
-@@ -460,31 +647,33 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+@@ -460,31 +652,33 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
          self.assertRaises(TypeError, complex, WithIndex(None), 1.5)
          self.assertRaises(TypeError, complex, 1.5, WithIndex(None))
  
@@ -299,7 +320,7 @@ index 6ff1a8ab29d..01295e03efc 100644
  
          check(complex(complex0(1j)), 0.0, 42.0)
          with self.assertWarns(DeprecationWarning):
-@@ -855,4 +1044,4 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+@@ -855,4 +1049,4 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
  
  
  if __name__ == "__main__":
diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
index 01295e03efc07..1572433c5aeff 100644
--- a/test/dynamo/cpython/3_13/test_complex.py
+++ b/test/dynamo/cpython/3_13/test_complex.py
@@ -14,6 +14,7 @@
 from torch._dynamo.test_case import CPythonTestCase
 from torch.testing._internal.common_utils import (
     run_tests,
+    slowTest,
     xfailIfTorchDynamo,
 )
 
@@ -279,6 +280,7 @@ def check_div(self, x, y):
             q = z.__truediv__(y)
             self.assertClose(q, x)
 
+    @slowTest
     def test_truediv(self):
         simple_real = [float(i) for i in range(-5, 6)]
         simple_complex = [complex(x, y) for x in simple_real for y in simple_real]
@@ -524,7 +526,10 @@ def test_pow_with_small_integer_exponents(self):
 
     def test_boolcontext(self):
         for i in range(100):
-            self.assertTrue(complex(random() + 1e-6, random() + 1e-6))
+            with torch._dynamo.set_fullgraph(False):
+                r1 = random()
+                r2 = random()
+            self.assertTrue(complex(r1 + 1e-6, r2 + 1e-6))
         self.assertTrue(not complex(0.0, 0.0))
         self.assertTrue(1j)
 
diff --git a/test/dynamo/test_guard_serialization.py b/test/dynamo/test_guard_serialization.py
index 8ff92321bb7bc..e826492089f63 100644
--- a/test/dynamo/test_guard_serialization.py
+++ b/test/dynamo/test_guard_serialization.py
@@ -235,6 +235,7 @@ def __hash__(self):
 pytree.register_constant(CustomConstantType)
 
 
+@torch._dynamo.config.patch({"strict_precompile": True})
 class TestGuardSerialization(torch._inductor.test_case.TestCase):
     def test_function_locals(self):
         def foo(x):
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 57983cea8e028..e86947aa2c101 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -12858,6 +12858,7 @@ def f(x):
                 complex(real=1),
                 complex(imag=1, real=2),
                 complex("1+2j"),
+                complex(1, 2).conjugate(),
             )
             return [x + z for z in c]
 
diff --git a/test/dynamo/test_package.py b/test/dynamo/test_package.py
index fdd01135ea2ff..ccf02769d56e9 100644
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@@ -35,6 +35,7 @@ def compute_loss_helper(x):
 
 
 @functorch_config.patch("bundled_autograd_cache", True)
+@torch._dynamo.config.patch({"strict_precompile": True})
 @instantiate_parametrized_tests
 class TestPackage(torch._inductor.test_case.TestCase):
     def path(self):
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 2da480c85f4ac..42ef410a548e7 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -7141,6 +7141,37 @@ def f(eye, out):
         torch.compile(f, backend="eager", fullgraph=True)(eye, out_res)
         self.assertEqual(out_ref, out_res)
 
+    def test_setitem_tensor_prop(self):
+        # Using the composite implicit of the forward would be incorrect
+        class MyFn(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return torch.matmul(x, x.t())
+
+            @staticmethod
+            def backward(ctx, grad_out):
+                return grad_out
+
+        def fn(x, y):
+            x[0] = y[0]
+            return MyFn.apply(x)
+
+        def inputs():
+            torch.manual_seed(123)
+            x = torch.randn(10, 10)
+            y = torch.randn(10, 10, requires_grad=True)
+            return x, y
+
+        x1, y1 = inputs()
+        fn(x1, y1).sum().backward()
+        self.assertTrue(x1.requires_grad)
+
+        x2, y2 = inputs()
+        torch.compile(fn, backend="eager")(x2, y2).sum().backward()
+        self.assertTrue(x2.requires_grad)
+
+        self.assertEqual(y1.grad, y2.grad)
+
     def test_nn_parameter_ctor_graph_breaks(self):
         def fn():
             param = torch.nn.Parameter(torch.ones(10))
diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index 13785d2409f34..b7166c5ce6d1b 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -322,6 +322,34 @@ def sample_func():
         # Since the remaining logs are env specific, we just check if they are present instead of checking the exact string
         self.assertGreater(len(stack_strings), 1)
 
+    @dynamo_config.patch({"log_compilation_metrics": True})
+    @inductor_config.patch({"force_disable_caches": True})
+    def test_exception_stack_trace(self):
+        from torch._dynamo.exc import Unsupported
+
+        def backward(grad_output):
+            print("graph break!")  # This should trigger a Dynamo error
+            return grad_output
+
+        compiled_backward = torch.compile(backward, backend="eager", fullgraph=True)
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            with self.assertRaisesRegex(
+                Unsupported,
+                "Dynamo does not know how to trace builtin operator `print`",
+            ):
+                compiled_backward(torch.ones(3))
+
+        compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+
+        self.assertGreater(len(compilation_events), 0)
+        self.assertGreater(len(compilation_events[0].exception_stack_trace), 0)
+        self.assertIn(
+            "Dynamo does not know how to trace builtin operator `print`",
+            compilation_events[0].exception_stack_trace[0],
+            "exception_stack_trace does not contain the expected string: "
+            "'Dynamo does not know how to trace builtin operator `print`'",
+        )
+
     @dynamo_config.patch(
         {
             "log_compilation_metrics": True,
@@ -474,6 +502,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
             e.python_version = None
             e.stack_trace = None
             e.graph_node_shapes = None
+            e.exception_stack_trace = None
 
         # First event is for the forward. Formatting makes reading diffs
         # much easier.
@@ -512,6 +541,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'dynamo_time_before_restart_s': 0.0,
  'end_time_us': 100,
  'entire_frame_compile_time_s': 0.0,
+ 'exception_stack_trace': None,
  'fail_reason': None,
  'fail_type': None,
  'fail_user_frame_filename': None,
@@ -596,6 +626,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'dynamo_time_before_restart_s': 0.0,
  'end_time_us': 100,
  'entire_frame_compile_time_s': 0.0,
+ 'exception_stack_trace': None,
  'fail_reason': None,
  'fail_type': None,
  'fail_user_frame_filename': None,
@@ -691,6 +722,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'dynamo_time_before_restart_s': None,
  'end_time_us': 100,
  'entire_frame_compile_time_s': None,
+ 'exception_stack_trace': None,
  'fail_reason': None,
  'fail_type': None,
  'fail_user_frame_filename': None,
@@ -775,6 +807,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'dynamo_time_before_restart_s': None,
  'end_time_us': 100,
  'entire_frame_compile_time_s': None,
+ 'exception_stack_trace': None,
  'fail_reason': None,
  'fail_type': None,
  'fail_user_frame_filename': None,
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_conjugate b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_conjugate
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_getnewargs b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_getnewargs
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/export/test_draft_export.py b/test/export/test_draft_export.py
index fe95d9538fef2..6cf819958fccf 100644
--- a/test/export/test_draft_export.py
+++ b/test/export/test_draft_export.py
@@ -296,8 +296,7 @@ def forward(self, a, b, c):
                     res = torch.ops.mylib.foo1(a, b)
 
                     c_item = c.item()
-                    if c_item > 0:
-                        return res[:c_item]
+                    return res[:c_item]
 
             inp = (torch.ones(3, 3), torch.ones(3, 3), torch.tensor(3))
 
@@ -368,8 +367,8 @@ def forward(self, x, y):
                 a = a + 5
 
                 z = torch.cat([y, y])
-                if a > 0:
-                    return z[:a]
+
+                return z[:a]
 
         ep = draft_export(
             M(),
@@ -387,7 +386,7 @@ def forward(self, x, y):
             for node in _ep.graph.nodes:
                 if bindings := node.meta.get("unbacked_bindings"):
                     unbacked_binding_symbols.update(bindings.keys())
-            self.assertEqual(len(unbacked_binding_symbols), 2)
+            self.assertEqual(len(unbacked_binding_symbols), 1)
 
     def test_offsets(self):
         class M(torch.nn.Module):
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 48b1c57a7edde..78d968ae6c721 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -3089,32 +3089,6 @@ def forward(self, x):
                 },
             )
 
-    def test_unbacked_slice_forward(self):
-        class Foo(torch.nn.Module):
-            def forward(self, x, xs):
-                u0, u1 = xs.tolist()
-                out = x[u0:u1]
-                return out
-
-        x = torch.randn(10)
-        idxs = torch.tensor([3, 6])
-        mod = Foo()
-        ep = export(mod, (x, idxs))
-        for xs in [
-            idxs,
-            torch.tensor([-9, -1]),
-            torch.tensor([-10000, 10000]),
-            torch.tensor([0, -10]),
-        ]:
-            self.assertTrue(torch.allclose(ep.module()(x, xs), mod(x, xs)))
-
-        # check unbacked bindings
-        # should be 4 symbols: u0, u1, output size, output storage offset
-        bound_unbacked = set()
-        for node in ep.graph.nodes:
-            bound_unbacked |= node.meta.get("unbacked_bindings", {}).keys()
-        self.assertEqual(len(bound_unbacked), 4)
-
     def test_dim_hint_ranges(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -4367,80 +4341,6 @@ def forward(self, xs):
         x = torch.tensor([1, 2])
         self.assertTrue(torch.allclose(mod(x), ep.module()(x)))
 
-    def test_nested_module_fake_tensor_leak(self):
-        class Bar(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._tensor_cache = None
-
-            def forward(self, x):
-                if self._tensor_cache is None:
-                    self._tensor_cache = x + 2
-                return self._tensor_cache.sum() + x.sum()
-
-        class Foo(torch.nn.Module):
-            def __init__(self, bar):
-                super().__init__()
-                self.bar = bar
-
-            def forward(self, x):
-                return self.bar(x)
-
-        foo = Foo(Bar())
-        _ = export(foo, (torch.ones(4, 4),), strict=False)
-        self.assertTrue(foo.bar._tensor_cache is None)
-
-    def test_export_leak_compile(self):
-        class BaseModule(torch.nn.Module):
-            def forward(self, *args, **kwargs):
-                raise NotImplementedError
-
-        class CacheModule(BaseModule):
-            def __init__(self, cache: torch.Tensor):
-                super().__init__()
-                assert cache.ndim == 3
-                self.cache = torch.nn.Parameter(cache, requires_grad=False)
-
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                n_tokens = x.size(1)
-                rolled_cache = torch.roll(self.cache.data, -n_tokens, dims=1)
-                rolled_cache[:, -n_tokens:, :] = x
-                self.cache.data = rolled_cache
-                return self.cache
-
-        class LinearBlock(torch.nn.Module):
-            def __init__(self, in_features, out_features, activation=None):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_features, out_features)
-                self.activation = activation
-
-            def forward(self, x):
-                x = self.linear(x)
-                return self.activation(x) if self.activation else x
-
-        class MyModel(BaseModule):
-            def __init__(self):
-                super().__init__()
-                default_cache = torch.zeros(1, 10, 5)
-                self.cache_layer = CacheModule(default_cache)
-                self.fc1 = LinearBlock(5, 10, activation=torch.nn.ReLU())
-                self.fc2 = LinearBlock(10, 5)
-
-            def forward(self, x):
-                cached = self.cache_layer(x)
-                out = self.fc1(cached)
-                out = self.fc2(out)
-                return out
-
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "We found a fake tensor in the exported program constant's list. "
-            "This typically means our tracing system encountered an op that we can't trace through. "
-            "For the potential source, you can refer to following model attribute: cache_layer.lifted_tensor_0. "
-            "Please file an issue on github.",
-        ):
-            _ = export(MyModel(), (torch.randn(1, 3, 5),), strict=False)
-
     def test_export_for_training_with_container_type(self):
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -5865,7 +5765,7 @@ def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
         }
         self._test_export_same_as_eager(kw_func, args, kwargs)
 
-    def test_unbacked_slice_simple(self):
+    def test_unbacked_slice(self):
         class M(torch.nn.Module):
             def forward(self, scores, score_thr, topk: torch.Tensor, results=None):
                 valid_mask = scores > score_thr
diff --git a/test/export/test_nativert.py b/test/export/test_nativert.py
index 044b6051400d4..bcbda2e42fc10 100644
--- a/test/export/test_nativert.py
+++ b/test/export/test_nativert.py
@@ -86,7 +86,7 @@ def run_with_nativert(ep):
     MODEL_NAME = "forward"
 
     # TODO Does named tempfile have collision?
-    with tempfile.NamedTemporaryFile(delete=False) as f:
+    with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
         torch.export.pt2_archive._package.package_pt2(
             f, exported_programs={MODEL_NAME: ep_infer}
         )
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 0fb4ba041356d..6b8d23a191cab 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -8837,22 +8837,6 @@ def test_function_schema_gen(self):
         self.assertEqual(schema2.parse(str(schema2)), schema2)
         self.assertEqual(schema3.parse(str(schema3)), schema3)
 
-    def test_while_loop_schema_gen(self):
-        fn, inp = WHILE_LOOP_TESTS["simple_with_linear"]
-        graph = make_fx(fn)(*inp).graph
-        while_loop_node = next(
-            node
-            for node in graph.nodes
-            if node.op == "call_function"
-            and node.target is torch.ops.higher_order.while_loop
-        )
-        schema = torch._library.utils.hop_schema_from_fx_node(while_loop_node)
-        self.assertExpectedInline(
-            str(schema),
-            """while_loop(GraphModule cond_fn, GraphModule body_fn, Tensor[2] carried_inputs, Tensor[3] additional_inputs) -> Tensor[2]""",  # noqa: B950
-        )
-        self.assertEqual(schema.parse(str(schema)), schema)
-
     def test_schema_tree_spec(self):
         schema_gen = HopSchemaGenerator(torch.ops.higher_order.cond)
         args = (torch.randn(3, 4), torch.randn(2, 3))
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index df1bd941d8857..fc6fd1c10fc6c 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -1809,6 +1809,35 @@ def fn(x):
         self.assertEqual(ref, res)
         res.sum().backward()
 
+    @requires_gpu
+    def test_ac_rng_cudagraphs(self):
+        def fn1(q, k, v):
+            return torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=0.5, is_causal=True
+            )
+
+        @nested_compile_region
+        def fn1_checkpoint(q, k, v):
+            return torch.utils.checkpoint.checkpoint(fn1, q, k, v, use_reentrant=False)
+
+        def fn(q, k, v):
+            return fn1_checkpoint(q, k, v) + fn1_checkpoint(q.cos(), k, v)
+
+        q = torch.randn(
+            1, 1, 32, 32, device=GPU_TYPE, dtype=torch.bfloat16, requires_grad=True
+        )
+        k = torch.randn(
+            1, 1, 32, 32, device=GPU_TYPE, dtype=torch.bfloat16, requires_grad=True
+        )
+        v = torch.randn(
+            1, 1, 32, 32, device=GPU_TYPE, dtype=torch.bfloat16, requires_grad=True
+        )
+
+        res = torch.compile(
+            fn, backend="inductor", fullgraph=True, mode="reduce-overhead"
+        )(q, k, v)
+        res.sum().backward()
+
     def test_fake_tensor_checking(self):
         @nested_compile_region
         def gn(x):
diff --git a/test/inductor/test_analysis.py b/test/inductor/test_analysis.py
index ac0467a2d1b80..55f5bec86c539 100644
--- a/test/inductor/test_analysis.py
+++ b/test/inductor/test_analysis.py
@@ -543,6 +543,99 @@ def test_pointwise_bandwidth(self, device, dtype, maxat):
             if event["name"] == "triton_poi_fused_add_randn_sin_0":
                 event["args"]["kernel_num_gb"] = 0.002097168
 
+    @skipIf(not SM80OrLater, "Requires SM80")
+    @dtypes(torch.float, torch.float16)
+    def test_combine_profiles(self, device, dtype):
+        """
+        Test combining multiple profiles into a single profile.
+        """
+        if device == "cpu" or torch.version.hip is not None:
+            return
+
+        # Create three different models to generate different traces
+        om1 = _test_model(device, dtype, addmm=True, bmm=False)
+        om2 = _test_model(device, dtype, addmm=False, bmm=True)
+        om3 = _pointwise_test_model(device, dtype)
+
+        # Generate three separate traces
+        trace1, trace2 = trace_files()
+        trace3 = f"{TMP_DIR}/trace3-{uuid.uuid4()}.json"
+        combined_trace = f"{TMP_DIR}/combined-{uuid.uuid4()}.json"
+
+        # Generate first trace
+        torch._dynamo.reset()
+        with fresh_inductor_cache():
+            with torch.profiler.profile(record_shapes=True) as p1:
+                om1()
+        p1.export_chrome_trace(trace1)
+
+        # Generate second trace
+        torch._dynamo.reset()
+        with fresh_inductor_cache():
+            with torch.profiler.profile(record_shapes=True) as p2:
+                om2()
+        p2.export_chrome_trace(trace2)
+
+        # Generate third trace
+        torch._dynamo.reset()
+        with fresh_inductor_cache():
+            with torch.profiler.profile(record_shapes=True) as p3:
+                om3()
+        p3.export_chrome_trace(trace3)
+
+        # Combine the three traces
+        with patch(
+            "sys.argv",
+            [
+                *prefix,
+                "--combine",
+                trace1,
+                trace2,
+                trace3,
+                combined_trace,
+            ],
+        ):
+            main()
+
+        # Verify the combined trace exists and contains expected data
+        with open(combined_trace) as f:
+            combined_profile = json.load(f)
+
+        # Load original traces for comparison
+        with open(trace1) as f:
+            profile1 = json.load(f)
+        with open(trace2) as f:
+            profile2 = json.load(f)
+        with open(trace3) as f:
+            profile3 = json.load(f)
+
+        # Verify trace events are combined
+        expected_event_count = (
+            len(profile1["traceEvents"])
+            + len(profile2["traceEvents"])
+            + len(profile3["traceEvents"])
+        )
+        self.assertEqual(len(combined_profile["traceEvents"]), expected_event_count)
+
+        # Verify device properties are present
+        self.assertIn("deviceProperties", combined_profile)
+        self.assertGreater(len(combined_profile["deviceProperties"]), 0)
+
+        # Verify some trace events from each original profile are present
+        combined_event_names = {
+            event["name"] for event in combined_profile["traceEvents"]
+        }
+
+        # Check that we have events from each original profile
+        profile1_event_names = {event["name"] for event in profile1["traceEvents"]}
+        profile2_event_names = {event["name"] for event in profile2["traceEvents"]}
+        profile3_event_names = {event["name"] for event in profile3["traceEvents"]}
+
+        # At least some events from each profile should be in the combined profile
+        self.assertTrue(profile1_event_names.intersection(combined_event_names))
+        self.assertTrue(profile2_event_names.intersection(combined_event_names))
+        self.assertTrue(profile3_event_names.intersection(combined_event_names))
+
 
 instantiate_device_type_tests(TestAnalysis, globals())
 
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 0889c948de0c4..1767e76f04cf2 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -60,6 +60,7 @@
     MACOS_VERSION,
     MI300_ARCH,
     parametrize,
+    runOnRocm,
     skipIfMPS,
     skipIfRocm,
     skipIfRocmArch,
@@ -6416,6 +6417,43 @@ def forward(self, x):
                 rtol=1e-3,
             )
 
+    @runOnRocm
+    def test_rocm_triton_autotuning(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y, m):
+                _M, K = x.shape
+                K, N = y.shape
+                M = torch.abs(m)
+                out = torch.empty((_M, N), device=x.device, dtype=torch.float32)
+                grid = lambda META: (  # noqa: E731
+                    triton.cdiv(
+                        4096 * 2046, META["BLOCK_SIZE_M"] * META["BLOCK_SIZE_N"]
+                    ),
+                )
+                strange_config_matmul_kernel[grid](
+                    x,
+                    y,
+                    out,
+                    M,
+                    N,
+                    K,
+                )
+                return out
+
+        x = torch.randn(4096, 1024, device=self.device)
+        y = torch.randn(1024, 2048, device=self.device)
+        m = torch.tensor([4096], dtype=torch.int32, device=self.device)
+
+        with config.patch("triton.autotune_with_sample_inputs", True):
+            # The tuned best config on XPU is different with CUDA.
+            grid_0 = 32736 if GPU_TYPE == "xpu" else 1023
+            self.code_check_count(
+                Model(), (x, y, m), f"uint32_t grid_0 = {grid_0}L;", 1
+            )
+
     @skipIfRocm  # RoCM does not support the config block size in test suite.
     def test_triton_autotuning(self):
         if self.device != GPU_TYPE:
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index 3b23e7a51f702..09690243475d5 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: inductor"]
 
+import random
 import sys
 import types
 import unittest
@@ -583,6 +584,9 @@ class CompiledOptimizerParityTests(TestCase):
     @optims(optim_db, dtypes=[torch.float32])
     @parametrize("use_closure", [True, False])
     def test_correctness(self, device, dtype, optim_info, use_closure):
+        torch.cuda.manual_seed_all(0)
+        torch.manual_seed(0)
+        random.seed(0)
         optim_cls = optim_info.optim_cls
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
@@ -604,7 +608,10 @@ def test_correctness(self, device, dtype, optim_info, use_closure):
                 torch._inductor.metrics.reset()
                 input = torch.ones([10, 10], device=device)
                 model_eager = torch.nn.Sequential(
-                    *[torch.nn.Linear(10, 10, device=device) for _ in range(2)]
+                    *[
+                        torch.nn.Linear(10, 10, device=device, bias=False)
+                        for _ in range(2)
+                    ]
                 )
                 model_eager(input).sum().backward()
                 model_compiled = deepcopy(model_eager)
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index 54531bd8b0681..fe1e59bd7f49a 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -2767,6 +2767,33 @@ def forward(self, x, w):
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
         self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1)
 
+    @patches
+    @torch.no_grad
+    @parametrize("bs", (1, 50))
+    @parametrize("Mdim", (192,))
+    @parametrize("Kdim", (196,))
+    @parametrize("Ndim", (84, 385))
+    @dtypes(torch.float, torch.bfloat16, torch.half)
+    def test_bmm_with_y_storage_offset(self, dtype, bs, Mdim, Kdim, Ndim):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                # y_with_offset: contiguous, but has non-zero storage offset
+                y_with_offset = torch.empty(
+                    (3, *y.shape), dtype=y.dtype, device=y.device
+                )[2].copy_(y)
+                return x @ y_with_offset
+
+        counters.clear()
+        u = torch.randn(bs, Mdim, Kdim).to(dtype=dtype)
+        v = torch.randn(bs, Kdim, Ndim).to(dtype=dtype)
+        mod = M().to(dtype=dtype).eval()
+        with verify(dtype) as (atol, rtol):
+            self.common(mod, (u, v), atol=atol, rtol=rtol)
+        self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
+
     @patches
     @torch.no_grad
     @dtypes(torch.float)
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index 8e4746212a0bc..1d365d99e74d0 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -5997,6 +5997,56 @@ def bias_func(score, b, h, q_idx, kv_idx):
             ],
         )
 
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    @torch.compile
+    def test_learnable_bias_global_compiled(self, device, params):
+        batch_size = 1
+        num_heads = 1
+        seq_len = 128
+        head_dim = 16
+        d_model = num_heads * head_dim
+
+        query = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+        key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+        value = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+
+        out_proj = nn.Linear(d_model, d_model, device=device)
+
+        query.requires_grad = True
+        key.requires_grad = True
+        value.requires_grad = True
+
+        bias = torch.randn(
+            batch_size,
+            num_heads,
+            seq_len,
+            seq_len,
+            device=device,
+            requires_grad=True,
+        )
+
+        def bias_mod(score, b, h, q_idx, kv_idx):
+            return score + bias[b, h, q_idx, kv_idx]
+
+        out = flex_attention(
+            query=query,
+            key=key,
+            value=value,
+            score_mod=bias_mod,
+        )
+        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
+
+        attn_output = out_proj(out)
+        random_target = torch.randn(batch_size, seq_len, d_model, device=device)
+        loss = torch.nn.functional.mse_loss(attn_output, random_target)
+        loss.backward()
+
+        assert bias.grad, "No gradient computed for bias"
+        assert torch.any(bias.grad != 0), "Gradient for bias is 0"
+
     @skip_on_cpu
     @common_utils.parametrize(
         "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
index aa569411ffd6e..ba80d5cc965c6 100644
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@@ -545,6 +545,18 @@ def run(*args, **kwargs):
         if use_dynamic_shapes:
             self.assertEqual(type(shape[0]), torch.fx.Node)
 
+    def test_output_slice_view(self):
+        """
+        Test when the output is a view of the input.
+        The sliced strides create a TensorBox in the output IR.
+        """
+
+        def foo(x):
+            return x[0:2:2].T[3:].squeeze(0)
+
+        args = [torch.rand([4, 4, 4, 4], device=self.device)]
+        self._compile_and_check(foo, args, expected_num_triton_kernels=0)
+
 
 class AOTFxirTestCase(InductorTestCase):
     device = GPU_TYPE
@@ -557,6 +569,13 @@ def check(self, model, inp, dynamic_shapes=None):
             )
             self.assertTrue(torch.allclose(model(*inp), gm(*inp)))
 
+            for node in gm.graph.nodes:
+                if (
+                    node.op == "call_function"
+                    and node.target != triton_kernel_wrapper_mutation
+                ):
+                    self.assertTrue(node.meta.get("val", None) is not None)
+
     def test_aoti_fx_add(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 6cb2a16f8dacf..f5c8e532433b8 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -98,7 +98,8 @@ def benchmark(self, *args, out):
 @instantiate_parametrized_tests
 class TestMaxAutotune(TestCase):
     @parametrize("dynamic", (False, True))
-    def test_max_autotune_mm_plus_mm_zero_size_input(self, dynamic):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_max_autotune_mm_plus_mm_zero_size_input(self, dynamic, search_space):
         """
         Make sure autotuning mm_plus_mm with zero-size input works without crashes.
         """
@@ -112,7 +113,9 @@ def mm_plus_mm(a, b, c, d):
         c = torch.randn(m, k).to(GPU_TYPE)
         d = torch.randn(k, n).to(GPU_TYPE)
 
-        with config.patch({"max_autotune": True}):
+        with config.patch(
+            {"max_autotune": True, "max_autotune_gemm_search_space": search_space}
+        ):
             torch.compile(mm_plus_mm, dynamic=dynamic)(a, b, c, d)
 
     @unittest.skipIf(
@@ -532,7 +535,8 @@ def addmm(x, a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(addmm, dynamic=dynamic)(x, a, b)
 
-    def test_autotune_conv1x1(self):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_autotune_conv1x1(self, search_space):
         # Assuming input has 3 channels and we want to produce 16 channels as output
         conv1x1 = (
             torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=1)
@@ -549,7 +553,11 @@ def test_autotune_conv1x1(self):
         )
 
         with config.patch(
-            {"max_autotune": True, "max_autotune_gemm_backends": "TRITON"}
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+                "max_autotune_gemm_search_space": search_space,
+            }
         ):
 
             @torch.compile()
@@ -661,7 +669,9 @@ def f(x, y):
         self.assertTrue(torch.allclose(act, ref, atol=4 * 1e-3, rtol=4 * 1e-3))
 
     @config.patch(max_autotune=True)
-    def test_empty_conv_input(self, kernel_size=3):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    @parametrize("kernel_size", (1, 3))
+    def test_empty_conv_input(self, search_space, kernel_size):
         x = torch.randn(0, 256, 14, 14, device=GPU_TYPE)
         weight = torch.randn(256, 256, kernel_size, kernel_size, device=GPU_TYPE)
 
@@ -678,17 +688,15 @@ def f(x, weight):
                 groups=1,
             )
 
-        opt_f = torch.compile(f)
-        ref = f(x, weight)
-        act = opt_f(x, weight)
-        self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
-
-    @config.patch(max_autotune=True)
-    def test_empty_conv_input_with_1x1_kernel(self):
-        self.test_empty_conv_input(kernel_size=1)
+        with config.patch({"max_autotune_gemm_search_space": search_space}):
+            opt_f = torch.compile(f)
+            ref = f(x, weight)
+            act = opt_f(x, weight)
+            self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
 
     @config.patch(max_autotune_gemm_backends="TRITON")
-    def test_baddmm(self):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_baddmm(self, search_space):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -707,11 +715,12 @@ def forward(self, x):
         )
         mod = M().to(GPU_TYPE)
 
-        m_c = torch.compile(mode="max-autotune")(mod)
-        out, code = run_and_get_code(m_c, x)
-        self.assertEqual(out, mod(x), atol=2e-3, rtol=2e-3)
+        with config.patch({"max_autotune_gemm_search_space": search_space}):
+            m_c = torch.compile(mode="max-autotune")(mod)
+            out, code = run_and_get_code(m_c, x)
+            self.assertEqual(out, mod(x), atol=2e-3, rtol=1e-3)
 
-        FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
+            FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
 
     @config.patch(max_autotune=True)
     def test_conv1x1_with_free_symbols(self):
@@ -846,7 +855,8 @@ def test_cat_max_autotune_extern(self):
     def test_cat_max_autotune_triton(self):
         self._test_cat_max_autotune_impl(using_triton_mm=True)
 
-    def test_conv_cat(self):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_conv_cat(self, search_space):
         class ToyModel(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -858,24 +868,28 @@ def forward(self, x):
                 x = self.conv(x)
                 return torch.cat((x, x + 1))
 
-        with torch.no_grad():
-            m = ToyModel().to(device=GPU_TYPE)
-            input_tensor = torch.randn(32, 3, 64, 64).to(device=GPU_TYPE)
+        with config.patch({"max_autotune_gemm_search_space": search_space}):
+            with torch.no_grad():
+                m = ToyModel().to(device=GPU_TYPE)
+                input_tensor = torch.randn(32, 3, 64, 64).to(device=GPU_TYPE)
 
-            # convolution is not currently plannable
-            m = torch.compile(m, mode="max-autotune-no-cudagraphs")
-            out, code = run_and_get_code(m, input_tensor)
-            self.assertEqual(out, m(input_tensor))
+                # convolution is not currently plannable
+                m = torch.compile(m, mode="max-autotune-no-cudagraphs")
+                out, code = run_and_get_code(m, input_tensor)
+                self.assertEqual(out, m(input_tensor))
 
-            if not TEST_WITH_ROCM:
-                FileCheck().check("def triton_poi_fused_add_cat_").run(code[0])
+                if not TEST_WITH_ROCM:
+                    FileCheck().check("def triton_poi_fused_add_cat_").run(code[0])
 
-    def test_conv3d(self):
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
+    def test_conv3d(self, search_space):
         fn = torch.nn.functional.conv3d
         image = torch.randn([1, 3, 8, 16, 32])
         filt = torch.randn([3, 3, 7, 7, 7])
 
-        with config.patch({"max_autotune": True}):
+        with config.patch(
+            {"max_autotune": True, "max_autotune_gemm_search_space": search_space}
+        ):
             expected = fn(image, filt)
             actual = torch.compile(fn)(image, filt)
             torch.testing.assert_close(actual, expected, atol=6e-5, rtol=0.001)
@@ -1380,7 +1394,7 @@ def func_test1(x, y, z, m):
                         'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[10,30],
                         'layout':"[[10,30],[30,1],torch.float32,device(type='cuda',index=0),0]",
                         'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity',
-                        'kwargs':{'EVEN_K':False,'FLOAT32_PRECISION':'"tf32"','USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
+                        'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
                         'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
 
                 expected = expected.replace("cuda", GPU_TYPE)
@@ -1419,7 +1433,7 @@ def func_test1(x, y, z, m):
                         "[[s27,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]"],
                     'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[s77,s94],
                     'layout':"[[s77,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]",'num_consumer_groups':0,
-                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','kwargs':{'EVEN_K':False,'FLOAT32_PRECISION':'"tf32"',
+                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
                     'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
                 expected = expected.replace("cuda", GPU_TYPE)
                 self.assertExpectedInline(
@@ -1919,8 +1933,9 @@ def mm(a, b):
         with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+    @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     @parametrize("dynamic", (False, True))
-    def test_max_autotune_addmm(self, dynamic=False):
+    def test_max_autotune_addmm(self, search_space, dynamic=False):
         """
         Make sure autotuning addmm in sub processes work without crashes.
         """
@@ -1933,7 +1948,13 @@ def addmm(x, a, b):
         x = torch.randn(100).to(GPU_TYPE)
         a = torch.randn(100, 10).to(GPU_TYPE)
         b = torch.randn(10, 100).to(GPU_TYPE)
-        with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_search_space": search_space,
+            }
+        ):
             Y_compiled = torch.compile(addmm, dynamic=dynamic)(x, a, b)
             Y = addmm(x, a, b)
             torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
index 41944a9169239..15c1abdf32db2 100644
--- a/test/inductor/test_padding.py
+++ b/test/inductor/test_padding.py
@@ -49,18 +49,6 @@ def geninp():
     return input_dict
 
 
-def get_padded_stride(shape, alignment_bytes, pad_output, itemsize):
-    align = alignment_bytes // itemsize
-    new_strides = [0 for _ in range(len(shape))]
-    new_strides[len(shape) - 1] = 1
-    for i in range(len(shape) - 1, 0, -1):
-        stride = shape[i] * new_strides[i]
-        if pad_output and stride % align != 0:
-            stride = (stride + align - 1) // align * align
-        new_strides[i - 1] = stride
-    return tuple(new_strides)
-
-
 class LinearAndSoftmax(nn.Module):
     """
     It's very common that a transformer model will do a matmul and then
@@ -757,11 +745,20 @@ def get_input(size: tuple[int], alignment_bytes: int) -> torch.Tensor:
         input_tensors = [get_input(shape, alignment_bytes) for _ in range(num_inputs)]
 
         config_patches = {
+            "compile_threads": 1,
             "comprehensive_padding": pad_output,
             "cpu_backend": "triton",
+            "disable_padding_cpu": False,
+            "implicit_fallbacks": False,
+            "inplace_buffers": False,
             "padding_alignment_bytes": alignment_bytes,
+            "pad_channels_last": True,
             "pad_outputs": True,
             "padding_stride_threshold": 0,
+            "triton.prefer_nd_tiling": True,
+            "triton.use_block_ptr": True,
+            "triton.codegen_upcast_to_fp32": False,
+            "unroll_reductions_threshold": 1,
         }
         with config.patch(config_patches):
             compiled = torch.compile(torch.cat)
@@ -770,89 +767,7 @@ def get_input(size: tuple[int], alignment_bytes: int) -> torch.Tensor:
         output_shape = (shape[0] * num_inputs, shape[1])
         output_stride = input_tensors[0].stride()
         output_line = f"buf12 = empty_strided_{GPU_TYPE}({output_shape}, {output_stride}, torch.float32)"
-        self.assertTrue(output_line in code[0])
-
-    @parametrize(
-        "shape,alignment_bytes,pad_output",
-        [
-            ((512, 1), 32, False),
-            ((512, 1), 32, True),
-            ((32, 30), 64, False),
-            ((32, 30), 64, True),
-            ((512, 100, 1), 32, False),
-            ((512, 100, 1), 32, True),
-            ((32, 50, 30), 64, False),
-            ((32, 50, 30), 64, True),
-        ],
-    )
-    def test_outer_dynamic_shape_padding(self, shape, alignment_bytes, pad_output):
-        """
-        When only the outermost dim is dynamic shape, the output can still be padded up
-        based on padding configuration.
-        """
-        num_inputs = 2
-        input_tensors = [
-            torch.randn(shape, dtype=torch.float32) for _ in range(num_inputs)
-        ]
-
-        config_patches = {
-            "comprehensive_padding": pad_output,
-            "cpu_backend": "triton",
-            "padding_alignment_bytes": alignment_bytes,
-            "pad_outputs": True,
-            "padding_stride_threshold": 0,
-        }
-        with config.patch(config_patches):
-            torch._dynamo.mark_dynamic(input_tensors[0], 0)
-            torch._dynamo.mark_dynamic(input_tensors[1], 0)
-            compiled = torch.compile(torch.add)
-            result, _ = run_and_get_code(compiled, *input_tensors)
-
-        expected_stride = get_padded_stride(
-            result.shape, alignment_bytes, pad_output, result.dtype.itemsize
-        )
-        self.assertEqual(result.stride(), expected_stride)
-
-    @parametrize(
-        "shape,alignment_bytes,pad_output",
-        [
-            ((500, 10, 1), 32, False),
-            ((500, 20, 1), 32, True),
-            ((30, 10, 20), 64, True),
-            ((30, 10, 20), 64, False),
-        ],
-    )
-    def test_perm_outer_dynamic_shape_padding(self, shape, alignment_bytes, pad_output):
-        """
-        When only the outermost dim is dynamic shape, the output can still be padded up
-        based on padding configuration. Test when this occurs after a permute op.
-        """
-
-        def permute_contig(x):
-            return torch.transpose(x, 0, 2).contiguous()
-
-        num_inputs = 1
-        input_tensors = [
-            torch.randn(shape, dtype=torch.float32) for _ in range(num_inputs)
-        ]
-
-        config_patches = {
-            "comprehensive_padding": pad_output,
-            "cpu_backend": "triton",
-            "padding_alignment_bytes": alignment_bytes,
-            "pad_outputs": True,
-            "padding_stride_threshold": 0,
-            "triton.use_block_ptr": True,
-        }
-        with config.patch(config_patches):
-            torch._dynamo.mark_dynamic(input_tensors[0], 2)
-            compiled = torch.compile(permute_contig)
-            result, _ = run_and_get_code(compiled, *input_tensors)
-
-        expected_stride = get_padded_stride(
-            result.shape, alignment_bytes, pad_output, result.dtype.itemsize
-        )
-        self.assertEqual(result.stride(), expected_stride)
+        self.assertTrue(any(output_line in line for line in code))
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index 8fb22219302bf..fc9f92477c79d 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -3081,7 +3081,7 @@ def fwd_kernel(
                 # Compute output
                 w = tl.load(w1_block_ptr)
                 b = tl.load(b1_block_ptr)
-                o = tl.dot(x, w, input_precision="ieee")
+                o = tl.dot(x, w, allow_tf32=False)
                 o += b[None, :]
 
                 # Store output
diff --git a/test/onnx/exporter/test_api.py b/test/onnx/exporter/test_api.py
index 593cc524ebe7e..24a9176bbe5bc 100644
--- a/test/onnx/exporter/test_api.py
+++ b/test/onnx/exporter/test_api.py
@@ -7,11 +7,9 @@
 import logging
 import os
 
-import numpy as np
-from onnxscript import BOOL, FLOAT, ir, opset18 as op
+from onnxscript import BOOL, FLOAT, opset18 as op
 
 import torch
-import torch.onnx._flags
 from torch.onnx._internal.exporter import _testing as onnx_testing
 from torch.testing._internal import common_utils
 
@@ -30,6 +28,11 @@ def forward(self, x, b):
         return (y, z)
 
 
+class SampleModelReduction(torch.nn.Module):
+    def forward(self, x):
+        return x.sum()
+
+
 class SampleModelForDynamicShapes(torch.nn.Module):
     def forward(self, x, b):
         return x.relu(), b.sigmoid()
@@ -67,6 +70,7 @@ def assert_export(
         )
         assert onnx_program is not None
         onnx_testing.assert_onnx_program(onnx_program, strategy=strategy)
+        return onnx_program
 
     def test_args_normalization_with_no_kwargs(self):
         self.assert_export(
@@ -74,6 +78,18 @@ def test_args_normalization_with_no_kwargs(self):
             (torch.randn(1, 1, 2), torch.randn(1, 1, 2)),
         )
 
+    def test_lower_opset_support(self):
+        # First test that opset 18 (torchlib opset works)
+        onnx_program = self.assert_export(
+            SampleModelReduction(), (torch.randn(1, 1, 2),), opset_version=18
+        )
+        self.assertEqual(onnx_program.model.opset_imports[""], 18)
+
+        onnx_program = self.assert_export(
+            SampleModelReduction(), (torch.randn(1, 1, 2),), opset_version=16
+        )
+        self.assertEqual(onnx_program.model.opset_imports[""], 16)
+
     def test_symbolic_argument_user_input_is_supported_by_report_and_call(self):
         class constant_plus_tensor_inputs(torch.nn.Module):
             def forward(self, a, x):
@@ -339,6 +355,47 @@ def test_export_successful_when_dynamic_dimension_is_one(self):
             ),
         )
 
+    def test_is_in_onnx_export(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                def f(x):
+                    return x.sin() if torch.onnx.is_in_onnx_export() else x.cos()
+
+                return f(x)
+
+        self.assertFalse(torch.onnx.is_in_onnx_export())
+        onnx_program = torch.onnx.export(
+            Mod(),
+            (torch.randn(3, 4),),
+            dynamo=True,
+            fallback=False,
+        )
+        self.assertFalse(torch.onnx.is_in_onnx_export())
+
+        node_names = [n.op_type for n in onnx_program.model.graph]
+        self.assertIn("Sin", node_names)
+
+    def test_torchscript_exporter_raises_deprecation_warning(self):
+        # Test that the deprecation warning is raised when using torchscript exporter
+        with self.assertWarnsRegex(
+            DeprecationWarning, "You are using the legacy TorchScript-based ONNX export"
+        ):
+            torch.onnx.export(
+                SampleModel(), (torch.randn(1, 1, 2),), io.BytesIO(), dynamo=False
+            )
+
+    def test_model_output_can_be_none(self):
+        class ModelWithNoneOutput(torch.nn.Module):
+            def forward(self, x):
+                return x + 1, None
+
+        onnx_program = torch.onnx.export(
+            ModelWithNoneOutput(),
+            (torch.randn(1, 1, 2),),
+            dynamo=True,
+        )
+        onnx_testing.assert_onnx_program(onnx_program)
+
 
 class TestCustomTranslationTable(common_utils.TestCase):
     def test_custom_translation_table_overrides_ops(self):
@@ -471,147 +528,5 @@ def onnx_add(self: FLOAT, other: FLOAT) -> FLOAT:
             self.assertNotIn("Sub", all_nodes_decomp)
 
 
-class TestFakeTensorExport(common_utils.TestCase):
-    """Test exporting in fake mode."""
-
-    def test_onnx_program_raises_when_model_defined_in_fake_mode(self):
-        with torch.onnx.enable_fake_mode():
-
-            class Model(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.weight = torch.nn.Parameter(torch.tensor(42.0))
-
-                def forward(self, x):
-                    return self.weight + x
-
-            onnx_program = torch.onnx.export(
-                Model(), (torch.tensor(1.0),), dynamo=True, optimize=False
-            )
-            assert onnx_program is not None
-            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
-            with self.assertRaises(Exception):
-                # The tensors need to be replaced with real tensors
-                _ = onnx_program.model_proto
-
-        # Convert to model proto and back to trigger to_bytes method which serializes the tensor
-        with self.assertRaises(Exception):
-            # It doesn't matter if it is called inside or outside of the enable_fake_mode() context
-            _ = onnx_program.model_proto
-
-        # If we replace with concrete tensors, the serialization will succeed.
-        # This needs to happen outside of the fake context
-        onnx_program.apply_weights({"weight": torch.tensor(42.0)})
-        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
-        np.testing.assert_allclose(
-            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
-        )
-
-    def test_onnx_program_save_raises_when_model_initialized_in_fake_mode(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.tensor(42.0))
-
-            def forward(self, x):
-                return self.weight + x
-
-        with torch.onnx.enable_fake_mode():
-            onnx_program = torch.onnx.export(
-                Model(), (torch.tensor(1.0),), dynamo=True, optimize=False
-            )
-            assert onnx_program is not None
-            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
-            with self.assertRaises(Exception):
-                # The tensors need to be replaced with real tensors
-                _ = onnx_program.model_proto
-
-        with self.assertRaises(Exception):
-            # It doesn't matter if it is called inside or outside of the enable_fake_mode() context
-            _ = onnx_program.model_proto
-
-        # If we replace with concrete tensors, the serialization will succeed
-        # This needs to happen outside of the fake context
-        onnx_program.apply_weights({"weight": torch.tensor(42.0)})
-        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
-        np.testing.assert_allclose(
-            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
-        )
-
-    def test_onnx_program_save_succeeds_when_export_and_save_in_fake_mode(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.tensor(42.0))
-
-            def forward(self, x):
-                return self.weight + x
-
-        real_model = Model()
-
-        with torch.onnx.enable_fake_mode():
-            onnx_program = torch.onnx.export(
-                real_model, (torch.tensor(1.0),), dynamo=True, optimize=False
-            )
-
-            assert onnx_program is not None
-            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
-            # Note that even though we are calling .model_proto (equivalently .save()) in fake mode,
-            # the concrete tensors are maintained.
-            # This is due to the usage of torch._subclasses.fake_tensor.unset_fake_temporarily() in
-            # TorchTensor.tobytes()
-            onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
-            np.testing.assert_allclose(
-                onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
-            )
-
-        # This works inside or outside the fake mode
-        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
-        np.testing.assert_allclose(
-            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
-        )
-
-    def test_is_in_onnx_export(self):
-        class Mod(torch.nn.Module):
-            def forward(self, x):
-                def f(x):
-                    return x.sin() if torch.onnx.is_in_onnx_export() else x.cos()
-
-                return f(x)
-
-        self.assertFalse(torch.onnx.is_in_onnx_export())
-        onnx_program = torch.onnx.export(
-            Mod(),
-            (torch.randn(3, 4),),
-            dynamo=True,
-            fallback=False,
-        )
-        self.assertFalse(torch.onnx.is_in_onnx_export())
-
-        node_names = [n.op_type for n in onnx_program.model.graph]
-        self.assertIn("Sin", node_names)
-
-    def test_torchscript_exporter_raises_deprecation_warning(self):
-        # Test that the deprecation warning is raised when using torchscript exporter
-        with self.assertWarnsRegex(
-            DeprecationWarning, "You are using the legacy TorchScript-based ONNX export"
-        ):
-            torch.onnx.export(
-                SampleModel(), (torch.randn(1, 1, 2),), io.BytesIO(), dynamo=False
-            )
-
-    def test_model_output_can_be_none(self):
-        class ModelWithNoneOutput(torch.nn.Module):
-            def forward(self, x):
-                return x + 1, None
-
-        onnx_program = torch.onnx.export(
-            ModelWithNoneOutput(),
-            (torch.randn(1, 1, 2),),
-            dynamo=True,
-        )
-        onnx_testing.assert_onnx_program(onnx_program)
-
-
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/test_fx_passes.py b/test/onnx/test_fx_passes.py
deleted file mode 100644
index 97d255abdcb14..0000000000000
--- a/test/onnx/test_fx_passes.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Owner(s): ["module: onnx"]
-import torch
-import torch._dynamo
-import torch.fx
-from torch.onnx._internal.fx.passes import _utils as pass_utils
-from torch.testing._internal import common_utils
-
-
-class TestFxPasses(common_utils.TestCase):
-    def test_set_node_name_correctly_renames_when_new_name_collides_recursively(self):
-        def func(x, y, z):
-            return x + y + z
-
-        x = torch.randn(3)
-        y = torch.randn(3)
-        z = torch.randn(3)
-        gm, _ = torch._dynamo.export(func)(x, y, z)
-        torch._dynamo.reset()
-
-        # Purposely name the nodes in a way that will cause a recursive collision later.
-        # See :func:`set_node_name` for name collision renaming logic.
-        base_name = "tensor"
-        nodes = list(gm.graph.nodes)
-        for i, node in enumerate(nodes[1:]):
-            if i == 0:
-                node.name = base_name
-            else:
-                node.name = f"{base_name}.{i}"
-
-        # Run `set_node_name` and verify that the names are correct.
-        name_to_node = {node.name: node for node in gm.graph.nodes}
-        pass_utils.set_node_name(nodes[0], base_name, name_to_node)
-        assert nodes[0].name == base_name, f"Expected {base_name}, got {nodes[0].name}"
-        assert len({node.name for node in nodes}) == len(nodes), (
-            f"Expected all names to be unique, got {nodes}"
-        )
-
-    def test_set_node_name_succeeds_when_no_name_collisions(self):
-        def func(x, y, z):
-            return x + y + z
-
-        x = torch.randn(3)
-        y = torch.randn(3)
-        z = torch.randn(3)
-        gm, _ = torch._dynamo.export(func)(x, y, z)
-        torch._dynamo.reset()
-
-        # Run `set_node_name` and verify that the names are correct.
-        new_name = "some_tensor"
-        nodes = list(gm.graph.nodes)
-        name_to_node = {node.name: node for node in nodes}
-        pass_utils.set_node_name(nodes[1], new_name, name_to_node)
-        assert nodes[1].name == new_name, f"Expected {new_name}, got {nodes[0].name}"
-        assert len({node.name for node in nodes}) == len(nodes), (
-            f"Expected all names to be unique, got {nodes}"
-        )
-
-
-if __name__ == "__main__":
-    common_utils.run_tests()
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 36a3743b3757b..46b21cb4dc097 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -2336,6 +2336,74 @@ def verify_events(events):
             events = main_with_thread_fn(profile_all_threads)
             verify_events(events)
 
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    @unittest.skipIf(not kineto_available(), "Kineto is required")
+    def test_python_gc_event(self):
+        activities = [ProfilerActivity.CPU]
+
+        def payload():
+            x = torch.randn(10, 10)
+            y = torch.randn(10, 10)
+            with record_function("pre_gc"):
+                torch.mm(x, y)
+            gc.collect()
+            with record_function("post_gc"):
+                torch.mm(x, y)
+
+        def validate_json(prof, gc_collection_on):
+            with TemporaryFileName(mode="w+") as fname:
+                prof.export_chrome_trace(fname)
+                with open(fname) as f:
+                    events = json.load(f)["traceEvents"]
+                    # Find required events
+                    if gc_collection_on:
+                        pre_gc = next(
+                            (e for e in events if e["name"] == "pre_gc"), None
+                        )
+                        post_gc = next(
+                            (e for e in events if e["name"] == "post_gc"), None
+                        )
+                        python_gc_events = [
+                            e for e in events if e["name"] == "Python GC"
+                        ]
+                        # Assert all required events are present
+                        self.assertIsNotNone(pre_gc, "pre_gc event is missing")
+                        self.assertIsNotNone(post_gc, "post_gc event is missing")
+                        self.assertTrue(
+                            len(python_gc_events) > 0, "No Python GC events found"
+                        )
+                        # Calculate boundaries
+                        pre_gc_end = pre_gc["ts"] + pre_gc.get("dur", 0)
+                        post_gc_start = post_gc["ts"]
+                        # Assert each Python GC event is correctly placed
+                        for python_gc in python_gc_events:
+                            python_gc_start = python_gc["ts"]
+                            python_gc_end = python_gc["ts"] + python_gc.get("dur", 0)
+                            self.assertTrue(
+                                python_gc_start > pre_gc_end
+                                and python_gc_end < post_gc_start,
+                                f"Python GC event at {python_gc_start} is not correctly placed.",
+                            )
+                    else:
+                        python_gc_events = [
+                            e for e in events if e["name"] == "Python GC"
+                        ]
+                        self.assertTrue(
+                            len(python_gc_events) == 0,
+                            "Python GC event found when flag off",
+                        )
+
+        for gc_flag in [True, False]:
+            with profile(
+                activities=activities,
+                experimental_config=torch._C._profiler._ExperimentalConfig(
+                    record_python_gc_info=gc_flag
+                ),
+                with_stack=True,
+            ) as prof:
+                payload()
+            validate_json(prof, gc_flag)
+
 
 class SimpleNet(nn.Module):
     def __init__(self) -> None:
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index b414b687f3d00..346f22c1e477f 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -166,7 +166,8 @@ def _quantize_fp8e4m3(t: torch.Tensor, channelwise: bool, scale: Optional[torch.
         scale = scale or t.abs().max().reshape([1]) / quant_max
         scale = torch.max(scale, eps) if isinstance(scale, torch.Tensor) else max(scale, eps.item())
         qt = t / scale
-    qt = qt.to(torch.float8_e4m3fn)
+    # Clamp to avoid NaN. Convert in two steps to align with fp32 -> fp16 -> fp8
+    qt = qt.clamp(-448, 448).half().to(torch.float8_e4m3fn)
     return qt, scale
 
 def _dequantize_fp8e4m3(qt: torch.Tensor, scale: torch.Tensor):
@@ -4732,7 +4733,7 @@ def _test_qlinear_fp8_helper(
         use_bias_list = [True, False]
         weight_quant_per_channel_list = [True, False]
         output_dtype_list = [None, torch.float32, torch.bfloat16]
-        y_scale, y_zp = 0.07, 0
+        y_scale, y_zp = 0.3, 0
         input_dim_list = [2, 3]
         cases = itertools.product(
             in_channels_list, out_channels_list, use_bias_list,
@@ -4830,6 +4831,7 @@ def _test_qlinear_fp8_helper(
 
                 self.assertEqual(x.dim(), qy.dim())
                 self.assertEqual(y_ref.float(), qy.float())
+                assert not torch.isnan(qy).any()
 
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
@@ -7883,7 +7885,7 @@ def _test_qconv_impl_cpu_tensor_fp8(
         strides=(),
         pads=(),
         dilations=(),
-        Y_scale=0.02,
+        Y_scale=0.002,
         use_bias=True,
         post_op=PointwisePostOp(),
         use_channelwise=True,
@@ -7960,9 +7962,7 @@ def _test_qconv_impl_cpu_tensor_fp8(
 
         # Quantize reference results for comparison
         if qconv_output_dtype is None:
-            Y_scale_t = torch.Tensor([Y_scale]).to(device)
-            # Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
-            result_ref = result_ref.div(Y_scale_t).half().to(torch.float8_e4m3fn)
+            result_ref = _quantize_fp8e4m3(result_ref, False, Y_scale)[0]
         else:
             result_ref = result_ref.to(qconv_output_dtype)
 
@@ -8039,7 +8039,8 @@ def _test_qconv_impl_cpu_tensor_fp8(
         if fp32_output or bfloat16_output:
             self.assertTrue(result.dtype == qconv_output_dtype)
 
-        assert torch.allclose(result.float(), result_ref.float(), atol=1e-6)
+        self.assertEqual(result.float(), result_ref.float(), atol=1e-6, rtol=1e-5)
+        assert not torch.isnan(result).any()
 
     def _test_qconv_fp8_helper(self, nd, pointwise_post_op):
         # nd = 1,2,3 -> conv1d/2d/3d
@@ -8154,6 +8155,7 @@ def test_qconv2d_sum_relu_fp8(self):
     @skipIfNoONEDNN
     def test_qconv3d_fp8(self):
         pointwise_post_op = PointwisePostOp()
+        torch.manual_seed(0)  # For reproducibility in 3D conv tests
         self._test_qconv_fp8_helper(3, pointwise_post_op)
 
 
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 6a23915c56efd..7ba466119da85 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -3449,119 +3449,6 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
         self.assertEqual(result_compiled, result_eager)
         self.assertEqual(cnt.frame_count, 2)
 
-    @fresh_cache()
-    @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_unbacked_slice(self):
-        from torch.fx.experimental.symbolic_shapes import statically_known_true
-
-        # standard slice
-        def f1(x, xs):
-            u0, u1 = xs.tolist()
-            torch._check_is_size(u0, max=x.size(0))
-            torch._check_is_size(u1, max=x.size(0))
-            torch._check(u0 <= u1)
-            out = x[u0:u1]
-            assert statically_known_true(out.size(0) == (u1 - u0))
-            return out
-
-        x, xs = torch.randn(10), torch.tensor([3, 6])
-        fn1 = torch.compile(f1, fullgraph=True, backend="inductor")
-        self.assertEqual(fn1(x, xs).size(0), 3)
-        self.assertTrue(torch.allclose(fn1(x, xs), f1(x, xs)))
-        with self.assertRaises(RuntimeError):
-            fn1(x, torch.tensor([-1, 5]))
-
-        # known negative slice
-        def f2(x, n):
-            u0 = n.item()
-            torch._check(u0 > 1)
-            torch._check(u0 <= x.size(0))
-            out = x[-u0:]
-            assert statically_known_true(out.size(0) == u0)
-            return out
-
-        x, n = torch.randn(10), torch.tensor([5])
-        fn2 = torch.compile(f2, fullgraph=True, backend="inductor")
-        self.assertEqual(fn2(x, n).size(0), 5)
-        self.assertTrue(torch.allclose(fn2(x, n), f2(x, n)))
-        with self.assertRaises(RuntimeError):
-            fn2(x, torch.tensor([-5]))
-
-        # general case: no known info
-        def f3(x, xs):
-            u0, u1 = xs.tolist()
-            return x[u0:u1]
-
-        log_stream, ctx = logs_to_string(
-            "torch._inductor.compile_fx", "post_grad_graphs"
-        )
-        cnts = CompileCounterWithBackend("inductor")
-        x, xs = torch.randn(10), torch.tensor([3, 6])
-        with ctx():
-            fn3 = torch.compile(f3, fullgraph=True, backend=cnts)
-            xs = torch.tensor([-9, -1])  # negative case
-            self.assertTrue(torch.allclose(fn3(x, xs), f3(x, xs)))
-            xs = torch.tensor([-1000, 1000])  # out of bounds
-            self.assertTrue(torch.allclose(fn3(x, xs), f3(x, xs)))
-            xs = torch.tensor([2, -2])  # mixed
-            self.assertTrue(torch.allclose(fn3(x, xs), f3(x, xs)))
-            self.assertEqual(cnts.frame_count, 1)
-
-        aot_graphs = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
-        self.assertExpectedInline(
-            aot_graphs,
-            """\
-        select: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 0)
-        _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(select);  select = None
-        select_1: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 1);  arg0_1 = None
-        _local_scalar_dense_1: "Sym(u1)" = torch.ops.aten._local_scalar_dense.default(select_1);  select_1 = None
-        slice_1: "f32[u2][1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, _local_scalar_dense, _local_scalar_dense_1);  arg1_1 = _local_scalar_dense = _local_scalar_dense_1 = None
-        sym_size_int: "Sym(u2)" = torch.ops.aten.sym_size.int(slice_1, 0)
-        ge_2: "Sym(u2 >= 0)" = sym_size_int >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u2 >= 0 on node 'ge'");  ge_2 = _assert_scalar = None
-        le: "Sym(u2 <= 10)" = sym_size_int <= 10;  sym_size_int = None
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(le, "Runtime assertion failed for expression u2 <= 10 on node 'le'");  le = _assert_scalar_1 = None
-        sym_storage_offset_default: "Sym(u3)" = torch.ops.aten.sym_storage_offset.default(slice_1)
-        ge_3: "Sym(u3 >= 0)" = sym_storage_offset_default >= 0;  sym_storage_offset_default = None
-        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u3 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_2 = None
-        return (slice_1,)""",  # noqa: B950
-            ignore_comments=True,
-            ignore_empty_lines=True,
-        )
-
-    @fresh_cache()
-    @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    @torch._inductor.config.patch("cpp_wrapper", True)
-    def test_unbacked_slice_cpp_wrapper(self):
-        self.test_unbacked_slice()
-
-    @fresh_cache()
-    @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_tensor_split(self):
-        def f1(x, xs):
-            xs = torch.tensor(xs.tolist())
-            return torch.tensor_split(x, xs)
-
-        x = torch.randn(20)
-        xs = torch.tensor([5, 10, 15])
-        fn = torch.compile(f1, fullgraph=True, backend="inductor")
-
-        def compare(x, xs):
-            for i, j in zip(f1(x, xs), fn(x, xs)):
-                self.assertTrue(torch.allclose(i, j))
-
-        compare(x, xs)
-        xs = torch.tensor([-15, 9, 10, 11])
-        compare(x, xs)
-        xs = torch.tensor([-15, -10, -5, -2])
-        compare(x, xs)
-
-    @fresh_cache()
-    @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    @torch._inductor.config.patch("cpp_wrapper", True)
-    def test_tensor_split_cpp_wrapper(self):
-        self.test_tensor_split()
-
     @unittest.skip("this test fails due to inductor/autograd issue #153041")
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_unbacked_non_contigious_reshape_failing(self):
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 73a15130922fa..7a202efbe084f 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -25,6 +25,7 @@
     skipXLA,
 )
 from torch.testing._internal.common_dtype import (
+    all_mps_types_and,
     all_types_and,
     all_types_and_complex_and,
     all_types_complex_float8_and,
@@ -903,10 +904,13 @@ def test_list_indices(self, device):
         # Generate a list of lists, containing overlapping window indices
         indices = [range(i, i + W) for i in range(0, N - W)]
 
-        for i in [len(indices), 100, 32, 31]:
+        for i in [len(indices), 100, 32]:
             windowed_data = t[indices[:i]]
             self.assertEqual(windowed_data.shape, (i, W))
 
+        with self.assertRaisesRegex(IndexError, "too many indices"):
+            windowed_data = t[indices[:31]]
+
     def test_bool_indices_accumulate(self, device):
         mask = torch.zeros(size=(10,), dtype=torch.bool, device=device)
         y = torch.ones(size=(10, 10), device=device)
@@ -1202,13 +1206,13 @@ def func1(x, i, v):
         out_cpu = func1(t, ind, val)
         self.assertEqual(out_cuda.cpu(), out_cpu)
 
-    @expectedFailureMPS  # Doubles not supported
     @onlyNativeDeviceTypes
     def test_index_put_accumulate_duplicate_indices(self, device):
+        dtype = torch.float if device.startswith("mps") else torch.double
         for i in range(1, 512):
             # generate indices by random walk, this will create indices with
             # lots of duplicates interleaved with each other
-            delta = torch.empty(i, dtype=torch.double, device=device).uniform_(-1, 1)
+            delta = torch.empty(i, dtype=dtype, device=device).uniform_(-1, 1)
             indices = delta.cumsum(0).long()
 
             input = torch.randn(indices.abs().max() + 1, device=device)
@@ -1866,7 +1870,7 @@ def test_index_reduce(self, device, dtype, reduce):
                     self.assertEqual(dest, expected)
 
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
-    @expectedFailureMPS  # See https://github.com/pytorch/pytorch/issues/160993
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat))
     def test_index_copy(self, device, dtype):
         # We just test for num_copy <= num_dest, as otherwise there are repeated indices
         # and the behavior is undefined
@@ -1909,8 +1913,8 @@ def ref_index_copy(tgt, dim, idx, src):
     # onlyNativeDeviceTypes due to an XLA error:
     # https://github.com/pytorch/pytorch/issues/53256
     @onlyNativeDeviceTypes
-    @expectedFailureMPS  # See https://github.com/pytorch/pytorch/issues/160737
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat))
     def test_index_copy_scalars(self, device, dtype):
         # Create the 8 possible combinations of scalar sizes for target / index / source
         scalars = (
@@ -2043,8 +2047,8 @@ def test_index_fill(self, device, dtype):
 
     # The test fails for zero-dimensional tensors on XLA
     @onlyNativeDeviceTypes
-    @expectedFailureMPS  # See https://github.com/pytorch/pytorch/issues/160737
     @dtypes(*all_types_complex_float8_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat))
     def test_index_select(self, device, dtype):
         num_src, num_out = 3, 5
 
diff --git a/test/test_mps.py b/test/test_mps.py
index deaec2886d325..8333ec0060779 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -738,6 +738,33 @@ def test_avg_pool2d_ceil_mode(self):
             padding=(0, 1), stride=2)
         self.assertFalse(torch.isnan(y).any())
 
+    # Test some cases for avg_pool2d which used to mismatch CPU results.
+    # Addresses this issue: https://github.com/pytorch/pytorch/issues/160743
+    def test_avg_pool2d_ceil_mode_mismatch(self):
+        sizes = [
+            (4, 2, 3),
+            (5, 2, 3),
+            (50, 2, 3),
+            (4, 1, 2, 3),
+            (4, 4, 2, 3),
+            (2, 2, 4, 6),
+            (5, 40, 60),
+            (2, 2, 40, 60),
+        ]
+
+        kwargs = dict(kernel_size=[1, 3],
+                      stride=[2, 3],
+                      ceil_mode=True,
+                      divisor_override=7)
+
+        for input_size in sizes:
+            model = torch.nn.AvgPool2d(**kwargs)
+            x = torch.arange(math.prod(input_size), dtype=torch.float).reshape(input_size)
+            out_cpu = model(x)
+            out_mps = model(x.to("mps"))
+            msg = f'{input_size=}, {kwargs=}'
+            self.assertEqual(out_mps, out_cpu, msg=msg)
+
 
 class TestMPS(TestCaseMPS):
     def test_exp(self, device="mps", dtype=torch.float):
@@ -8904,6 +8931,12 @@ def test_constant_pad_nd_preserves_memory_format(self):
         nhwc_padded = torch.constant_pad_nd(nhwc_tensor, [1, 2], 0.5)
         self.assertTrue(nhwc_padded.is_contiguous(memory_format=torch.channels_last))
 
+    def test_constant_pad_nd_with_empty_pad(self):
+        # Empty constant pad is no-op
+        # See https://github.com/pytorch/pytorch/issues/161066
+        input_mps = torch.randn((2, 3, 4), device="mps")
+        output_mps = torch.constant_pad_nd(input_mps, [])
+        self.assertEqual(output_mps, input_mps)
 
 class TestLinalgMPS(TestCaseMPS):
     def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False):
diff --git a/test/test_numa_binding.py b/test/test_numa_binding.py
index 1f3d3e30e8365..349b89fa95e6c 100644
--- a/test/test_numa_binding.py
+++ b/test/test_numa_binding.py
@@ -3,12 +3,10 @@
 from __future__ import annotations
 
 import json
-import multiprocessing.spawn as spawn
 import os
-import subprocess
 import sys
-import tempfile
 from dataclasses import dataclass
+from multiprocessing.context import SpawnProcess
 from typing import Any, Optional
 from unittest import skipUnless
 from unittest.mock import mock_open, patch
@@ -16,6 +14,9 @@
 import torch
 from torch._utils_internal import signpost_event
 from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs, start_processes
+from torch.distributed.elastic.multiprocessing.subprocess_handler import (
+    SubprocessHandler,
+)
 from torch.numa.binding import (
     _get_ranges_str_from_ints,
     _get_set_of_int_from_ranges_str,
@@ -40,7 +41,6 @@ class MockDeviceProperties:
 
 
 _real_open = open
-_real_mkstemp = tempfile.mkstemp
 
 
 @skipUnless(sys.platform == "linux", "Only linux currently supported")
@@ -56,7 +56,6 @@ def setUp(self) -> None:
         self._mock_num_logical_cpus = 0
         self._mock_num_numa_nodes = 0
         self._mock_num_sockets = 0
-        self._temp_file_paths = []
 
         self._context_managers_to_apply_to_all_tests = [
             patch("torch.cuda.device_count", self._mock_device_count),
@@ -67,9 +66,6 @@ def setUp(self) -> None:
             patch("builtins.open", new=self._mock_open),
             patch("os.listdir", new=self._mock_listdir),
             patch("os.sched_getaffinity", new=self._mock_sched_getaffinity),
-            patch("shutil.which", return_value="/usr/bin/numactl"),
-            patch("torch.numa.binding.run"),
-            patch("torch.numa.binding.mkstemp", self._mock_mkstemp),
             patch("torch.numa.binding.signpost_event", self._mock_signpost_event),
         ]
 
@@ -77,14 +73,6 @@ def setUp(self) -> None:
             context_manager.__enter__()
 
     def tearDown(self) -> None:
-        # Clean up temporary files
-        for temp_file_path in self._temp_file_paths:
-            try:
-                os.unlink(temp_file_path)
-            except FileNotFoundError:
-                # File may have already been deleted or doesn't exist
-                pass
-
         for context_manager in self._context_managers_to_apply_to_all_tests:
             context_manager.__exit__(None, None, None)
         super().tearDown()
@@ -94,12 +82,6 @@ def _mock_signpost_event(self, *args, **kwargs) -> None:
         json.dumps(kwargs["parameters"])
         return signpost_event(*args, **kwargs)
 
-    def _mock_mkstemp(self, *args, **kwargs):
-        # Just keep track of temp files so we can delete them
-        fd, path = _real_mkstemp(*args, **kwargs)
-        self._temp_file_paths.append(path)
-        return fd, path
-
     def _add_mock_hardware(
         self,
         *,
@@ -249,18 +231,41 @@ def _mock_listdir(self, target_path: str) -> set[str]:
     def _mock_sched_getaffinity(self, pid: int) -> set[int]:
         return set(range(self._mock_num_logical_cpus))
 
-    def _start_processes_for_str_entrypoint_and_get_Popen_args(
+    def _start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
         self, *, numa_options: Optional[NumaOptions], target_local_rank: int
-    ) -> tuple[str, ...]:
-        """
-        Calls start_processes like elastic_launch ultimately would
-        and returns the commandline args tuple input to Popen.
+    ) -> Optional[set[int]]:
+        active_local_rank = None
+        target_sched_setaffinity_logical_cpu_indices = None
 
-        Does not actually create the processes.
-        """
-        with patch(
-            "torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler.Popen"
-        ) as mock_popen:
+        real_subprocess_handler_init = SubprocessHandler.__init__
+
+        def mock_SubprocessHandler__init__(*args, **kwargs) -> None:
+            nonlocal active_local_rank
+            active_local_rank = kwargs["local_rank_id"]
+            return real_subprocess_handler_init(*args, **kwargs)
+
+        def mock_sched_setaffinity(*args, **kwargs) -> None:
+            nonlocal target_sched_setaffinity_logical_cpu_indices
+            if (
+                active_local_rank == target_local_rank
+                # We only care about the first call, not the second
+                # one where it gets reset
+                and target_sched_setaffinity_logical_cpu_indices is None
+            ):
+                target_sched_setaffinity_logical_cpu_indices = args[1]
+
+        with (
+            patch(
+                "os.sched_setaffinity", mock_sched_setaffinity
+            ) as mock_sched_setaffinity,
+            patch(
+                "torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler.Popen"
+            ),
+            patch(
+                "torch.distributed.elastic.multiprocessing.subprocess_handler.SubprocessHandler.__init__",
+                mock_SubprocessHandler__init__,
+            ),
+        ):
             start_processes(
                 name="test_process",
                 entrypoint="echo",
@@ -273,40 +278,40 @@ def _start_processes_for_str_entrypoint_and_get_Popen_args(
                 logs_specs=DefaultLogsSpecs(),
                 numa_options=numa_options,
             )
-            # This will raise an exception if there is no call from the desired local_rank
-            call_args = next(
-                call_args
-                for call_args in mock_popen.call_args_list
-                if call_args.kwargs.get("env", {}).get("LOCAL_RANK")
-                == str(target_local_rank)
-            )
-            return call_args.kwargs["args"]
 
-    def _start_processes_for_callable_entrypoint_and_get_executable_contents(
+        return target_sched_setaffinity_logical_cpu_indices
+
+    def _start_processes_for_callable_entrypoint_and_get_sched_setaffinity_cpus(
         self, *, numa_options: Optional[NumaOptions], target_local_rank: int
-    ) -> str:
+    ) -> Optional[set[int]]:
         active_local_rank = None
-        executable_path = None
+        target_sched_setaffinity_logical_cpu_indices = None
 
-        def _mock_process_start(self: Any) -> None:
-            nonlocal active_local_rank
-            active_local_rank = self._args[1]
-            spawn.get_command_line()
-            self._target(*self._args)
-
-        original_get_command_line = spawn.get_command_line
+        real_process__init__ = SpawnProcess.__init__
 
-        def _mock_get_command_line(*args, **kwargs) -> list[str]:
-            nonlocal executable_path
-            result = original_get_command_line(*args, **kwargs)
-            if active_local_rank == target_local_rank:
-                executable_path = result[0]
-
-            return result
+        def _mock_process__init__(*args, **kwargs) -> None:
+            nonlocal active_local_rank
+            active_local_rank = kwargs["args"][1]
+            return real_process__init__(*args, **kwargs)
+
+        def mock_sched_setaffinity(*args, **kwargs) -> None:
+            nonlocal target_sched_setaffinity_logical_cpu_indices
+            if (
+                active_local_rank == target_local_rank
+                # We only care about the first call, not the second
+                # one where it gets reset
+                and target_sched_setaffinity_logical_cpu_indices is None
+            ):
+                target_sched_setaffinity_logical_cpu_indices = args[1]
 
         with (
-            patch("multiprocessing.context.SpawnProcess.start", _mock_process_start),
-            patch("multiprocessing.spawn.get_command_line", _mock_get_command_line),
+            patch(
+                "os.sched_setaffinity", mock_sched_setaffinity
+            ) as mock_sched_setaffinity,
+            patch("multiprocessing.context.SpawnProcess.start"),
+            patch(
+                "multiprocessing.context.SpawnProcess.__init__", _mock_process__init__
+            ),
             patch("multiprocessing.process.BaseProcess.sentinel", 1),
             # Prevent hanging
             patch(
@@ -325,9 +330,7 @@ def _mock_get_command_line(*args, **kwargs) -> list[str]:
                 numa_options=numa_options,
             )
 
-        assert executable_path is not None
-        with open(executable_path) as executable_file:
-            return executable_file.read()
+        return target_sched_setaffinity_logical_cpu_indices
 
     def test_node_numa_binding(self) -> None:
         self._add_mock_hardware(
@@ -338,20 +341,19 @@ def test_node_numa_binding(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
-            target_local_rank=11,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
+                target_local_rank=11,
+            )
         )
         self.assertEqual(
-            command_args,
+            bound_logical_cpu_indices,
             # There are 8 numa nodes and 2 GPUs per numa node, so GPU 11 would be
             # on numa node 11 // 2 = 5.
-            (
-                "numactl",
-                "--cpunodebind=5",
-                "echo",
-                "Hello, world!",
-            ),
+            # Each numa node has 4 * 2 * 2 = 16 logical CPUs
+            # Numa node 5 has CPUs 80-95
+            set(range(80, 96)),
         )
 
     def test_no_numa_binding_if_numa_options_not_provided(self) -> None:
@@ -363,15 +365,14 @@ def test_no_numa_binding_if_numa_options_not_provided(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=None, target_local_rank=11
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=None, target_local_rank=11
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            None,
         )
 
     def test_default_numa_binding(self) -> None:
@@ -407,7 +408,7 @@ def test_default_numa_binding(self) -> None:
 
     def test_fallback(self) -> None:
         self._add_mock_hardware(
-            num_sockets=1,
+            num_sockets=2,
             num_numa_nodes_per_socket=1,
             num_gpus_per_numa_node=1,
             num_l3_caches_per_numa_node=1,
@@ -417,28 +418,27 @@ def test_fallback(self) -> None:
         with (
             patch("torch.numa.binding.signpost_event") as signpost_patch,
             patch(
-                "torch.numa.binding.run",
-                side_effect=subprocess.CalledProcessError(1, "numactl"),
+                "torch.numa.binding._get_numa_node_index_for_gpu_index",
+                side_effect=Exception("Mock exception!"),
             ),
         ):
-            command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-                numa_options=NumaOptions(
-                    affinity_mode=AffinityMode.NODE,
-                    should_fall_back_if_binding_fails=True,
-                ),
-                target_local_rank=0,
+            bound_logical_cpu_indices = (
+                self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                    numa_options=NumaOptions(
+                        affinity_mode=AffinityMode.NODE,
+                        should_fall_back_if_binding_fails=True,
+                    ),
+                    target_local_rank=0,
+                )
             )
         self.assertIn(
-            "subprocess.CalledProcessError",
+            "Mock exception!",
             signpost_patch.call_args.kwargs["parameters"]["traceback"],
         )
         self.assertEqual(
-            command_args,
-            # No numa bindings due to exception
-            (
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # We should just reset to the original CPU affinity, which is all the CPUs
+            set(range(4)),
         )
 
     def test_explicit_numa_options_overrides_default(self) -> None:
@@ -460,7 +460,7 @@ def test_explicit_numa_options_overrides_default(self) -> None:
             NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
         )
 
-    def test_fork_start_method_does_not_call_get_default_numa_options(self) -> None:
+    def test_parallel_start_does_not_call_get_default_numa_options(self) -> None:
         # Inner import to avoid crashing if not torch.distributed.is_available()
         from torch.distributed.launcher.api import LaunchConfig
 
@@ -475,16 +475,14 @@ def test_fork_start_method_does_not_call_get_default_numa_options(self) -> None:
         with patch(
             "torch.distributed.launcher.api.get_default_numa_options"
         ) as mock_get_default_numa_options:
+            os.environ["TORCH_MP_PARALLEL_START"] = "1"
             launch_config = LaunchConfig(
                 min_nodes=1,
                 max_nodes=1,
                 nproc_per_node=2,
-                start_method="fork",
-                # Don't provide numa_options
+                start_method="forkserver",
             )
-            # Verify get_default_numa_options was not called
             mock_get_default_numa_options.assert_not_called()
-            # Verify numa_options is None when start_method is fork
             self.assertIsNone(launch_config.numa_options)
 
     def test_nproc_must_equal_cuda_device_count_to_use_default_numa_options(
@@ -509,9 +507,7 @@ def test_nproc_must_equal_cuda_device_count_to_use_default_numa_options(
                 max_nodes=1,
                 nproc_per_node=2,
             )
-            # Verify get_default_numa_options was not called
             mock_get_default_numa_options.assert_not_called()
-            # Verify numa_options is None when start_method is fork
             self.assertIsNone(launch_config.numa_options)
 
     def test_socket_numa_binding_with_multiple_numa_per_socket(self) -> None:
@@ -523,18 +519,18 @@ def test_socket_numa_binding_with_multiple_numa_per_socket(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.SOCKET),
-            target_local_rank=15,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.SOCKET),
+                target_local_rank=15,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                "--cpunodebind=6-7",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 15 is on numa node 15 // 2 = 7, which is on socket 3 (numa nodes 6 and 7)
+            # Each numa node has 4 * 2 * 2 = 16 logical CPUs
+            # Numa nodes 6 and 7 have CPUs 96-111 and 112-127
+            set(range(96, 128)),
         )
 
     def test_socket_numa_binding_with_single_numa_per_socket(self) -> None:
@@ -546,18 +542,18 @@ def test_socket_numa_binding_with_single_numa_per_socket(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.SOCKET),
-            target_local_rank=7,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.SOCKET),
+                target_local_rank=7,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                "--cpunodebind=3",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 7 is on numa node 7 // 2 = 3, which is socket 3 by itself
+            # Each numa node has 4 * 2 * 2 = 16 logical CPUs
+            # Numa node 3 has CPUs 48-63
+            set(range(48, 64)),
         )
 
     def test_exclusive_numa_binding(self) -> None:
@@ -569,34 +565,30 @@ def test_exclusive_numa_binding(self) -> None:
             num_physical_core_per_l3_cache=3,
         )
 
-        command_args_0 = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
-            target_local_rank=0,
+        bound_logical_cpu_indices_0 = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
+                target_local_rank=0,
+            )
         )
         self.assertEqual(
-            command_args_0,
-            (
-                "numactl",
-                # Gets an extra physical core due to odd number of physical cores on numa node
-                "--physcpubind=0-3",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices_0,
+            # Gets an extra physical core due to odd number of physical cores on numa node
+            # 3 physical cores total, 2 GPUs: GPU 0 gets 2 physical cores (CPUs 0-3)
+            set(range(0, 4)),
         )
 
-        command_args_1 = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
-            target_local_rank=1,
+        bound_logical_cpu_indices_1 = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
+                target_local_rank=1,
+            )
         )
         self.assertEqual(
-            command_args_1,
-            (
-                "numactl",
-                # Does not get an extra physical core, since the 1st GPU already took the extra.
-                "--physcpubind=4-5",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices_1,
+            # Does not get an extra physical core, since the 1st GPU already took the extra.
+            # GPU 1 gets 1 physical core (CPUs 4-5)
+            set(range(4, 6)),
         )
 
     def test_exclusive_raises_if_too_few_physical_cores(self) -> None:
@@ -612,7 +604,7 @@ def test_exclusive_raises_if_too_few_physical_cores(self) -> None:
             RuntimeError,
             "There are only 1 physical cores on numa_node_index=0, but there are 2 GPUs associated with this NUMA node.",
         ):
-            self._start_processes_for_str_entrypoint_and_get_Popen_args(
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
                 numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
                 target_local_rank=1,
             )
@@ -626,19 +618,18 @@ def test_core_complex_numa_binding_with_extra_l3(self) -> None:
             num_physical_core_per_l3_cache=3,
         )
 
-        command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
-            target_local_rank=3,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
+                target_local_rank=3,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                # The second L3 on the second numa node
-                "--physcpubind=24-29",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 3 is on numa node 3 // 2 = 1, relative GPU index is 3 % 2 = 1
+            # The second L3 on the second numa node (numa node 1)
+            # Second numa node starts at CPU 18, second L3 cache is CPUs 24-29
+            set(range(24, 30)),
         )
 
     def test_core_complex_numa_binding_with_fewer_l3_than_gpu(self) -> None:
@@ -650,20 +641,18 @@ def test_core_complex_numa_binding_with_fewer_l3_than_gpu(self) -> None:
             num_physical_core_per_l3_cache=3,
         )
 
-        command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
-            target_local_rank=3,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
+                target_local_rank=3,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                # There are only 2 L3 caches, so the 4th GPU shares the same
-                # cores as the 3rd GPU.
-                "--physcpubind=6-11",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 3 is on numa node 3 // 2 = 1, relative GPU index is 3 % 2 = 1
+            # With 1 L3 cache per numa node, GPU 3 uses L3 cache index 1 % 1 = 0 (the only cache)
+            # Second numa node starts at CPU 6, single L3 cache spans CPUs 6-11
+            set(range(6, 12)),
         )
 
     def test_core_complex_prefers_caches_with_more_cpus(self) -> None:
@@ -677,20 +666,17 @@ def test_core_complex_prefers_caches_with_more_cpus(self) -> None:
 
         # Only some subset of the CPUs are available this time.
         with patch("os.sched_getaffinity", return_value={0, 4, 6, 7, 9}):
-            command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-                numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
-                target_local_rank=0,
+            bound_logical_cpu_indices = (
+                self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                    numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
+                    target_local_rank=0,
+                )
             )
 
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                # Binds to the second L3 because it has the most available CPUs
-                "--physcpubind=6-7,9",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # Binds to the second L3 because it has the most available CPUs
+            {6, 7, 9},
         )
 
     def test_core_complex_tiebreak_prefers_lower_cache_key(self) -> None:
@@ -706,36 +692,19 @@ def test_core_complex_tiebreak_prefers_lower_cache_key(self) -> None:
             num_physical_core_per_l3_cache=1,
         )
 
-        command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
-            target_local_rank=0,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
+                target_local_rank=0,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                "--physcpubind=0-1",
-                "echo",
-                "Hello, world!",
-            ),
-        )
-
-    def test_raises_error_if_numactl_unavailable(self) -> None:
-        self._add_mock_hardware(
-            num_sockets=1,
-            num_numa_nodes_per_socket=1,
-            num_gpus_per_numa_node=1,
-            num_l3_caches_per_numa_node=1,
-            num_physical_core_per_l3_cache=1,
+            bound_logical_cpu_indices,
+            # 1 numa node, 2 L3 caches, 1 physical core per L3 cache = 2 logical CPUs per cache
+            # L3 cache 0: CPUs 0-1, L3 cache 1: CPUs 2-3
+            # Both have same number of CPUs, so prefer lower cache key (0)
+            set(range(0, 2)),
         )
-        with (
-            patch("shutil.which", return_value=None),
-            self.assertRaisesRegex(RuntimeError, r".*numactl.*"),
-        ):
-            self._start_processes_for_str_entrypoint_and_get_Popen_args(
-                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
-                target_local_rank=0,
-            )
 
     def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
         self._add_mock_hardware(
@@ -755,18 +724,18 @@ def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
             contents="-1",
         )
 
-        command_args = self._start_processes_for_str_entrypoint_and_get_Popen_args(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
-            target_local_rank=0,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
+                target_local_rank=0,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                "--cpunodebind=0",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 0 has numa node stored as -1, which is treated as numa node 0
+            # Each numa node has 1 * 1 * 2 = 2 logical CPUs
+            # Numa node 0 has CPUs 0-1
+            set(range(0, 2)),
         )
 
     def test_callable_entrypoint_basic(self) -> None:
@@ -778,27 +747,41 @@ def test_callable_entrypoint_basic(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        executable_contents = (
-            self._start_processes_for_callable_entrypoint_and_get_executable_contents(
-                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
-                target_local_rank=11,
-            )
+        bound_logical_cpu_indices = self._start_processes_for_callable_entrypoint_and_get_sched_setaffinity_cpus(
+            numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
+            target_local_rank=11,
         )
         self.assertEqual(
-            executable_contents,
+            bound_logical_cpu_indices,
             # There are 8 numa nodes and 2 GPUs per numa node, so GPU 11 would be
             # on numa node 11 // 2 = 5.
-            f"""#!/bin/bash
-
-# If this file is more than a few minutes old and still exists on your machine,
-# that is NOT expected. It should have deleted itself. If you are seeing an accumulation of such
-# files, that could suggest a bug in pytorch. See https://github.com/pytorch/pytorch/pull/160163.
+            # Each numa node has 4 * 2 * 2 = 16 logical CPUs
+            # Numa node 5 has CPUs 80-95
+            set(range(80, 96)),
+        )
 
-rm -- "$0"
-numactl --cpunodebind=5 {sys.executable} "$@"
-""",
+    def test_raises_if_binding_to_empty_set(self) -> None:
+        self._add_mock_hardware(
+            num_sockets=1,
+            num_numa_nodes_per_socket=1,
+            num_gpus_per_numa_node=1,
+            num_l3_caches_per_numa_node=1,
+            num_physical_core_per_l3_cache=1,
         )
 
+        with (
+            patch(
+                "torch.numa.binding._get_logical_cpus_to_bind_to", return_value=set()
+            ),
+            self.assertRaisesRegex(
+                RuntimeError, "Must bind to a non-empty set of CPU indices"
+            ),
+        ):
+            self._start_processes_for_callable_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
+                target_local_rank=0,
+            )
+
     def test_get_set_of_int_from_ranges_str(self) -> None:
         self.assertEqual(
             _get_set_of_int_from_ranges_str("0-2,4,6-7"), {0, 1, 2, 4, 6, 7}
diff --git a/test/test_optim.py b/test/test_optim.py
index 27db6d717954b..6dd23d6328c89 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -187,7 +187,8 @@ def test_forloop_goes_right_direction(
                     )
                 input = torch.randn(5, device=device, dtype=dtype)
 
-                optimizer = optim_cls([weight, bias], **optim_input.kwargs)
+                params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
+                optimizer = optim_cls(params, **optim_input.kwargs)
                 schedulers = [
                     s(optimizer)
                     for s in (schedulers_constructor if schedulers_constructor else [])
@@ -195,7 +196,12 @@ def test_forloop_goes_right_direction(
 
                 def closure():
                     optimizer.zero_grad()
-                    loss = (weight.mv(input) + bias).pow(2).sum()
+                    wo = (
+                        weight.mv(input)
+                        if optim_cls.__name__ == "Muon"
+                        else weight.mv(input) + bias
+                    )
+                    loss = wo.pow(2).sum()
                     loss.backward()
                     if optim_info.only_supports_sparse_grads:
                         # For this test, we naively convert the Tensor layout, which we know does
@@ -246,7 +252,8 @@ def test_forloop_goes_right_direction_multigpu(
                 bias = Parameter(torch.randn((10), device="cuda:1", dtype=dtype))
                 inpt = torch.randn(5, device="cuda:0", dtype=dtype)
 
-                optimizer = optim_cls([weight, bias], **optim_input.kwargs)
+                params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
+                optimizer = optim_cls(params, **optim_input.kwargs)
                 schedulers = [
                     s(optimizer)
                     for s in (schedulers_constructor if schedulers_constructor else [])
@@ -254,7 +261,12 @@ def test_forloop_goes_right_direction_multigpu(
 
                 def closure():
                     optimizer.zero_grad()
-                    loss = (weight.mv(inpt).cuda(1) + bias).pow(2).sum()
+                    wo = (
+                        weight.mv(inpt).cuda(1)
+                        if optim_cls.__name__ == "Muon"
+                        else weight.mv(inpt).cuda(1) + bias
+                    )
+                    loss = wo.pow(2).sum()
                     loss.backward()
                     if optim_info.only_supports_sparse_grads:
                         # For this test, we naively convert the Tensor layout, which we know does
@@ -285,23 +297,25 @@ def test_param_group_with_lrscheduler_goes_right_direction(
 
         for schedulers_c in optim_info.scheduler_inputs:
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
-            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+            weight2 = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             inpt = torch.randn(5, device=device, dtype=dtype)
 
             # avoid endless recompiles by wrapping LR in a tensor if we're compiling
             lr = torch.tensor(0.01) if torch.compiler.is_compiling() else 0.01
-            optimizer = optim_cls([{"params": [weight]}, {"params": [bias], "lr": lr}])
+            optimizer = optim_cls(
+                [{"params": [weight]}, {"params": [weight2], "lr": lr}]
+            )
             schedulers = [scheduler_c(optimizer) for scheduler_c in schedulers_c]
 
             def closure():
                 optimizer.zero_grad()
-                loss = (weight.mv(inpt) + bias).pow(2).sum()
+                loss = (weight.mv(inpt) + weight2.mv(inpt)).pow(2).sum()
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
                     # NOT represent the expected use case for optims like SparseAdam!
                     weight.grad = weight.grad.to_sparse()
-                    bias.grad = bias.grad.to_sparse()
+                    weight2.grad = weight2.grad.to_sparse()
                 return loss
 
             initial_value = closure().item()
@@ -339,21 +353,26 @@ def test_tensor_lr(self, device, dtype, optim_info, num_dim):
             if "lr" in kwargs:
                 del kwargs["lr"]
 
+            params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
             kwargs["lr"] = 1.0 if optim_info.step_requires_closure else 1e-3
-            optimizer_r = optim_cls([weight, bias], **kwargs)
+            optimizer_r = optim_cls(params, **kwargs)
 
             try:
                 kwargs["lr"] = (
                     torch.tensor(kwargs["lr"]).reshape([1] * num_dim).to(lr_device)
                 )
-                optimizer = optim_cls([weight_c, bias_c], **kwargs)
+                params_c = [weight_c, bias_c]
+                if optim_cls.__name__ == "Muon":
+                    params_c = [weight_c]
+                optimizer = optim_cls(params_c, **kwargs)
             except ValueError as e:
                 self.assertRegex(str(e), ".*lr as a Tensor is not supported.*")
                 continue
 
             def closure(optim, w, b, i):
                 optim.zero_grad()
-                loss = (w.mv(i) + b).pow(2).sum()
+                wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
+                loss = wo.pow(2).sum()
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
@@ -377,7 +396,8 @@ def closure(optim, w, b, i):
                     optimizer.step()
 
                 self.assertEqual(weight, weight_c)
-                self.assertEqual(bias, bias_c)
+                if optim_cls.__name__ != "Muon":
+                    self.assertEqual(bias, bias_c)
 
     @parametrize("with_lrsched", [True, False])
     @optims(
@@ -1217,31 +1237,31 @@ def test_param_groups_weight_decay(self, device, dtype, optim_info):
         )
         for optim_input in all_optim_inputs:
             weight_kwargs = optim_input.kwargs
-            bias_kwargs = deepcopy(optim_input.kwargs)
-            bias_kwargs["weight_decay"] = 0.0
+            weight2_kwargs = deepcopy(optim_input.kwargs)
+            weight2_kwargs["weight_decay"] = 0.0
 
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
-            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+            weight2 = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             input = torch.randn(5, device=device, dtype=dtype)
 
             optimizer = optim_cls(
                 [
                     dict(params=[weight], **weight_kwargs),
-                    dict(params=[bias], **bias_kwargs),
+                    dict(params=[weight2], **weight2_kwargs),
                 ]
             )
 
-            loss = (weight.mv(input) + bias).pow(2).sum()
+            loss = (weight.mv(input) + weight2.mv(input)).pow(2).sum()
             initial_value = loss.item()
             for _ in range(20):
                 optimizer.zero_grad()
-                loss = (weight.mv(input) + bias).pow(2).sum()
+                loss = (weight.mv(input) + weight2.mv(input)).pow(2).sum()
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
                     # NOT represent the expected use case for optims like SparseAdam!
                     weight.grad = weight.grad.to_sparse()
-                    bias.grad = bias.grad.to_sparse()
+                    weight2.grad = weight2.grad.to_sparse()
                 optimizer.step()
 
             # Test that the direction of loss moved appropriately
@@ -1268,22 +1288,33 @@ def test_param_groups_lr(self, device, dtype, optim_info):
 
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             bias = Parameter(torch.randn((10), device=device, dtype=dtype))
-            irrelevant = Parameter(torch.randn(2, device=device, dtype=dtype))
+            irrelevant = Parameter(torch.randn((2, 2), device=device, dtype=dtype))
             irrelevant_clone = irrelevant.clone()
             input = torch.randn(5, device=device, dtype=dtype)
+            params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
             optimizer = optim_cls(
                 [
-                    dict(params=[weight, bias], **optim_input.kwargs),
+                    dict(params=params, **optim_input.kwargs),
                     dict(params=[irrelevant]),
                 ],
                 **outer_kwargs,
             )
 
-            loss = (weight.mv(input) + bias).pow(2).sum()
+            wo = (
+                weight.mv(input)
+                if optim_cls.__name__ == "Muon"
+                else weight.mv(input) + bias
+            )
+            loss = wo.pow(2).sum()
             initial_value = loss.item()
             for _ in range(20):
                 optimizer.zero_grad()
-                loss = (weight.mv(input) + bias).pow(2).sum()
+                wo = (
+                    weight.mv(input)
+                    if optim_cls.__name__ == "Muon"
+                    else weight.mv(input) + bias
+                )
+                loss = wo.pow(2).sum()
                 loss.backward()
                 irrelevant.grad = torch.rand_like(irrelevant)
                 if optim_info.only_supports_sparse_grads:
@@ -1341,8 +1372,8 @@ def closure():
             if kwargs.get("weight_decay", 0) != 0:
                 continue
 
-            # AdamW params will be updated regardless of grads due to lr, so make lr smaller
-            if optim_cls.__name__ == "AdamW":
+            # AdamW/Muon params will be updated regardless of grads due to lr, so make lr smaller
+            if optim_cls.__name__ == "AdamW" or optim_cls.__name__ == "Muon":
                 kwargs["lr"] = (
                     torch.tensor(1e-5)
                     if isinstance(kwargs.get("lr", 1e-5), torch.Tensor)
@@ -1439,6 +1470,8 @@ def test_state_dict_deterministic(
         bias = Parameter(torch.randn(2, requires_grad=True, device=device, dtype=dtype))
         input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
         params = [weight, bias]
+        if optim_cls.__name__ == "Muon":
+            params = [weight]
 
         def make_named_param(param, is_named):
             if not is_named:
@@ -1453,7 +1486,8 @@ def without_param_names(state_dict):
 
         def fwd_bwd(optim, w, b, i):
             optim.zero_grad()
-            loss = (w.mv(i) + b).pow(2).sum()
+            wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
+            loss = wo.pow(2).sum()
             loss.backward()
             if optim_info.only_supports_sparse_grads:
                 if w.grad is not None:
@@ -1479,7 +1513,10 @@ def fwd_bwd(optim, w, b, i):
             with torch.no_grad():
                 weight_c = Parameter(weight.clone())
                 bias_c = Parameter(bias.clone())
-            params_c = make_named_param([weight_c, bias_c], is_named=is_named_optim1)
+            params_c_list = (
+                [weight_c, bias_c] if optim_cls.__name__ != "Muon" else [weight_c]
+            )
+            params_c = make_named_param(params_c_list, is_named=is_named_optim1)
             optimizer_c = optim_cls(params_c, **optim_input.kwargs)
             closure_c = functools.partial(fwd_bwd, optimizer_c, weight_c, bias_c, input)
 
@@ -1498,7 +1535,8 @@ def fwd_bwd(optim, w, b, i):
                     optimizer_c.step()
 
                 self.assertEqual(weight, weight_c)
-                self.assertEqual(bias, bias_c)
+                if optim_cls.__name__ != "Muon":
+                    self.assertEqual(bias, bias_c)
 
             # Make sure state dict is deterministic with equal (not identical) parameters
             # Param names are optional and not needed to be the consistent.
@@ -1522,14 +1560,24 @@ def test_can_load_older_state_dict(self, device, dtype, optim_info):
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
         )
+
+        def _get_model_and_input_tensor(device, dtype, optim_cls):
+            if optim_cls.__name__ == "Muon":
+                # Muon only accepts 2D parameter.
+                model = torch.nn.Linear(10, 4, bias=False)
+                input = torch.rand(10, device=device, dtype=dtype)
+            else:
+                model = torch.nn.Sequential(
+                    torch.nn.Conv2d(4, 2, 1, stride=2),
+                    torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
+                )
+                input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model.to(dtype=dtype, device=device)
+            return model, input
+
         for optim_input in all_optim_inputs:
             torch.manual_seed(1)
-            model = torch.nn.Sequential(
-                torch.nn.Conv2d(4, 2, 1, stride=2),
-                torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
-            )
-            model.to(dtype=dtype, device=device)
-            input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model, input = _get_model_and_input_tensor(device, dtype, optim_cls)
             optimizer = optim_cls(model.parameters(), **optim_input.kwargs)
 
             def fwd_bwd(optim, mod, i):
@@ -1577,14 +1625,24 @@ def test_can_load_from_to_named_state_dict(
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
         )
+
+        def _get_model_and_input_tensor(device, dtype, optim_cls):
+            if optim_cls.__name__ == "Muon":
+                # Muon only accepts 2D parameter.
+                model = torch.nn.Linear(10, 4, bias=False)
+                input = torch.rand(10, device=device, dtype=dtype)
+            else:
+                model = torch.nn.Sequential(
+                    torch.nn.Conv2d(4, 2, 1, stride=2),
+                    torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
+                )
+                input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model.to(dtype=dtype, device=device)
+            return model, input
+
         for optim_input in all_optim_inputs:
             torch.manual_seed(1)
-            model = torch.nn.Sequential(
-                torch.nn.Conv2d(4, 2, 1, stride=2),
-                torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
-            )
-            model.to(dtype=dtype, device=device)
-            input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model, input = _get_model_and_input_tensor(device, dtype, optim_cls)
 
             def fwd_bwd(optim, mod, i):
                 optim.zero_grad()
@@ -1621,11 +1679,12 @@ def fwd_bwd(optim, mod, i):
                 fwd_bwd(optimizer2, model, input)
                 optimizer2.step()
 
+            ref_names = [p[0] for p in model.named_parameters()]
             # Make sure that param_names are preserved when provided to at least one of the optimizers
             if is_named_optim0 or is_named_optim1:
                 self.assertEqual(
                     optimizer2.state_dict()["param_groups"][0]["param_names"],
-                    ["0.weight", "0.bias", "1.weight", "1.bias"],
+                    ref_names,
                 )
 
     @parametrize("is_named_optim", [True, False])
@@ -1644,7 +1703,7 @@ def test_save_load_equality_with_weights_only(
         )
         bias = Parameter(torch.randn(2, requires_grad=True, device=device, dtype=dtype))
         input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
-        params = [weight, bias]
+        params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
 
         def make_named_param(param, is_named):
             if not is_named:
@@ -1653,7 +1712,8 @@ def make_named_param(param, is_named):
 
         def fwd_bwd(optim, w, b, i):
             optim.zero_grad()
-            loss = (w.mv(i) + b).pow(2).sum()
+            wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
+            loss = wo.pow(2).sum()
             loss.backward()
             if optim_info.only_supports_sparse_grads:
                 weight.grad = weight.grad.to_sparse()
@@ -1937,7 +1997,7 @@ def post_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data += 2
 
-        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+        params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
 
         def dummy_closure():
             return 1
@@ -1969,7 +2029,8 @@ def pre_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data += 2
 
-        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+        # Create a random 2D tensor for compatibility with Muon.
+        params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
 
         def dummy_closure():
             return 1
@@ -2013,7 +2074,7 @@ def local_post_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data.append(2)
 
-        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+        params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
 
         def dummy_closure():
             return 1
@@ -2219,7 +2280,8 @@ def test_defaults_changed_to_foreach(self, device, dtype, optim_info):
     def test_non_empty_state(self, device, dtype, optim_info):
         # There are internal tests that check that the state is not empty
         optim_cls = optim_info.optim_cls
-        model = torch.nn.Linear(5, 5)
+        # Muon only accepts 2D parameter.
+        model = torch.nn.Linear(5, 5, bias=False)
         model.to(dtype=dtype, device=device)
         inpt = torch.rand(2, 5, dtype=dtype, device=device)
 
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index f278eb33be16e..6d36b36996c4b 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1973,6 +1973,7 @@ def f(t):
     skip('item'),
     xfail('cov'),
     xfail('nn.functional.gaussian_nll_loss'),
+    xfail('tensor_split'),
     xfail('corrcoef'),
     xfail('quantile'),
     xfail('nanquantile'),
@@ -1992,12 +1993,10 @@ def f(t):
 
 only_real_tensor_failures = {
     xfail('narrow'),
-    xfail('tensor_split'),
 }
 
 only_fake_tensor_failures = {
     xfail('narrow'),
-    xfail('tensor_split'),
 }
 
 fake_tensor_failures = set()
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index d2b16b61c9035..9faa5ce4b8946 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -2124,6 +2124,16 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             t = DimImplementedTensor(torch.randn(3, 3), use_wrapper_subclass)
             self.assertEqual(t.dim(), 2)
 
+    def test_maybe_tuple_bug(self):
+        class T(torch.Tensor):
+            @classmethod
+            def __torch_function__(cls, *args, **kwargs):
+                pass
+
+        a = torch.rand(3)
+
+        a[[T(), T()]]
+
     def test_standard_is_not_subclass(self):
         # https://github.com/pytorch/pytorch/issues/79079
         self.assertFalse(torch._C._dispatch_isTensorSubclassLike(torch.empty(0)))
diff --git a/test/test_transformers.py b/test/test_transformers.py
index b18af79433ae5..c0641570649c0 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -49,6 +49,7 @@
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     PLATFORM_SUPPORTS_FUSED_ATTENTION,
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+    PLATFORM_SUPPORTS_CK_SDPA,
     tf32_on_and_off,
     tf32_enabled,
 )
@@ -85,7 +86,6 @@ def use_deterministic_algorithims(mode: bool, warn_only: bool):
 isSM5xDevice = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] == 5
 isLessThanSM80Device = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8
 
-TEST_WITH_CK = TEST_WITH_ROCM and torch.backends.cuda.preferred_rocm_fa_library() == torch.backends.cuda._ROCmFABackends['ck']
 
 def _check_equal(
     golden: torch.Tensor,
@@ -2656,7 +2656,6 @@ def test_cudnn_attention_gqa(self, device):
 
     @skipIfRocm  # No cuDNN Attention
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
-    @unittest.expectedFailure  # cuDNN currently doesn't support this on SM100+/fails graph validation
     def test_cudnn_attention_d256_heuristic(self, device):
         dtype = torch.bfloat16
         make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True)
@@ -2667,18 +2666,24 @@ def test_cudnn_attention_d256_heuristic(self, device):
         v_shape = SdpaShape(batch, num_heads, seq_len, head_dim_v)
         query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
 
-        with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION], set_priority=True):
-            actual = torch.nn.functional.scaled_dot_product_attention(
-                query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
-            actual.backward(torch.randn_like(actual))
-        with sdpa_kernel(backends=[SDPBackend.MATH]):
-            math_ref = torch.nn.functional.scaled_dot_product_attention(
-                query.contiguous().to(torch.float32),
-                key.contiguous().to(torch.float32),
-                value.contiguous().to(torch.float32),
-                attn_mask=None, dropout_p=0.0, is_causal=False)
-
-        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+        def test():
+            with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION], set_priority=True):
+                actual = torch.nn.functional.scaled_dot_product_attention(
+                    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
+                actual.backward(torch.randn_like(actual))
+            with sdpa_kernel(backends=[SDPBackend.MATH]):
+                math_ref = torch.nn.functional.scaled_dot_product_attention(
+                    query.contiguous().to(torch.float32),
+                    key.contiguous().to(torch.float32),
+                    value.contiguous().to(torch.float32),
+                    attn_mask=None, dropout_p=0.0, is_causal=False)
+            self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+
+        if torch.cuda.get_device_capability() in [(9, 0)]:
+            test()
+        else:
+            with self.assertRaisesRegex(RuntimeError, "No available kernel."):
+                test()
 
     @skipIfRocm(msg="No cuDNN on ROCm")
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
@@ -3572,10 +3577,12 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
     @parametrize("scale", [None, "l1"])
     @parametrize("enable_gqa", [True, False])
     @parametrize("n_heads", [[16, 8], [10, 2]])
+    @parametrize("sdpa_backend", ["aotriton", "ck"] if PLATFORM_SUPPORTS_CK_SDPA else ["aotriton"])
     @tf32_enabled()
     def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_len_q: int, seq_len_k: int,
-                                               head_dim: int, is_causal: bool, dropout_p: float, dtype: torch.dtype,
-                                               scale: str, enable_gqa: bool, n_heads: list[int]):
+                                               head_dim: int, is_causal: bool, dropout_p: float,
+                                               dtype: torch.dtype, scale: str, enable_gqa: bool,
+                                               n_heads: list[int], sdpa_backend: str):
         if isSM8XDevice or isSM120Device and head_dim in range(193, 256 + 1):
             self.skipTest("Flash attention on sm86, sm87, and sm89 for headdim > 192 currently disabled")
         if is_causal and seq_len_q != seq_len_k:
@@ -3585,8 +3592,14 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
         if max(seq_len_q, seq_len_k) >= 2048 and torch.cuda.get_device_properties('cuda').total_memory < 40 * 2**30:
             unittest.skip("Reference implementation OOM")
             return
-        if TEST_WITH_CK and dropout_p != 0:
-            self.skipTest("CK does not support tensor format dropout masks")
+
+        # ROCm now supports 2 different backends for SDPA that require different set up.
+        TEST_WITH_CK = False
+        if TEST_WITH_ROCM:
+            torch.backends.cuda.preferred_rocm_fa_library(sdpa_backend)
+            # When no args are given to preferred_rocm_fa_library, it acts as a getter
+            TEST_WITH_CK = (torch.backends.cuda.preferred_rocm_fa_library() == torch._C._ROCmFABackend.Ck)
+
         if TEST_WITH_CK and head_dim > 128:
             self.skipTest("CK does not support head dims over 128")
 
@@ -3642,15 +3655,24 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
             softmax_mask = self.convert_flash_attn_S_to_softmax(
                 dbug_mask, seq_len_q, seq_len_k, query_padding_mask, key_padding_mask,
                 causal=is_causal)[:, :, :seq_len_q, :seq_len_k]
+
+            # This is the default implementation for the mask but we need to match CK if we are using it
             dropout_mask = softmax_mask >= 0
+
+            # This logic matches how CK calculates the dropout mask.
+            # This is necessary because CK doesn't support passing in custom dropout masks
+            # So we use this logic to ensure we are comparing apples to apples.
+            if TEST_WITH_CK:
+                dropout_mask = (softmax_mask <= int((1.0 - dropout_p) * 255.0)).to(torch.float32)
+
             # High Precision Math Reference
             out_ref = torch.ops.aten._scaled_dot_product_attention_math(
                 query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal,
                 scale=scale, dropout_mask=dropout_mask, enable_gqa=enable_gqa)[0]
             # Low Precision Math Reference
             out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
-                query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale,
-                dropout_mask=dropout_mask, enable_gqa=enable_gqa)[0]
+                query, key, value, dropout_mask=dropout_mask, dropout_p=dropout_p,
+                is_causal=is_causal, scale=scale, enable_gqa=enable_gqa)[0]
 
         upstream_grad = torch.rand_like(out, requires_grad=False)
 
@@ -3670,17 +3692,33 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
             'grad_value': 4,
         }
         if TEST_WITH_ROCM:
-            fudge_factors['grad_key'] = 45.0
-            fudge_factors['grad_query'] = 360.0
-            if seq_len_k >= 1024:
-                fudge_factors['grad_key'] = 70.0
-            if seq_len_k >= 2048:
-                fudge_factors['grad_key'] = 190.0
-                fudge_factors['grad_query'] = 650.0
-                if seq_len_q >= 2048:
-                    fudge_factors['grad_query'] = 1100.0
-            if dtype == torch.float32:
-                fudge_factors['grad_key'] = 90.0
+            if TEST_WITH_CK:
+                fudge_factors['out'] = 5
+                fudge_factors['grad_key'] = 145.0
+                fudge_factors['grad_query'] = 855.0  # ck min = 855.0
+                fudge_factors['grad_value'] = 6
+                if seq_len_k >= 1024:
+                    fudge_factors['grad_key'] = 70.0
+                if seq_len_k >= 2048:
+                    fudge_factors['grad_key'] = 190.0
+                    fudge_factors['grad_query'] = 1550.0  # NEW CK MIN
+                    if seq_len_q >= 2048:
+                        fudge_factors['grad_query'] = 1100.0
+                if dtype == torch.float32:
+                    fudge_factors['grad_key'] = 90.0
+            else:
+                fudge_factors['grad_key'] = 45.0
+                fudge_factors['grad_query'] = 360.0
+                if seq_len_k >= 1024:
+                    fudge_factors['grad_key'] = 70.0
+                if seq_len_k >= 2048:
+                    fudge_factors['grad_key'] = 190.0
+                    fudge_factors['grad_query'] = 650.0
+                    if seq_len_q >= 2048:
+                        fudge_factors['grad_query'] = 1100.0
+                if dtype == torch.float32:
+                    fudge_factors['grad_key'] = 90.0
+
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index afb0b3f56bead..5dcb7df8802a8 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-77cc792cd265179745d335579d233e6d4f9a2667
+77cc792cd265179745d335579d233e6d4f9a2667
\ No newline at end of file
diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
index 2db2e054b98e8..20e093688ba14 100644
--- a/tools/flight_recorder/components/types.py
+++ b/tools/flight_recorder/components/types.py
@@ -388,8 +388,10 @@ def __init__(
         self, event: dict[Any, Any], memberships: dict[str, set[Any]], pg_name: str
     ):
         self.profiling_name = event["profiling_name"]
-        nccl, name = self.profiling_name.split(":")
-        assert nccl == "nccl", f"name formatting error? {nccl} != 'nccl'"
+        comm_lib_backend, name = self.profiling_name.split(":")
+        assert comm_lib_backend in ["nccl", "xccl"], (
+            f"name formatting error? {comm_lib_backend} != 'nccl' or 'xccl'"
+        )
         parts = name.split(" ")
         type = parts[0]
         meta = parts[1] if len(parts) == 2 else None
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 94567b08a5cce..0b3a9ab334b80 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -2231,6 +2231,7 @@ def _is_flash_attention_available() -> _bool: ...
 def _can_use_cudnn_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
 def _can_use_flash_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
 def _can_use_mem_efficient_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
+def _is_ck_sdpa_available() -> _bool: ...
 
 # Defined in torch/csrc/cuda/GdsFile.cpp
 def _gds_register_buffer(t: Storage) -> None: ...
@@ -2335,13 +2336,10 @@ class _MemPool:
         allocator: _cuda_CUDAAllocator | None = None,
         is_user_created: _bool = True,
         use_on_oom: _bool = False,
-        symmetric: _bool = False,
     ) -> None: ...
     @property
     def id(self) -> tuple[_int, _int]: ...
     @property
-    def is_symmetric(self) -> _bool: ...
-    @property
     def allocator(self) -> _cuda_CUDAAllocator | None: ...
     def use_count(self) -> _int: ...
 
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 9007d3fbf5a09..168e4d9dfc841 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -298,6 +298,8 @@ class Backend:
         def _timeout(self) -> timedelta: ...
         @_timeout.setter
         def _timeout(self, val: timedelta) -> None: ...
+        global_ranks_in_group: list[int]
+        group_name: str
 
     def __init__(
         self,
@@ -608,8 +610,6 @@ class ProcessGroupGloo(Backend):
     class Options(Backend.Options):
         devices: list[ProcessGroupGloo.Device]
         threads: int
-        global_ranks_in_group: list[int]
-        group_name: str
 
         def __init__(self): ...
 
@@ -644,14 +644,13 @@ class ProcessGroupNCCL(Backend):
         cga_cluster_size: int
         min_ctas: int
         max_ctas: int
+        def unsafe_get_ptr(self) -> int: ...
 
     class Options(Backend.Options):
         config: ProcessGroupNCCL.NCCLConfig
         is_high_priority_stream: bool
         split_from: ProcessGroupNCCL
         split_color: int
-        global_ranks_in_group: list[int]
-        group_name: str
 
         def __init__(self, is_high_priority_stream: bool = False): ...
 
@@ -805,6 +804,12 @@ class _SymmetricMemory:
         channel: int = 0,
         timeout_ms: int = 0,
     ) -> None: ...
+    def copy_buffer(
+        self,
+        src: torch.Tensor,
+        dst: torch.Tensor,
+        size: int,
+    ) -> None: ...
     @staticmethod
     def memset32(
         tensor: torch.Tensor, offset: int, val: int, count: int = 1
@@ -829,12 +834,18 @@ class _SymmetricMemory:
     def signal_pad_size(self) -> int: ...
 
 class ProcessGroupXCCL(Backend):
+    class Options(Backend.Options):
+        def __init__(self): ...
+
     def __init__(
         self,
         store: Store,
         rank: int,
         size: int,
-    ): ...
+        options: Options,
+    ) -> None: ...
+    @property
+    def options(self) -> Options: ...  # type: ignore[override]
 
 def _set_process_group(pg: ProcessGroup) -> None: ...
 def _current_process_group() -> ProcessGroup: ...
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 954950318b6a1..ba09c6173c5f3 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -6,7 +6,6 @@
 import operator
 import sys
 from collections.abc import Iterable
-from contextlib import nullcontext
 from enum import Enum
 from functools import partial, reduce
 from itertools import chain, product
@@ -722,7 +721,10 @@ def slice_forward(
     end: Optional[int] = None,
     step: int = 1,
 ):
-    from torch.fx.experimental.symbolic_shapes import statically_known_true
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_size_oblivious,
+        statically_known_true,
+    )
 
     ndim = self.dim()
     if ndim == 0:
@@ -737,22 +739,22 @@ def slice_forward(
     start_val = start if start is not None else 0
     end_val = end if end is not None else sys.maxsize  # 2^63 - 1
 
-    if start_val < 0:
+    if guard_size_oblivious(start_val < 0):
         start_val += sizes[dim]
 
-    if end_val < 0:
+    if guard_size_oblivious(end_val < 0):
         end_val += sizes[dim]
 
-    if start_val < 0:
+    if guard_size_oblivious(start_val < 0):
         start_val = 0
-    elif start_val > sizes[dim]:
+    elif guard_size_oblivious(start_val > sizes[dim]):
         start_val = sizes[dim]
 
     if statically_known_true(end_val == sys.maxsize):
         end_val = sizes[dim]
-    elif end_val < start_val:
+    elif guard_size_oblivious(end_val < start_val):
         end_val = start_val
-    elif end_val > sizes[dim]:
+    elif guard_size_oblivious(end_val > sizes[dim]):
         end_val = sizes[dim]
 
     storage_offset = self.storage_offset() + start_val * strides[dim]
@@ -1436,17 +1438,7 @@ def tensor_split_tensor_indices_or_sections_py_impl(
         assert isinstance(sections, IntLike)
         return self.tensor_split(sections, dim)
     else:
-        ctx = nullcontext
-        if (fake_mode := torch._guards.detect_fake_mode()) and (
-            shape_env := fake_mode.shape_env
-        ):
-            ctx = shape_env.ignore_fresh_unbacked_symbols  # type: ignore[assignment]
-        # In fake tensor prop, we end up calling slice() with these unbacked indices.
-        # Because slice has flexible semantics, the unbacked handling generates new output sizes
-        # for each slice, effectively clobbering over these index symbols.
-        # To avoid PendingUnbackedSymbolNotFound errors, we tell the compiler it's fine to not bind these.
-        with ctx():
-            indices = [i.item() for i in tensor_indices_or_sections]
+        indices = [i.item() for i in tensor_indices_or_sections]
         # WARNING: Tempted to torch._check_is_size on the indices here?  You
         # can't: tensor_split works with negative values in indices:
         #
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index b8b7561dde16b..234cac2d75a65 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -578,6 +578,8 @@ def default_debug_dir_root() -> str:
 # Enables automatic DynamoCache save/load
 caching_precompile = os.environ.get("TORCH_CACHING_PRECOMPILE", "0") == "1"
 
+strict_precompile = os.environ.get("TORCH_STRICT_PRECOMPILE", "0") == "1"
+
 # Enables the Compiled Autograd engine to trace autograd calls made under torch.compile().
 # Note: AOTAutograd will still trace and partition an AOT backward graph local to that
 # compiled region. But AOTAutograd traces without knowledge of backward hooks which are
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 016d4eefab30c..2d859073f0a82 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -1272,6 +1272,7 @@ def format_func_info(code: CodeType) -> str:
         start_time_ns = time.time_ns()
         fail_type: Optional[str] = None
         fail_reason: Optional[str] = None
+        exception_stack_trace: Optional[list[str]] = None
         fail_user_frame_filename: Optional[str] = None
         fail_user_frame_lineno: Optional[int] = None
         torch._dynamo.utils.ReinplaceCounters.clear()
@@ -1300,6 +1301,7 @@ def format_func_info(code: CodeType) -> str:
             # info here and add it to the metrics context below.
             fail_type = type(e).__qualname__
             fail_reason = str(e)
+            exception_stack_trace = [traceback.format_exc()]
             exception_handler(e, code, frame, export=export)
             # NB: this is the post-mutation exception
             torch._logging.trace_structured(
@@ -1420,6 +1422,7 @@ def format_func_info(code: CodeType) -> str:
                 ),
                 "stack_trace": stack_trace,
                 "graph_node_shapes": str(graph_node_shapes),
+                "exception_stack_trace": exception_stack_trace,
             }
             # TODO: replace with CompileEventLogger.compilation_metrics
             # There are some columns here not in PT2 Compile Events
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index ab8304cc5f080..3096d840a8db1 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -759,15 +759,6 @@ def mark_static_address(t: Any, guard: bool = True) -> None:
     is not needed for this input. The data_ptr will be guarded if guard=True. Note:
     Tensors marked in this way will be kept alive until `torch._dynamo.reset()` is called.
     """
-    if torch._dynamo.config.caching_precompile:
-        # [Note] Static Addresses and Precompile
-        # When using precompile, `mark_static_address` is dangerous to use, because
-        # dynamo saves the addresses directly on the parameters of the graph. These addresses
-        # are process dependent, so are not serializable, and serializing
-        # their tensors would be extremely expensive. Instead, by treating mark_static_address
-        # as a no-op, dynamo will automatically inline them as inputs to the graph instead.
-        # See https://github.com/pytorch/pytorch/issues/159228
-        return
     if not isinstance(t, torch.Tensor):
         raise TypeError(f"mark_static_address expects a tensor but received {type(t)}")
 
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 083f62d0284a9..dbb0b1d1260f3 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -31,6 +31,7 @@
 import pickle
 import sys
 import textwrap
+import traceback
 import types
 import warnings
 import weakref
@@ -3438,9 +3439,17 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
             from torch._dynamo.output_graph import OutputGraph
 
             assert isinstance(self.output_graph, OutputGraph)
-            self.guards_state = self.serialize_guards(
-                builder, sorted_guards, self.output_graph
-            )
+            try:
+                self.guards_state = self.serialize_guards(
+                    builder, sorted_guards, self.output_graph
+                )
+            except exc.PackageError as e:
+                if torch._dynamo.config.strict_precompile:
+                    raise e
+                self.output_graph.bypass_package(
+                    f"Guard evaluation failed: {str(e)}",
+                    traceback=traceback.format_exc().split("\n"),
+                )
 
         # TODO: don't do the string rep, do something more structured here
         torch._logging.trace_structured(
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index d91e0472807ed..08c9da68afd33 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -1584,6 +1584,32 @@ def cleanup_graph(self) -> None:
                     self.graph.erase_node(node1)
                     self.graph.erase_node(node2)
 
+    def bypass_package(self, reason: str = "", **kwargs: Any) -> None:
+        """
+        Do not save this output graph to the CompilePackage
+        """
+        if not self.package:
+            return
+        if torch._dynamo.config.strict_precompile:
+            raise torch._dynamo.exc.PackageError(
+                "Detected a package bypass: %s", reason
+            )
+        log.warning("Detected a package bypass: %s", reason)
+        torch._logging.trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "precompile_cache_bypass",
+                "encoding": "json",
+            },
+            payload_fn=lambda: {
+                # precede with underscore so it always appear first in JSON in tlparse
+                "_reason": reason,
+                **kwargs,
+            },
+        )
+        self.package.bypass_current_entry()
+        self.package = None
+
     def get_graph_sizes_structured(self) -> dict[str, list[Union[int, str]]]:
         ret: dict[str, list[Union[int, str]]] = {}
         for node in self.graph.nodes:
@@ -1740,7 +1766,20 @@ def compile_and_call_fx_graph(
             for register_finalizer in self.register_finalizer_fns:
                 register_finalizer(gm)
 
-            gm._backend_id = name
+            if next(gm.parameters(), None) is not None:
+                # If dynamo produces a graph with parameters, skip package stuff
+                # Bypass output graph
+                self.bypass_package(
+                    "Graph contains named parameters: either inline_inbuilt_nn_modules=False or there are static addresses.",
+                    inline_builtin_nn_modules=torch._dynamo.config.inline_inbuilt_nn_modules,
+                    gm=gm.print_readable(
+                        print_output=False, include_stride=True, include_device=True
+                    ),
+                )
+
+            if self.package is not None:
+                gm._backend_id = name
+
             gm.compile_subgraph_reason = self.compile_subgraph_reason
             gm.meta["dynamo_flat_name_to_original_fqn"] = (
                 self.dynamo_flat_name_to_original_fqn.copy()
diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py
index 0c2e1d0af8bf3..ef7f28c19a12b 100644
--- a/torch/_dynamo/package.py
+++ b/torch/_dynamo/package.py
@@ -140,6 +140,7 @@ class _DynamoCodeCacheEntry(DynamoCaptureOutput):
          A code object can be accessed by "{python_module}.{function_name}.{code_source}" .
       8. A boolean flag indicating whether the function is installed to global scope.
       9. A boolean flag indicating whether the function has a compile id.
+      10. Whether or not this code entry was bypassed
     """
 
     python_code: SerializedCode
@@ -149,6 +150,7 @@ class _DynamoCodeCacheEntry(DynamoCaptureOutput):
     code_source: Optional[str]
     install_to_global: bool
     has_compile_id: bool = False
+    bypassed: bool = False
 
 
 def _lookup_code(entry: _DynamoCodeCacheEntry) -> types.CodeType:
@@ -322,7 +324,6 @@ def _compile_frame_context(
     def _ctx() -> Iterator[None]:
         increment_frame()
         compile_id = get_compile_id(frame_state={})
-        log_dynamo_start(code)
         with (
             compile_context(CompileContext(compile_id)),
             dynamo_timed(
@@ -338,6 +339,7 @@ def _ctx() -> Iterator[None]:
                 },
             ),
         ):
+            log_dynamo_start(code)
             yield
 
     return _ctx()
@@ -488,6 +490,10 @@ def code_context(self, code: types.CodeType) -> Generator[None, None, None]:
         try:
             yield
         finally:
+            if (
+                entry.bypassed
+            ):  # Remove the code from the cache entry if it's been bypassed
+                del self._codes[code]
             entry.has_compile_id = True
             self._current_entry = None
 
@@ -497,6 +503,8 @@ def add_guarded_code(
         dynamo_code: types.CodeType,
     ) -> None:
         assert self._current_entry is not None
+        if self._current_entry.bypassed:
+            return
         guarded_code_entry = _GuardedCodeCacheEntry(
             guards_state=guards_state,
             dynamo_code=SerializedCode.from_code_object(dynamo_code),
@@ -504,6 +512,9 @@ def add_guarded_code(
         self._current_entry.guarded_codes.append(guarded_code_entry)
 
     def add_inlined_source(self, sources: list[types.CodeType]) -> None:
+        assert self._current_entry is not None
+        if self._current_entry.bypassed:
+            return
         for code in sources:
             if code in self._resume_codes:
                 continue
@@ -524,6 +535,10 @@ def add_inlined_source(self, sources: list[types.CodeType]) -> None:
                 )
             )
 
+    def bypass_current_entry(self) -> None:
+        assert self._current_entry is not None
+        self._current_entry.bypassed = True
+
     def add_resume_function(
         self,
         python_code: types.CodeType,
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 9d33c63e9c64c..c7166aaba1ef5 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -3716,9 +3716,10 @@ def create_call_resume_at(
             package_name = name
 
         if self.package is not None:
-            self.package.add_resume_function(
-                new_code, self.f_globals["__name__"], function_name=package_name
-            )
+            if self.output.package is not None:
+                self.package.add_resume_function(
+                    new_code, self.f_globals["__name__"], function_name=package_name
+                )
 
         cg.extend_output([cg.create_load(k) for k in argnames])
         cg.extend_output(create_call_function(nargs, False))
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 2a714cccc983a..3f57f318b2754 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1292,6 +1292,7 @@ class CompilationMetrics:
     restart_reasons: Optional[set[str]] = None
     dynamo_time_before_restart_s: Optional[float] = None
     stack_trace: Optional[list[str]] = None
+    exception_stack_trace: Optional[list[str]] = None
     graph_node_shapes: Optional[str] = None
     # Sometimes, we will finish analyzing a frame but conclude we don't want
     # to install any guarded code.  True means we actually decided to install
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 67ce8db4228c1..e24ca0fc499ea 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -689,13 +689,10 @@ def from_tensor():
             )
             and type(value) not in config.nontraceable_tensor_subclasses
         ):
-            if type(value).__torch_dispatch__ is torch.Tensor.__torch_dispatch__:
-                # This case it's either tensor or subclass with default
-                # torch_dispatch (they might override torch_function or not),
-                # and we can always trace into them.
-                return self.wrap_tensor(value)
-            elif is_traceable_wrapper_subclass(value):
-                # For non-default torch_dispatch, we have more requirements.
+            if (
+                type(value).__torch_dispatch__ is torch.Tensor.__torch_dispatch__
+                or is_traceable_wrapper_subclass(value)
+            ):
                 return self.wrap_tensor(value)
 
         if is_namedtuple(value):
@@ -1744,11 +1741,6 @@ def wrap_slice_range(self, value: Union[slice, range]):
     def mark_static_input(self, value: torch.Tensor, guard: bool):
         from ..decorators import mark_static_address
 
-        # See [Note] Static Addresses and Precompile
-        # https://github.com/pytorch/pytorch/issues/159228
-        if torch._dynamo.config.caching_precompile:
-            return
-
         static_inputs_log.debug(
             "Marking static input %s, id: %s)", self.source.name(), id(value)
         )
@@ -2060,32 +2052,8 @@ def wrap_tensor(self, value: torch.Tensor):
             return self.tx.output.input_source_to_var[source]
 
         options = {}
-        if type(value) in (
-            torch.Tensor,
-            torch.nn.Parameter,
-            torch._subclasses.fake_tensor.FakeTensor,
-            torch._subclasses.functional_tensor.FunctionalTensor,
-        ) or is_traceable_wrapper_subclass(value):
-            # Ordinarily, we would fakeify a tensor so that it can get dynamic
-            # shapes and be computed on without triggering actual operations.
-            # However, how can we fakeify a tensor subclass?  Ordinary
-            # inheritance (nor multiple inheritance) won't work work.
-            #
-            # Instead, our plan is to *manually simulate* the tensor subclass
-            # inheriting from a fake tensor with dynamo.  This means our
-            # data representation for a tensor subclass will be a fake tensor
-            # + tensor subclass type + any extra data the subclass may have
-            # been storing on the tensor.  Because all Python accesses are
-            # mediated through TensorWithTFOverrideVariable, we can ensure
-            # that we dispatch differently, e.g., according to
-            # __torch_function__
-            #
-            # To simplify things for now, the __dict__ tracking bits haven't
-            # been implemented yet, but they can be added into this design at
-            # a later point in time.
-            subclass_type = None
-        else:
-            subclass_type = type(value)
+        subclass_type = infer_subclass_type(value)
+        if subclass_type is not None:
             self.install_guards(GuardBuilder.TYPE_MATCH)
 
         if get_static_address_type(value) == "guarded":
@@ -3043,6 +3011,55 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         )
 
 
+def infer_subclass_type(value):
+    if type(value) in (
+        torch.Tensor,
+        torch.nn.Parameter,
+        torch._subclasses.fake_tensor.FakeTensor,
+        torch._subclasses.functional_tensor.FunctionalTensor,
+    ) or is_traceable_wrapper_subclass(value):
+        # Ordinarily, we would fakeify a tensor so that it can get dynamic
+        # shapes and be computed on without triggering actual operations.
+        # However, how can we fakeify a tensor subclass?  Ordinary
+        # inheritance (nor multiple inheritance) won't work work.
+        #
+        # Instead, our plan is to *manually simulate* the tensor subclass
+        # inheriting from a fake tensor with dynamo.  This means our
+        # data representation for a tensor subclass will be a fake tensor
+        # + tensor subclass type + any extra data the subclass may have
+        # been storing on the tensor.  Because all Python accesses are
+        # mediated through TensorWithTFOverrideVariable, we can ensure
+        # that we dispatch differently, e.g., according to
+        # __torch_function__
+        #
+        # To simplify things for now, the __dict__ tracking bits haven't
+        # been implemented yet, but they can be added into this design at
+        # a later point in time.
+        return None
+    else:
+        return type(value)
+
+
+def get_specialized_props(target_cls, tx, example_value, subclass_type):
+    specialized_props = target_cls.specialize(example_value)
+    # TODO: not sure about this fake mode test
+    if (
+        isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor)
+        and example_value.fake_mode is tx.fake_mode
+    ):
+        if subclass_type:
+            tensor_type = subclass_type
+        elif isinstance(example_value, torch.nn.Parameter):
+            tensor_type = torch.nn.Parameter
+        elif isinstance(example_value, torch.nn.Buffer):
+            tensor_type = torch.nn.Buffer
+        else:
+            tensor_type = torch.Tensor
+        specialized_props["class_type"] = tensor_type
+
+    return specialized_props
+
+
 def construct_tensor_variable(
     target_cls, tx, proxy, example_value, subclass_type, options
 ):
@@ -3060,23 +3077,7 @@ def construct_tensor_variable(
     # when lifting unbacked symbols of input tensors to subgraph inputs.
     # We do it lazily because the tensor may not be used in subgraphs.
     tx.output.current_tracer.track_unbacked_symbols(example_value, proxy)
-    specialized_props = target_cls.specialize(example_value)
-    # TODO: not sure about this fake mode test
-    if (
-        isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor)
-        and example_value.fake_mode is tx.fake_mode
-    ):
-        if subclass_type:
-            tensor_type = subclass_type
-        elif isinstance(example_value, torch.nn.Parameter):
-            tensor_type = torch.nn.Parameter
-        elif isinstance(example_value, torch.nn.Buffer):
-            tensor_type = torch.nn.Buffer
-        else:
-            tensor_type = torch.Tensor
-        specialized_props["class_type"] = tensor_type
-
-    options.update(specialized_props)
+    options.update(get_specialized_props(target_cls, tx, example_value, subclass_type))
     return target_cls(proxy, **options)
 
 
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 51c9f2941cebd..74f8864479d4f 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -308,6 +308,7 @@ def _constant_fold_functions():
             bool,
             callable,
             chr,
+            complex,
             divmod,
             float,
             getattr,
@@ -1478,21 +1479,6 @@ def _call_int_float(self, tx: "InstructionTranslator", arg):
     call_int = _call_int_float
     call_float = _call_int_float
 
-    def call_complex(self, tx: "InstructionTranslator", *args, **kwargs):
-        if self.constant_args(*args, **kwargs):
-            try:
-                c = complex(
-                    *(arg.as_python_constant() for arg in args),
-                    **{k: kwargs[k].as_python_constant() for k in kwargs},
-                )
-            except (TypeError, ValueError) as exc:
-                raise_observed_exception(
-                    type(exc),
-                    tx,
-                    args=list(map(ConstantVariable.create, exc.args)),
-                )
-            return ConstantVariable(c)
-
     def call_bool(self, tx: "InstructionTranslator", arg):
         # Emulate `PyBool_Type.tp_vectorcall` which boils down to `PyObject_IsTrue`.
         # https://github.com/python/cpython/blob/3.12/Objects/object.c#L1674-L1697
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 998bef52da4ca..90cbb08f5fc8f 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -206,6 +206,12 @@ def call_method(
         elif isinstance(self.value, bytes) and name == "decode":
             method = getattr(self.value, name)
             return ConstantVariable.create(method(*const_args, **const_kwargs))
+        elif type(self.value) is complex and name in complex.__dict__.keys():
+            method = getattr(self.value, name)
+            try:
+                return ConstantVariable.create(method(*const_args, **const_kwargs))
+            except Exception as e:
+                raise_observed_exception(type(e), tx)
 
         if name == "__len__" and not (args or kwargs):
             return ConstantVariable.create(len(self.value))
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index dc3929c9cce4c..a3c38ffb1e76b 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -286,19 +286,13 @@ def __contains__(self, vt) -> bool:
             and not isinstance(self.items[Hashable(vt)], variables.DeletedVariable)
         )
 
-    def len(self):
-        return len(
-            [
-                x
-                for x in self.items.values()
-                if not isinstance(x, variables.DeletedVariable)
-            ]
+    def len(self) -> int:
+        return sum(
+            not isinstance(x, variables.DeletedVariable) for x in self.items.values()
         )
 
-    def has_new_items(self):
-        if self.should_reconstruct_all:
-            return True
-        return any(
+    def has_new_items(self) -> bool:
+        return self.should_reconstruct_all or any(
             self.is_new_item(self.original_items.get(key.vt), value)
             for key, value in self.items.items()
         )
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index f75f5b180c72d..6c0fdd8c0b73c 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -657,13 +657,13 @@ def __init__(self, fn_cls, **kwargs) -> None:
     def call_apply(self, tx: "InstructionTranslator", args, kwargs):
         requires_grad = False
 
-        def visit(node):
+        def visit(vt):
             nonlocal requires_grad
-            if isinstance(node, variables.TensorVariable):
-                if node.requires_grad is not False:
+            if isinstance(vt, variables.TensorVariable):
+                if vt.requires_grad is not False:
                     requires_grad = True
-            if isinstance(node, variables.NNModuleVariable):
-                if node.is_training(tx):
+            if isinstance(vt, variables.NNModuleVariable):
+                if vt.is_training(tx):
                     requires_grad = True
 
         VariableTracker.visit(visit, (args, kwargs))
diff --git a/torch/_dynamo/variables/optimizer.py b/torch/_dynamo/variables/optimizer.py
index 025b106880498..499c956843beb 100644
--- a/torch/_dynamo/variables/optimizer.py
+++ b/torch/_dynamo/variables/optimizer.py
@@ -239,22 +239,10 @@ def map_sources_and_install_guards(self, tx):
         self.grad_to_source = {}
         self.tensor_to_source = {}
 
-        # Tracing the _init_group is expensive. But we still have to insert the
-        # necessary guards for _init_group. So, we manually handle insertion of
-        # guards. We also want to mark all the tensors inside the state dict to
-        # be static address.
-
-        # Mark all the tensors in the state dict to be static address. This has
-        # to be done first because the variable builder relies on the static
-        # address annotation.
-        # NB: Caching precompile is incompatible with mark_static_address
-        # https://github.com/pytorch/pytorch/issues/159228
-        if not torch._dynamo.config.caching_precompile:
-
-            def mark_static(x):
-                mark_static_address(x)
-
-            tree_map_only(torch.Tensor, mark_static, self.value.state)
+        def mark_static(x):
+            mark_static_address(x)
+
+        tree_map_only(torch.Tensor, mark_static, self.value.state)
 
         # Recursively realize the variable trackers for optim.state and
         # optim.param_groups, which recursively install the necessary guards.
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 62d0542dcab04..08dab47451abf 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -1090,6 +1090,30 @@ def method___setitem__(self, key, value):
             *proxy_args_kwargs([self, key, value], {}),
         )
 
+        if isinstance(value, TensorVariable):
+            # [Note: Tensor.__setitem__ and VariableTracker metadata]
+            # At this point, we proxied a node representing `self[key] = value` into the graph.
+            # When executed, this node will mutate `self`'s tensor metadata, so it's important
+            # even during tracing to propagate. For example:
+            #   value.requires_grad is True => self.requires_grad becomes True
+            #   value.requires_grad is True => self.has_grad_fn becomes True
+
+            # Not sure if __setitem__ can ever save activations, disabling just in case
+            with torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
+                get_fake_value(proxy.node, tx, allow_non_graph_fake=False)
+
+            example_value = self.proxy.node.meta.get("example_value")
+            from .builder import get_specialized_props, infer_subclass_type
+
+            if isinstance(value, variables.lazy.LazyVariableTracker):
+                value = variables.lazy.LazyVariableTracker.realize_all(value)
+
+            specialized_props = get_specialized_props(
+                type(value), tx, example_value, infer_subclass_type(example_value)
+            )
+            for k, v in specialized_props.items():
+                setattr(self, k, v)
+
         if config.use_graph_deduplication or config.track_nodes_for_deduplication:
             tx.output.region_tracker.add_node_mutation(proxy.node, 0)
 
diff --git a/torch/_export/passes/_node_metadata_hook.py b/torch/_export/passes/_node_metadata_hook.py
index ef49c4f035a56..f1958815293c1 100644
--- a/torch/_export/passes/_node_metadata_hook.py
+++ b/torch/_export/passes/_node_metadata_hook.py
@@ -3,6 +3,9 @@
 from typing import Any, Optional
 
 import torch
+import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.graph_module import GraphModule
 
 
@@ -10,7 +13,9 @@
 
 
 def _node_metadata_hook(
-    node: torch.fx.Node, metadata: Optional[dict[str, Any]] = None
+    node: torch.fx.Node,
+    metadata: Optional[dict[str, Any]] = None,
+    fake_mode: Optional[FakeTensorMode] = None,
 ) -> None:
     """
     Hook for adding the appropriate metadata to nodes that are created during a
@@ -27,11 +32,11 @@ def _node_metadata_hook(
     that nodes being added are only call_function nodes, and copies over the
     first argument node's nn_module_stack.
     """
-    assert node.op == "call_function" and callable(node.target)
+    fake_mode = fake_mode or contextlib.nullcontext()
 
-    arg_meta = [arg.meta for arg in node.args if isinstance(arg, torch.fx.Node)]
-    assert len(arg_meta) >= 1
-    arg_meta = arg_meta[0]
+    assert node.op == "call_function" and callable(node.target), (
+        f"node: {node}, target: {node.target}"
+    )
 
     if (
         isinstance(node.target, torch._ops.OpOverload)
@@ -39,34 +44,48 @@ def _node_metadata_hook(
     ):
         node.meta["val"] = None
     else:
-        fake_args = [
-            arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
-            for arg in node.args
-        ]
-        fake_res = node.target(*fake_args)
+        fake_args, fake_kwargs = pytree.tree_map_only(
+            torch.fx.Node, lambda arg: arg.meta["val"], (node.args, node.kwargs)
+        )
+        with fake_mode, enable_python_dispatcher():
+            fake_res = node.target(*fake_args, **fake_kwargs)
         node.meta["val"] = fake_res
 
-    node.meta["nn_module_stack"] = arg_meta.get(
+    if metadata is not None:
+        for k, v in metadata.items():
+            node.meta[k] = v
+
+    # Copy over metadata from argument nodes
+    arg_meta = [
+        arg.meta
+        for arg in pytree.tree_flatten((node.args, node.kwargs))[0]
+        if isinstance(arg, torch.fx.Node)
+    ]
+    if len(arg_meta) == 0:
+        return
+    arg_meta = arg_meta[0]
+
+    node.meta["nn_module_stack"] = node.meta.get(
         "nn_module_stack",
-        {
-            _EMPTY_NN_MODULE_STACK_KEY: (
-                _EMPTY_NN_MODULE_STACK_KEY,
-                _EMPTY_NN_MODULE_STACK_KEY,
-            )
-        },
+        arg_meta.get(
+            "nn_module_stack",
+            {
+                _EMPTY_NN_MODULE_STACK_KEY: (
+                    _EMPTY_NN_MODULE_STACK_KEY,
+                    _EMPTY_NN_MODULE_STACK_KEY,
+                )
+            },
+        ),
     )
 
-    node.meta["torch_fn"] = (
-        f"{node.target.__name__}_0",
-        f"{node.target.__class__.__name__}.{node.target.__name__}",
+    node.meta["torch_fn"] = node.meta.get(
+        "torch_fn",
+        (
+            f"{node.target.__name__}_0",
+            f"{node.target.__class__.__name__}.{node.target.__name__}",
+        ),
     )
 
-    # Hook specified metadata takes precedence over all previously set
-    # metadata, so this goes last
-    if metadata is not None:
-        for k, v in metadata.items():
-            node.meta[k] = v
-
 
 @contextlib.contextmanager
 def _set_node_metadata_hook(gm: torch.fx.GraphModule, f):
diff --git a/torch/_export/serde/export_schema.thrift b/torch/_export/serde/export_schema.thrift
index d24053cdce32c..47ab33cc12f18 100644
--- a/torch/_export/serde/export_schema.thrift
+++ b/torch/_export/serde/export_schema.thrift
@@ -1,5 +1,5 @@
 // @generated by update_schema.py
-// checksum<<00d94226d15b290b97bd49f9ff12bbfe04b7252c75d2d1bae66d1756fd9b8517>>
+// checksum<<8ec417b91fce9bc5d8447e99c26225f653583faf7c12cbaca355bda27f997fa1>>
 
 namespace py3 torch._export
 namespace cpp2 torch._export.schema
@@ -336,14 +336,6 @@ struct ExportedProgram {
   60: SchemaVersion schema_version;
   70: list<string> verifiers;
   80: string torch_version;
-  90: map<string, string> tensor_paths;
-  100: map<string, string> constant_paths;
-}
-
-struct Model {
-  10: string name;
-  80: ExportedProgram program;
-  90: map<string, ExportedProgram> variants;
 }
 
 struct AOTInductorModelPickleData {
diff --git a/torch/_export/serde/schema.py b/torch/_export/serde/schema.py
index 2cf95d44ade58..a24c63b924c25 100644
--- a/torch/_export/serde/schema.py
+++ b/torch/_export/serde/schema.py
@@ -9,7 +9,7 @@
 
 
 # NOTE: Please update this value if any modifications are made to the schema
-SCHEMA_VERSION = (8, 10)
+SCHEMA_VERSION = (8, 11)
 TREESPEC_VERSION = 1
 
 
@@ -443,37 +443,12 @@ class ExportedProgram:
     verifiers: Annotated[list[str], 70] = field(default_factory=list)
     torch_version: Annotated[str, 80] = "<=2.4"
 
-    # key is the FQN of tensor in exported program
-    # value is the archive path of tensor payloads
-    # e.g. "L__self__linear.weight" : "/data/tensor/weight_1"
-    tensor_paths: Annotated[dict[str, str], 90] = field(default_factory=dict)
-
-    # key is the FQN of constant in exported program (constant tensor or torchbind objs)
-    # value is the archive path of serialized constants
-    constant_paths: Annotated[dict[str, str], 100] = field(default_factory=dict)
-
 
 #########################################################################
 # Container types for inference tasks, not being used directly for export.
 #########################################################################
 
 
-# This is the top-level model definition that be will serialized into the package
-@dataclass
-class Model:
-    # unique identifier of the model in the package, e.g. local, remote, merge
-    name: Annotated[str, 10]
-
-    # the main program exported from torch.export()
-    program: Annotated[ExportedProgram, 80]
-
-    # a collection of ExportedPrograms that are related to the same model
-    # They can be used for different purposes, e.g.
-    # - different methods such as "encode" and "decode" for the same model
-    # - different delegates such as "aoti_sm80" and "aoti_sm90"
-    variants: Annotated[dict[str, ExportedProgram], 90]
-
-
 #
 # The structure is used to serialize instances of AOTInductorModel to pass
 # them from the publishing pipeline to the predictor.
diff --git a/torch/_export/serde/schema.yaml b/torch/_export/serde/schema.yaml
index d53eeaebf7c84..c1708546a8582 100644
--- a/torch/_export/serde/schema.yaml
+++ b/torch/_export/serde/schema.yaml
@@ -1,5 +1,5 @@
 # @generated by update_schema.py
-# checksum<<face83b52f81c45eeaeccc97cee19e146b3f7416ed91e015b4510ada7549a72f>>
+# checksum<<d2bd01954a583467e1032a4b402f3350fd06a13c87e37b3e62aad33017ec71a2>>
 AOTInductorModelPickleData:
   kind: struct
   fields:
@@ -131,12 +131,6 @@ ExportedProgram:
     torch_version:
       type: str
       default: <=2.4
-    tensor_paths:
-      type: Dict[str, str]
-      default: '{}'
-    constant_paths:
-      type: Dict[str, str]
-      default: '{}'
 ExternKernelNode:
   kind: struct
   fields:
@@ -299,15 +293,6 @@ MemoryFormat:
     ChannelsLast: 2
     ChannelsLast3d: 3
     PreserveFormat: 4
-Model:
-  kind: struct
-  fields:
-    name:
-      type: str
-    program:
-      type: ExportedProgram
-    variants:
-      type: Dict[str, ExportedProgram]
 ModuleCallEntry:
   kind: struct
   fields:
@@ -538,5 +523,5 @@ UserOutputSpec:
       type: Argument
 SCHEMA_VERSION:
 - 8
-- 10
+- 11
 TREESPEC_VERSION: 1
diff --git a/torch/_functorch/_aot_autograd/frontend_utils.py b/torch/_functorch/_aot_autograd/frontend_utils.py
index 394f42a04aafb..55b84c12df829 100644
--- a/torch/_functorch/_aot_autograd/frontend_utils.py
+++ b/torch/_functorch/_aot_autograd/frontend_utils.py
@@ -221,23 +221,10 @@ def _get_attributes(mod):
         # return any attributes of a module that are not standard attributes
         return {k: v for k, v in mod.__dict__.items() if k not in STD_ATTRS}
 
-    def _get_all_module_attributes(mod):
-        # return attributes from all modules and submodules
-        result = {}
-        for name, submodule in mod.named_modules():
-            result[name] = _get_attributes(submodule)
-        return result
-
-    def _restore_all_module_attributes(mod, snapshot):
-        # restore attributes to all modules and submodules
-        for name, submodule in mod.named_modules():
-            if name in snapshot:
-                submodule.__dict__.update(snapshot[name])
-
     # save state of attributes before enter
     snapshot = pytree.tree_map(
         lambda x: x,
-        _get_all_module_attributes(mod),
+        _get_attributes(mod),
         is_leaf=lambda x: type(x) in _pytree_subclasses_that_lose_info,
     )
     try:
@@ -249,54 +236,41 @@ def _restore_all_module_attributes(mod, snapshot):
 
         def _collect_assigned_tensor_attributes(kp, v, _v):
             if _v is not v:
-                module_name, attr, *rest = kp
+                attr, *rest = kp
                 if isinstance(v, torch.Tensor):
-                    module_prefix = f"{module_name.key}." if module_name.key else ""
                     assigned_tensor_attributes.append(
-                        f"self.{module_prefix}{attr.key}{pytree.keystr(rest)}"
+                        f"self.{attr.key}{pytree.keystr(rest)}"
                     )
                 # TODO(avik): Assigning all other types are allowed right now.
                 # Maybe in the future we want to limit this to primitive types?
             return v
 
-        new_attrs = _get_all_module_attributes(mod)
-
-        # Check for added/deleted attributes across all modules
-        for module_name in snapshot.keys() | new_attrs.keys():
-            old_module_attrs = snapshot.get(module_name, {})
-            new_module_attrs = new_attrs.get(module_name, {})
-
-            if len(new_module_attrs) != len(old_module_attrs):
-                added_attrs = new_module_attrs.keys() - old_module_attrs.keys()
-                deleted_attrs = old_module_attrs.keys() - new_module_attrs.keys()
-
-                module_prefix = f"self.{module_name}." if module_name else "self."
-
-                if len(added_attrs) > 0:
-                    formatted_attrs = [f"{module_prefix}{attr}" for attr in added_attrs]
-                    raise ValueError(
-                        f"During torch.export, following attrs were created in the model.forward: {formatted_attrs} "
-                        f"Such attributes must be registered as buffers using the `register_buffer` "
-                        f"API and must be initialized at model.__init__ "
-                        f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
-                    )
+        new_attrs = _get_attributes(mod)
+        if len(new_attrs) != len(snapshot):
+            added_attrs = new_attrs.keys() - snapshot.keys()
+            deleted_attrs = snapshot.keys() - new_attrs.keys()
+
+            if len(added_attrs) > 0:
+                raise ValueError(
+                    f"During torch.export, following attrs were created in the model.forward: {added_attrs} "
+                    f"Such attributes must be registered as buffers using the `register_buffer` "
+                    f"API and must be initialized at model.__init__ "
+                    f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+                )
 
-                if len(deleted_attrs) > 0:
-                    formatted_attrs = [
-                        f"{module_prefix}{attr}" for attr in deleted_attrs
-                    ]
-                    raise ValueError(
-                        f"During torch.export, following attrs were deleted in the model.forward: {formatted_attrs} "
-                        f"Such attributes must be registered as buffers using the `register_buffer` "
-                        f"API and must be initialized at model.__init__ "
-                        f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
-                    )
+            if len(deleted_attrs) > 0:
+                raise ValueError(
+                    f"During torch.export, following attrs were deleted in the model.forward: {deleted_attrs} "
+                    f"Such attributes must be registered as buffers using the `register_buffer` "
+                    f"API and must be initialized at model.__init__ "
+                    f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+                )
 
         pytree.tree_map_with_path(
             _collect_assigned_tensor_attributes, snapshot, new_attrs
         )
         # restore state of all attributes (including, e.g., of primitive types)
-        _restore_all_module_attributes(mod, snapshot)
+        mod.__dict__.update(snapshot)
 
         if assigned_tensor_attributes:
             if len(assigned_tensor_attributes) > 1:
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index cb524eae36407..9030cfc3c17ca 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -2598,6 +2598,88 @@ def has_same_nodes(joint_graph):
     return saved_values
 
 
+def thread_graphsafe_rng_from_hops(module, is_backward):
+    """
+    Graph-safe RNG lets torch.compile use CUDA Graphs for graphs with RNG ops.
+    For graphs without HOPs, the partitioner adds placeholder nodes
+    fwd_rng_state_* and bw_rng_state_* to the forward and backward graphs. At
+    runtime, the AOTDispatcher retrieves these RNG states and passes them to the
+    compiled graphs.
+
+    This works well for no-HOP graphs. With HOPs, the partitioner runs
+    recursively: it first partitions the HOP (producing forward/backward HOP
+    subgraphs) and then stitches them back into the outer joint graph. For HOPs
+    that contain RNG ops, the outer joint graph now includes HOP subgraph
+    modules with extra RNG placeholders. We must thread these placeholders
+    through the outer module partitioned forward and backward graphs—this
+    function does exactly that. It collects the RNG placeholder nodes from the
+    HOPs and creates corresponding placeholders in the outer forward and
+    backward graphs.
+
+    There is a catch: for a short period, the joint graph is in a “bad” state.
+    The HOP subgraphs expect additional inputs (because of the new
+    placeholders), but the outer graph call sites don't yet provide them. We
+    can't fix this in the joint graph because the joint graph's input signature
+    is fixed (primals, tangents). As a compromise, we keep the joint graph in
+    somewhat of a bad state for some time and, once the outer forward and
+    backward graphs are partitioned, insert the corresponding RNG placeholders
+    and wire up the calls.
+    """
+
+    rng_count = 0
+    rng_string = "bwd_rng_state" if is_backward else "fwd_rng_state"
+    last_input = next(reversed(module.graph.find_nodes(op="placeholder")))
+    for hop_node in module.graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.invoke_subgraph
+    ):
+        subgraph = getattr(module, hop_node.args[0].target)
+        if isinstance(subgraph, fx.GraphModule):
+            new_rng_inputs = []
+            for idx, placeholder_node in enumerate(
+                subgraph.graph.find_nodes(op="placeholder")
+            ):
+                if rng_string in placeholder_node.name:
+                    # Found a rng state placeholder in the hop graph, lets add
+                    # the corresponding node in the outer graph
+                    with module.graph.inserting_after(last_input):
+                        rng_state = module.graph.placeholder(
+                            f"{rng_string}_{rng_count}"
+                        )
+                        rng_count += 1
+                        rng_state.meta["val"] = placeholder_node.meta["val"]
+                        last_input = rng_state
+                        new_rng_inputs.append(rng_state)
+
+            if new_rng_inputs:
+                # Pass on the new args that include the new_rng_inputs
+                with module.graph.inserting_after(hop_node):
+                    new_hop_node_with_fixed_args = module.graph.create_node(
+                        "call_function",
+                        torch.ops.higher_order.invoke_subgraph,
+                        (*hop_node.args, *new_rng_inputs),  # type: ignore[arg-type]
+                        {},
+                    )
+                    hop_node.replace_all_uses_with(
+                        new_hop_node_with_fixed_args, propagate_meta=True
+                    )
+
+                # Setup the eager_input_vals
+                eager_vals = hop_node.meta.get("eager_input_vals")
+                if eager_vals:
+                    eager_args, eager_kwargs = eager_vals
+                    new_eager_args = (
+                        *eager_args,
+                        *[inp.meta["val"] for inp in new_rng_inputs],
+                    )
+                    new_hop_node_with_fixed_args.meta["eager_input_vals"] = (
+                        new_eager_args,
+                        eager_kwargs,
+                    )
+                module.graph.erase_node(hop_node)
+
+    return module
+
+
 def min_cut_rematerialization_partition(
     joint_module: fx.GraphModule,
     _joint_inputs,
@@ -2767,6 +2849,9 @@ def classify_nodes(joint_module, static_lifetime_input_indices):
     fw_module = raise_getitems(fw_module)
     bw_module = raise_getitems(bw_module)
 
+    fw_module = thread_graphsafe_rng_from_hops(fw_module, is_backward=False)
+    bw_module = thread_graphsafe_rng_from_hops(bw_module, is_backward=True)
+
     if AOT_PARTITIONER_DEBUG:
         # Calculate sorted sizes of saved values
         sorted_sizes = sorted([(_size_of(i), str(i)) for i in saved_values])
diff --git a/torch/_higher_order_ops/invoke_subgraph.py b/torch/_higher_order_ops/invoke_subgraph.py
index 9b775a03a1460..85a99d93f041d 100644
--- a/torch/_higher_order_ops/invoke_subgraph.py
+++ b/torch/_higher_order_ops/invoke_subgraph.py
@@ -74,8 +74,11 @@ def __call__(
         )
 
         assert all(
-            isinstance(o, (torch.Tensor, int, torch.SymInt)) for o in operands
-        ), f"invoke_subgraph operands must be a list of tensors/ints/SymInts {operands}"
+            isinstance(o, (torch.Tensor, int, torch.SymInt, torch.Generator))
+            for o in operands
+        ), (
+            f"invoke_subgraph operands must be a list of tensors/ints/SymInts/Generator {operands}"
+        )
 
         return super().__call__(subgraph, identifier, *operands)
 
diff --git a/torch/_inductor/analysis/profile_analysis.py b/torch/_inductor/analysis/profile_analysis.py
index 3a30380a656aa..134d06528c0df 100644
--- a/torch/_inductor/analysis/profile_analysis.py
+++ b/torch/_inductor/analysis/profile_analysis.py
@@ -670,12 +670,64 @@ def dump(self, out: str) -> None:
         with open(out, "w") as f:
             json.dump(self.data, f)
 
+    def combine_with(self, other: "JsonProfile") -> "JsonProfile":
+        """
+        Combine this profile with another profile by merging their trace events.
+        Returns a new JsonProfile object with combined data.
+        """
+        # Create a new combined data structure
+        combined_data = {
+            "traceEvents": self.data["traceEvents"] + other.data["traceEvents"],
+            "deviceProperties": self.data.get("deviceProperties", []),
+        }
+
+        # Merge device properties, avoiding duplicates
+        other_device_props = other.data.get("deviceProperties", [])
+        existing_device_ids = OrderedSet(
+            [dev["id"] for dev in combined_data["deviceProperties"]]
+        )
+
+        for device_prop in other_device_props:
+            if device_prop["id"] not in existing_device_ids:
+                combined_data["deviceProperties"].append(device_prop)
+
+        # Copy any other top-level properties from the first profile
+        for key, value in self.data.items():
+            if key not in combined_data:
+                combined_data[key] = value
+
+        import os
+
+        # Create a temporary file to write the combined data
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", delete=False
+        ) as tmp_file:
+            json.dump(combined_data, tmp_file)
+            tmp_path = tmp_file.name
+
+        try:
+            # Create new JsonProfile from the combined data
+            combined_profile = JsonProfile(
+                tmp_path,
+                benchmark_name=f"{self.benchmark_name or 'Profile1'}_+_{other.benchmark_name or 'Profile2'}",
+                dtype=self.dtype or other.dtype,
+            )
+            return combined_profile
+        finally:
+            # Clean up temporary file
+            os.unlink(tmp_path)
+
 
 class ParseException(RuntimeError):
     pass
 
 
 def main() -> None:
+    """
+    Main function for the profile analysis script.
+    """
     import argparse
 
     parser = argparse.ArgumentParser()
@@ -709,6 +761,14 @@ def main() -> None:
         metavar=("input_file", "dtype"),
         help="Run analysis on a single trace, specified as <file> <dtype>",
     )
+    parser.add_argument(
+        "--combine",
+        nargs="+",
+        metavar=("input_files", "output_file"),
+        help="Combine multiple profiles into a single profile by merging trace events. Specify as <input_file1> \
+<input_file2> [input_file3 ...] <output_file>. The last argument is the output file, all preceding arguments are \
+input files to combine.",
+    )
     args = parser.parse_args()
 
     if args.diff:
@@ -734,6 +794,24 @@ def main() -> None:
         p = JsonProfile(args.augment_trace[0], dtype=args.augment_trace[2])
         p.augment_trace()
         p.dump(args.augment_trace[1])
+    if args.combine:
+        input_files = args.combine[:-1]  # All arguments except the last one
+        output_file = args.combine[-1]  # Last argument is the output file
+
+        if len(input_files) < 2:
+            print("Error: At least 2 input files are required for combining")
+            return
+
+        # Load the first profile
+        combined = JsonProfile(input_files[0], dtype=None)
+
+        # Iteratively combine with all other profiles
+        for input_file in input_files[1:]:
+            profile = JsonProfile(input_file, dtype=None)
+            combined = combined.combine_with(profile)
+
+        combined.dump(output_file)
+        print(f"Successfully combined {', '.join(input_files)} into {output_file}")
 
 
 if __name__ == "__main__":
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index d5e142939c9c5..40c7a1d66c3cb 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -2398,9 +2398,44 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     os.remove(o_file)
 
                 if use_mmap_weights:
-                    import resource
 
-                    page_size_ = resource.getpagesize()
+                    def get_page_size() -> int:
+                        # Don't use resource.getpagesize() on Windows, as it is a Unix specific package
+                        # as seen in https://docs.python.org/2/library/resource.html
+                        if _IS_WINDOWS:
+                            from ctypes import (  # type: ignore[attr-defined]
+                                byref,
+                                Structure,
+                                windll,
+                            )
+                            from ctypes.wintypes import DWORD, LPVOID, WORD
+
+                            class SYSTEM_INFO(Structure):
+                                _fields_ = [
+                                    ("wProcessorArchitecture", WORD),
+                                    ("wReserved", WORD),
+                                    ("dwPageSize", DWORD),
+                                    ("lpMinimumApplicationAddress", LPVOID),
+                                    ("lpMaximumApplicationAddress", LPVOID),
+                                    ("dwActiveProcessorMask", DWORD),
+                                    ("dwNumberOfProcessors", DWORD),
+                                    ("dwProcessorType", DWORD),
+                                    ("dwAllocationGranularity", DWORD),
+                                    ("wProcessorLevel", WORD),
+                                    ("wProcessorRevision", WORD),
+                                ]
+
+                            si = SYSTEM_INFO()
+                            windll.kernel32.GetSystemInfo(byref(si))
+                            sys_page_size = si.dwPageSize
+                        else:
+                            import resource
+
+                            sys_page_size = resource.getpagesize()
+
+                        return sys_page_size
+
+                    page_size_ = get_page_size()
                     page_size = max(16384, page_size_)
 
                     with open(output_so, "a+b") as f_so:
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index 0fedaf12203f2..bfcebbd6a3810 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -1094,6 +1094,18 @@ def get_padded_size(n, block_n, k, should_block_weight):
         new_size = [padded_n // block_n, k, block_n]
         return new_size, padded_n
 
+    @staticmethod
+    def _maybe_remove_storage_offset(node: ir.IRNode):
+        if node.get_layout().offset == 0:
+            return node
+        # node may be contiguous but still have a non-zero storage offset.
+        # GEMM_TEMPLATE emits code like:
+        #   W.data_ptr[node.offset + ...]
+        # but runtime W.data_ptr (after normalize_shapes()) already includes this offset.
+        # To avoid double-offsetting, we remove the offset in the node also in the generated code.
+        #   W.data_ptr[...]
+        return ir.ExternKernel.copy_input(node)
+
     @classmethod
     def prep_weight(
         cls,
@@ -1149,6 +1161,7 @@ def prep_weight(
         elif isinstance(W, ir.IRNode):
             # Require W layout to be fixed & contiguous, happens inplace.
             ir.ExternKernel.require_contiguous(W)
+            new_inputs[1] = cls._maybe_remove_storage_offset(W)
 
         if not skip_int8_compensation and _is_int8_gemm(new_inputs):
             BCompensate = None
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index ea1cf09c1b8d0..6fa08465ce2b8 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -1456,51 +1456,19 @@ def codegen_dynamic_scalar(self, node):
         # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
         self.unbacked_symbol_decls.add(str(node.sym))
 
-    def codegen_dynamic_select_index(self, node, clamp):
+    def codegen_dynamic_select_index(self, node):
         index_cpp_str = self.val_to_arg_str_for_prim_type(node.index, int)
-        size_cpp_str = self.val_to_arg_str_for_prim_type(node.size, int)
 
-        # codegen index
-        sym = node.unbacked_offset_symbol
-        index_str = (
+        index_compute_str = (
             f"{index_cpp_str} < 0 ? {index_cpp_str} + "
-            f"{self.val_to_arg_str_for_prim_type(node.size, int)}: {index_cpp_str}"
+            f"{self.val_to_arg_str_for_prim_type(node.size, int)}:  {index_cpp_str}"
         )
-        self.writeline(f"auto {sym}_index = {index_str};")
-        index_str_clamped = (
-            f"{sym}_index < 0 ? 0 : ({sym}_index > {size_cpp_str} ? {size_cpp_str} : {sym}_index)"
-            if clamp
-            else f"{sym}_index"
-        )
-        self.writeline(f"auto {sym}_index_clamped = {index_str_clamped};")
         self.writeline(
-            f"auto {sym} = {self.val_to_arg_str_for_prim_type(node.base_offset, int)} + "
-            f"{self.val_to_arg_str_for_prim_type(node.base_dim_stride, int)} * {sym}_index_clamped;"
+            f"auto {node.unbacked_offset_symbol} = {self.val_to_arg_str_for_prim_type(node.base_offset, int)} + "
+            f"{self.val_to_arg_str_for_prim_type(node.base_dim_stride, int)} * ({index_compute_str});"
         )
         # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
-        self.unbacked_symbol_decls.add(str(sym))
-
-    def codegen_dynamic_slice_size(self, node):
-        start_cpp_str = self.val_to_arg_str_for_prim_type(node.start, int)
-        end_cpp_str = self.val_to_arg_str_for_prim_type(node.end, int)
-        size_cpp_str = self.val_to_arg_str_for_prim_type(node.size, int)
-        sym = node.unbacked_size_symbol
-
-        def codegen_clamp(index_str, start=True):
-            suf = "start" if start else "end"
-            index_ = f"{sym}_{suf}_index"
-            self.writeline(
-                f"auto {index_} = {index_str} < 0 ? {index_str} + {size_cpp_str} : {index_str};"
-            )
-            self.writeline(
-                f"auto {sym}_{suf}_clamped = {index_} < 0 ? 0 : ({index_} > {size_cpp_str} ? {size_cpp_str} : {index_});"
-            )
-
-        codegen_clamp(start_cpp_str, start=True)
-        codegen_clamp(end_cpp_str, start=False)
-        self.writeline(f"auto {sym}_raw = {sym}_end_clamped - {sym}_start_clamped;")
-        self.writeline(f"auto {sym} = {sym}_raw < 0 ? 0 : {sym}_raw;")
-        self.unbacked_symbol_decls.add(str(sym))
+        self.unbacked_symbol_decls.add(str(node.unbacked_offset_symbol))
 
     def make_buffer_free(self, buffer):
         return (
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index b6b8075e92846..ee63e7e9b085f 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -228,11 +228,18 @@ def writeline(line: str, example_grid: Optional[str] = None):
                 key=lambda x: len(x[1].kwargs),
                 reverse=True,
             ):
+                guardslist = []
                 if c.kwargs:
-                    guards = [
-                        f"meta['{name}'] == {val}" for name, val in c.kwargs.items()
-                    ]
-                    guards = " and ".join(guards)
+                    # Remove AMD specific kwargs.
+                    for kwarg in c.kwargs:
+                        if kwarg not in [
+                            "matrix_instr_nonkdim",
+                            "waves_per_eu",
+                            "kpack",
+                        ]:
+                            guardslist.append(f"meta['{kwarg}'] == {c.kwargs[kwarg]}")
+                if guardslist:
+                    guards = " and ".join(guardslist)
                 else:
                     guards = "True"  # for configs with empty kwargs
                 grid, example_grid = determine_grid(grid, example_grid)
@@ -1887,33 +1894,14 @@ def codegen_multi_output(self, node: ir.MultiOutput):
         arg_name = node.input_name(0)
         self.writeline(MultiOutputLine(self, result_name, arg_name, node.indices))
 
-    def codegen_dynamic_select_index(self, node, clamp):
+    def codegen_dynamic_select_index(self, node):
         index_str = f"{node.index} + {node.size} if {node.index} < 0 else {node.index}"
-        if clamp:
-            index_str = f"max(0, min({node.size}, {index_str}))"
         self.writeline(
             f"{node.unbacked_offset_symbol} = {node.base_offset} + {node.base_dim_stride} * ({index_str})"
         )
         # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
         self.unbacked_symbol_decls.add(str(node.unbacked_offset_symbol))
 
-    def codegen_dynamic_slice_size(self, node):
-        def clamp_index(x):
-            pos = self.codegen_sizevar(sympy.Max(0, sympy.Min(x, node.size)))
-            neg = self.codegen_sizevar(
-                sympy.Max(0, sympy.Min(x + node.size, node.size))
-            )
-            return f"{pos} if {x} >= 0 else {neg}"
-
-        # codegen start, end
-        sym = node.unbacked_size_symbol
-        start = clamp_index(node.start)
-        end = clamp_index(node.end)
-        self.writeline(f"{sym}_start = {start}")
-        self.writeline(f"{sym}_end = {end}")
-        self.writeline(f"{sym} = max(0, {sym}_end - {sym}_start)")
-        self.unbacked_symbol_decls.add(str(node.unbacked_size_symbol))
-
     def codegen_dynamic_scalar(self, node):
         (data,) = (t.codegen_reference() for t in node.inputs)
         if len(node.keypath) == 0:
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index 5cf0340012e52..741ff85f97f80 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -10,6 +10,11 @@
 import sympy
 
 import torch
+from torch._export.passes._node_metadata_hook import (
+    _node_metadata_hook,
+    _set_node_metadata_hook,
+)
+from torch._export.utils import _detect_fake_mode_from_gm
 from torch._higher_order_ops.triton_kernel_wrap import (
     TraceableTritonKernelWrapper,
     tracing_triton_hopifier_singleton,
@@ -198,11 +203,6 @@ def _fake_tensor(
                 device=device,
             )
 
-    def _create_meta_from_buffer(
-        self, node: torch.fx.Node, buffer: CodegenBuffer
-    ) -> None:
-        node.meta["val"] = buffer.get_example()
-
     def _create_as_strided(
         self,
         input_node: torch.fx.Node,
@@ -266,6 +266,31 @@ def _generate_graph_inputs(self) -> None:
         Converts graph inputs to FX placeholders.
         """
 
+        for node in V.graph.module.graph.find_nodes(op="placeholder"):  # type: ignore[operator, union-attr]
+            name = node.name
+            if name in V.graph.graph_inputs:
+                ir_node = V.graph.graph_inputs[name]
+
+                # Introduce a new symbol for constant inputs.
+                buffer = (
+                    SymbolBuffer(sympy.Symbol(name, is_integer=True))
+                    if isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
+                    else self._get_buffer(ir_node)
+                )
+                placeholder_node = self.gm.graph.placeholder(buffer.get_name())
+                placeholder_node.meta["val"] = buffer.get_example()
+                self._record_allocation(buffer, placeholder_node)
+
+            elif V.aot_compilation:
+                # Create dummy input nodes to match the input signature
+                self.gm.graph.placeholder(name)
+
+    def _generate_graph_input_shapes(self) -> None:
+        """
+        Generate nodes creating symints that are part of graph input
+        shape/strides.
+        """
+
         def _codegen_symbol(
             sym_or_exp: Union[sympy.Symbol, sympy.Expr],
             base_node: torch.fx.Node,
@@ -273,16 +298,12 @@ def _codegen_symbol(
             dim: int,
         ) -> None:
             if isinstance(sym_or_exp, sympy.Symbol):
-                buffer = SymbolBuffer(sym_or_exp)
-
-                if buffer.get_name() in self.buffer_to_node:
+                if sym_or_exp in self.expr_to_proxy:
                     return
 
                 size_node = self.gm.graph.call_function(target, (base_node, dim))
                 size_proxy = torch.fx.Proxy(size_node, tracer=self.tracer)
 
-                self._create_meta_from_buffer(size_node, buffer)
-                self._record_allocation(buffer, size_node)
                 self.expr_to_proxy[sym_or_exp] = size_proxy
 
             elif isinstance(sym_or_exp, sympy.Integer):
@@ -295,26 +316,10 @@ def _codegen_symbol(
             name = node.name
             if name in V.graph.graph_inputs:
                 ir_node = V.graph.graph_inputs[name]
-
-                # Introduce a new symbol for constant inputs.
-                buffer = (
-                    SymbolBuffer(sympy.Symbol(name, is_integer=True))
-                    if isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
-                    else self._get_buffer(ir_node)
-                )
-                placeholder_node = self.gm.graph.placeholder(buffer.get_name())
-                self._create_meta_from_buffer(placeholder_node, buffer)
-                self._record_allocation(buffer, placeholder_node)
-
-                # not sure if this is needed...
-                if isinstance(ir_node, (sympy.Symbol)):
-                    placeholder_proxy = torch.fx.Proxy(
-                        placeholder_node, tracer=self.tracer
-                    )
-                    self.expr_to_proxy[ir_node] = placeholder_proxy
-
-                # Generate nodes for dynamic input sizes/strides.
                 if isinstance(ir_node, ir.TensorBox):
+                    buffer = self._get_buffer(ir_node)
+                    placeholder_node = self.buffer_to_node[buffer.get_name()]
+
                     for dim, size in enumerate(ir_node.get_size()):
                         _codegen_symbol(
                             size, placeholder_node, torch.ops.aten.sym_size.int, dim
@@ -324,10 +329,6 @@ def _codegen_symbol(
                             stride, placeholder_node, torch.ops.aten.sym_stride.int, dim
                         )
 
-            elif V.aot_compilation:
-                # Create dummy input nodes to match the input signature
-                self.gm.graph.placeholder(name)
-
     def _generate_graph_constants(self) -> None:
         for name, value in V.graph.constants.items():
             node = self.gm.graph.get_attr(name)
@@ -346,7 +347,7 @@ def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
                 return node
             elif isinstance(node, ir.NoneAsConstantBuffer):
                 return None
-            elif isinstance(node, ir.StorageBox):
+            elif isinstance(node, ir.MutableBox):
                 return generate_to_buffer(node.data)
             elif isinstance(node, ir.ReinterpretView):
                 # We need to introduce a new symbol if the output is a ReinterpretView.
@@ -397,24 +398,32 @@ def generate(self) -> torch.fx.GraphModule:
         self._generate_graph_inputs()
         self._generate_graph_constants()
 
-        # Generate FX IR from Wrapper IR lines.
-        for line in self.lines:
-            if isinstance(line, WrapperLine):
-                line.codegen_fx(self)(line)
-            elif isinstance(line, LineContext):
-                # Ignore line context in FX IR.
-                pass
-            else:
-                raise NotImplementedError(
-                    textwrap.dedent(
-                        f"""
-                    Found line of unrecognized type '{type(line)}':
-                        '{line}'
-
-                    FX conversion only supports Wrapper IR lines.
-                    """
+        fake_mode = _detect_fake_mode_from_gm(self.gm)
+
+        with _set_node_metadata_hook(
+            self.gm,
+            functools.partial(_node_metadata_hook, fake_mode=fake_mode),
+        ):
+            self._generate_graph_input_shapes()
+
+            # Generate FX IR from Wrapper IR lines.
+            for line in self.lines:
+                if isinstance(line, WrapperLine):
+                    line.codegen_fx(self)(line)
+                elif isinstance(line, LineContext):
+                    # Ignore line context in FX IR.
+                    pass
+                else:
+                    raise NotImplementedError(
+                        textwrap.dedent(
+                            f"""
+                        Found line of unrecognized type '{type(line)}':
+                            '{line}'
+
+                        FX conversion only supports Wrapper IR lines.
+                        """
+                        )
                     )
-                )
 
         self._generate_output()
         self.gm.recompile()
@@ -512,7 +521,6 @@ def _generate_allocate(self, line: WrapperLine) -> None:
         )
         assert name
         node.name = name
-        self._create_meta_from_buffer(node, buffer)
         self._record_allocation(buffer, node)
 
     def _generate_comment(self, line: WrapperLine) -> None:
@@ -583,7 +591,6 @@ def _generate_reinterpret_helper(
         # Map ReinterpretView to as_strided.
         result_node = self._create_as_strided(input_node, size, stride, offset)
         result_node.name = name
-        result_node.meta["val"] = layout.get_example()
         self._record_allocation(result_buffer, result_node)
 
     def _generate_reuse(self, line: WrapperLine) -> None:
@@ -606,7 +613,6 @@ def _generate_reuse(self, line: WrapperLine) -> None:
             or old.get_offset() != offset
         ):
             result_node = self._create_as_strided(old_node, size, stride, offset)
-            self._create_meta_from_buffer(result_node, new)
 
         self._record_allocation(new, result_node)
 
@@ -635,7 +641,6 @@ def _generate_multi_output(self, line: WrapperLine) -> None:
         idx = inds[0]
 
         node = self.gm.graph.call_function(operator.getitem, args=(arg_node, idx))
-        node.meta["val"] = arg_node.meta["val"][idx]
         node.name = line.result_name
         self.buffer_to_node[line.result_name] = node
 
@@ -778,14 +783,6 @@ def _generate_extern_kernel_common(
             fx_node.name = result_buffer
             self.buffer_to_node[result_buffer] = fx_node
 
-            arg_tensors = [
-                arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
-                for arg in args
-            ]
-
-            # Run the operation to propagate metadata.
-            fx_node.meta["val"] = op(*arg_tensors, **kwargs)
-
     def _generate_kernel_call(self, line: WrapperLine) -> None:
         assert isinstance(line, KernelCallLine)
         if not line.triton:
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
index 0024322a8e9f5..a209cbf008a42 100644
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@@ -2,6 +2,7 @@
 # The design document please check this RFC: https://github.com/pytorch/pytorch/issues/124245
 
 import copy
+import ctypes
 import errno
 import functools
 import json
@@ -18,7 +19,7 @@
 import textwrap
 import warnings
 from collections.abc import Sequence
-from ctypes import cdll
+from ctypes import cdll, wintypes
 from ctypes.util import find_library
 from pathlib import Path
 from typing import Any, Optional, Union
@@ -141,11 +142,201 @@ def check_compiler_exist_windows(compiler: str) -> None:
         pass
 
 
+class WinPeFileVersionInfo:
+    def __init__(self, file_path: str) -> None:
+        self.file_path = file_path
+        self.version_dll = ctypes.WinDLL("version.dll")  # type: ignore[attr-defined]
+        self._setup_functions()
+        self._get_version_info()
+
+    def _setup_functions(self) -> None:
+        self.version_dll.GetFileVersionInfoSizeW.argtypes = [
+            wintypes.LPCWSTR,
+            wintypes.LPDWORD,
+        ]
+        self.version_dll.GetFileVersionInfoSizeW.restype = wintypes.DWORD
+
+        self.version_dll.GetFileVersionInfoW.argtypes = [
+            wintypes.LPCWSTR,
+            wintypes.DWORD,
+            wintypes.DWORD,
+            wintypes.LPVOID,
+        ]
+        self.version_dll.GetFileVersionInfoW.restype = wintypes.BOOL
+
+        self.version_dll.VerQueryValueW.argtypes = [
+            wintypes.LPCVOID,
+            wintypes.LPCWSTR,
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.POINTER(wintypes.UINT),
+        ]
+        self.version_dll.VerQueryValueW.restype = wintypes.BOOL
+
+    def _get_version_info(self) -> None:
+        dummy = wintypes.DWORD()
+        size = self.version_dll.GetFileVersionInfoSizeW(
+            self.file_path, ctypes.byref(dummy)
+        )
+
+        if size == 0:
+            raise RuntimeError(f"Can't get version info size of {self.file_path}.")
+
+        self.version_info = ctypes.create_string_buffer(size)
+        success = self.version_dll.GetFileVersionInfoW(
+            self.file_path, 0, size, self.version_info
+        )
+
+        if not success:
+            raise RuntimeError(f"Can't get version info of {self.file_path}.")
+
+    def get_language_id(self) -> int:
+        lp_buffer = ctypes.c_void_p()
+        u_len = wintypes.UINT()
+
+        success = self.version_dll.VerQueryValueW(
+            self.version_info,
+            r"\VarFileInfo\Translation",
+            ctypes.byref(lp_buffer),
+            ctypes.byref(u_len),
+        )
+
+        if not success or u_len.value == 0:
+            return 0
+
+        translations = []
+        lang_id: int = 0
+        if lp_buffer.value is not None:
+            for i in range(u_len.value // 4):
+                offset = i * 4
+                data = ctypes.string_at(lp_buffer.value + offset, 4)
+                lang_id = int.from_bytes(data[:2], "little")
+                code_page = int.from_bytes(data[2:4], "little")
+                translations.append((lang_id, code_page))
+        else:
+            # Handle the case where lp_buffer.value is None
+            print("Buffer is None")
+
+        return lang_id
+
+
+@functools.cache
+def check_msvc_cl_language_id(compiler: str) -> None:
+    """
+    Torch.compile() is only work on MSVC with English language pack well.
+    Check MSVC's language pack: https://github.com/pytorch/pytorch/issues/157673#issuecomment-3051682766
+    """
+
+    def get_msvc_cl_path() -> tuple[bool, str]:
+        """
+        Finds the path to cl.exe using vswhere.exe.
+        """
+        vswhere_path = os.path.join(
+            os.environ.get("ProgramFiles(x86)", "C:\\Program Files (x86)"),
+            "Microsoft Visual Studio",
+            "Installer",
+            "vswhere.exe",
+        )
+        if not os.path.exists(vswhere_path):
+            vswhere_path = os.path.join(
+                os.environ.get("ProgramFiles", "C:\\Program Files"),
+                "Microsoft Visual Studio",
+                "Installer",
+                "vswhere.exe",
+            )
+            if not os.path.exists(vswhere_path):
+                return False, ""  # vswhere.exe not found
+
+        try:
+            # Get the Visual Studio installation path
+            cmd = [
+                vswhere_path,
+                "-latest",
+                "-prerelease",
+                "-products",
+                "*",
+                "-requires",
+                "Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+                "-property",
+                "installationPath",
+            ]
+            vs_install_path = subprocess.check_output(
+                cmd, text=True, encoding="utf-8"
+            ).strip()
+
+            if not vs_install_path:
+                return False, ""
+
+            # Find the latest MSVC toolset version within the installation
+            msvc_tools_path = os.path.join(vs_install_path, "VC", "Tools", "MSVC")
+            if not os.path.exists(msvc_tools_path):
+                return False, ""
+
+            # Get the latest toolset version directory
+            toolset_versions = [
+                d
+                for d in os.listdir(msvc_tools_path)
+                if os.path.isdir(os.path.join(msvc_tools_path, d))
+            ]
+            if not toolset_versions:
+                return False, ""
+            latest_toolset_version = sorted(toolset_versions, reverse=True)[0]
+
+            # Construct the full cl.exe path
+            cl_path = os.path.join(
+                msvc_tools_path,
+                latest_toolset_version,
+                "bin",
+                "HostX64",
+                "x64",
+                "cl.exe",
+            )
+            if os.path.exists(cl_path):
+                return True, cl_path
+            else:
+                # Fallback for older versions or different architectures if needed
+                cl_path = os.path.join(
+                    msvc_tools_path,
+                    latest_toolset_version,
+                    "bin",
+                    "HostX86",
+                    "x86",
+                    "cl.exe",
+                )
+                if os.path.exists(cl_path):
+                    return True, cl_path
+
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            return False, ""
+
+        return False, ""
+
+    if not _is_msvc_cl(compiler):
+        return
+
+    if os.path.exists(compiler):
+        # Passed compiler with path.
+        cl_exe_path = compiler
+    else:
+        b_ret, cl_exe_path = get_msvc_cl_path()
+        if b_ret is False:
+            return
+
+    version_info = WinPeFileVersionInfo(cl_exe_path)
+    lang_id = version_info.get_language_id()
+    if lang_id != 1033:
+        # MSVC English language id is 0x0409, and the DEC value is 1033.
+        raise RuntimeError(
+            "Torch.compile() is only support MSVC with English language pack,"
+            "Please reinstall its language pack to English."
+        )
+
+
 def get_cpp_compiler() -> str:
     if _IS_WINDOWS:
         compiler = os.environ.get("CXX", "cl")
         compiler = normalize_path_separator(compiler)
         check_compiler_exist_windows(compiler)
+        check_msvc_cl_language_id(compiler)
     else:
         if config.is_fbcode():
             return build_paths.cc
@@ -565,6 +756,9 @@ def _get_os_related_cpp_cflags(cpp_compiler: str) -> list[str]:
             "EHsc",
             # For Intel oneAPI, ref: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
             "Zc:__cplusplus",
+            # Enable max compatible to msvc for oneAPI headers.
+            # ref: https://github.com/pytorch/pytorch/blob/db38c44ad639e7ada3e9df2ba026a2cb5e40feb0/cmake/public/utils.cmake#L352-L358 # noqa: B950
+            "permissive-",
         ]
     else:
         cflags = ["Wno-unused-variable", "Wno-unknown-pragmas"]
@@ -582,23 +776,42 @@ def _get_os_related_cpp_cflags(cpp_compiler: str) -> list[str]:
     return cflags
 
 
+def _get_os_related_cpp_definitions(cpp_compiler: str) -> list[str]:
+    os_definitions: list[str] = []
+    if _IS_WINDOWS:
+        # On Windows, we need disable min/max macro to avoid C2589 error, as PyTorch CMake:
+        # https://github.com/pytorch/pytorch/blob/9a41570199155eee92ebd28452a556075e34e1b4/CMakeLists.txt#L1118-L1119
+        os_definitions.append("NOMINMAX")
+    else:
+        pass
+    return os_definitions
+
+
 def _get_ffast_math_flags() -> list[str]:
-    # ffast-math is equivalent to these flags as in
-    # https://github.com/gcc-mirror/gcc/blob/4700ad1c78ccd7767f846802fca148b2ea9a1852/gcc/opts.cc#L3458-L3468
-    # however gcc<13 sets the FTZ/DAZ flags for runtime on x86 even if we have
-    # -ffast-math -fno-unsafe-math-optimizations because the flags for runtime
-    # are added by linking in crtfastmath.o. This is done by the spec file which
-    # only does globbing for -ffast-math.
-    flags = [
-        "fno-trapping-math",
-        "funsafe-math-optimizations",
-        "ffinite-math-only",
-        "fno-signed-zeros",
-        "fno-math-errno",
-    ]
+    if _IS_WINDOWS:
+        flags = []
+    else:
+        # ffast-math is equivalent to these flags as in
+        # https://github.com/gcc-mirror/gcc/blob/4700ad1c78ccd7767f846802fca148b2ea9a1852/gcc/opts.cc#L3458-L3468
+        # however gcc<13 sets the FTZ/DAZ flags for runtime on x86 even if we have
+        # -ffast-math -fno-unsafe-math-optimizations because the flags for runtime
+        # are added by linking in crtfastmath.o. This is done by the spec file which
+        # only does globbing for -ffast-math.
+        flags = [
+            "fno-trapping-math",
+            "funsafe-math-optimizations",
+            "ffinite-math-only",
+            "fno-signed-zeros",
+            "fno-math-errno",
+        ]
 
-    if is_gcc():
-        flags.append("fexcess-precision=fast")
+        flags.append("fno-finite-math-only")
+        if not config.cpp.enable_unsafe_math_opt_flag:
+            flags.append("fno-unsafe-math-optimizations")
+        flags.append(f"ffp-contract={config.cpp.enable_floating_point_contract_flag}")
+
+        if is_gcc():
+            flags.append("fexcess-precision=fast")
 
     return flags
 
@@ -646,25 +859,24 @@ def _get_optimization_cflags(
             cflags = [wrapper_opt_level if min_optimize else "O3", "DNDEBUG"]
 
     cflags += _get_ffast_math_flags()
-    cflags.append("fno-finite-math-only")
-    if not config.cpp.enable_unsafe_math_opt_flag:
-        cflags.append("fno-unsafe-math-optimizations")
-    cflags.append(f"ffp-contract={config.cpp.enable_floating_point_contract_flag}")
-
-    if sys.platform != "darwin":
-        # on macos, unknown argument: '-fno-tree-loop-vectorize'
-        if _is_gcc(cpp_compiler):
-            cflags.append("fno-tree-loop-vectorize")
-        # https://stackoverflow.com/questions/65966969/why-does-march-native-not-work-on-apple-m1
-        # `-march=native` is unrecognized option on M1
-        if not config.is_fbcode():
-            if platform.machine() == "ppc64le":
-                cflags.append("mcpu=native")
-            else:
-                cflags.append("march=native")
 
-    if config.aot_inductor.enable_lto and _is_clang(cpp_compiler):
-        cflags.append("flto=thin")
+    if _IS_WINDOWS:
+        pass
+    else:
+        if sys.platform != "darwin":
+            # on macos, unknown argument: '-fno-tree-loop-vectorize'
+            if _is_gcc(cpp_compiler):
+                cflags.append("fno-tree-loop-vectorize")
+            # https://stackoverflow.com/questions/65966969/why-does-march-native-not-work-on-apple-m1
+            # `-march=native` is unrecognized option on M1
+            if not config.is_fbcode():
+                if platform.machine() == "ppc64le":
+                    cflags.append("mcpu=native")
+                else:
+                    cflags.append("march=native")
+
+        if config.aot_inductor.enable_lto and _is_clang(cpp_compiler):
+            cflags.append("flto=thin")
 
     return cflags, ldflags
 
@@ -709,6 +921,8 @@ def get_cpp_options(
         + _get_os_related_cpp_cflags(cpp_compiler)
     )
 
+    definitions += _get_os_related_cpp_definitions(cpp_compiler)
+
     if not _IS_WINDOWS and config.aot_inductor.enable_lto and _is_clang(cpp_compiler):
         ldflags.append("fuse-ld=lld")
         ldflags.append("flto=thin")
@@ -1384,7 +1598,8 @@ def get_cpp_torch_device_options(
             ze_root = os.getenv("LEVEL_ZERO_V1_SDK_PATH")
             if ze_root is None:
                 raise OSError(xpu_error_string)
-            include_dirs = [os.path.join(ze_root, "include")]
+            include_dirs += [os.path.join(ze_root, "include")]
+            libraries_dirs += [os.path.join(ze_root, "lib")]
             libraries += ["c10_xpu", "sycl", "ze_loader", "torch_xpu"]
         else:
             # Suppress multi-line comment warnings in sycl headers
diff --git a/torch/_inductor/fx_passes/bucketing.py b/torch/_inductor/fx_passes/bucketing.py
index 1b35cf324f5fc..1c4c5f6c3f733 100644
--- a/torch/_inductor/fx_passes/bucketing.py
+++ b/torch/_inductor/fx_passes/bucketing.py
@@ -7,6 +7,7 @@
 import torch.utils._pytree as pytree
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import detect_fake_mode
+from torch._inductor.runtime.runtime_utils import dynamo_timed
 from torch._logging import trace_structured
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.utils._ordered_set import OrderedSet
@@ -362,16 +363,17 @@ def all_gather_merge_fn_to_trace_functional(
 
 
 def _trace(fn, inps) -> torch.fx.GraphModule:  # type: ignore[no-untyped-def]
-    fake_mode = detect_fake_mode(inps)
-    assert fake_mode is not None
-    with fake_mode, enable_python_dispatcher():
-        out = make_fx(fn)(*inps)
-        for node in out.graph.find_nodes(
-            op="call_function", target=torch.ops.aten.detach.default
-        ):
-            node.replace_all_uses_with(node.args[0])
-            out.graph.erase_node(node)
-        return out
+    with dynamo_timed("fx.bucketing._trace", log_pt2_compile_event=True):
+        fake_mode = detect_fake_mode(inps)
+        assert fake_mode is not None
+        with fake_mode, enable_python_dispatcher():
+            out = make_fx(fn)(*inps)
+            for node in out.graph.find_nodes(
+                op="call_function", target=torch.ops.aten.detach.default
+            ):
+                node.replace_all_uses_with(node.args[0])
+                out.graph.erase_node(node)
+            return out
 
 
 def _insert_fn_trace_before_node(  # type: ignore[no-untyped-def]
@@ -389,109 +391,113 @@ def _insert_fn_trace_before_node(  # type: ignore[no-untyped-def]
     using :attr:`g_fn_inps` nodes of original graphas inputs of function graph,
     function graph outputs will replace :attr:`g_fn_outs` in original graph.
     """
-    fn_gm = _trace(
-        fn_to_trace,
-        inps,
-    )
-    fn_g = fn_gm.graph
-    fn_g_ins = fn_g.find_nodes(op="placeholder")
-    env = {fn_g_ins[idx]: g_fn_inps[idx] for idx in range(len(g_fn_inps))}
-    g_fn_new_outs: list[torch.fx.Node] = []
-    with g.inserting_before(insert_before_node):
-        for _n in fn_g.nodes:
-            if _n.op == "placeholder":
-                continue
-            _new_n = g.node_copy(_n, lambda x: env[x])
-            env[_n] = _new_n
-            if _n.op == "output":
-                g_fn_new_outs = _new_n.args[0]  # type: ignore[assignment]
-                g.erase_node(_new_n)
-    replacements = {  # noqa: C416
-        orig_out: new_out for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs)
-    }
-    for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs):
-        orig_out.replace_all_uses_with(new_out)
-    return replacements
+    with dynamo_timed(
+        "fx.bucketing._insert_fn_trace_before_node", log_pt2_compile_event=True
+    ):
+        fn_gm = _trace(
+            fn_to_trace,
+            inps,
+        )
+        fn_g = fn_gm.graph
+        fn_g_ins = fn_g.find_nodes(op="placeholder")
+        env = {fn_g_ins[idx]: g_fn_inps[idx] for idx in range(len(g_fn_inps))}
+        g_fn_new_outs: list[torch.fx.Node] = []
+        with g.inserting_before(insert_before_node):
+            for _n in fn_g.nodes:
+                if _n.op == "placeholder":
+                    continue
+                _new_n = g.node_copy(_n, lambda x: env[x])
+                env[_n] = _new_n
+                if _n.op == "output":
+                    g_fn_new_outs = _new_n.args[0]  # type: ignore[assignment]
+                    g.erase_node(_new_n)
+        replacements = {  # noqa: C416
+            orig_out: new_out for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs)
+        }
+        for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs):
+            orig_out.replace_all_uses_with(new_out)
+        return replacements
 
 
 def merge_reduce_scatter(
     gm: torch.fx.GraphModule, rs_buckets: list[list[torch.fx.Node]]
 ) -> None:
-    trace_structured(
-        "artifact",
-        metadata_fn=lambda: {
-            "name": "fx_bucketing_passes_reduce_scatter_buckets",
-            "encoding": "string",
-        },
-        payload_fn=lambda: str(rs_buckets),
-    )
-    n_buckets = len(rs_buckets)
-    g = gm.graph
-    rs_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-    rs_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-
-    for bucket_idx, rs_nodes in enumerate(rs_buckets):
-        rs0 = rs_nodes[0]
-        rs0_val = rs0.meta["val"]
-        _, reduce_op, group_size, group_name = rs0.args
-        reduce_dtype = rs0_val.dtype
-        device = rs0_val.device
-        for n in rs_nodes:
-            rs_val = n.meta["val"]
-            assert (
-                n.args[1] == reduce_op
-                and n.args[2] == group_size
-                and n.args[3] == group_name
-                and rs_val.device == device
-                and rs_val.dtype == reduce_dtype
-            )
-            assert len(n.users) == 1
-            wait_n = next(iter(n.users))
-            rs_ins[bucket_idx].append(n.args[0])  # type: ignore[arg-type]
-            rs_waits[bucket_idx].append(wait_n)
-
-    for bucket_idx in range(n_buckets):
-        _rs_ins = rs_ins[bucket_idx]
-        _rs_waits = rs_waits[bucket_idx]
-        _rs_ns = rs_buckets[bucket_idx]
-
-        rs0 = _rs_ns[0]
-        rs0_val = rs0.meta["val"]
-        _, reduce_op, group_size, group_name = rs0.args
-        reduce_dtype = rs0_val.dtype
-        device = rs0_val.device
-
-        replacements = _insert_fn_trace_before_node(
-            g,
-            reduce_scatter_merge_fn_to_trace,
-            (
-                pytree.tree_map(lambda node: node.meta["val"], _rs_ins),
-                group_size,
-                group_name,
-                reduce_op,
-                reduce_dtype,
-                device,
-            ),
-            _rs_ns[-1].next,
-            _rs_ins,
-            _rs_waits,
+    with dynamo_timed("fx.bucketing.merge_reduce_scatter", log_pt2_compile_event=True):
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_bucketing_passes_reduce_scatter_buckets",
+                "encoding": "string",
+            },
+            payload_fn=lambda: str(rs_buckets),
         )
-        # [Note: Replacement in bucketing passes]
-        # After bucketing _rs_waits will be replaced with output nodes of
-        # fn_to_trace graph that will be inserted in the graph g.
-        # By this time we already prepared rs_ins, rs_waits.
-        # rs_ins for following buckets can be replaced _rs_waits with new nodes.
-        # We apply replacements to rs_ins.
+        n_buckets = len(rs_buckets)
+        g = gm.graph
+        rs_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
+        rs_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
+
+        for bucket_idx, rs_nodes in enumerate(rs_buckets):
+            rs0 = rs_nodes[0]
+            rs0_val = rs0.meta["val"]
+            _, reduce_op, group_size, group_name = rs0.args
+            reduce_dtype = rs0_val.dtype
+            device = rs0_val.device
+            for n in rs_nodes:
+                rs_val = n.meta["val"]
+                assert (
+                    n.args[1] == reduce_op
+                    and n.args[2] == group_size
+                    and n.args[3] == group_name
+                    and rs_val.device == device
+                    and rs_val.dtype == reduce_dtype
+                )
+                assert len(n.users) == 1
+                wait_n = next(iter(n.users))
+                rs_ins[bucket_idx].append(n.args[0])  # type: ignore[arg-type]
+                rs_waits[bucket_idx].append(wait_n)
+
+        for bucket_idx in range(n_buckets):
+            _rs_ins = rs_ins[bucket_idx]
+            _rs_waits = rs_waits[bucket_idx]
+            _rs_ns = rs_buckets[bucket_idx]
+
+            rs0 = _rs_ns[0]
+            rs0_val = rs0.meta["val"]
+            _, reduce_op, group_size, group_name = rs0.args
+            reduce_dtype = rs0_val.dtype
+            device = rs0_val.device
+
+            replacements = _insert_fn_trace_before_node(
+                g,
+                reduce_scatter_merge_fn_to_trace,
+                (
+                    pytree.tree_map(lambda node: node.meta["val"], _rs_ins),
+                    group_size,
+                    group_name,
+                    reduce_op,
+                    reduce_dtype,
+                    device,
+                ),
+                _rs_ns[-1].next,
+                _rs_ins,
+                _rs_waits,
+            )
+            # [Note: Replacement in bucketing passes]
+            # After bucketing _rs_waits will be replaced with output nodes of
+            # fn_to_trace graph that will be inserted in the graph g.
+            # By this time we already prepared rs_ins, rs_waits.
+            # rs_ins for following buckets can be replaced _rs_waits with new nodes.
+            # We apply replacements to rs_ins.
 
-        def _replace(x: torch.fx.Node) -> torch.fx.Node:
-            return replacements.get(x, x)
+            def _replace(x: torch.fx.Node) -> torch.fx.Node:
+                return replacements.get(x, x)
 
-        for j in range(bucket_idx + 1, n_buckets):
-            rs_ins[j] = pytree.tree_map(_replace, rs_ins[j])
+            for j in range(bucket_idx + 1, n_buckets):
+                rs_ins[j] = pytree.tree_map(_replace, rs_ins[j])
 
-        for rs_n, wait_n in zip(_rs_ns, _rs_waits):
-            g.erase_node(wait_n)
-            g.erase_node(rs_n)
+            for rs_n, wait_n in zip(_rs_ns, _rs_waits):
+                g.erase_node(wait_n)
+                g.erase_node(rs_n)
 
 
 def merge_all_gather(
@@ -500,78 +506,79 @@ def merge_all_gather(
     """
     Merges specified buckets of all_gather to joint all_gather.
     """
-    from torch.distributed.distributed_c10d import _resolve_process_group
-
-    trace_structured(
-        "artifact",
-        metadata_fn=lambda: {
-            "name": "fx_bucketing_passes_all_gather_buckets",
-            "encoding": "string",
-        },
-        payload_fn=lambda: str(ag_buckets),
-    )
-    n_buckets = len(ag_buckets)
-
-    ag_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-    ag_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-    for bucket_idx, ag_bucket in enumerate(ag_buckets):
-        _, group_size, group_name = ag_bucket[0].args
-        assert isinstance(group_name, str)
-        dtype = ag_bucket[0].meta["val"].dtype
-
-        for ag_node in ag_bucket:
-            assert len(ag_node.users) == 1, (
-                f"Expect only one user for {ag_node}, but got {ag_node.users}"
-            )
-            wait_node = next(iter(ag_node.users))
-            assert (
-                ag_node.args[1] == group_size
-                and ag_node.args[2] == group_name
-                and ag_node.meta["val"].dtype == dtype
-            )
-            ag_node_in = ag_node.args[0]
-
-            ag_ins[bucket_idx].append(ag_node_in)  # type: ignore[union-attr, arg-type]
-            ag_waits[bucket_idx].append(wait_node)
-
-    g = gm.graph
-
-    for bucket_idx in range(n_buckets):
-        _ag_ins = ag_ins[bucket_idx]
-        _ag_waits = ag_waits[bucket_idx]
-        _ag_ns = ag_buckets[bucket_idx]
-
-        ag0 = _ag_ns[0]
-        ag0_val = ag0.meta["val"]
-        _, group_size, group_name = ag0.args
-        dtype = ag0_val.dtype
-        assert isinstance(group_name, str)
-
-        rank: int = dist.get_rank(_resolve_process_group(group_name))
-
-        replacements = _insert_fn_trace_before_node(
-            g,
-            all_gather_merge_fn_to_trace,
-            (
-                pytree.tree_map(lambda node: node.meta["val"], _ag_ins),
-                group_size,
-                group_name,
-                dtype,
-                rank,
-            ),
-            ag0.next,
-            _ag_ins,
-            _ag_waits,
+    with dynamo_timed("fx.bucketing.merge_all_gather", log_pt2_compile_event=True):
+        from torch.distributed.distributed_c10d import _resolve_process_group
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_bucketing_passes_all_gather_buckets",
+                "encoding": "string",
+            },
+            payload_fn=lambda: str(ag_buckets),
         )
+        n_buckets = len(ag_buckets)
+
+        ag_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
+        ag_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
+        for bucket_idx, ag_bucket in enumerate(ag_buckets):
+            _, group_size, group_name = ag_bucket[0].args
+            assert isinstance(group_name, str)
+            dtype = ag_bucket[0].meta["val"].dtype
+
+            for ag_node in ag_bucket:
+                assert len(ag_node.users) == 1, (
+                    f"Expect only one user for {ag_node}, but got {ag_node.users}"
+                )
+                wait_node = next(iter(ag_node.users))
+                assert (
+                    ag_node.args[1] == group_size
+                    and ag_node.args[2] == group_name
+                    and ag_node.meta["val"].dtype == dtype
+                )
+                ag_node_in = ag_node.args[0]
+
+                ag_ins[bucket_idx].append(ag_node_in)  # type: ignore[union-attr, arg-type]
+                ag_waits[bucket_idx].append(wait_node)
+
+        g = gm.graph
+
+        for bucket_idx in range(n_buckets):
+            _ag_ins = ag_ins[bucket_idx]
+            _ag_waits = ag_waits[bucket_idx]
+            _ag_ns = ag_buckets[bucket_idx]
+
+            ag0 = _ag_ns[0]
+            ag0_val = ag0.meta["val"]
+            _, group_size, group_name = ag0.args
+            dtype = ag0_val.dtype
+            assert isinstance(group_name, str)
+
+            rank: int = dist.get_rank(_resolve_process_group(group_name))
+
+            replacements = _insert_fn_trace_before_node(
+                g,
+                all_gather_merge_fn_to_trace,
+                (
+                    pytree.tree_map(lambda node: node.meta["val"], _ag_ins),
+                    group_size,
+                    group_name,
+                    dtype,
+                    rank,
+                ),
+                ag0.next,
+                _ag_ins,
+                _ag_waits,
+            )
 
-        # See Note: [Replacement in bucketing passes]
-        def _replace(x: torch.fx.Node) -> torch.fx.Node:
-            return replacements.get(x, x)
+            # See Note: [Replacement in bucketing passes]
+            def _replace(x: torch.fx.Node) -> torch.fx.Node:
+                return replacements.get(x, x)
 
-        for j in range(bucket_idx + 1, n_buckets):
-            ag_ins[j] = pytree.tree_map(_replace, ag_ins[j])
+            for j in range(bucket_idx + 1, n_buckets):
+                ag_ins[j] = pytree.tree_map(_replace, ag_ins[j])
 
-        # Erasing old nodes in reverse order
-        for ag_n, wait_n in zip(ag_buckets[bucket_idx], _ag_waits):
-            g.erase_node(wait_n)
-            g.erase_node(ag_n)
+            # Erasing old nodes in reverse order
+            for ag_n, wait_n in zip(ag_buckets[bucket_idx], _ag_waits):
+                g.erase_node(wait_n)
+                g.erase_node(ag_n)
diff --git a/torch/_inductor/fx_passes/fsdp.py b/torch/_inductor/fx_passes/fsdp.py
index 086651b9a9d77..e24ebe4037e7a 100644
--- a/torch/_inductor/fx_passes/fsdp.py
+++ b/torch/_inductor/fx_passes/fsdp.py
@@ -38,7 +38,19 @@ def is_graph_output(node: torch.fx.Node) -> bool:
 
 
 def is_fsdp_reduce_scatter_wait(wait: torch.fx.Node) -> bool:
-    return is_graph_output(wait)
+    if is_graph_output(wait):
+        return True
+
+    if len(wait.users) == 1:
+        user = next(iter(wait.users))
+        assert user is not None
+        return (
+            is_graph_output(user)
+            and user.op == "call_function"
+            and user.target == torch.ops.prims.convert_element_type.default
+        )
+
+    return False
 
 
 def bucket_fsdp_all_gather(
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index bd8b6d1e12990..d10dc7a464261 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -1112,10 +1112,11 @@ def placeholder(
             return None
         # See note: Note: [Generator arguments in AOTDispatcher]
         elif isinstance(example, torch.Generator):
-            assert (
-                len(V.graph.current_node.users) == 1
-                and next(iter(V.graph.current_node.users)).target
-                is torch._prims.rng_prims.graphsafe_run_with_rng_state
+            assert len(V.graph.current_node.users) == 1 and next(
+                iter(V.graph.current_node.users)
+            ).target in (
+                torch._prims.rng_prims.graphsafe_run_with_rng_state,
+                torch.ops.higher_order.invoke_subgraph,
             )
             gen = ir.GeneratorState(name=target, device=example.device)
             self.graph_inputs[target] = gen  # type: ignore[assignment]
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 1fea9a0d01875..622c8f6bd01f3 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3437,6 +3437,7 @@ def clamp_wrap(
             if val is None:
                 # TODO(rec): can this really happen?
                 return default
+            val = cls.handle_negative_index(val, dim_size)
             return clamp(val, lower, upper)
 
         start = clamp_wrap(start, 0, dim_size, 0)
@@ -3453,6 +3454,14 @@ def create(  # type: ignore[override]
         step: int = 1,
         clamp: bool = True,
     ) -> IRNode:
+        step = sympy.expand(step)
+        assert isinstance(step, Expr) or step > 0, step
+        try:
+            if start == 0 and end >= 2**63 - 1 and step == 1:
+                return x
+        except TypeError:
+            pass
+
         new_size = list(x.get_size())
 
         # NB: Ordinarily we default to clamping.
@@ -3739,8 +3748,10 @@ def _pad_strides(
         # do for dynamic shape.
         #
         # Skip padding the strides for dynamic shape for now.
-        # If outermost dim is dynamic, stride still can be fully static
-        if not all(isinstance(s, (int, sympy.Integer)) for s in in_strides):
+        if not all(
+            isinstance(s, (int, sympy.Integer))
+            for s in itertools.chain(in_strides, size)
+        ):
             return in_strides
 
         stride_order = get_stride_order(in_strides)
@@ -3755,11 +3766,11 @@ def _pad_strides(
         for rank, idx in enumerate(fill_order[1:], start=1):
             prev_idx = fill_order[rank - 1]
             stride = new_strides[prev_idx] * size[prev_idx]
-            if isinstance(stride, (int, sympy.Integer)):
-                if stride > config.padding_stride_threshold and stride % align != 0:
-                    stride = ceildiv(stride, align) * align
-                    padded = True
-                new_strides[idx] = stride
+
+            if stride > config.padding_stride_threshold and stride % align != 0:
+                stride = ceildiv(stride, align) * align
+                padded = True
+            new_strides[idx] = stride
 
         if not padded:
             # Consider a tensor with shape [256, 1, 5, 5]
@@ -7212,7 +7223,6 @@ def __init__(
         base_offset: Union[sympy.Symbol, int],
         base_dim_stride: Union[sympy.Symbol, int],
         size: Union[sympy.Symbol, int],
-        clamp: bool,
     ) -> None:
         super().__init__(None, NoneLayout(device=torch.device("cpu")), [])
         # This node codegen the following:
@@ -7222,7 +7232,6 @@ def __init__(
         self.base_offset = base_offset
         self.base_dim_stride = base_dim_stride
         self.size = size
-        self.clamp = clamp
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet([self.unbacked_offset_symbol])
@@ -7233,57 +7242,7 @@ def get_free_symbol_uses(
         return get_free_symbols(self.index, unbacked_only)
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
-        wrapper.codegen_dynamic_select_index(self, clamp=self.clamp)
-
-
-class DynamicSliceSize(ExternKernel):
-    """
-    Computes the output size of a slice call, handling the correct semantics in codegen.
-    We do this for flexible handling for unbacked indices (to not data-dependent error).
-
-    Slicing has 4 semantics for indices, i.e. x[start:] could be:
-    1) start < -x.size(0)            -> x[0:]                    # negative out-of-bounds
-    2) start in [-x.size(0), 0)      -> x[x.size(0) + start:]    # negative slicing
-    3) start in [0, x.size(0))       -> x[start:]                # standard slicing
-    4) start >= x.size(0)            -> empty slice              # positive out-of-bounds
-
-    If the appropriate semantics are known beforehand, the output size is computed based on
-    the start & end indices. If not (with unbacked indices), a new unbacked symbol is created
-    to represent the output size, and codegen handles computing the correct case.
-    """
-
-    def get_reads(self) -> OrderedSet[Dep]:
-        return OrderedSet()
-
-    def should_allocate(self) -> bool:
-        return False
-
-    def __init__(
-        self,
-        unbacked_size_symbol: sympy.Symbol,
-        start: sympy.Symbol,
-        end: Union[sympy.Symbol, int],
-        size: Union[sympy.Symbol, int],
-    ):
-        super().__init__(None, NoneLayout(device=torch.device("cpu")), [])
-        # This node codegen
-        self.unbacked_size_symbol = unbacked_size_symbol
-        self.start = start
-        self.end = end
-        self.size = size
-
-    def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
-        return OrderedSet([self.unbacked_size_symbol])
-
-    def get_free_symbol_uses(
-        self, unbacked_only: bool = False
-    ) -> OrderedSet[sympy.Symbol]:
-        return get_free_symbols(self.start, unbacked_only).union(
-            get_free_symbols(self.end, unbacked_only)
-        )
-
-    def codegen(self, wrapper: PythonWrapperCodegen) -> None:
-        wrapper.codegen_dynamic_slice_size(self)
+        wrapper.codegen_dynamic_select_index(self)
 
 
 class DynamicScalar(ExternKernel):
@@ -8339,7 +8298,7 @@ def create(
         new_operands: list[IRNode] = []
 
         for idx, operand in enumerate(operands):
-            if isinstance(operand, ShapeAsConstantBuffer):
+            if isinstance(operand, (ShapeAsConstantBuffer, GeneratorState)):
                 new_operands.append(operand)
             else:
                 new_operands.append(
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index 947175af04708..92822ecc310bb 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -95,7 +95,7 @@ def bmm_grid(b, m, n, meta, *, cdiv):
         else:
             a = tl.load(A, mask=rk[None, :] < k, other=0.)
             b = tl.load(B, mask=rk[:, None] < k, other=0.)
-        acc += tl.dot(a, b, input_precision=FLOAT32_PRECISION)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
         A += BLOCK_K * stride_ak
         B += BLOCK_K * stride_bk
 
diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py
index 3b40bfc21b5e8..5ac471e352d60 100644
--- a/torch/_inductor/kernel/conv.py
+++ b/torch/_inductor/kernel/conv.py
@@ -85,7 +85,7 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
         )
         mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
         matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
-        acc += tl.dot(matrix_x, matrix_w, input_precision=FLOAT32_PRECISION)
+        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
 """
 
 """
@@ -214,7 +214,7 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
         )
         mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
         matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
-        acc += tl.dot(matrix_x, matrix_w, input_precision=FLOAT32_PRECISION)
+        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
 """
 
 conv3d_template = TritonTemplate(
@@ -390,11 +390,6 @@ def channels_last_order(rank):
     return order
 
 
-def _get_float32_precision():
-    result = "tf32" if torch.backends.cuda.matmul.allow_tf32 else "ieee"
-    return f'"{result}"'
-
-
 def convert_1x1_conv_to_mm(x, weight, bias):
     # special case for 1x1 convolution, which is actually just a matmul
     rank = len(weight.get_size())
@@ -596,10 +591,12 @@ def channels_last_conv():
 
         conv_configs = V.choices.get_conv_configs(device_type)
 
+        dtype_size = x.get_dtype().itemsize
         for cfg in conv_configs(
             sympy_product([x.get_size()[0], *x.get_size()[2:]]),
             out_chan,
             in_chan,
+            dtype_size=dtype_size,
         ):
             if ndim == 2:
                 conv2d_template.maybe_append_choice(
@@ -616,7 +613,7 @@ def channels_last_conv():
                     # TODO(jansel): try unroll for bigger kernels once fixed:
                     #               https://github.com/triton-lang/triton/issues/1254
                     UNROLL=is_ones(kernel_shape),
-                    FLOAT32_PRECISION=_get_float32_precision(),
+                    ALLOW_TF32=torch.backends.cudnn.allow_tf32,
                     num_stages=cfg.num_stages,
                     num_warps=cfg.num_warps,
                     **cfg.kwargs,
@@ -639,7 +636,7 @@ def channels_last_conv():
                     # TODO(jansel): try unroll for bigger kernels once fixed:
                     #               https://github.com/triton-lang/triton/issues/1254
                     UNROLL=is_ones(kernel_shape),
-                    FLOAT32_PRECISION=_get_float32_precision(),
+                    ALLOW_TF32=torch.backends.cudnn.allow_tf32,
                     num_stages=cfg.num_stages,
                     num_warps=cfg.num_warps,
                     **cfg.kwargs,
diff --git a/torch/_inductor/kernel/flex/common.py b/torch/_inductor/kernel/flex/common.py
index 6cc197a35b9cf..aab25ac0813bb 100644
--- a/torch/_inductor/kernel/flex/common.py
+++ b/torch/_inductor/kernel/flex/common.py
@@ -125,12 +125,6 @@ def build_subgraph_module_buffer(
     with V.set_graph_handler(pw_subgraph):  # type: ignore[arg-type]
         pw_subgraph.run(*args)
 
-    # Since we are allowing mutations/buffer creation, we need to register any fresh buffers
-    # creating during the pointwise subgraph lowering
-    if len(pw_subgraph.buffers) > 0:
-        for buffer in pw_subgraph.buffers:
-            V.graph.register_buffer(buffer)
-
     def convert_output_node_to_buffer(output_buffer) -> Optional[ComputedBuffer]:
         if output_buffer is None:
             return None
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 0378c0371b179..863212c4b3d72 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -136,9 +136,9 @@
         {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
 
         {% if USE_FAST_ACCUM %}
-        acc = tl.dot(a, b, acc, input_precision=FLOAT32_PRECISION, out_dtype=ACC_TYPE)
+        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
         {% else %}
-        acc += tl.dot(a, b, input_precision=FLOAT32_PRECISION, out_dtype=ACC_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
         {% endif %}
 
     # rematerialize rm and rn to save registers
@@ -211,9 +211,9 @@
         idx_n = offs_b_n[None, :]
         {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
         {% if USE_FAST_ACCUM %}
-        acc = tl.dot(a, b, acc, input_precision=FLOAT32_PRECISION, out_dtype=ACC_TYPE)
+        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
         {% else %}
-        acc += tl.dot(a, b, input_precision=FLOAT32_PRECISION, out_dtype=ACC_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
         {% endif %}
 
     # rematerialize rm and rn to save registers
@@ -347,7 +347,7 @@
         acc += tl.dot(
             a if A_ROW_MAJOR else a.T,
             b if B_ROW_MAJOR else b.T,
-            input_precision=FLOAT32_PRECISION,
+            allow_tf32=ALLOW_TF32,
         )
 
         if ki == k_tiles - 1:
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index d5ab1d2b83e9d..df3e8fcf1e656 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -90,7 +90,7 @@
         else:
             a = tl.load(A, mask=rk[None, :] < k1, other=0.)
             b = tl.load(B, mask=rk[:, None] < k1, other=0.)
-        acc += tl.dot(a, b, input_precision=FLOAT32_PRECISION)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
         A += BLOCK_K * stride_ak
         B += BLOCK_K * stride_bk
 
@@ -103,7 +103,7 @@
         else:
             c = tl.load(C, mask=rk[None, :] < k2, other=0.)
             d = tl.load(D, mask=rk[:, None] < k2, other=0.)
-        acc += tl.dot(c, d, input_precision=FLOAT32_PRECISION)
+        acc += tl.dot(c, d, allow_tf32=ALLOW_TF32)
         C += BLOCK_K * stride_ck
         D += BLOCK_K * stride_dk
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index e708355e3f629..b29732eb67ef9 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1172,130 +1172,9 @@ def permute(x, dims):
 
 @register_lowering(aten.slice, type_promotion_kind=None)
 def slice_(x, dim=0, start=0, end=2**63, step=1, clamp=True):
-    """
-    Lowers a slice call, creating ExternKernels for the output size & storage offset symbols,
-    if the indices are unbacked and appropriate semantics aren't known.
-    If they are known (indices are static/backed/unbacked with info), a SliceView is created.
-    """
-
-    from torch.fx.experimental.symbolic_shapes import (
-        CallMethodKey,
-        resolve_unbacked_bindings,
-    )
-
     assert isinstance(x, TensorBox)
     dim = _validate_dim(x, dim, 0)
-    size = x.get_size()[dim]
-    step = sympy.expand(step)
-    assert isinstance(step, sympy.Expr) or step > 0, step
-
-    # maybe apply slice optimization
-    try:
-        if (
-            start == 0
-            and V.graph.sizevars.statically_known_leq(size, end)
-            and step == 1
-        ):
-            return x
-    except TypeError:
-        pass
-
-    # try to avoid dynamic slice
-    def handle_negative_index(idx, size, default):
-        if idx is None:
-            return default
-        idx = sympy.expand(idx)
-        size = sympy.expand(size)
-        if V.graph.sizevars.guard_or_false(idx >= 0):
-            return idx
-        elif V.graph.sizevars.guard_or_false(idx < 0):
-            return size + idx
-        return None
-
-    ambiguous_slice = clamp
-    if ambiguous_slice:
-        start_index = handle_negative_index(start, size, 0)
-        end_index = handle_negative_index(end, size, size)
-        if start_index is not None and end_index is not None:
-            start, end = start_index, end_index
-            ambiguous_slice = False
-
-    # ambiguous_slice=False means we know what semantics this slice call follows,
-    # and don't need to generate an extern kernel to represent the output size.
-    # This is assumed True for clamp=False
-    # (meant to follow standard indexing semantics: 0 <= index < size)
-    if not ambiguous_slice:
-        return TensorBox(
-            ir.SliceView.create(x.data, dim, start, end, step, clamp=clamp)
-        )  # go to SliceView/ReinterpretView
-
-    # unbacked territory: create DynamicSlice ExternKernel
-    # clamp is True, unbacked start / end
-    assert clamp
-    unbacked_bindings = resolve_unbacked_bindings(
-        V.graph.sizevars.shape_env, V.graph.current_node.meta["unbacked_bindings"]
-    )
-    assert unbacked_bindings is not None
-    assert len(unbacked_bindings) <= 2, unbacked_bindings
-    sym_size, sym_storage = None, None
-    for sym, keypath in unbacked_bindings.items():
-        if keypath == (CallMethodKey("size"), pytree.SequenceKey(dim)):
-            sym_size = sym
-        elif keypath == (CallMethodKey("storage_offset"),):
-            sym_storage = sym
-
-    def compute_slice_index(index, size):
-        fn = lambda x: V.graph.sizevars.guard_or_false(x)  # noqa: E731
-
-        if fn(sympy.Ge(index, 0)) and fn(sympy.Le(index, size)):
-            return index
-        elif fn(sympy.Lt(index, 0)) and fn(sympy.Ge(index, -size)):
-            return -index
-        elif fn(sympy.Gt(index, size)):
-            return size
-        elif fn(sympy.Lt(index, -size)):
-            return 0
-        return None
-
-    start_index = compute_slice_index(start, size)
-    end_index = compute_slice_index(end, size)
-    if start_index is not None and end_index is not None:
-        # we shouldn't have allocated size symbol, if output size was determinable from input indices
-        assert sym_size is None
-        new_size = sympy.Max(0, end_index - start_index)
-    else:
-        b_size = ir.DynamicSliceSize(
-            sym_size,
-            start,
-            end,
-            x.get_size()[dim],
-        )
-        b_size.name = V.graph.register_buffer(b_size)
-        V.graph.register_operation(b_size)
-        new_size = sym_size
-
-    if start_index is not None:
-        # we shouldn't have allocated storage offset symbol if start index was determinable
-        assert sym_storage is None
-        new_storage_offset = x.get_layout().offset + start_index * x.get_stride()[dim]
-    else:
-        b_storage = ir.DynamicSelectStorageOffset(
-            sym_storage,
-            start,
-            x.get_layout().offset,
-            x.get_stride()[dim],
-            x.get_size()[dim],
-            clamp=True,
-        )
-        b_storage.name = V.graph.register_buffer(b_storage)
-        V.graph.register_operation(b_storage)
-        new_storage_offset = sym_storage
-
-    new_sizes = list(x.get_size())
-    new_strides = list(x.get_stride())
-    new_sizes[dim] = new_size
-    new_strides[dim] *= step
-    return as_strided(x, new_sizes, new_strides, new_storage_offset)
+    return TensorBox(ir.SliceView.create(x.data, dim, start, end, step, clamp=clamp))
 
 
 @register_lowering(aten.as_strided, type_promotion_kind=None)
@@ -1921,7 +1800,6 @@ def select(x, dim, idx):
         x.get_layout().offset,
         new_stride[dim],
         x.get_size()[dim],
-        clamp=False,
     )
     buffer.name = V.graph.register_buffer(buffer)
     V.graph.register_operation(buffer)
@@ -3113,8 +2991,6 @@ def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
     dim = _validate_dim(x, dim, 0)
     dim_size = x.get_size()[dim]
 
-    start = ir.SliceView.handle_negative_index(start, dim_size)
-    end = ir.SliceView.handle_negative_index(end, dim_size)
     start, end = ir.SliceView.normalize_start_end(x, dim, start, end)
 
     src_size = list(x.get_size())
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index c28b298835334..27ca4415c8f0e 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -88,13 +88,20 @@ def _dep_size_hint(dep: Dep) -> int:
         collections.defaultdict(OrderedSet)
     )
     dep_name_to_size: dict[str, int] = dict()
+
     for node in nodes:
         for dep in node.read_writes.reads:
-            if dep.name in graph_inputs and not dep.name.startswith(
-                ("primals_", "arg", "fwd_rng_state", "bwd_rng_state")
-            ):
-                dep_name_to_succ_nodes[dep.name].add(node)
-                dep_name_to_size[dep.name] = _dep_size_hint(dep)
+            if dep.name in graph_inputs:
+                dep_name = dep.name
+                # Subgraphs have a prefix for the name, cleanup the prefix
+                # before checking for known strings.
+                if V.graph.name:
+                    dep_name = dep_name.removeprefix(V.graph.name + "_")
+                if not dep_name.startswith(
+                    ("primals_", "arg", "fwd_rng_state", "bwd_rng_state")
+                ):
+                    dep_name_to_succ_nodes[dep.name].add(node)
+                    dep_name_to_size[dep.name] = _dep_size_hint(dep)
 
     # create FreeableInputBuffer objects and add them to the returned dictionary
     name_to_freeable_input_buf: dict[str, FreeableInputBuffer] = dict()
@@ -279,6 +286,11 @@ def assign_memory_planning_info_for_scheduler_nodes(
     for index, node in enumerate(nodes):
         size_alloc = sum(buffer.mpi_buffer.size_alloc for buffer in node.get_outputs())
         succ_nodes = node_to_succ_nodes[node]
+        pred_nodes = node_to_pred_nodes[node]
+
+        # make sure we do not make node a successor or predecessor of itself
+        succ_nodes.discard(node)
+        pred_nodes.discard(node)
 
         node.mpi_node = MemoryPlanningInfoForNode(
             index=index,
@@ -395,17 +407,35 @@ def estimate_peak_memory(
 ) -> tuple[int, list[int]]:
     """
     Given a list of nodes in their execution order, estimate the peak memory, by
-    keeping track of the liveness of SchedulerBuffers and FreeableInputBuffers.
+    keeping track of the liveliness of SchedulerBuffers and FreeableInputBuffers.
 
     Returns:
         int: peak memory
         List[int]: memory usage at each node (or each step).
     """
-    # Use estimate_peak_memory_allocfree to keep one impl.
-    peak_memory, snodes_curr_memory, snodes_allocfree, buf_to_snode_last_use = (
-        estimate_peak_memory_allocfree(nodes, name_to_freeable_input_buf, graph_outputs)
+
+    buf_info_list, _, _ = compute_memory_timeline(
+        nodes, name_to_freeable_input_buf, graph_outputs
     )
-    return peak_memory, [(curr_mem[0] + curr_mem[1]) for curr_mem in snodes_curr_memory]
+
+    # incremental memory changes at each step
+    memory = [0 for _ in range(len(nodes) + 1)]
+
+    # for each buffer, update memory when created and when freed
+    for buf_info in buf_info_list:
+        memory[buf_info.start_step] += buf_info.size_alloc
+        memory[buf_info.end_step + 1] -= buf_info.size_free
+
+    # get peak memory by compute the cumulative memories
+    max_memory = 0
+    cur_memory = 0
+    memories_at_nodes = []
+    for t in range(len(nodes) + 1):
+        cur_memory += memory[t]
+        memories_at_nodes.append(cur_memory)
+        max_memory = max(max_memory, cur_memory)
+
+    return (max_memory, memories_at_nodes)
 
 
 @dataclasses.dataclass
@@ -748,6 +778,7 @@ def dfs_visit(node: BaseSchedulerNode) -> None:
         path.append(node)
 
         for pred_node in node.mpi_node.pred_nodes:
+            assert pred_node != node
             dfs_visit(pred_node)
 
         path.pop()
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index dd6c714263909..25f505da5d40e 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -1699,7 +1699,7 @@ def generate(  # type: ignore[override]
         # patch around it here.  See https://github.com/triton-lang/triton/issues/3011
         # for one example issue with this problem.
         if torch.cuda.is_available() and not torch.cuda.is_tf32_supported():
-            kwargs["FLOAT32_PRECISION"] = '"ieee"'
+            kwargs["ALLOW_TF32"] = "False"
 
         if call_sizes is None:
             call_sizes = layout.size
@@ -1832,7 +1832,7 @@ def make_kernel_render(out_node, hint_override: Optional[int] = None):
                 "num_stages": num_stages,
                 "num_warps": num_warps,
                 "GROUP_M": kwargs.get("GROUP_M", -1),
-                "float32_precision": str(kwargs.get("FLOAT32_PRECISION", None)),
+                "allow_tf32": str(kwargs.get("ALLOW_TF32", None)),
                 "acc_type": str(kwargs.get("ACC_TYPE", None)),
                 "matrix_instr_nonkdim": kwargs.get("matrix_instr_nonkdim", 0),
                 "waves_per_eu": kwargs.get("waves_per_eu", 0),
@@ -2464,12 +2464,12 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
 
                 important_keys = [
                     "ACC_TYPE",
+                    "ALLOW_TF32",
                     "BLOCK_K",
                     "BLOCK_M",
                     "BLOCK_N",
                     "EVEN_K",
                     "GROUP_M",
-                    "FLOAT32_PRECISION",
                     "USE_FAST_ACCUM",
                     "num_stages",
                     "num_warps",
diff --git a/torch/_inductor/shape_propagation.py b/torch/_inductor/shape_propagation.py
index 231f6f85ae0ac..ab3249ea1ba1e 100644
--- a/torch/_inductor/shape_propagation.py
+++ b/torch/_inductor/shape_propagation.py
@@ -67,7 +67,7 @@ def broadcast_shapes_for_args(args: Sequence[ShapeArg]) -> BlockShapeType:
         else:
             from torch._inductor.loop_body import LoopBody, LoopBodyBlock
 
-            if isinstance(arg, (LoopBodyBlock, LoopBody)):
+            if isinstance(arg, (LoopBodyBlock, LoopBody, OpsValue)):
                 # TODO: fix me
                 return None
             raise TypeError(f"Unknown type: {type(arg)}")
diff --git a/torch/_inductor/subgraph_lowering.py b/torch/_inductor/subgraph_lowering.py
index 3c8116d402c96..180a9d0eba801 100644
--- a/torch/_inductor/subgraph_lowering.py
+++ b/torch/_inductor/subgraph_lowering.py
@@ -87,8 +87,7 @@ def mark_buffer_mutated(self, name: str) -> None:
 
     def register_buffer(self, buffer: ir.Buffer, *, set_name: bool = False) -> str:
         if self._approved_mutator():
-            name = self.qualify_name(f"buf{len(self.buffers)}")
-            self.buffers.append(buffer)
+            name = self.root_graph.register_buffer(buffer, set_name=set_name)
             return name
         else:
             raise SubgraphLoweringException(
diff --git a/torch/_inductor/template_heuristics.py b/torch/_inductor/template_heuristics.py
index 1b87a61c35d12..68b304fdbc616 100644
--- a/torch/_inductor/template_heuristics.py
+++ b/torch/_inductor/template_heuristics.py
@@ -1316,19 +1316,6 @@ def get_template_configs(
             )
             yield template_kwargs
 
-    @staticmethod
-    def _get_input_precision(
-        m: sympy.Integer, n: sympy.Integer, k: sympy.Integer
-    ) -> str:
-        allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
-            not inductor_config.force_same_precision
-            or ((m % 16) == 0 and (n % 16) == 0 and (k % 8) == 0)
-        )
-        result = "tf32" if allow_tf32 else "ieee"
-
-        # wrap in quotes, because the string will be dropped into the templates
-        return f'"{result}"'
-
     def _convert_config_to_template_kwargs(
         self,
         triton_config: TritonConfig,
@@ -1348,10 +1335,16 @@ def _convert_config_to_template_kwargs(
             == triton_config.kwargs["BLOCK_K"]
         )
 
+        # Calculate allow_tf32
+        allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
+            not inductor_config.force_same_precision
+            or ((m % 16) == 0 and (n % 16) == 0 and (k % 8) == 0)
+        )
+
         # Build options dict
         options_dict = dict(
             EVEN_K=even_k_symbolic,
-            FLOAT32_PRECISION=MMTemplateConfigMixin._get_input_precision(m, n, k),
+            ALLOW_TF32=allow_tf32,
             USE_FAST_ACCUM=False,  # Option for _scaled_mm
             ACC_TYPE=self._get_acc_type(layout.dtype),
             num_stages=triton_config.num_stages,
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
index 10ba37b361171..7ebd2ec92d124 100644
--- a/torch/_subclasses/fake_impls.py
+++ b/torch/_subclasses/fake_impls.py
@@ -6,7 +6,7 @@
 import operator
 import sys
 from functools import reduce
-from typing import Callable, Optional, Union
+from typing import Callable, Union
 
 import torch
 import torch._custom_op
@@ -15,7 +15,6 @@
 from torch._dispatch.python import no_python_dispatcher
 from torch._ops import OpOverload
 from torch._prims_common import (
-    canonicalize_dim,
     contiguous_for_memory_format_or_false,
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
@@ -747,88 +746,6 @@ def _padded_dense_to_jagged_forward(fake_mode, func, padded, offsets, total_L=No
     return padded.new_empty(output_shape)
 
 
-def _compute_slice_index(size, index):
-    from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_and
-
-    if guard_or_false(sym_and(index >= 0, index <= size)):
-        return index
-    elif guard_or_false(sym_and(index < 0, index >= -size)):
-        return index + size
-    elif guard_or_false(index < -size):
-        return 0
-    elif guard_or_false(index > size):
-        return size
-    return None
-
-
-@register_op_impl(torch.ops.aten.slice.Tensor)
-def slice_forward(
-    fake_mode,
-    func,
-    self,
-    dim: int = 0,
-    start: Optional[int] = None,
-    end: Optional[int] = None,
-    step: int = 1,
-):
-    from torch.fx.experimental.symbolic_shapes import (
-        guard_or_false,
-        statically_known_true,
-    )
-
-    shape_env = fake_mode.shape_env
-
-    ndim = self.dim()
-    if ndim == 0:
-        raise RuntimeError("slice() cannot be applied to a 0-dim tensor.")
-    dim = canonicalize_dim(self.dim(), dim)
-    sizes = list(self.size())
-    strides = list(self.stride())
-
-    if step <= 0:
-        raise RuntimeError("slice step must be positive")
-
-    # start, end
-    start_index = 0 if start is None else _compute_slice_index(sizes[dim], start)
-    end_index = (
-        sizes[dim]
-        if statically_known_true(end == sys.maxsize) or end is None
-        else _compute_slice_index(sizes[dim], end)
-    )
-
-    # size
-    new_size = None
-    if start_index is not None and end_index is not None:
-        if guard_or_false(end_index >= start_index):
-            new_size = (end_index - start_index + step - 1) // step
-        elif guard_or_false(start_index >= end_index):
-            new_size = 0
-
-    # create unbacked if case unknown
-    if new_size is None:
-        new_size = shape_env.create_unbacked_symint()
-        torch._check_is_size(new_size, max=sizes[dim])
-
-    # stride
-    new_stride = strides[dim] * step
-
-    # storage offset
-    if start_index is not None:
-        storage_offset = self.storage_offset() + start_index * strides[dim]
-    else:
-        storage_offset = shape_env.create_unbacked_symint()
-        torch._check(storage_offset >= 0)
-
-    sizes[dim] = new_size
-    strides[dim] = new_stride
-    if self.is_quantized:
-        raise NotImplementedError(
-            "Slice decomposition for quantized tensors aren't implemented"
-        )
-    else:
-        return self.as_strided(sizes, strides, storage_offset)
-
-
 @register_op_impl(torch.ops.aten.masked_select.default)
 def masked_select(fake_mode, func, self, mask):
     if (
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 6da4bd98eca24..52b776946b361 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -2616,9 +2616,7 @@ def go(t: object, real_t: Tensor) -> None:
         if (
             func not in meta_table
             and not self.cpp_meta_supports_symint(func)
-            and not (
-                has_symbolic_sizes and func in self._unbacked_special_fake_handling_ops
-            )
+            and not (has_symbolic_sizes and func in self._view_fake_tensor_impl_ops)
         ):
             from torch._decomp import decomposition_table
 
@@ -2927,10 +2925,8 @@ def create_symbolic_nested_int(
         aten._sparse_coo_tensor_with_dims_and_tensors.default,
     )
 
-    _unbacked_special_fake_handling_ops = ordered_set(
-        aten.view.default,
-        aten._unsafe_view.default,
-        aten.slice.Tensor,
+    _view_fake_tensor_impl_ops = ordered_set(
+        aten.view.default, aten._unsafe_view.default
     )
 
     def cpp_meta_supports_symint(self, func: OpOverload) -> bool:
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
index 87327428461a2..ee8e8234298ae 100644
--- a/torch/backends/cuda/__init__.py
+++ b/torch/backends/cuda/__init__.py
@@ -15,6 +15,7 @@
     "preferred_linalg_library",
     "preferred_blas_library",
     "preferred_rocm_fa_library",
+    "is_ck_sdpa_available",
     "cufft_plan_cache",
     "matmul",
     "SDPAParams",
@@ -332,6 +333,16 @@ def preferred_rocm_fa_library(
 SDPAParams.__name__ = "SDPAParams"
 
 
+def is_ck_sdpa_available() -> bool:
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether composable_kernel may be used as the backend for
+    scaled-dot-product-attention.
+    """
+    return torch._C._is_ck_sdpa_available()
+
+
 def flash_sdp_enabled():
     r"""
     .. warning:: This flag is beta and subject to change.
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 0e4429d637888..1f98b89bbfe58 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -2454,6 +2454,14 @@ Call this whenever a new thread is created in order to propagate values from
     return at::globalContext().getROCmFAPreferredBackend();
   });
 
+  py_module.def("_is_ck_sdpa_available", []() {
+#ifdef USE_ROCM
+    return at::globalContext().ckSupported() && at::globalContext().hasCKSDPA();
+#else
+    return false;
+#endif
+  });
+
   py_module.def(
       "_set_sm_carveout_experimental", [](std::optional<int32_t> val) {
         at::globalContext()._setSMCarveout_EXPERIMENTAL(val);
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 7c6792f5e6986..78a0c6eeec7ac 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -704,7 +704,7 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
       PyFrameObject* frame,
       int what,
       PyObject* arg);
-
+  void register_gc_callback() override;
   void stop() override;
   void restart() override;
   std::vector<std::shared_ptr<Result>> getEvents(
@@ -723,6 +723,8 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
       PyFrameObject* frame,
       bool is_startup_frame);
 
+  static PyObject* gc_event_callback(PyObject* self, PyObject* args);
+
   void recordCCall(
       ThreadLocalResults& tls,
       PyFrameObject* frame,
@@ -733,6 +735,7 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
 
   std::atomic<bool> active_lock_{false};
   bool active_{false};
+  bool gc_callback_registered_{false};
 
   torch::profiler::impl::RecordQueue* queue_;
   PyInterpreterState* interpreter_{nullptr};
@@ -973,6 +976,27 @@ const std::vector<PyThreadState*> PythonTracer::interpreterThreads() const {
   return out;
 }
 
+// we are only registering on main thread while holding GIL so this should be
+// safe
+static PyObject* py_gc_callback = nullptr;
+// The C function to be called by Python's GC
+PyObject* PythonTracer::gc_event_callback(PyObject* self, PyObject* args) {
+  const char* phase;
+  PyObject* info;
+  if (!PyArg_ParseTuple(args, "sO", &phase, &info)) {
+    return nullptr;
+  }
+  PythonTracer* instance =
+      reinterpret_cast<PythonTracer*>(PyCapsule_GetPointer(self, nullptr));
+  if (!instance) {
+    PyErr_SetString(PyExc_RuntimeError, "Invalid tracer instance");
+    return nullptr;
+  }
+  instance->queue_->getSubqueue()->emplace_gc_call(
+      phase, c10::getApproximateTime());
+  Py_RETURN_NONE;
+}
+
 PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     : queue_(queue),
 
@@ -1045,8 +1069,74 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
 #endif
 }
 
+void unregister_gc_callback() {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* gc_module = PyImport_ImportModule("gc");
+  if (!gc_module) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return;
+  }
+  PyObject* callbacks = PyObject_GetAttrString(gc_module, "callbacks");
+  if (!callbacks || !PyList_Check(callbacks)) {
+    PyErr_Print();
+    Py_XDECREF(gc_module);
+    Py_XDECREF(callbacks);
+    PyGILState_Release(gstate);
+    return;
+  }
+  Py_ssize_t idx = PySequence_Index(callbacks, py_gc_callback);
+  if (idx >= 0) {
+    PySequence_DelItem(callbacks, idx);
+  } else {
+    // Not found, maybe already removed
+  }
+  Py_DECREF(callbacks);
+  Py_DECREF(gc_module);
+  Py_XDECREF(py_gc_callback);
+  py_gc_callback = nullptr;
+  PyGILState_Release(gstate);
+}
+
+void PythonTracer::register_gc_callback() {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* gc_module = PyImport_ImportModule("gc");
+  if (!gc_module) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return;
+  }
+  PyObject* callbacks = PyObject_GetAttrString(gc_module, "callbacks");
+  if (!callbacks || !PyList_Check(callbacks)) {
+    PyErr_Print();
+    Py_XDECREF(gc_module);
+    Py_XDECREF(callbacks);
+    PyGILState_Release(gstate);
+    return;
+  }
+  static PyMethodDef method_def = {
+      "gc_event_callback",
+      (PyCFunction)gc_event_callback,
+      METH_VARARGS,
+      nullptr};
+  PyObject* capsule = PyCapsule_New(this, nullptr, nullptr);
+  py_gc_callback = PyCFunction_New(&method_def, capsule);
+  Py_DECREF(capsule); // PyCFunction_New increments refcount
+  if (PyList_Append(callbacks, py_gc_callback) < 0) {
+    PyErr_Print();
+  }
+  gc_callback_registered_ = true;
+  Py_DECREF(callbacks);
+  Py_DECREF(gc_module);
+  PyGILState_Release(gstate);
+}
+
 void PythonTracer::stop() {
   gil_and_restore_thread gil;
+  if (gc_callback_registered_) {
+    unregister_gc_callback();
+    gc_callback_registered_ = false;
+  }
   if (active_) {
     for (const auto thread_state : interpreterThreads()) {
       if (thread_state->c_profilefunc == &PythonTracer::pyProfileFn) {
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index aa2a61d594ba9..9dd811eabe794 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -262,9 +262,76 @@ static Variable applySlicing(
   return result;
 }
 
+static bool treatSequenceAsTuple(PyObject* index) {
+  if (PyTuple_Check(index)) {
+    return true;
+  }
+  if (THPVariable_Check(index)) {
+    return false;
+  }
+  //  Allow indexing with ndarray if numpy compilation is enabled. An ndarray
+  //  index should not be treated as a tuple since the indexing has a different
+  //  syntax.
+#ifdef USE_NUMPY
+  if (::torch::utils::is_numpy_available() && PyArray_CheckExact(index)) {
+    return false;
+  }
+#endif
+  if (!PySequence_Check(index)) {
+    return false;
+  }
+  // This uses a heuristics from NumPy for determining whether to treat
+  // non-tuple sequences as if they were a tuple. From the NumPy code comments:
+  //
+  // "At this point, we're left with a non-tuple, non-array, sequence:
+  //  typically, a list. We use some somewhat-arbitrary heuristics from here
+  //  onwards to decided whether to treat that list as a single index, or a
+  //  list of indices. Backwards compatibility only takes effect for short
+  //  sequences - otherwise we treat it like any other scalar."
+  auto n = PySequence_Size(index);
+  if (n < 0) {
+    // Negative size indicates a Python error in the PySequence_Size call.
+    PyErr_Clear();
+    return false;
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  if (n >= 32) {
+    return false;
+  }
+  for (Py_ssize_t i = 0; i < n; i++) {
+    auto obj = THPObjectPtr{PySequence_GetItem(index, i)};
+    if (!obj.get()) {
+      PyErr_Clear();
+      return false;
+    }
+    if (THPVariable_Check(obj.get()) || PySequence_Check(obj.get()) ||
+        PySlice_Check(obj.get())) {
+      TORCH_WARN(
+          "Using a non-tuple sequence for "
+          "multidimensional indexing is deprecated and will be changed in "
+          "pytorch 2.9; use x[tuple(seq)] instead of "
+          "x[seq]. In pytorch 2.9 this will be interpreted as tensor index, "
+          "x[torch.tensor(seq)], which will result either in an error or a "
+          "different result");
+      return true;
+    }
+    if (obj.get() == Py_Ellipsis || obj.get() == Py_None) {
+      TORCH_WARN(
+          "Using a non-tuple sequence for "
+          "multidimensional indexing is deprecated and will be changed in "
+          "pytorch 2.9; use x[tuple(seq)] instead of "
+          "x[seq]. In pytorch 2.9 this will be interpreted as tensor index, "
+          "x[torch.tensor(seq)], which will result either in an error or a "
+          "different result");
+      return true;
+    }
+  }
+  return false;
+}
+
 static THPObjectPtr wrapTuple(PyObject* index) {
   THPObjectPtr res;
-  if (PyTuple_Check(index)) {
+  if (treatSequenceAsTuple(index)) {
     res = PySequence_Tuple(index);
   } else {
     res = PyTuple_Pack(1, index);
diff --git a/torch/csrc/cuda/MemPool.cpp b/torch/csrc/cuda/MemPool.cpp
index feb22e360bb98..b651a4b5e68aa 100644
--- a/torch/csrc/cuda/MemPool.cpp
+++ b/torch/csrc/cuda/MemPool.cpp
@@ -16,15 +16,12 @@ void THCPMemPool_init(PyObject* module) {
       .def(
           py::init([](c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator,
                       bool is_user_created,
-                      bool use_on_oom,
-                      bool symmetric) {
+                      bool use_on_oom) {
             torch::utils::device_lazy_init(at::kCUDA);
             return std::make_shared<::c10::cuda::MemPool>(
-                allocator, is_user_created, use_on_oom, symmetric);
+                allocator, is_user_created, use_on_oom);
           }))
       .def_property_readonly("id", &::c10::cuda::MemPool::id)
-      .def_property_readonly(
-          "is_symmetric", &::c10::cuda::MemPool::is_symmetric)
       .def_property_readonly("allocator", &::c10::cuda::MemPool::allocator)
       .def("use_count", &::c10::cuda::MemPool::use_count);
 }
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 0ae3bb62370f9..655e0a5578c29 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -47,6 +47,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
     const std::string backend;
     std::string group_name;
+    std::vector<uint64_t> global_ranks_in_group;
   };
 
   explicit Backend(int rank, int size);
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
index bc47f40c6dc61..3c6af83bde296 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -39,7 +39,7 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
     auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
     // Create the .cache directory if it doesn't exist
     std::filesystem::create_directories(cacheDirPath);
-    auto defaultLocation = cacheDirPath / "nccl_trace_rank_";
+    auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_";
 
     // For internal bc compatibility, we keep the old the ENV check.
     std::string fileNamePrefix = getCvarString(
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.hpp b/torch/csrc/distributed/c10d/FlightRecorder.hpp
index 768889015fb75..b0974495a87a9 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.hpp
@@ -20,10 +20,10 @@ namespace c10d {
 // (minor when adding fields, major when changing existing fields)
 // Also update both JSON and Pickle dumps to make use of the newly defined
 // field(s).
-DEFINE_CONSTANT(version_val, "2.9")
+DEFINE_CONSTANT(version_val, "2.10")
 DEFINE_CONSTANT(entries_key, "entries")
 DEFINE_CONSTANT(nccl_comm_key, "nccl_comm_state")
-DEFINE_CONSTANT(nccl_version_key, "nccl_version")
+DEFINE_CONSTANT(comm_lib_version_key, "comm_lib_version")
 DEFINE_CONSTANT(version_key, "version")
 DEFINE_CONSTANT(pg_config_key, "pg_config")
 DEFINE_CONSTANT(pg_status_key, "pg_status")
@@ -179,7 +179,7 @@ struct FlightRecorder {
   std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_ = {};
   std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
       pg_name_to_ranks_ = {};
-  std::string nccl_version_;
+  std::string comm_lib_version_;
 
   std::optional<size_t> record(
       size_t pg_id,
@@ -200,7 +200,7 @@ struct FlightRecorder {
       const std::tuple<std::string, std::string>& pg_name,
       std::vector<uint64_t> ranks);
 
-  void record_accelerator_version(const std::string nccl_version);
+  void record_accelerator_version(const std::string comm_lib_version);
 
   void update_state(Entry& r);
 
diff --git a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
index 608b9157ac391..473372fd44b4c 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -128,12 +128,12 @@ void FlightRecorder<EventType>::record_pg_ranks(
 
 template <typename EventType>
 void FlightRecorder<EventType>::record_accelerator_version(
-    const std::string nccl_version) {
+    const std::string comm_lib_version) {
   if (!enabled_) {
     return;
   }
   std::lock_guard<std::mutex> guard(mutex_);
-  nccl_version_ = std::move(nccl_version);
+  comm_lib_version_ = std::move(comm_lib_version);
 }
 
 template <typename EventType>
@@ -425,7 +425,7 @@ std::string FlightRecorder<EventType>::dump_json(
     bool onlyActive) {
   json result;
   result[version_key_str] = version_val_str;
-  result[nccl_version_key_str] = nccl_version_;
+  result[comm_lib_version_key_str] = comm_lib_version_;
   result[pg_config_key_str] = getPgConfigJson();
   result[pg_status_key_str] = getPgStatusJson();
 
@@ -522,7 +522,7 @@ std::string FlightRecorder<EventType>::dump(
   // common values
   result.insert(version_key, version_val);
   result.insert(pg_config_key, getPgConfig());
-  result.insert(nccl_version_key_str, nccl_version_);
+  result.insert(comm_lib_version_key_str, comm_lib_version_);
   result.insert(pg_status_key, getPgStatus());
 
   // collective trace
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index 2bb10b2fecfd6..4297807f2e8b9 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -255,7 +255,6 @@ class TORCH_API ProcessGroupGloo : public Backend {
       return c10::make_intrusive<Options>(timeout);
     }
 
-    std::vector<uint64_t> global_ranks_in_group;
     std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
     int threads;
   };
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 655193e8f3186..339a8c147d5ac 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -291,8 +291,12 @@ bool shouldAllCommunicatorsRegisterAllTensors() {
 // - This map has also to be maintained as global variable since the register
 //   hooks are called outside the scope of any PG, thus we need traverse
 //   communicators in all PGs.
-using MemPoolSet = std::
-    unordered_set<c10::cuda::MempoolId_t, c10::hash<c10::cuda::MempoolId_t>>;
+
+// MemPoolSet has ids of mempools used with this communicator, and whether they
+// were registered with window APIs or not
+using MemPoolSet = std::unordered_set<
+    std::tuple<c10::cuda::MempoolId_t, bool>,
+    c10::hash<std::tuple<c10::cuda::MempoolId_t, bool>>>;
 static std::unordered_map<std::shared_ptr<NCCLComm>, MemPoolSet>
     ncclCommMemPoolMap;
 static std::mutex ncclCommMemPoolMapMutex;
@@ -310,10 +314,23 @@ static void cacheAllocatorRegisterHook(
   std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
   for (auto& [ncclComm, memPools] : ncclCommMemPoolMap) {
     if (te.device_ == ncclComm->getDeviceIndex()) {
-      if (shouldAllCommunicatorsRegisterAllTensors() ||
-          memPools.find(te.mempool_) != memPools.end()) {
+      bool symm = false;
+      bool should_register = shouldAllCommunicatorsRegisterAllTensors();
+      auto it =
+          std::find_if(memPools.begin(), memPools.end(), [&](const auto& tup) {
+            return std::get<0>(tup) == te.mempool_;
+          });
+      if (it != memPools.end()) {
+        should_register = true;
+        symm = std::get<1>(*it);
+      }
+      if (should_register) {
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
-        ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
+        ncclComm->registerSegment(
+            reinterpret_cast<void*>(te.addr_),
+            te.size_,
+            /*errorOnRereg*/ false,
+            /*window*/ symm);
       }
     }
   }
@@ -330,10 +347,19 @@ static void cacheAllocatorDeregisterHook(
   std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
   for (auto& [ncclComm, memPools] : ncclCommMemPoolMap) {
     if (te.device_ == ncclComm->getDeviceIndex()) {
-      if (shouldAllCommunicatorsRegisterAllTensors() ||
-          memPools.find(te.mempool_) != memPools.end()) {
+      bool symm = false;
+      bool should_register = shouldAllCommunicatorsRegisterAllTensors();
+      auto it =
+          std::find_if(memPools.begin(), memPools.end(), [&](const auto& tup) {
+            return std::get<0>(tup) == te.mempool_;
+          });
+      if (it != memPools.end()) {
+        should_register = true;
+        symm = std::get<1>(*it);
+      }
+      if (should_register) {
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
-        ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
+        ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_), symm);
       }
     }
   }
@@ -968,8 +994,9 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   const std::string OFF = "OFF";
   std::string torch_distributed_debug =
       getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str());
-  LOG(INFO) << logPrefix() << "ProcessGroupNCCL initialization options: "
-            << "size: " << size << ", global rank: " << globalRank()
+  LOG(INFO) << logPrefix()
+            << "ProcessGroupNCCL initialization options: " << "size: " << size
+            << ", global rank: " << globalRank()
             << ", TIMEOUT(ms): " << options_->timeout.count()
             << ", USE_HIGH_PRIORITY_STREAM: "
             << options_->is_high_priority_stream
@@ -1089,7 +1116,7 @@ ErrorType ProcessGroupNCCL::getError() {
   return error_;
 }
 
-void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool) {
+void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
   const auto key = std::to_string(pool->device());
   LOG(INFO) << logPrefix()
             << "Performing NCCL user buffer registration for all buffers in "
@@ -1101,24 +1128,15 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool) {
         DistBackendError,
         "NCCL communicator has not been initialized before mem pool creation. You can pass `device_id` to init_process_group -- one way of eager initialization -- to work around this issue");
   }
-  TORCH_INTERNAL_ASSERT(ncclComm != nullptr);
   {
     std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
     auto iter = ncclCommMemPoolMap.find(ncclComm);
-    iter->second.insert(pool->id());
+    iter->second.insert(std::make_tuple(pool->id(), symm));
   }
   // We must ensure we're listening for allocator trace events in order to
   // register future segments allocated in this pool (this call is idempotent).
   attachAllocatorHooks();
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(pool->id());
-  // TODO:
-  // if(pool->is_symmetric()) {
-  //   Allgather to verify len(mempool.snapshot.segments) matches across GPUs
-  //   Allgather to verify mempool.alloc_request_counter matches across GPUs
-  //   add alloc_request_counter per mempool (How many allocations a mempool has
-  //   served during its lifetime) this should guarantee pool is used in a
-  //   symmetric/SPMD manner
-  // }
   for (const auto& segmentInfo : snapshot.segments) {
     TORCH_INTERNAL_ASSERT(
         segmentInfo.device == pool->device(),
@@ -1128,31 +1146,35 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool) {
         reinterpret_cast<void*>(segmentInfo.address),
         segmentInfo.total_size,
         /*errorOnRereg=*/false, // ignores reregistration error
-        /*window=*/pool->is_symmetric()); // whether to use NCCL symmetric
-                                          // memory
+        /*window*/ symm); // whether to use NCCL symmetric memory
   }
 }
 
 void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
   const auto key = std::to_string(pool->device());
-  auto device = at::Device(at::DeviceType::CUDA, pool->device());
   LOG(INFO) << logPrefix()
             << "Performing NCCL user buffer deregistration for all buffers in "
             << "MemPool: " << pool->id() << ", device index: " << key
             << ", i am " << this;
   auto ncclComm = getNCCLComm(key);
   if (ncclComm == nullptr) {
-    // HACK: currently we are using this function for NVLS
-    // reductions, and that's why using OpType::ALLREDUCE.
-    // If we end up using this API for zero-copy P2P, we might
-    // need to refactor and account for different OpType.
-    ncclComm = initNCCLComm(key, device, OpType::ALLREDUCE);
+    C10_THROW_ERROR(
+        DistBackendError,
+        "NCCL communicator has not been initialized before mem pool creation. You can pass `device_id` to init_process_group -- one way of eager initialization -- to work around this issue");
   }
-  TORCH_INTERNAL_ASSERT(ncclComm != nullptr);
+  bool symm;
   {
     std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
     auto iter = ncclCommMemPoolMap.find(ncclComm);
-    iter->second.erase(pool->id());
+    auto mempool_it = std::find_if(
+        iter->second.begin(), iter->second.end(), [&](const auto& tup) {
+          return std::get<0>(tup) == pool->id();
+        });
+    TORCH_CHECK(
+        mempool_it != iter->second.end(),
+        "Trying to unregister not previously registered pool");
+    symm = std::get<1>(*mempool_it);
+    iter->second.erase(mempool_it);
   }
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(pool->id());
   for (const auto& segmentInfo : snapshot.segments) {
@@ -1161,7 +1183,7 @@ void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
         "Mismatch between CUDA memory segment device and pool's device");
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
     ncclComm->deregisterSegment(
-        reinterpret_cast<void*>(segmentInfo.address), pool->is_symmetric());
+        reinterpret_cast<void*>(segmentInfo.address), symm);
   }
 }
 
@@ -5749,7 +5771,7 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
     // Pool is created
     memPool_ = std::make_unique<c10::cuda::MemPool>(allocator);
     // Register so that we call ncclCommRegister on all new allocations
-    registerMemPool(memPool_.get());
+    registerMemPool(memPool_.get(), /*symmetric*/ false);
     LOG(INFO) << logPrefix() << "Created memory pool";
   }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 7304a4a21b559..f7a3a28caceb3 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -545,7 +545,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // the int value of `NCCL_SPLIT_NOCOLOR` (-1) instead.
     int split_color{-2};
 #endif
-    std::vector<uint64_t> global_ranks_in_group;
   };
 
   // Helper class related to TORCH_NCCL_DESYNC_DEBUG
@@ -1003,7 +1002,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   // Performs NCCL user buffer registration for all buffers in
   // the given MemPool
-  void registerMemPool(c10::cuda::MemPool* pool);
+  void registerMemPool(c10::cuda::MemPool* pool, bool symm = false);
 
   // Performs NCCL user buffer de-registration for all buffers in
   // the given MemPool
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index c39957c2e8386..60714977ef4ba 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1198,6 +1198,12 @@ This class does not support ``__members__`` property.)");
           py::arg("src_rank"),
           py::arg("channel") = 0,
           py::arg("timeout_ms") = 0)
+      .def(
+        "copy_buffer",
+        &SymmetricMemory::copy_buffer,
+        py::arg("src"),
+        py::arg("dst"),
+        py::arg("size"))
       // Util functions that are often used together with symmetric memory but
       // not necessarily directly on symmetric memory.
       .def_static(
@@ -3086,7 +3092,11 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               py::arg("backend"),
               py::arg("timeout") = kProcessGroupDefaultTimeout)
           .def_readonly("backend", &::c10d::Backend::Options::backend)
-          .def_readwrite("_timeout", &::c10d::Backend::Options::timeout);
+          .def_readwrite("_timeout", &::c10d::Backend::Options::timeout)
+          .def_readwrite(
+              "global_ranks_in_group",
+              &::c10d::Backend::Options::global_ranks_in_group)
+          .def_readwrite("group_name", &::c10d::Backend::Options::group_name);
 
 #ifdef USE_C10D_GLOO
   static const std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";
@@ -3102,12 +3112,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
       processGroupGloo, "_Options", backendOptions)
       .def(py::init<>())
       .def_readwrite("_devices", &::c10d::ProcessGroupGloo::Options::devices)
-      .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads)
-      .def_readwrite(
-          "global_ranks_in_group",
-          &::c10d::ProcessGroupGloo::Options::global_ranks_in_group)
-      .def_readwrite(
-          "group_name", &::c10d::ProcessGroupGloo::Options::group_name);
+      .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads);
 
   processGroupGloo
       .def_static(
@@ -3335,7 +3340,11 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def(
               "perform_nocolor_split",
               &::c10d::ProcessGroupNCCL::performNocolorSplit)
-          .def("register_mem_pool", &::c10d::ProcessGroupNCCL::registerMemPool)
+          .def(
+              "register_mem_pool",
+              &::c10d::ProcessGroupNCCL::registerMemPool,
+              py::arg("pool"),
+              py::arg("symm") = false)
           .def(
               "deregister_mem_pool",
               &::c10d::ProcessGroupNCCL::deregisterMemPool)
@@ -3400,6 +3409,11 @@ for details.
 #ifdef NCCL_HAS_NVLS_CTAS
       .def_readwrite("nvls_ctas", &ncclConfig_t::nvlsCTAs)
 #endif
+      .def(
+          "unsafe_get_ptr",
+          [](const ncclConfig_t& self) {
+            return reinterpret_cast<uintptr_t>(&self);
+          })
       .def_property(
           "net_name",
           [](const ncclConfig_t& self) { return self.netName; },
@@ -3464,11 +3478,6 @@ Example::
           "split_from", &::c10d::ProcessGroupNCCL::Options::split_from)
       .def_readwrite(
           "split_color", &::c10d::ProcessGroupNCCL::Options::split_color)
-      .def_readwrite(
-          "global_ranks_in_group",
-          &::c10d::ProcessGroupNCCL::Options::global_ranks_in_group)
-      .def_readwrite(
-          "group_name", &::c10d::ProcessGroupNCCL::Options::group_name)
       .def(
           "__copy__",
           [](const ::c10d::ProcessGroupNCCL::Options& self) {
@@ -3507,17 +3516,49 @@ Example::
           .def(
               py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                           int rank,
-                          int size) {
+                          int size,
+                          c10::intrusive_ptr<::c10d::ProcessGroupXCCL::Options>
+                              options) {
                 // gil_scoped_release is not safe as a call_guard in init.
                 // https://github.com/pybind/pybind11/issues/5473
                 py::gil_scoped_release nogil{};
-
                 return c10::make_intrusive<::c10d::ProcessGroupXCCL>(
-                    store, rank, size);
+                    store, rank, size, std::move(options));
               }),
               py::arg("store"),
               py::arg("rank"),
-              py::arg("size"));
+              py::arg("size"),
+              py::arg("options"),
+              R"(Create a new ProcessGroupXCCL instance.)");
+
+  intrusive_ptr_class_<::c10d::ProcessGroupXCCL::Options>(
+      processGroupXCCL, "Options", backendOptions)
+      .def(py::init<>());
+  module
+      .def(
+          "_dump_xccl_trace",
+          [](std::optional<bool> includeCollectives,
+             std::optional<bool> includeStackTraces,
+             std::optional<bool> onlyActive) {
+            return py::bytes(::c10d::dump_xccl_trace(
+                includeCollectives.value_or(true),
+                includeStackTraces.value_or(true),
+                onlyActive.value_or(false)));
+          },
+          py::arg("includeCollectives") = std::optional<bool>(),
+          py::arg("includeStackTraces") = std::optional<bool>(),
+          py::arg("onlyActive") = std::optional<bool>(),
+          R"(
+Arguments:
+    includeCollectives(bool, optional): Whether to include collective work traces. Default is True.
+    includeStackTraces(bool, optional): Whether to include stacktraces in the collective work traces. Default is True.
+    onlyActive (bool, optional): Whether to only include active collective work traces. Default is False.
+Returns:
+    Stringified pickle work traces.
+    Default settings return everything - i.e. contains XCCL comm dumps and collective traces.
+      )")
+      .def("get_xccl_version", [] { return ::c10d::getXcclVersion(); });
+
 #endif
 
 #ifdef USE_C10D_UCC
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index b3b48b35dee32..110ff4606a019 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -812,16 +812,8 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
 } // namespace
 
 c10::intrusive_ptr<SymmetricMemory> CUDASymmetricMemoryAllocator::rendezvous(
-    const at::Tensor& tensor,
+    void* ptr,
     const std::optional<std::string>& group_name) {
-  // TODO: currently using `storage().data_ptr()` to maintain the same behavior
-  // as before, but we should use `data_ptr()` instead
-  auto ptr = tensor.storage().data_ptr().get();
-
-  // Today this would still find the ptr in the map because one allocation
-  // matches one tensor. But will break once we enable MemPool.
-  // TODO: implement a customized `find` that searches for the allocation that
-  // contains ptr.
   auto block = find_block(ptr);
   if (block == nullptr) {
     return nullptr;
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
index e047cbd24af6c..f61d8f9622a7b 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
@@ -115,7 +115,7 @@ class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
   void free(void* ptr) override;
   size_t get_alloc_size(void* ptr) override;
   c10::intrusive_ptr<SymmetricMemory> rendezvous(
-      const at::Tensor& tensor,
+      void* ptr,
       const std::optional<std::string>& group_name) override;
   bool has_multicast_support(int device_idx) override;
   c10::DeviceType supported_device_type() override;
diff --git a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
index 44d0c99faa7cd..55695ca27c8ec 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
@@ -40,32 +40,17 @@ struct NCCLAllocation {
 class NCCLSymmetricMemory : public SymmetricMemory {
  public:
  NCCLSymmetricMemory(
-      const at::Tensor& tensor,
       std::shared_ptr<NCCLAllocation> allocation,
       const std::string& group_name,
       ncclWindow_t handle,
       ncclWindow_t signal_handle)
-      : tensor_weak_ptr_(tensor.getIntrusivePtr()),
-        allocation_(allocation),
+      : allocation_(allocation),
+        buffer_size_(allocation->buffer_size),
         device_idx_(allocation->device_idx),
         group_name_(group_name),
         handle_(handle),
         signal_handle_(signal_handle) {
     c10::cuda::CUDAGuard guard(device_idx_);
-    // `ptr` is tensor data's starting address
-    auto ptr = tensor.data_ptr();
-    // Buffer size is rest of space available after ptr (this field may not be
-    // important in future thus subject to removal)
-    buffer_size_ = allocation->buffer_size -
-        (reinterpret_cast<uintptr_t>(ptr) - reinterpret_cast<uintptr_t>(allocation->ptr));
-
-    GroupInfo& group_info = get_group_info(group_name_);
-    rank_ = group_info.rank;
-    world_size_ = group_info.world_size;
-
-    buffers_.reserve(world_size_);
-    buffers_[rank_] = ptr;
-    // TODO: Fill in `buffers_[peer]` once NCCL API is ready.
 
     // We need some API like nvshmem_extension::nvshmem_ptr()
     // put API to get the reference of remote memory.
@@ -90,7 +75,6 @@ class NCCLSymmetricMemory : public SymmetricMemory {
     return signal_pads_dev_;
   }
 
-  // This API is subject to removal
   size_t get_buffer_size() override {
     return buffer_size_;
   }
@@ -213,13 +197,7 @@ class NCCLSymmetricMemory : public SymmetricMemory {
     return rank_to_global_rank_dev_;
   };
 
-  bool expired() const {
-    // True if the tensor has been deallocated
-    return tensor_weak_ptr_.expired();
-  }
-
  private:
-  c10::weak_intrusive_ptr<c10::TensorImpl> tensor_weak_ptr_;
   std::shared_ptr<NCCLAllocation> allocation_;
   size_t buffer_size_;
   // TODO: We need to finalize what booking variables we need for nccl backend.
@@ -278,37 +256,19 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
   };
 
   c10::intrusive_ptr<SymmetricMemory> rendezvous(
-      const at::Tensor& tensor,
+      void* ptr,
       const std::optional<std::string>& group_name) override {
     TORCH_CHECK(group_name.has_value(), "group_name must be provided");
-
-    // Using raw address of TensorImpl as a unique key of tensor in `symm_mems_`
-    // map, because other addresses such as `tensor.data_ptr()` or
-    // `tensor.storage().data_ptr()` may been shared by views and slices.
-    auto tensor_raw_ptr = (void*)tensor.unsafeGetTensorImpl();
-    auto symm_mem_key = std::make_tuple(tensor_raw_ptr, *group_name);
     {
-      auto it = symm_mems_.find(symm_mem_key);
+      auto it = symm_mems_.find(std::make_tuple(ptr, *group_name));
       if (it != symm_mems_.end()) {
-        auto symm_mem = it->second;
-        if (!symm_mem->expired()) {
-          return symm_mem;
-        }
-        // Otherwise, the tensor in `symm_mems_` map must have been deallocated,
-        // and we are facing a new tensor that happens to have the same raw
-        // TensorImpl* address. We would go thru a new insert below.
+        return it->second;
       }
     }
-
-    // `ptr` is tensor data's starting address
-    auto ptr = tensor.data_ptr();
-    // Today this would still find the ptr in the map because one allocation
-    // matches one tensor. But will break once we enable MemPool.
-    // TODO: implement a customized `find` that searches for the allocation that
-    // contains ptr.
     auto it = allocations_.find(ptr);
     TORCH_CHECK(it != allocations_.end(), "memory needs to be first allocated before calling rendezvous.");
 
+
     auto group = resolve_process_group(group_name.value());
     auto alloc = it->second;
     c10::cuda::CUDAGuard guard(alloc->device_idx);
@@ -353,9 +313,9 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
         comm));
 
     auto symm_mem =
-        c10::make_intrusive<NCCLSymmetricMemory>(tensor, alloc, *group_name, std::move(handle), std::move(signal_handle));
+        c10::make_intrusive<NCCLSymmetricMemory>(alloc, *group_name, std::move(handle), std::move(signal_handle));
 
-    symm_mems_[symm_mem_key] = symm_mem;
+    symm_mems_[std::make_tuple(ptr, *group_name)] = symm_mem;
     return symm_mem;
   };
 
@@ -377,7 +337,7 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
       ptr_to_symm_mem_;
 
   std::unordered_map<void*, std::shared_ptr<NCCLAllocation>> allocations_;
-  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<NCCLSymmetricMemory>>
+  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<SymmetricMemory>>
       symm_mems_;
 };
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index 2f4e982eb4a3d..d9f71e4cddf08 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -1,5 +1,3 @@
-#include <algorithm>
-
 #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
@@ -48,21 +46,14 @@ struct NVSHMEMAllocation {
 class NVSHMEMSymmetricMemory : public SymmetricMemory {
  public:
   NVSHMEMSymmetricMemory(
-      const at::Tensor& tensor,
       std::shared_ptr<NVSHMEMAllocation> allocation,
       const std::string& group_name)
-      : tensor_weak_ptr_(tensor.getIntrusivePtr()),
-        allocation_(allocation),
+      : allocation_(allocation),
+        buffer_size_(allocation->buffer_size),
         device_idx_(allocation->device_idx),
         group_name_(group_name) {
     // For logging only
     static int exchanged_n_times = 0;
-    // `ptr` is tensor data's starting address
-    auto ptr = tensor.data_ptr();
-    // Buffer size is rest of space available after ptr (this field may not be
-    // important in future thus subject to removal)
-    buffer_size_ = allocation->buffer_size -
-        (reinterpret_cast<std::uintptr_t>(ptr) - reinterpret_cast<std::uintptr_t>(allocation->ptr));
     c10::cuda::CUDAGuard guard(device_idx_);
 
     auto global_rank = get_group_info("0").rank;
@@ -87,7 +78,7 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
     rank_to_global_rank_ = group_info.rank_to_global_rank;
     for (int r = 0; r < world_size_; ++r) {
       buffers_.push_back(nvshmem_ptr(
-          ptr, rank_to_global_rank_[r]));
+          allocation->ptr, rank_to_global_rank_[r]));
     }
 
     // TODO: use the same allocation for signal pad
@@ -143,7 +134,6 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
     return signal_pads_dev_;
   }
 
-  // This API is subject to removal
   size_t get_buffer_size() override {
     return buffer_size_;
   }
@@ -264,13 +254,7 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
     return rank_to_global_rank_dev_;
   };
 
-  bool expired() const {
-    // True if the tensor has been deallocated
-    return tensor_weak_ptr_.expired();
-  }
-
  private:
-  c10::weak_intrusive_ptr<c10::TensorImpl> tensor_weak_ptr_;
   std::shared_ptr<NVSHMEMAllocation> allocation_;
   size_t buffer_size_;
   std::vector<void*> buffers_;
@@ -386,53 +370,21 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
   };
 
   c10::intrusive_ptr<SymmetricMemory> rendezvous(
-      const at::Tensor& tensor,
+      void* ptr,
       const std::optional<std::string>& group_name) override {
     TORCH_CHECK(group_name.has_value());
-
-    // Using raw address of TensorImpl as a unique key of tensor in `symm_mems_`
-    // map, because other addresses such as `tensor.data_ptr()` or
-    // `tensor.storage().data_ptr()` may been shared by views and slices.
-    auto tensor_raw_ptr = (void*)tensor.unsafeGetTensorImpl();
-    auto symm_mem_key = std::make_tuple(tensor_raw_ptr, *group_name);
     {
-      auto it = symm_mems_.find(symm_mem_key);
+      auto it = symm_mems_.find(std::make_tuple(ptr, *group_name));
       if (it != symm_mems_.end()) {
-        auto symm_mem = it->second;
-        if (!symm_mem->expired()) {
-          return symm_mem;
-        }
-        // Otherwise, the tensor in `symm_mems_` map must have been deallocated,
-        // and we are facing a new tensor that happens to have the same raw
-        // TensorImpl* address. We would go thru a new insert below.
+        return it->second;
       }
     }
-
-    // This is the first time the tenosr gets rendezvous'ed. We need to first
-    // search for an allocations that backs it (below).
-    LOG(INFO) << tensor.device() << ": rendezvousing tensor " << tensor_raw_ptr
-              << ", size " << tensor.sizes() << ", over group " << *group_name;
-
-    // `ptr` is tensor data's starting address
-    auto ptr = tensor.data_ptr();
-    // [Note] In case of MemPool or when the tensor is a slice of another, the
-    // tensor's data_ptr() may not match exactly with an allocation's base
-    // address. Thus we perform the search by testing if the tensor's data_ptr
-    // is within an allocation's range.
-    auto it = std::find_if(allocations_.begin(), allocations_.end(),
-                               [&](const auto& pair){
-                                  auto& allocation = pair.second;
-                                  auto ptr_int = reinterpret_cast<uintptr_t>(ptr);
-                                  auto base_ptr = reinterpret_cast<uintptr_t>(allocation->ptr);
-                                  return ptr_int >= base_ptr && ptr_int < base_ptr + allocation->buffer_size; });
-    TORCH_CHECK(it != allocations_.end(),
-        "Pointer not within any SymmetricMemory allocation, "
-        "is the tensor allocated from SymmetricMemory?");
-
+    auto it = allocations_.find(ptr);
+    TORCH_CHECK(it != allocations_.end());
     auto symm_mem =
-        c10::make_intrusive<NVSHMEMSymmetricMemory>(tensor, it->second /*allocation*/, *group_name);
+        c10::make_intrusive<NVSHMEMSymmetricMemory>(it->second, *group_name);
 
-    symm_mems_[symm_mem_key] = symm_mem;
+    symm_mems_[std::make_tuple(ptr, *group_name)] = symm_mem;
     return symm_mem;
   };
 
@@ -451,7 +403,7 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
 
  private:
   std::unordered_map<void*, std::shared_ptr<NVSHMEMAllocation>> allocations_;
-  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<NVSHMEMSymmetricMemory>>
+  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<SymmetricMemory>>
       symm_mems_;
 };
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
index 6dcff136d7c85..2831a4416de9d 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
@@ -253,7 +253,7 @@ TORCH_API c10::intrusive_ptr<SymmetricMemory> rendezvous(
     const at::Tensor& tensor,
     const std::optional<std::string>& group_name) {
   auto allocator = get_allocator(tensor.device().type());
-  return allocator->rendezvous(tensor, group_name);
+  return allocator->rendezvous(tensor.storage().data_ptr().get(), group_name);
 }
 
 TORCH_API bool has_multicast_support(
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
index 556a772431373..cd45572ceaefa 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
@@ -68,6 +68,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
   virtual void barrier(int channel, size_t timeout_ms) = 0;
   virtual void put_signal(int dst_rank, int channel, size_t timeout_ms) = 0;
   virtual void wait_signal(int src_rank, int channel, size_t timeout_ms) = 0;
+  virtual void copy_buffer(at::Tensor src, at::Tensor dst , size_t size) = 0;
 
   virtual int get_rank() = 0;
   virtual int get_world_size() = 0;
@@ -93,7 +94,7 @@ class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
   virtual void free(void* ptr) = 0;
   virtual size_t get_alloc_size(void* ptr) = 0;
   virtual c10::intrusive_ptr<SymmetricMemory> rendezvous(
-      const at::Tensor& tensor,
+      void* ptr,
       const std::optional<std::string>& group_name) = 0;
   virtual bool has_multicast_support(int device_idx) = 0;
   virtual c10::DeviceType supported_device_type() = 0;
diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
index 01d56f45f6db4..0d53d100cee7d 100644
--- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
@@ -219,11 +219,7 @@ bool IntraNodeComm::rendezvous() {
       groupName, static_cast<int>(rank_), static_cast<int>(worldSize_), store_);
   auto allocator = get_allocator(c10::DeviceType::CUDA);
   symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName);
-  // Rendezvous API now takes a tensor instead of raw pointer, thus we create a
-  // temporary wrapper here
-  auto tensor_wrap = at::from_blob(
-      symmetricMemoryPtr_, {static_cast<long>(bufferSize_)}, at::kByte);
-  symmetricMemory_ = allocator->rendezvous(tensor_wrap, std::nullopt);
+  symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_, std::nullopt);
   isInitialized_ = true;
   return true;
 }
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index 55ebebb28e244..84b4eade99eb2 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -10,7 +10,11 @@
 #include <ATen/cuda/cub.cuh>
 
 // NVSHMEM minimum SM arch
+#if CUDA_VERSION >= 13000
+#define _NVSHMEM_MIN_SM_ARCH 800
+#else
 #define _NVSHMEM_MIN_SM_ARCH 700
+#endif
 
 // Some NVSHMEM device APIs do not compile on older SM archs
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH)
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index b9abd5ae508f3..c7f759cd077c9 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -613,6 +613,7 @@ std::string Result::name() const {
       ATTRIBUTE(OutOfMemory, std::string("[OutOfMemory]")),
       ATTRIBUTE(PyCall, toString(e)),
       ATTRIBUTE(PyCCall, std::string(e.function_name_.str())),
+      ATTRIBUTE(PythonGC, std::string("Python GC")),
       [](const auto& e) -> std::string { return e.name_; }));
 }
 
@@ -631,6 +632,7 @@ libkineto::ActivityType Result::kinetoType() const {
       ATTRIBUTE(OutOfMemory, libkineto::ActivityType::CPU_INSTANT_EVENT),
       ATTRIBUTE(PyCall, libkineto::ActivityType::PYTHON_FUNCTION),
       ATTRIBUTE(PyCCall, libkineto::ActivityType::PYTHON_FUNCTION),
+      ATTRIBUTE(PythonGC, libkineto::ActivityType::PYTHON_FUNCTION),
       ATTRIBUTE(Kineto, e.activity_type_)));
 }
 
@@ -650,6 +652,7 @@ int64_t Result::endTimeNS() const {
       ATTRIBUTE(Allocation, start_time_ns_),
       ATTRIBUTE(OutOfMemory, start_time_ns_),
       ATTRIBUTE(Kineto, start_time_ns_ + e.duration_ns_),
+      ATTRIBUTE(PythonGC, start_time_ns_ + e.duration_ns_),
       [&](const auto& e) -> int64_t { return e.end_time_ns_; }));
 
   // In rare cases we're willing to tolerate ops which are missing an end time
@@ -700,6 +703,9 @@ RecordQueue::RecordQueue(
       activities_{std::move(activities)} {
   if (tracePython()) {
     python_tracer_ = python_tracer::PythonTracerBase::make(this);
+    if (getPythonGcEvents()) {
+      python_tracer_->register_gc_callback();
+    }
   }
 }
 
@@ -707,6 +713,10 @@ bool RecordQueue::tracePython() const {
   return config_.with_stack && activities_.count(ActivityType::CPU);
 }
 
+bool RecordQueue::getPythonGcEvents() const {
+  return config_.experimental_config.record_python_gc_info;
+}
+
 ThreadLocalSubqueue* RecordQueue::getSubqueue() {
   // In the most common case, a thread will want to write to the same sub-queue
   // that it wrote to last call. The only time that isn't true is if:
@@ -1488,6 +1498,31 @@ RecordQueue::getRecords(
     queue.allocations_.clear();
     materialize(queue.ooms_);
 
+    std::optional<int64_t> pending_start;
+    for (auto& e : queue.pythongc_) {
+      if (e.first.find("start") != std::string::npos) {
+        pending_start = e.second;
+      } else if (e.first.find("stop") != std::string::npos) {
+        if (pending_start.has_value()) {
+          out.emplace_back(Result::create(
+              /*start_time_ns_=*/converter(pending_start.value()),
+              /*start_tid_=*/queue.tid(),
+              /*kineto_info_=*/queue.kineto_info(),
+              /*extra_fields_=*/
+              // NOLINTNEXTLINE
+              ExtraFields<EventType::PythonGC>{
+                  e.first,
+                  converter(e.second) - converter(pending_start.value())}));
+          pending_start.reset();
+        } else {
+          // Handle the case where "stop" is found without a matching "start"
+          // For example, you might want to log a warning or take other action:
+          LOG(WARNING) << R"("stop" event found without a matching "start": )"
+                       << e.first;
+        }
+      }
+    }
+
     for (auto& i : queue.py_calls_) {
       python_enters.push_back(
           {i.first, queue.tid(), queue.kineto_info(), converter(i.second)});
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 59ebda87a176e..847819f971957 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -34,7 +34,8 @@ enum class EventType : uint8_t {
   OutOfMemory,
   PyCall,
   PyCCall,
-  Kineto
+  Kineto,
+  PythonGC
 };
 
 // ============================================================================
@@ -191,6 +192,12 @@ struct ExtraFields<EventType::Backend> {
   jit_modules_t jit_modules_;
 };
 
+template <>
+struct ExtraFields<EventType::PythonGC> {
+  std::string phase;
+  int64_t duration_ns_;
+};
+
 template <>
 struct ExtraFields<EventType::Vulkan> {
   using raw_event_t = std::pair<c10::approx_time_t, vulkan_id_t>;
@@ -415,7 +422,8 @@ struct TORCH_API Result : public std::enable_shared_from_this<Result> {
       ExtraFields<EventType::OutOfMemory>,
       ExtraFields<EventType::PyCall>,
       ExtraFields<EventType::PyCCall>,
-      ExtraFields<EventType::Kineto>>
+      ExtraFields<EventType::Kineto>,
+      ExtraFields<EventType::PythonGC>>
       extra_fields_;
 
   std::weak_ptr<Result> parent_;
@@ -549,6 +557,11 @@ class TORCH_API ThreadLocalSubqueue {
     py_calls_.emplace_back(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  void emplace_gc_call(Args&&... args) {
+    pythongc_.emplace_back(std::forward<Args>(args)...);
+  }
+
   uint64_t tid() const {
     return tid_;
   }
@@ -639,6 +652,9 @@ class TORCH_API ThreadLocalSubqueue {
       std::pair<python_tracer::TraceKey, c10::approx_time_t>,
       BlockSize>
       py_calls_;
+  // gc with_stack (Python)
+  AppendOnlyList<std::pair<std::string, c10::approx_time_t>, BlockSize>
+      pythongc_;
 };
 
 class TORCH_API RecordQueue {
@@ -646,6 +662,7 @@ class TORCH_API RecordQueue {
   RecordQueue(ProfilerConfig config, std::set<ActivityType> activities);
 
   bool tracePython() const;
+  bool getPythonGcEvents() const;
   ThreadLocalSubqueue* getSubqueue();
   void stop();
   void restart();
diff --git a/torch/csrc/profiler/orchestration/observer.cpp b/torch/csrc/profiler/orchestration/observer.cpp
index 18b792a1abe97..5ef0690d18115 100644
--- a/torch/csrc/profiler/orchestration/observer.cpp
+++ b/torch/csrc/profiler/orchestration/observer.cpp
@@ -21,6 +21,7 @@ ExperimentalConfig::ExperimentalConfig(
     bool disable_external_correlation,
     bool profile_all_threads,
     bool capture_overload_names,
+    bool record_python_gc_info,
     std::string custom_profiler_config,
     bool adjust_timestamps)
     : profiler_metrics{std::move(profiler_metrics)},
@@ -32,6 +33,7 @@ ExperimentalConfig::ExperimentalConfig(
       disable_external_correlation{disable_external_correlation},
       profile_all_threads{profile_all_threads},
       capture_overload_names{capture_overload_names},
+      record_python_gc_info{record_python_gc_info},
       custom_profiler_config(std::move(custom_profiler_config)),
       adjust_timestamps{adjust_timestamps} {}
 
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
index 427736e6c6359..ba62e9b56b5c6 100644
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -62,6 +62,7 @@ struct TORCH_API ExperimentalConfig {
       bool disable_external_correlation = false,
       bool profile_all_threads = false,
       bool capture_overload_names = false,
+      bool record_python_gc_info = false,
       std::string custom_profiler_config = "",
       bool adjust_timestamps = false);
   explicit operator bool() const;
@@ -102,6 +103,12 @@ struct TORCH_API ExperimentalConfig {
    * function schema and stored in the profile  */
   bool capture_overload_names;
 
+  /*
+   * Controls whether or not python gc info is recorded. This is used to
+   * determine if gc collect is slowing down your profile.
+   */
+  bool record_python_gc_info;
+
   /*
    * A custom_profiler_config option is introduced to allow custom backends
    * to apply custom configurations as needed.
diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp
index d5d120d376f25..0d1ad389f8896 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.cpp
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@@ -11,6 +11,7 @@ struct NoOpPythonTracer : public PythonTracerBase {
 
   void stop() override {}
   void restart() override {}
+  void register_gc_callback() override {}
   std::vector<std::shared_ptr<Result>> getEvents(
       std::function<c10::time_t(c10::approx_time_t)>,
       std::vector<CompressedEvent>&,
diff --git a/torch/csrc/profiler/orchestration/python_tracer.h b/torch/csrc/profiler/orchestration/python_tracer.h
index 52387e92e562b..1011f75b82308 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.h
+++ b/torch/csrc/profiler/orchestration/python_tracer.h
@@ -48,6 +48,7 @@ struct TORCH_API PythonTracerBase {
 
   virtual void stop() = 0;
   virtual void restart() = 0;
+  virtual void register_gc_callback() = 0;
   virtual std::vector<std::shared_ptr<Result>> getEvents(
       std::function<c10::time_t(c10::approx_time_t)> time_converter,
       std::vector<CompressedEvent>& enters,
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 062f87a465ccb..aa7abe9433fe1 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -341,6 +341,7 @@ void initPythonBindings(PyObject* module) {
               bool /* disable_external_correlation*/,
               bool /* profile_all_threads */,
               bool /* capture_overload_names */,
+              bool /* record_python_gc_info */,
               std::string /* custom_profiler_config*/
               >(),
           "An experimental config for Kineto features. Please note that"
@@ -360,6 +361,7 @@ void initPythonBindings(PyObject* module) {
           "    disable_external_correlation (bool) : whether to disable external correlation\n"
           "    profile_all_threads (bool) : whether to profile all threads\n"
           "    capture_overload_names (bool) : whether to include ATen overload names in the profile\n"
+          "    record_python_gc_info (bool) : adds python gc events to profile\n"
           "    custom_profiler_config (string) : Used to pass some configurations to the custom profiler backend.\n",
           py::arg("profiler_metrics") = std::vector<std::string>(),
           py::arg("profiler_measure_per_kernel") = false,
@@ -370,6 +372,7 @@ void initPythonBindings(PyObject* module) {
           py::arg("disable_external_correlation") = false,
           py::arg("profile_all_threads") = false,
           py::arg("capture_overload_names") = false,
+          py::arg("record_python_gc_info") = false,
           py::arg("custom_profiler_config") = "")
       .def(py::pickle(
           [](const ExperimentalConfig& p) { // __getstate__
@@ -393,6 +396,7 @@ void initPythonBindings(PyObject* module) {
                 p.disable_external_correlation,
                 p.profile_all_threads,
                 p.capture_overload_names,
+                p.record_python_gc_info,
                 p.custom_profiler_config,
                 p.performance_events);
           },
diff --git a/torch/csrc/stable/tensor_inl.h b/torch/csrc/stable/tensor_inl.h
index 5e1944e202da3..cbc6f30ed6562 100644
--- a/torch/csrc/stable/tensor_inl.h
+++ b/torch/csrc/stable/tensor_inl.h
@@ -5,9 +5,8 @@
 // implementations of the Tensor methods can depend on APIs in library.h
 // without circular dependencies.
 
-#pragma once
 #include <torch/csrc/stable/stableivalue_conversions.h>
-#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/tensor_struct.h>
 #include <torch/headeronly/core/ScalarType.h>
 #include <torch/headeronly/util/shim_utils.h>
 
diff --git a/torch/csrc/utils/generated_serialization_types.h b/torch/csrc/utils/generated_serialization_types.h
index 0347fbcafd745..41f2371bc2627 100644
--- a/torch/csrc/utils/generated_serialization_types.h
+++ b/torch/csrc/utils/generated_serialization_types.h
@@ -1,5 +1,5 @@
 // @generated by update_schema.py
-// checksum<<face83b52f81c45eeaeccc97cee19e146b3f7416ed91e015b4510ada7549a72f>>
+// checksum<<d2bd01954a583467e1032a4b402f3350fd06a13c87e37b3e62aad33017ec71a2>>
 // clang-format off
 
 #pragma once
@@ -149,7 +149,6 @@ class InputToParameterSpec;
 class InputToTensorConstantSpec;
 class InputTokenSpec;
 class LossOutputSpec;
-class Model;
 class ModuleCallEntry;
 class ModuleCallSignature;
 class NamedArgument;
@@ -3061,8 +3060,6 @@ class ExportedProgram {
   SchemaVersion schema_version;
   std::vector<std::string> verifiers = {};
   std::string torch_version = "<=2.4";
-  std::unordered_map<std::string, std::string> tensor_paths = {};
-  std::unordered_map<std::string, std::string> constant_paths = {};
 
  public:
 
@@ -3114,62 +3111,10 @@ class ExportedProgram {
     torch_version = std::move(def);
   }
 
-  const std::unordered_map<std::string, std::string>& get_tensor_paths() const {
-    return tensor_paths;
-  }
-
-  void set_tensor_paths(std::unordered_map<std::string, std::string> def) {
-    tensor_paths = std::move(def);
-  }
-
-  const std::unordered_map<std::string, std::string>& get_constant_paths() const {
-    return constant_paths;
-  }
-
-  void set_constant_paths(std::unordered_map<std::string, std::string> def) {
-    constant_paths = std::move(def);
-  }
-
   friend void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nlohmann_json_t);
   friend void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t);
 };
 
-class Model {
- private:
-  std::string name;
-  ExportedProgram program;
-  std::unordered_map<std::string, ExportedProgram> variants;
-
- public:
-
-  const std::string& get_name() const {
-    return name;
-  }
-
-  void set_name(std::string def) {
-    name = std::move(def);
-  }
-
-  const ExportedProgram& get_program() const {
-    return program;
-  }
-
-  void set_program(ExportedProgram def) {
-    program = std::move(def);
-  }
-
-  const std::unordered_map<std::string, ExportedProgram>& get_variants() const {
-    return variants;
-  }
-
-  void set_variants(std::unordered_map<std::string, ExportedProgram> def) {
-    variants = std::move(def);
-  }
-
-  friend void to_json(nlohmann::json& nlohmann_json_j, const Model& nlohmann_json_t);
-  friend void from_json(const nlohmann::json& nlohmann_json_j, Model& nlohmann_json_t);
-};
-
 class AOTInductorModelPickleData {
  private:
   std::string library_basename;
@@ -3337,8 +3282,6 @@ inline void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nloh
   nlohmann_json_j["schema_version"] = nlohmann_json_t.schema_version;
   nlohmann_json_j["verifiers"] = nlohmann_json_t.verifiers;
   nlohmann_json_j["torch_version"] = nlohmann_json_t.torch_version;
-  nlohmann_json_j["tensor_paths"] = nlohmann_json_t.tensor_paths;
-  nlohmann_json_j["constant_paths"] = nlohmann_json_t.constant_paths;
 }
 
 inline void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t) {
@@ -3349,8 +3292,6 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nl
   nlohmann_json_t.schema_version = nlohmann_json_j.value("schema_version", nlohmann_json_default_obj.schema_version);
   nlohmann_json_t.verifiers = nlohmann_json_j.value("verifiers", nlohmann_json_default_obj.verifiers);
   nlohmann_json_t.torch_version = nlohmann_json_j.value("torch_version", nlohmann_json_default_obj.torch_version);
-  nlohmann_json_t.tensor_paths = nlohmann_json_j.value("tensor_paths", nlohmann_json_default_obj.tensor_paths);
-  nlohmann_json_t.constant_paths = nlohmann_json_j.value("constant_paths", nlohmann_json_default_obj.constant_paths);
 }
 
 inline void to_json(nlohmann::json& nlohmann_json_j, const ExternKernelNode& nlohmann_json_t) {
@@ -3534,19 +3475,6 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, LossOutputSpec& nlo
   nlohmann_json_t.arg = nlohmann_json_j.value("arg", nlohmann_json_default_obj.arg);
 }
 
-inline void to_json(nlohmann::json& nlohmann_json_j, const Model& nlohmann_json_t) {
-  nlohmann_json_j["name"] = nlohmann_json_t.name;
-  nlohmann_json_j["program"] = nlohmann_json_t.program;
-  nlohmann_json_j["variants"] = nlohmann_json_t.variants;
-}
-
-inline void from_json(const nlohmann::json& nlohmann_json_j, Model& nlohmann_json_t) {
-  Model nlohmann_json_default_obj;
-  nlohmann_json_t.name = nlohmann_json_j.value("name", nlohmann_json_default_obj.name);
-  nlohmann_json_t.program = nlohmann_json_j.value("program", nlohmann_json_default_obj.program);
-  nlohmann_json_t.variants = nlohmann_json_j.value("variants", nlohmann_json_default_obj.variants);
-}
-
 inline void to_json(nlohmann::json& nlohmann_json_j, const ModuleCallEntry& nlohmann_json_t) {
   nlohmann_json_j["fqn"] = nlohmann_json_t.fqn;
   nlohmann_json_j["signature"] = nlohmann_json_t.signature;
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 1bd6f9edc0319..54b75d4611bac 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -1169,28 +1169,21 @@ class MemPool(_MemPool):
         use_on_oom(bool): a bool that indicates if this pool can be used
             as a last resort if a memory allocation outside of the pool fails due
             to Out Of Memory. This is False by default.
-        symmetric(bool): a bool that indicates if this pool is symmetrical
-            across ranks. This is False by default.
+
     """
 
     def __init__(
         self,
         allocator: Optional[_cuda_CUDAAllocator] = None,
         use_on_oom: bool = False,
-        symmetric: bool = False,
     ):
-        super().__init__(allocator, True, use_on_oom, symmetric)
+        super().__init__(allocator, True, use_on_oom)
 
     @property
     def id(self) -> tuple[int, int]:
         r"""Returns the ID of this pool as a tuple of two ints."""
         return super().id
 
-    @property
-    def is_symmetric(self) -> bool:
-        r"""Returns whether this pool is used for NCCL's symmetric memory."""
-        return super().is_symmetric
-
     @property
     def allocator(self) -> Optional[_cuda_CUDAAllocator]:
         r"""Returns the allocator this MemPool routes allocations to."""
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 4b0e9acc19bd7..c3f54db0b8e9b 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -10,6 +10,7 @@
 from typing import Any, Callable, Literal, Optional
 
 import torch
+import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.distributed_c10d as c10d
 from torch._C._autograd import DeviceType
@@ -105,7 +106,7 @@ def get_symm_mem_workspace(group_name: str, min_size: int) -> _SymmetricMemory:
     tensor = _group_name_to_workspace_tensor.get(group_name)
     size = tensor.numel() * tensor.element_size() if tensor is not None else 0
     if tensor is None or size < min_size:
-        if torch.cuda.is_current_stream_capturing():
+        if False:  # torch.cuda.is_current_stream_capturing():
             curr_size = 0 if tensor is None else tensor.numel() * tensor.element_size()
             raise RuntimeError(
                 f"get_symm_mem_workspace(): the requested size ({min_size} bytes) "
@@ -120,19 +121,19 @@ def get_symm_mem_workspace(group_name: str, min_size: int) -> _SymmetricMemory:
             (max(size, min_size),),
             [1],
             torch.uint8,
-            torch.device(f"cuda:{torch.cuda.current_device()}"),
+            torch.device(f"xpu:{torch.xpu.current_device()}"),
             group_name,
         )
         _group_name_to_workspace_tensor[group_name] = tensor
     return _SymmetricMemory.rendezvous(tensor)
 
 
-_backend_streams: dict[int, torch.cuda.Stream] = {}
+_backend_streams: dict[int, torch.xpu.Stream] = {}
 
 
-def _get_backend_stream(priority: int = 0) -> torch.cuda.Stream:
+def _get_backend_stream(priority: int = 0) -> torch.xpu.Stream:
     if priority not in _backend_streams:
-        _backend_streams[priority] = torch.cuda.Stream(priority=priority)
+        _backend_streams[priority] = torch.xpu.Stream(priority=priority)
     return _backend_streams[priority]
 
 
@@ -168,8 +169,9 @@ def _pipelined_multi_all_gather_and_consume(
     rank = symm_mem.rank
 
     symm_mem.barrier(channel=0)
+    # dist.barrier()
     backend_stream = _get_backend_stream()
-    backend_stream.wait_stream(torch.cuda.current_stream())
+    backend_stream.wait_stream(torch.xpu.current_stream())
 
     for x, y in zip(shard, ag_out):
         assert x.is_contiguous(), (
@@ -185,7 +187,8 @@ def _pipelined_multi_all_gather_and_consume(
 
     def copy_shard(dst: list[torch.Tensor], src: list[torch.Tensor]) -> None:
         for d, s in zip(dst, src):
-            d.copy_(s)
+            symm_mem.copy_buffer(s, d, s.numel())
+            # d.copy_(s)
 
     def get_p2p_bufs(remote_rank: int) -> list[torch.Tensor]:
         offset_bytes = 0
@@ -249,8 +252,11 @@ def get_p2p_bufs(remote_rank: int) -> list[torch.Tensor]:
     # prevent suboptimal scenarios, we are giving up the chance to overlap "mv"
     # and "b" with the first shard_consumer for now.
     copy_shard(dst=local_p2p_bufs, src=shard)
-    symm_mem.barrier(channel=1)
-    backend_stream.wait_stream(torch.cuda.current_stream())
+    # torch.xpu.synchronize()
+    # print(f"[Python] zl_debug copy shard tensor to local done {local_p2p_bufs} with shape {local_p2p_bufs.shape}", flush=True)
+    # symm_mem.barrier(channel=1)
+    dist.barrier()
+    backend_stream.wait_stream(torch.xpu.current_stream())
 
     # At this point, all ranks have copied their local shard to
     # their local p2p buffer. Each rank can now copy and consume
@@ -259,7 +265,7 @@ def get_p2p_bufs(remote_rank: int) -> list[torch.Tensor]:
 
     for step in range(1, group_size):
         if step % 2 == 0:
-            stream = torch.cuda.current_stream()
+            stream = torch.xpu.current_stream()
         else:
             stream = backend_stream
         remote_rank = (step + rank) % group_size
@@ -268,18 +274,32 @@ def get_p2p_bufs(remote_rank: int) -> list[torch.Tensor]:
             copy_shard(dst=shards[remote_rank], src=remote_p2p_bufs)
             shard_consumer(shards[remote_rank], remote_rank)
 
+    # stage_size = 2
+    # for stage in range(1, group_size, stage_size):
+    #     stream = torch.xpu.current_stream() if (stage // stage_size) % 2 == 0 else backend_stream
+    #     with stream:
+    #         for i in range(stage_size):
+    #             step = stage + i
+    #             if step >= group_size:
+    #                 break
+    #             remote_rank = (step + rank) % group_size
+    #             remote_p2p_bufs = get_p2p_bufs(remote_rank)
+    #             copy_shard(dst=shards[remote_rank], src=remote_p2p_bufs)
+    #             shard_consumer(shards[remote_rank], remote_rank)
+
     if ag_out_needed:
         # Copy from input to the all-gather output. Opportunistically overlap
         # it with the last shard_consumer.
         if group_size % 2 == 0:
-            stream = torch.cuda.current_stream()
+            stream = torch.xpu.current_stream()
         else:
             stream = backend_stream
         with stream:
             copy_shard(dst=shards[rank], src=shard)
 
-    torch.cuda.current_stream().wait_stream(backend_stream)
-    symm_mem.barrier(channel=0)
+    torch.xpu.current_stream().wait_stream(backend_stream)
+    # symm_mem.barrier(channel=0)
+    dist.barrier()
 
 
 def _pipelined_all_gather_and_consume(
@@ -327,102 +347,55 @@ def _pipelined_produce_and_all2all(
         dist.all_to_all_single(output=output, input=torch.cat(chunks))
     """
     out_chunks = output.chunk(c10d._get_group_size_by_name(group_name))
-    p2p_workspace_size_req = out_chunks[0].numel() * out_chunks[0].element_size() * 2
+    p2p_workspace_size_req = (
+        out_chunks[0].numel() * out_chunks[0].element_size() * dist.get_world_size()
+    )
     symm_mem = get_symm_mem_workspace(group_name, min_size=p2p_workspace_size_req)
     group_size = symm_mem.world_size
     rank = symm_mem.rank
 
-    symm_mem.barrier(channel=0)
+    # symm_mem.barrier(channel=0)
+    dist.barrier()
     backend_stream = _get_backend_stream()
-    backend_stream.wait_stream(torch.cuda.current_stream())
+    backend_stream.wait_stream(torch.xpu.current_stream())
 
     def get_p2p_buf(rank: int, idx: int) -> torch.Tensor:
-        assert idx in (0, 1)
-        offset = 0 if idx == 0 else out_chunks[0].numel()
+        offset = out_chunks[0].numel() * idx
         return symm_mem.get_buffer(
             rank, out_chunks[0].shape, out_chunks[0].dtype, offset
         )
 
-    # Prepare two local p2p buffers, so that a remote rank can pull the result
-    # of step [i] in one p2p buffer while the local rank can compute the
-    # result of step [i+1] and write it directly the other p2p buffer.
-    local_p2p_buf_0 = get_p2p_buf(rank, 0)
-    local_p2p_buf_1 = get_p2p_buf(rank, 1)
-
     for step in range(1, group_size):
         remote_rank = (rank - step) % group_size
+        producer_rank = (rank + step) % group_size
+        p2p_buf = get_p2p_buf(rank, producer_rank)
+        remote_p2p_buf = get_p2p_buf(remote_rank, rank)
         if step % 2 == 0:
-            stream = torch.cuda.current_stream()
-            p2p_buf = local_p2p_buf_1
-            remote_p2p_buf = get_p2p_buf(remote_rank, 1)
+            stream = torch.xpu.current_stream()
         else:
             stream = backend_stream
-            p2p_buf = local_p2p_buf_0
-            remote_p2p_buf = get_p2p_buf(remote_rank, 0)
         with stream:
-            # Parallelization strategy: every rank issues independent compute
-            # -> barrier -> p2p copy sequences on two streams. In addition to
-            # computation/communication overlapping, the strategy allows for
-            # computation/computation overlapping, greatly reducing
-            # quantization inefficiency.
-            #
-            # Ideally, stream activities would look like this ("b" for
-            # barriers, "cp" for p2p copies):
-            #
-            # [rank 0]
-            # stream 0:         [  chunk_producer  ][b][ cp ][  chunk_producer ][b][ cp ]
-            # stream 1: [  chunk_producer  ][b][ cp ][  chunk_producer  ][b][ cp ]
-            #
-            # [rank 1]
-            # stream 0:         [  chunk_producer  ][b][ cp ][  chunk_producer ][b][ cp ]
-            # stream 1: [  chunk_producer  ][b][ cp ][  chunk_producer  ][b][ cp ]
-            #
-            # Note that the barriers synchronize streams with the same ID
-            # across ranks. They don't synchronize streams on the same rank.
-            #
-            # Since the work on both streams is independent, there's no
-            # guarantee that the chunk_producer from stream 0 or stream 1 will
-            # be scheduled first. If there is a scheduling mismatch across
-            # ranks, the barrier forces all ranks to wait for the slowest.
-            #
-            # When scheduling mismatches occur among ranks, the stream
-            # activities might look like this (note that p2p copies from
-            # different streams cannot overlap with each other):
-            #
-            # [rank 0]
-            # stream 0: [  chunk_producer  ][b        ][ cp ][  chunk_producer ][b       ][ cp ]
-            # stream 1:         [  chunk_producer  ][b]      [ cp ][  chunk_producer  ][b]      [ cp ]
-            #
-            # [rank 1]
-            # stream 0:         [  chunk_producer  ][b]      [ cp ][  chunk_producer  ][b]      [ cp ]
-            # stream 1: [  chunk_producer  ][b        ][ cp ][  chunk_producer  ][b      ][ cp ]
-            #
-            # To prevent this, we need to ensure that the chunk_producer on
-            # stream 1 gets scheduled first on every rank. Without access to
-            # the underlying kernels, CUDA offers no API to control the
-            # scheduling order of two independent, overlapping kernels. Our
-            # solution is to issue a small sleep kernel in stream 0. The sleep
-            # duration is insignificant, but having an extra task in stream 0
-            # will almost guarantee that the chunk_producer on stream 1 gets
-            # scheduled first. Once the first chunk_producer is scheduled in
-            # the correct order, there's very little room for the scheduling
-            # order of subsequent kernels to be inconsistent across ranks.
-            if step == 2:
-                torch.cuda._sleep(100)
-            chunk_producer((rank + step) % group_size, p2p_buf)
-            symm_mem.barrier(channel=step % 2)
-            out_chunks[remote_rank].copy_(remote_p2p_buf)
+            # if step == 2:
+            #     torch.xpu._sleep(100)
+            chunk_producer(producer_rank, p2p_buf)
+            # symm_mem.barrier(channel=step % 2)
+            dist.barrier()
+            # replaced with copy_buffer
+            # out_chunks[remote_rank].copy_(remote_p2p_buf)
+            symm_mem.copy_buffer(
+                remote_p2p_buf, out_chunks[remote_rank], remote_p2p_buf.numel()
+            )
             # The local P2P buffer can only be overwritten by the next
             # chunk_producer after all peers have finished reading from it.
-            symm_mem.barrier(channel=step % 2)
+            # symm_mem.barrier(channel=step % 2)
 
     # If the sleep wasn't issued in the above loop, do it now.
-    if group_size == 2:
-        torch.cuda._sleep(100)
-
+    # if group_size == 2:
+    #     torch.xpu._sleep(100)
     chunk_producer(rank, out_chunks[rank])
-    torch.cuda.current_stream().wait_stream(backend_stream)
-    symm_mem.barrier(channel=0)
+    torch.xpu.current_stream().wait_stream(backend_stream)
+    # symm_mem.barrier(channel=0)
+    dist.barrier()
 
 
 lib = torch.library.Library("symm_mem", "DEF")  # noqa: TOR901
@@ -545,6 +518,7 @@ def unflatten(t: torch.Tensor) -> torch.Tensor:
     scale_mode = _check_and_verify_fp8_all_gather_scale_mode(
         shard=A_shard, scale=A_scale, gather_dim=gather_dim, group_size=group.size()
     )
+    print(f"zl_debug get scaled mode = {scale_mode} of allgather+matmul", flush=True)
 
     # Computing block-wise matmul along the first dim of A
     if scale_mode == _ScaleMode.ROW_WISE_SHARDED:
@@ -642,6 +616,7 @@ def _fused_all_gather_matmul_fallback(
 
 
 @torch.library.impl(lib, "fused_all_gather_matmul", "CUDA")
+@torch.library.impl(lib, "fused_all_gather_matmul", "XPU")
 def _fused_all_gather_matmul(
     A_shard: torch.Tensor,
     Bs: list[torch.Tensor],
@@ -886,6 +861,7 @@ def scaled_matmul(
 
 
 @torch.library.impl(lib, "fused_all_gather_scaled_matmul", "CUDA")
+@torch.library.impl(lib, "fused_all_gather_scaled_matmul", "XPU")
 def _fused_all_gather_scaled_matmul(
     A_shard: torch.Tensor,
     Bs: list[torch.Tensor],
@@ -941,7 +917,8 @@ def _fused_all_gather_scaled_matmul(
 
     with torch.profiler.record_function("fused_all_gather_scaled_matmul"):
         A, res = _fused_all_gather_matmul_impl(
-            torch.ops.aten._scaled_mm.out,
+            # torch.ops.aten._scaled_mm.out,
+            torch.ops.vllm.fp8_gemm.out,
             A_shard,
             Bs,
             A_scale,
@@ -993,6 +970,7 @@ def restride_A_shard_for_fused_all_gather_matmul(
 
 
 @torch.library.impl(lib, "fused_matmul_reduce_scatter", "CUDA")
+@torch.library.impl(lib, "fused_matmul_reduce_scatter", "XPU")
 def _fused_matmul_reduce_scatter(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -1098,6 +1076,7 @@ def chunk_producer(rank: int, out: torch.Tensor) -> None:
 
 
 @torch.library.impl(lib, "fused_scaled_matmul_reduce_scatter", "CUDA")
+@torch.library.impl(lib, "fused_scaled_matmul_reduce_scatter", "XPU")
 def _fused_scaled_matmul_reduce_scatter(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -1131,7 +1110,8 @@ def _fused_scaled_matmul_reduce_scatter(
         )
     with torch.profiler.record_function("fused_scaled_matmul_reduce_scatter"):
         return _fused_scaled_matmul_reduce_scatter_impl(
-            mm_out_op=torch.ops.aten._scaled_mm.out,
+            # mm_out_op=torch.ops.aten._scaled_mm.out,
+            mm_out_op=torch.ops.vllm.fp8_gemm.out,
             A=A,
             B=B,
             A_scale=A_scale,
@@ -1270,11 +1250,6 @@ def _fused_scaled_matmul_reduce_scatter_impl(
             .flatten(0, -2)
         )
         A_scale_shards = list(A_scale.chunk(group.size()))
-        # cuBLAS's row-wise kernel requires scales to be aligned to 16 bytes.
-        # When we slice them we might break this and need to reallocate them.
-        A_scale_shards = [
-            t if t.data_ptr() % 16 == 0 else t.clone() for t in A_scale_shards
-        ]
     else:
         raise ValueError("A_scale cannot be none for scaled_mm")
 
@@ -1396,7 +1371,7 @@ def _maybe_convert_scalar_types_to_dtypes(
 class Work(_Work):
     def __init__(self) -> None:
         super().__init__()
-        self.event = torch.cuda.Event()
+        self.event = torch.xpu.Event()
         self.event.record()
 
     def wait(self, timeout: timedelta = timedelta(seconds=0)) -> bool:
@@ -1441,7 +1416,7 @@ def _low_contention_all_gather_meta(
     group_size = c10d._get_group_size_by_name(group_name)
     return tensor.new_empty(tensor.shape[0] * group_size, *tensor.shape[1:])
 
-
+@torch.library.impl(lib, "_low_contention_all_gather", "XPU")
 @torch.library.impl(lib, "_low_contention_all_gather", "CUDA")
 def _low_contention_all_gather(
     tensor: torch.Tensor,
@@ -1474,7 +1449,7 @@ def _low_contention_all_gather(
     output = tensor.new_empty(tensor.shape[0] * world_size, *tensor.shape[1:])
     chunks = output.chunk(world_size)
 
-    _get_backend_stream().wait_stream(torch.cuda.current_stream())
+    _get_backend_stream().wait_stream(torch.xpu.current_stream())
     with _get_backend_stream():
         if not input_is_symm_mem:
             local_buf = symm_mem.get_buffer(rank, tensor.shape, tensor.dtype)
@@ -1512,7 +1487,7 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
     a2a_res = torch.empty_like(tensor)
     chunks = a2a_res.chunk(world_size)
 
-    _get_backend_stream().wait_stream(torch.cuda.current_stream())
+    _get_backend_stream().wait_stream(torch.xpu.current_stream())
     with _get_backend_stream():
         # pull + offline reduction
         symm_mem.barrier()
@@ -1549,7 +1524,7 @@ def _low_contention_reduce_scatter_with_workspace(
     assert tensor.shape[0] % world_size == 0
     chunks = tensor.chunk(world_size)
 
-    _get_backend_stream().wait_stream(torch.cuda.current_stream())
+    _get_backend_stream().wait_stream(torch.xpu.current_stream())
     with _get_backend_stream():
         # push + offline reduction
         workspace.barrier()
@@ -1572,7 +1547,7 @@ def _low_contention_reduce_scatter_with_workspace(
         torch._C._distributed_c10d._register_work(ret, Work())
         return ret
 
-
+@torch.library.impl(lib, "_low_contention_reduce_scatter", "XPU")
 @torch.library.impl(lib, "_low_contention_reduce_scatter", "CUDA")
 def _low_contention_reduce_scatter(
     tensor: torch.Tensor,
@@ -1755,7 +1730,7 @@ def is_nvshmem_available() -> bool:
     return _is_nvshmem_available()
 
 
-def set_backend(name: Literal["NVSHMEM", "CUDA", "NCCL"]) -> None:
+def set_backend(name: Literal["NVSHMEM", "CUDA", "NCCL", "XCCL"]) -> None:
     r"""
     Set the backend for symmetric memory allocation. This is a global setting
     and affects all subsequent calls to
diff --git a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
index 9ed98f3968d88..db5b8aa6f96c9 100644
--- a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
+++ b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
@@ -691,27 +691,21 @@ def consolidate_safetensors_files_on_every_rank(
         if idx in indices_for_this_rank
     }
 
-    if not filtered_mapping:
-        logger.info("Rank %d: No files to process, exiting early", rank)
-        # Wait for all ranks to complete
-        if dist.is_available() and dist.is_initialized():
-            dist.barrier()
-        return
-
-    # Convert index mapping to filename mapping
-    max_index = max(unique_indices)
-    filtered_filename_mapping = {}
-    for fqn, idx in filtered_mapping.items():
-        filename = _gen_file_name(idx, max_index)
-        filtered_filename_mapping[fqn] = filename
-
-    # Call the existing consolidation function with the filtered mapping
-    _consolidate_safetensors_files(
-        input_dir=input_dir,
-        output_dir=output_dir,
-        fqn_to_file_mapping=filtered_filename_mapping,
-        num_threads=num_threads,
-    )
+    if filtered_mapping:
+        # Convert index mapping to filename mapping
+        max_index = max(unique_indices)
+        filtered_filename_mapping = {}
+        for fqn, idx in filtered_mapping.items():
+            filename = _gen_file_name(idx, max_index)
+            filtered_filename_mapping[fqn] = filename
+
+        # Call the existing consolidation function with the filtered mapping
+        _consolidate_safetensors_files(
+            input_dir=input_dir,
+            output_dir=output_dir,
+            fqn_to_file_mapping=filtered_filename_mapping,
+            num_threads=num_threads,
+        )
 
     logger.info(
         "Rank %d: Done consolidating. Processed %d unique indices in %.2f secs.",
diff --git a/torch/distributed/checkpoint/hf_storage.py b/torch/distributed/checkpoint/hf_storage.py
index 23a4cc1f877ab..17db989727d4a 100644
--- a/torch/distributed/checkpoint/hf_storage.py
+++ b/torch/distributed/checkpoint/hf_storage.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import queue
+import threading
 from typing import Any, Optional
 
 import torch
@@ -203,15 +204,52 @@ class HuggingFaceStorageReader(FileSystemReader):
     A reader that reads a checkpoint in the huggingface safetensors format.
     """
 
-    def __init__(self, path: str) -> None:
+    def __init__(self, path: str, thread_count: int = 1) -> None:
         """
         Initialize the huggingface reader pointing to path.
 
         Args:
             path: directory where the checkpoint will be read from.
+            thread_count: Number of threads to use to read distributed checkpoint. Default to 1.
         """
 
         super().__init__(path=path)
+        self.thread_count = thread_count
+
+    def _process_read_request(self, f, req: ReadItem, planner: LoadPlanner) -> None:
+        """Helper function to process a single read request."""
+        # Create slices for each dimension based on offsets and lengths
+        slices = tuple(
+            slice(offset, offset + length)
+            for offset, length in zip(req.storage_offsets, req.lengths)
+        )
+        tensor = f.get_slice(req.storage_index.fqn)[slices]
+        target_tensor = planner.resolve_tensor(req).detach()
+
+        assert target_tensor.size() == tensor.size(), (
+            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+        )
+
+        target_tensor.copy_(tensor)
+        planner.commit_tensor(req, target_tensor)
+
+    def _read_files_from_queue(
+        self,
+        file_queue: queue.Queue,
+        result_queue: queue.Queue,
+        planner: LoadPlanner,
+    ) -> None:
+        from safetensors import safe_open  # type: ignore[import]
+
+        try:
+            while True:
+                file_name, reqs = file_queue.get_nowait()
+                with safe_open(filename=file_name, framework="pt") as f:
+                    for req in reqs:
+                        self._process_read_request(f, req, planner)
+                result_queue.put(True)  # Signal that this file has been processed
+        except queue.Empty:
+            pass
 
     def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         from safetensors import safe_open  # type: ignore[import]
@@ -223,25 +261,47 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
             file_name = item_md.relative_path
             per_file.setdefault(file_name, []).append(read_item)
 
-        for file_name, reqs in per_file.items():
-            with safe_open(filename=file_name, framework="pt") as f:
-                for req in reqs:
-                    item_md = self.storage_data[req.storage_index]
-
-                    # Create slices for each dimension based on offsets and lengths
-                    slices = tuple(
-                        slice(offset, offset + length)
-                        for offset, length in zip(req.storage_offsets, req.lengths)
-                    )
-                    tensor = f.get_slice(req.storage_index.fqn)[slices]
-                    target_tensor = planner.resolve_tensor(req).detach()
-
-                    assert target_tensor.size() == tensor.size(), (
-                        f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-                    )
-
-                    target_tensor.copy_(tensor)
-                    planner.commit_tensor(req, target_tensor)
+        if self.thread_count <= 1 or len(per_file) <= 1:
+            for file_name, reqs in per_file.items():
+                with safe_open(filename=file_name, framework="pt") as f:
+                    for req in reqs:
+                        self._process_read_request(f, req, planner)
+        else:
+            # Use parallel implementation with thread pool
+            file_queue: queue.Queue = queue.Queue()
+            result_queue: queue.Queue = queue.Queue()
+
+            # Fill the queue with files to process
+            for file_name, reqs in per_file.items():
+                file_queue.put((file_name, reqs))
+
+            # Create and start worker threads
+            threads = []
+            num_threads = min(self.thread_count, len(per_file))
+            for _ in range(num_threads):
+                t = threading.Thread(
+                    target=self._read_files_from_queue,
+                    args=(file_queue, result_queue, planner),
+                )
+                t.start()
+                threads.append(t)
+
+            # Wait for all threads to complete
+            for t in threads:
+                t.join()
+
+            # Check if all files were processed
+            processed_count = 0
+            try:
+                while True:
+                    result_queue.get_nowait()
+                    processed_count += 1
+            except queue.Empty:
+                pass
+
+            assert processed_count == len(per_file), (
+                f"Not all files were processed: {processed_count} out of {len(per_file)}"
+            )
 
         fut: Future = Future()
         fut.set_result(None)
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index a7ca2453b251f..a2409cce969aa 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -2035,8 +2035,12 @@ def _new_process_group_helper(
         elif backend_str == Backend.XCCL:
             if not is_xccl_available():
                 raise RuntimeError("Distributed package doesn't have XCCL built in")
+            backend_options = ProcessGroupXCCL.Options()
+            backend_options.global_ranks_in_group = global_ranks_in_group
+            backend_options.group_name = group_name
+            backend_options._timeout = timeout
             backend_class = ProcessGroupXCCL(
-                backend_prefix_store, group_rank, group_size
+                backend_prefix_store, group_rank, group_size, backend_options
             )
             backend_type = ProcessGroup.BackendType.XCCL
         else:
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
index c2327e1cd3cf3..c48f75ad331ff 100644
--- a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
@@ -11,7 +11,10 @@
 from subprocess import Popen
 from typing import Any, Optional
 
-from torch.numa.binding import maybe_wrap_command_with_numa_bindings, NumaOptions
+from torch.numa.binding import (
+    maybe_temporarily_apply_numa_binding_to_current_process,
+    NumaOptions,
+)
 
 
 __all__ = ["SubprocessHandler"]
@@ -50,22 +53,20 @@ def __init__(
         env_vars.update(env)
 
         args_str = (entrypoint, *[str(e) for e in args])
-        args_str = (
-            maybe_wrap_command_with_numa_bindings(
-                command_args=args_str,
-                gpu_index=local_rank_id,
-                numa_options=numa_options,
-            )
-            or args_str
-        )
 
         self.local_rank_id = local_rank_id
-        self.proc: Popen = self._popen(args_str, env_vars)
+
+        # See HACK [NUMA inheritance] in spawn.py for context.
+        with maybe_temporarily_apply_numa_binding_to_current_process(
+            gpu_index=local_rank_id, numa_options=numa_options
+        ):
+            self.proc: Popen = self._popen(args_str, env_vars)
 
     def _popen(self, args: tuple, env: dict[str, str]) -> Popen:
         kwargs: dict[str, Any] = {}
         if not IS_WINDOWS:
             kwargs["start_new_session"] = True
+
         return Popen(
             # pyre-fixme[6]: Expected `Union[typing.Sequence[Union[_PathLike[bytes],
             #  _PathLike[str], bytes, str]], bytes, str]` for 1st param but got
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index ff496fb2d58f1..76edc14ef1f18 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -26,6 +26,7 @@
 from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
 from torch.distributed.elastic.utils.logging import get_logger
+from torch.multiprocessing.spawn import should_use_parallel_start
 from torch.numa.binding import NumaOptions
 
 
@@ -109,9 +110,11 @@ def __post_init__(self):
 
         if (
             self.numa_options is None
-            # NOTE: This filter isn't relevant for str entrypoints,
-            # but it's the default anyway.
-            and self.start_method == "spawn"
+            # The way we apply NUMA bindings currently depends
+            # on the processes being started sequentially.
+            # Technically, this filter does not matter for str entrypoints,
+            # but we ignore that nuance for now.
+            and not should_use_parallel_start(self.start_method)
             and torch.cuda.is_available()
             # We assume local_rank n uses cuda device n.
             and torch.cuda.device_count() == self.nproc_per_node
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
index 70ea7e9ce97aa..dc3a1fb10e4b3 100644
--- a/torch/distributed/tensor/_random.py
+++ b/torch/distributed/tensor/_random.py
@@ -2,16 +2,17 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import contextlib
 import warnings
+from logging import getLogger
 from typing import Optional, Union
 
 import torch
-import torch.distributed as dist
-from torch import Tensor
 from torch.distributed.device_mesh import _get_device_handle, DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor.placement_types import Shard
 
 
+logger = getLogger(__name__)
+
 __all__ = [
     "is_rng_supported_mesh",
     "manual_seed",
@@ -75,22 +76,69 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
         )
         return
 
+    # TODO: deprecate this API, but also need to ensure we disable broadcast for PP case, and that's currently
+    # bundled together with this API.  See torchtitan/distributed/utils.py:set_determinism
+    # warnings.warn(
+    #     "DTensor manual_seed() is deprecated, since DTensor no longer maintains a separate copy of generator state. "
+    #     "Use `torch.manual_seed` instead"
+    # )
+    # Note: we still need to ensure setting `run_state_sync=False` to support the the pp case
+
     # instantiate a RNG tracker if haven't. By default DTensor uses an
     # OffsetBasedRNGTracker to perform random operators.
     global _rng_tracker
     if not _rng_tracker:
         _rng_tracker = OffsetBasedRNGTracker(device_mesh, run_state_sync=False)
 
-    # the current rank is in mesh
-    if device_mesh.get_coordinate() is not None:
-        _rng_tracker._manual_seed(seed)
-    else:
+    if device_mesh.get_coordinate() is None:
         raise RuntimeError(
             "manual_seed requires the current rank to be a part of the device mesh "
             "otherwise DTensor RNG state on the rank will not be initialized and "
             "the behavior of DTensor random ops is undefined."
         )
 
+    # DTensor no longer maintains a copy of rng state. manual seed on dtensor is the same thing
+    # as manual seed on torch.
+    torch.manual_seed(seed)
+
+
+class _PhiloxState:
+    """
+    Convenience accessor for interpreting the packed bits of (seed: uint64, offset: uint64) in the philox state,
+    which for some reason is actually exposed as a size-16 uint8 tensor.
+
+    The state is always moved to .cpu since it is necessary for it to be on CPU before applying it back to a generator.
+    """
+
+    def __init__(self, state: torch.Tensor):
+        self._state = state.to("cpu")
+
+    @property
+    def state(self):
+        return self._state
+
+    @property
+    def offset(self) -> int:
+        return int(self._state[8:].view(dtype=torch.int64).item())
+
+    @offset.setter
+    def offset(self, offset: int) -> None:
+        offset_tensor = torch.tensor([offset], dtype=torch.uint64, device="cpu").view(
+            torch.uint8
+        )
+        self._state[8:] = offset_tensor
+
+    @property
+    def seed(self) -> int:
+        return int(self._state[:8].view(dtype=torch.int64).item())
+
+    @seed.setter
+    def seed(self, seed: int) -> None:
+        seed_tensor = torch.tensor([seed], dtype=torch.uint64, device="cpu").view(
+            torch.uint8
+        )
+        self._state[:8] = seed_tensor
+
 
 class _RNGStateTracker:
     """
@@ -109,14 +157,8 @@ def __init__(self, device: torch.device):
                 f"{self.__class__.__name__} instantiation requires the presence of "
                 f"{device.type} device but couldn't find."
             )
-
-        self._states: dict[str, Tensor] = {}
         self._use_distribute_region = True
 
-    @property
-    def rng_states(self) -> dict[str, Tensor]:
-        return self._states
-
     @property
     def distribute_region_enabled(self) -> bool:
         return self._use_distribute_region
@@ -125,27 +167,6 @@ def distribute_region_enabled(self) -> bool:
     def distribute_region_enabled(self, value) -> None:
         self._use_distribute_region = value
 
-    def rng_state_is_sync(self, name) -> bool:
-        return name in self.rng_states
-
-    def get_seed(self, name: str) -> int:
-        if name not in self.rng_states:
-            raise RuntimeError(
-                f"{self.__class__.__name__} does not have random state for {name}"
-            )
-
-        seed_tensor = (self.rng_states[name])[0:8].view(dtype=torch.int64)
-        return int(seed_tensor.item())
-
-    def set_seed(self, name: str, seed: int) -> None:
-        seed_tensor = torch.tensor([seed], dtype=torch.uint64, device="cpu").view(
-            torch.uint8
-        )
-        offset_tensor = torch.tensor([0], dtype=torch.uint64, device="cpu").view(
-            torch.uint8
-        )
-        self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
-
     def _distribute_region(
         self, spec: DTensorSpec, generator: Optional[torch.Generator] = None
     ):
@@ -178,54 +199,66 @@ def __init__(
                 f"CUDA/CUDA-like/XPU device. Got {self._device.type} instead."
             )
 
+        rng_state = self._get_device_state()
+        if run_state_sync:
+            # synchronize RNG state using rank 0's current one
+            torch.distributed.broadcast(rng_state, 0)
+            my_rng_state = self._get_device_state()
+            if not all(my_rng_state == rng_state):
+                logger.warning(
+                    "DTensor is synchronizing RNG states of every rank with the state from rank 0. "
+                    "This behavior is deprecated. "
+                    "Please call `torch.manual_seed()` on every rank that participates in SPMD DTensor Operations with "
+                    "the same seed. If using Pipeline Parallelism, each pipeling state would use a different seed, "
+                    "but all ranks belonging to one pipeline stage would use the same seed."
+                )
+            self._set_device_state(rng_state)
+
+    def _get_device_state(self) -> torch.Tensor:
         if self._device.type == "hpu":
             self._device_handle.set_rng_ctx("philox")
         rng_state = self._device_handle.get_rng_state().to(self._device)
         if self._device.type == "hpu":
             self._device_handle.unset_rng_ctx("philox")
-        if run_state_sync:
-            # synchronize RNG state using rank 0's current one
-            dist.broadcast(rng_state, 0)
-
-        self.rng_states["parallel-rng"] = rng_state.to("cpu")
+        return rng_state
 
-    def _manual_seed(self, parallel_seed: int) -> None:
-        self.set_seed("parallel-rng", parallel_seed)
+    def _set_device_state(self, state: torch.Tensor):
+        # It seems that the underlying generator wants a cpu tensor but the dtensor code expects `_get_device_state`
+        # to convert to a 'device' tensor, probably because we may use it with our backend comms for sync/debug
+        # for now, we just convert back to cpu here to make sure it always works.
+        if self._device.type == "hpu":
+            self._device_handle.set_rng_ctx("philox")
+        self._device_handle.set_rng_state(state.to("cpu"))
+        if self._device.type == "hpu":
+            self._device_handle.unset_rng_ctx("philox")
 
     @contextlib.contextmanager
     def _distribute_region(
         self, spec: DTensorSpec, generator: Optional[torch.Generator] = None
     ):
-        g_name = "parallel-rng"
         if generator is not None:
             # This is a little hacky, but for any user-passed generator, we store its state under a unique key,
             # not because we need to keep a copy of it but because its the easiest way to make it work with the
             # existing set/get APIs. We also ensure we remove it from rng_states after each _distribute_region.
-            g_name = "user-passed-generator"
-            assert g_name not in self.rng_states
-            self.rng_states[g_name] = generator.get_state()
-        # check if the parallel rng state has been synchronized or not
-        if not self.rng_state_is_sync("parallel-rng"):
-            raise RuntimeError(
-                "OffsetBasedRNGTracker requires the random state to be synchronized "
-                "before entering into a distribute region!"
-            )
+            state = _PhiloxState(generator.get_state())
+        else:
+            state = _PhiloxState(self._get_device_state())
 
         if self.distribute_region_enabled:
             if self._device.type == "hpu":
                 self._device_handle.set_rng_ctx("philox")
-            old_offset = self.get_offset(g_name)
-            self._set_pre_op_offset(g_name, spec)
+            old_offset = state.offset
+            self._set_pre_op_offset(state, spec)
             with torch.random.fork_rng(
                 devices=[self._device], device_type=self._device.type
             ):
                 assert self._device_handle is not None
-                self._device_handle.set_rng_state(self.rng_states[g_name])
+                self._device_handle.set_rng_state(state.state)
                 try:
                     yield  # execute the region code
                 finally:
                     # update offset to synchronize among ranks
-                    self._set_post_op_offset(g_name, spec, old_offset)
+                    self._set_post_op_offset(state, spec, old_offset)
             if self._device.type == "hpu":
                 self._device_handle.unset_rng_ctx("philox")
         else:
@@ -235,30 +268,11 @@ def _distribute_region(
             # ensure we (a) propagate the state advancement back to the user's RNG so its visible and impacts any future
             # usage of that RNG (dtensor or non-dtensor), (b) drop it from our own cache so that if the user updates
             # the seed value in their rng and uses it with DTensor again, we always use the latest value
-            generator.set_state(self.rng_states.pop(g_name))
-
-    def get_offset(self, name: str) -> int:
-        if name not in self.rng_states:
-            raise RuntimeError(
-                f"{self.__class__.__name__} does not have random state for {name}"
-            )
-
-        offset_tensor = (self.rng_states[name])[8:].view(dtype=torch.int64)
-        return int(offset_tensor.item())
-
-    def set_offset(self, name: str, offset: int) -> None:
-        if name not in self.rng_states:
-            raise RuntimeError(
-                f"{self.__class__.__name__} does not have random state for {name}"
-            )
-
-        seed_tensor = (self.rng_states[name])[0:8]
-        offset_tensor = torch.tensor([offset], dtype=torch.uint64, device="cpu").view(
-            torch.uint8
-        )
-        self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
+            generator.set_state(state.state)
+        else:
+            self._set_device_state(state.state)
 
-    def _set_pre_op_offset(self, name: str, spec: DTensorSpec) -> None:
+    def _set_pre_op_offset(self, state: _PhiloxState, spec: DTensorSpec) -> None:
         """Set the starting RNG offset for current device's local shard before actual
         op execution. The pre_op_offset value should start from the current RNG offset
         and increment by the size of local shard until it reaches the size of the whole
@@ -266,7 +280,7 @@ def _set_pre_op_offset(self, name: str, spec: DTensorSpec) -> None:
         will be the same.
 
         Args:
-            name (str): The name of the generator to use (should be a key in self.rng_states)
+            state (:class:`Tensor`): The generator state to modify
             spec (:class:`DTensorSpec`): the spec of the DTensor object on which
                 we prepare the offset for running random ops.
 
@@ -369,15 +383,15 @@ def _set_pre_op_offset(self, name: str, spec: DTensorSpec) -> None:
         local_size = prod(local_size_on_rank_0)
 
         # get current RNG offset
-        current_offset = self.get_offset(name)
+        current_offset = state.offset
 
         # pytorch: offset must be multiple of 4
         # source: aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
         offset_incr = (shard_linear_idx * local_size + 3) // 4 * 4
-        self.set_offset(name, current_offset + offset_incr)
+        state.offset = current_offset + offset_incr
 
     def _set_post_op_offset(
-        self, name: str, spec: DTensorSpec, old_offset: int
+        self, state: _PhiloxState, spec: DTensorSpec, old_offset: int
     ) -> None:
         """Sets the RNG to a synchronized state after running the local random op. Every
         rank should set its RNG offset to `old_offset + DTensor.numel()` where old_offset is
@@ -385,7 +399,7 @@ def _set_post_op_offset(
         random ops.
 
         Args:
-            name (str): The name of the generator to use (should be a key in self.rng_states)
+            state (:class:`Tensor`): The generator state to modify.
             spec (:class:`DTensorSpec`): the spec of the DTensor object on which
                 we post-process the offset for running random ops.
 
@@ -400,7 +414,7 @@ def _set_post_op_offset(
         # pytorch: offset must be multiple of 4
         # source: aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
         numel = (numel + 3) // 4 * 4
-        self.set_offset(name, old_offset + numel)
+        state.offset = old_offset + numel
 
     def _calc_shard_linear_idx(
         self, shard_coord: list[int], shard_size: list[int]
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index a2ac60c212026..2522e6f8a90a3 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -1850,14 +1850,6 @@ def _find_node(gm: torch.fx.GraphModule, name: str) -> torch.fx.Node:
     return next(iter(node for node in gm.graph.nodes if node.name == name))
 
 
-def _is_bogus_const_name(name: str):
-    splitted_names = name.split(".")
-    if len(splitted_names) < 1:
-        return True
-
-    return splitted_names[-1].startswith("lifted_tensor")
-
-
 def _non_strict_export(
     mod: torch.nn.Module,
     args: tuple[Any, ...],
@@ -2057,11 +2049,6 @@ def _export_for_training(
 
     original_state_dict = _get_original_state_dict(mod)
 
-    has_ambient_mode = False
-    if not strict:
-        flat_args, _ = pytree.tree_flatten((args, kwargs))
-        has_ambient_mode = torch._guards.detect_fake_mode(flat_args) is not None
-
     # Call the appropriate export function based on the strictness of tracing.
     export_func = _strict_export if strict else _non_strict_export
 
@@ -2076,21 +2063,6 @@ def _export_for_training(
         _to_aten_func=_export_to_aten_ir_make_fx,
     )
 
-    # If we are tracing with fake inputs, it is expected to
-    # see fake tensor constants.
-    if not strict and not has_ambient_mode:
-        for const, val in export_artifact.aten.constants.items():
-            if isinstance(
-                val, torch._subclasses.fake_tensor.FakeTensor
-            ) and _is_bogus_const_name(const):
-                raise RuntimeError(
-                    f"We found a fake tensor in the exported program constant's list. "
-                    f"This typically means our tracing system encountered an op that "
-                    f"we can't trace through. For the potential source, you can refer to "
-                    f"following model attribute: {const}. "
-                    f"Please file an issue on github. "
-                )
-
     export_graph_signature = export_artifact.aten.sig
 
     forward_arg_names = _get_forward_arg_names(mod, args, kwargs)
diff --git a/torch/export/pt2_archive/_package.py b/torch/export/pt2_archive/_package.py
index 323253a1501b8..fd76070391e05 100644
--- a/torch/export/pt2_archive/_package.py
+++ b/torch/export/pt2_archive/_package.py
@@ -400,6 +400,7 @@ def package_pt2(
     if not (
         (isinstance(f, (io.IOBase, IO)) and f.writable() and f.seekable())
         or (isinstance(f, (str, os.PathLike)) and os.fspath(f).endswith(".pt2"))
+        or (isinstance(f, tempfile._TemporaryFileWrapper) and f.name.endswith(".pt2"))
     ):
         # TODO: turn this into an error
         logger.warning(
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index 0a0266f3a189c..436de6913e638 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -280,6 +280,10 @@ def adapt(
         """NOTE: This adapter may mutate given ``input_args_with_path``."""
         ...
 
+    def get_flat_arg_paths(self) -> list[str]:
+        """Returns a list of paths that are used to access the flat args."""
+        return []
+
 
 class UnflattenedModule(torch.nn.Module):
     def __init__(
@@ -577,12 +581,25 @@ def process_forward_inputs(self, *args, **kwargs):
             from torch._export.utils import _check_input_constraints_for_graph
 
             if self.adapted is True:
-                # TODO(suo): The FlatArgsAdapter returns a list of flat args,
-                # which we don't have keypaths for. For now, just create a dummy
-                # keypath to associate with the arg.
+                flat_arg_paths = (
+                    self.flat_args_adapter.get_flat_arg_paths()
+                    if self.flat_args_adapter
+                    else []
+                )
+                assert not flat_arg_paths or len(flat_arg_paths) == len(flat_args)
                 new_flat_args_with_path = [  # type: ignore[var-annotated]
-                    ((SequenceKey(idx=0), GetAttrKey(name="<unknown location>")), arg)
-                    for arg in flat_args
+                    (
+                        (
+                            SequenceKey(idx=idx),
+                            GetAttrKey(
+                                name=flat_arg_paths[idx]
+                                if flat_arg_paths
+                                else "<unknown location>"
+                            ),
+                        ),
+                        arg,
+                    )
+                    for idx, arg in enumerate(flat_args)
                 ]
             else:
                 new_flat_args_with_path = flat_args_with_path  # type: ignore[assignment]
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index eb5f885acc194..0b522591c63e6 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -2,7 +2,6 @@
 import logging
 import multiprocessing
 import multiprocessing.connection
-import multiprocessing.spawn as mp_spawn
 import os
 import pickle
 import signal
@@ -14,7 +13,7 @@
 from typing import Optional
 
 from torch.numa.binding import (
-    maybe_get_temporary_python_executable_with_numa_bindings,
+    maybe_temporarily_apply_numa_binding_to_current_process,
     NumaOptions,
 )
 
@@ -30,6 +29,7 @@
     "ProcessException",
     "ProcessExitedException",
     "ProcessRaisedException",
+    "should_use_parallel_start",
     "spawn",
     "SpawnContext",
     "start_processes",
@@ -227,6 +227,17 @@ def __init__(self, processes, error_files):
         super().__init__(processes, error_files)
 
 
+def should_use_parallel_start(start_method: str) -> bool:
+    """
+    Returns:
+        Whether we will start subprocesses in parallel.
+    """
+    return (
+        start_method == "forkserver"
+        and os.environ.get(ENV_VAR_PARALLEL_START, "0") == "1"
+    )
+
+
 # Note: [start_processes]
 # mp.start_processes handles both start_method='spawn' and 'fork'. It's supposed to be a
 # more generalized API than mp.spawn. Currently we only document mp.spawn as it's the
@@ -248,53 +259,21 @@ def start_processes(
     # this func will start processes in parallel if start_method is 'forkserver'.
     # Please opt in to this perf optimization by setting env var (TORCH_MP_PARALLEL_START) to 1.
     # todo: investigate why spawn does not work with threadpool and raises SIGINT
-    if (
-        start_method == "forkserver"
-        and os.environ.get(ENV_VAR_PARALLEL_START, "0") == "1"
-    ):
+    if should_use_parallel_start(start_method):
         log.info("Starting processes in parallel.")
         start_parallel = True
     else:
         # Set env var TORCH_MP_PARALLEL_START to 0 to disable parallel start
         start_parallel = False
 
-    if numa_options is not None and start_method != "spawn":
-        raise ValueError("NUMA binding is only compatible with spawn")
-
     if numa_options is not None and start_parallel:
         raise ValueError("NUMA binding is not compatible with parallel start")
 
     mp = multiprocessing.get_context(start_method)
     error_files = [None] * nprocs
     processes = [None] * nprocs
-    original_executable = mp_spawn.get_executable()
 
     def start_process(i):
-        # HACK: We want to force Process.start() to kick off the subprocess
-        # using a custom numactl command per rank. However, the API exposed
-        # by multiprocessing only allows us to override the executable for
-        # the entire context, and only with a single str rather than a tuple.
-        # Furthermore, there is no API for passing additional options, e.g.
-        # to make LOCAL_RANK available to the executable.
-        #
-        # In order to get around these limitations, we pre-compute
-        # the appropriate command containing NUMA bindings and store it in a
-        # temporary executable which passes Python args on to the original
-        # executable. Then, we call set_executable before and after each
-        # Process.start() call.
-        #
-        # This assumes that, under the hood, Process.start() for rank n
-        # will not call get_executable after start_process for rank n+1
-        # calls set_executable again. We guarantee this by
-        # raising an exception if `start_parallel`, above. (Not clear
-        # if there would be a race condition otherwise, but we want to be safe.)
-        temporary_executable_path = (
-            maybe_get_temporary_python_executable_with_numa_bindings(
-                python_executable_path=original_executable,
-                gpu_index=i,
-                numa_options=numa_options,
-            )
-        )
         # Each process is assigned a file to write tracebacks to.  We
         # use the file being non-empty to indicate an exception
         # occurred (vs an expected shutdown).  Note: this previously
@@ -307,18 +286,29 @@ def start_process(i):
         tf.close()
         os.unlink(tf.name)
 
-        try:
-            if temporary_executable_path is not None:
-                mp.set_executable(temporary_executable_path)
-            process = mp.Process(
-                target=_wrap,
-                args=(fn, i, args, tf.name),
-                daemon=daemon,
-            )
+        process = mp.Process(
+            target=_wrap,
+            args=(fn, i, args, tf.name),
+            daemon=daemon,
+        )
+
+        # HACK [NUMA inheritance]: Subprocesses inherit the parent process's CPU
+        # affinity. So, we temporarily apply the bindings to the current process,
+        # and then immediately undo them.
+        # This is necessary because the alternatives would be to
+        # either
+        # 1. Use numactl CLI. However, Python's multiprocessing library
+        # does not provide an API which would allow us to prepend
+        # the command it runs with numactl options.
+        # 2. Wrap the provided function such that it first applies
+        # NUMA bindings, and then executes as expected. However, this
+        # can result in worse memory locality, because torch and CUDA
+        # initialization would occur before applying the bindings, thus
+        # allowing some memory to be allocated on the wrong NUMA nodes.
+        with maybe_temporarily_apply_numa_binding_to_current_process(
+            gpu_index=i, numa_options=numa_options
+        ):
             process.start()
-        finally:
-            if temporary_executable_path is not None:
-                mp.set_executable(original_executable)
         return i, process, tf.name
 
     if not start_parallel:
diff --git a/torch/numa/binding.py b/torch/numa/binding.py
index 73484fdc8b6ea..1995f58f05853 100644
--- a/torch/numa/binding.py
+++ b/torch/numa/binding.py
@@ -1,15 +1,11 @@
 import os
-import shutil
-import stat
-import subprocess
 import traceback
 from collections import defaultdict
-from collections.abc import Iterable
+from collections.abc import Iterable, Iterator
+from contextlib import contextmanager
 from dataclasses import asdict, dataclass
 from enum import Enum
 from logging import getLogger
-from subprocess import run
-from tempfile import mkstemp
 from typing import Callable, Optional, TypeVar
 
 import torch
@@ -18,13 +14,10 @@
 
 __all__ = [
     "AffinityMode",
-    "maybe_get_temporary_python_executable_with_numa_bindings",
-    "maybe_wrap_command_with_numa_bindings",
+    "maybe_temporarily_apply_numa_binding_to_current_process",
     "NumaOptions",
 ]
 
-_NUMACTL_COMMAND = "numactl"
-
 logger = getLogger(__name__)
 
 
@@ -54,248 +47,136 @@ class NumaOptions:
     should_fall_back_if_binding_fails: bool = False
 
 
-def maybe_get_temporary_python_executable_with_numa_bindings(
-    *, python_executable_path: str, gpu_index: int, numa_options: Optional[NumaOptions]
-) -> Optional[str]:
+@contextmanager
+def maybe_temporarily_apply_numa_binding_to_current_process(
+    *, gpu_index: int, numa_options: Optional[NumaOptions]
+) -> Iterator[None]:
     """
-    Args:
-        python_executable_path: E.g., "/usr/local/bin/python"
-    Returns:
-        Path to a temporary file. This file can be executed just like the original python
-        executable, except it will first apply NUMA bindings.
+    1. Applies NUMA binding to the current process, suitable for the process
+    which will be interacting with GPU gpu_index.
+    2. Resets to the original CPU affinity before exiting the context manager.
     """
     if numa_options is None:
-        logger.info("Received numa_options=None, not creating numa executable.")
-        return None
-
-    if isinstance(python_executable_path, bytes):
-        python_executable_path = python_executable_path.decode()
-
-    full_numactl_command = maybe_wrap_command_with_numa_bindings(
-        # "$@", i.e. pass through any args the python executable would have
-        # received.
-        command_args=(python_executable_path, '"$@"'),
-        gpu_index=gpu_index,
-        numa_options=numa_options,
-    )
-
-    if full_numactl_command is None:
-        return None
+        yield
+        return
 
-    executable_path = _get_temporary_executable_for_command(
-        command_args=full_numactl_command
+    original_logical_cpu_indices = _get_allowed_cpu_indices_for_current_process()
+    _apply_numa_binding_to_current_process(
+        gpu_index=gpu_index, numa_options=numa_options
+    )
+    yield
+    _bind_current_process_to_logical_cpus(
+        logical_cpu_indices=original_logical_cpu_indices
     )
-    logger.info("Returning python executable with NUMA bindings %s", executable_path)
-
-    return executable_path
-
-
-def maybe_wrap_command_with_numa_bindings(
-    *,
-    command_args: tuple[str, ...],
-    gpu_index: int,
-    numa_options: Optional[NumaOptions],
-) -> Optional[tuple[str, ...]]:
-    """
-    Args:
-        command_args: Full shell command, like ("/usr/local/bin/python", "train.py")
-        gpu_index: The index of the GPU which command_args should bind to
 
-    Returns:
-        command_args, but wrapped so that it runs with NUMA bindings corresponding to
-        gpu_index and numa_options.
-        E.g., ("numactl", "--cpunodebind=0", "/usr/local/bin/python", "train.py")
-    """
-    if not numa_options:
-        logger.info("Received numa_options=None, not applying bindings.")
-        return None
 
+def _apply_numa_binding_to_current_process(
+    *, gpu_index: int, numa_options: NumaOptions
+) -> None:
     kwargs = {
-        "command_args": command_args,
         "gpu_index": gpu_index,
         "numa_options": asdict(numa_options),
     }
-    logger.info("Attempting to wrap command with NUMA bindings, given input %r", kwargs)
+    logger.info("Attempting to apply NUMA binding, given input %r", kwargs)
 
     try:
-        _raise_if_numactl_not_available()
-
-        numactl_options = _get_numactl_cli_options(
-            command_args=command_args, gpu_index=gpu_index, numa_options=numa_options
+        logical_cpu_indices = _get_logical_cpus_to_bind_to(
+            gpu_index=gpu_index, numa_options=numa_options
+        )
+        logger.info(
+            "Computed logical_cpu_indices=%s for NUMA binding",
+            _get_ranges_str_from_ints(logical_cpu_indices),
         )
-        logger.info("Computed numactl_options=%r", numactl_options)
-
-        _raise_if_numactl_fails_dry_run(numactl_options=numactl_options)
-        logger.info("Validated numactl_options=%r", numactl_options)
 
-        full_numactl_command = _get_assembled_command_from_pieces(
-            command_args=command_args, numactl_options=numactl_options
+        _raise_if_logical_cpu_indices_invalid(logical_cpu_indices=logical_cpu_indices)
+        logger.info(
+            "Validated logical_cpu_indices=%s for NUMA binding",
+            _get_ranges_str_from_ints(logical_cpu_indices),
         )
+
+        _bind_current_process_to_logical_cpus(logical_cpu_indices=logical_cpu_indices)
         logger.info(
-            "Successfully wrapped command with numa_bindings. Returning %r",
-            full_numactl_command,
+            "Successfully bound to logical_cpu_indices=%r for NUMA binding",
+            _get_ranges_str_from_ints(logical_cpu_indices),
         )
+
         signpost_event(
             category="numa_binding",
-            name="wrap_command_success",
-            parameters={**kwargs, "result": full_numactl_command},
+            name="apply_success",
+            parameters={
+                **kwargs,
+                "logical_cpu_indices": _get_ranges_str_from_ints(logical_cpu_indices),
+            },
         )
-        return full_numactl_command
     except Exception:
         signpost_event(
             category="numa_binding",
-            name="wrap_command_exception",
+            name="apply_exception",
             parameters={
                 **kwargs,
                 "traceback": traceback.format_exc(),
             },
         )
-        logger.exception(
-            "Failed to wrap command with NUMA bindings for input = %r", kwargs
-        )
+        logger.exception("Failed to apply NUMA binding for input=%r", kwargs)
         if numa_options.should_fall_back_if_binding_fails:
-            logger.warning("Falling back to original command without NUMA bindings.")
+            logger.warning(
+                "Continuing executing without applying NUMA binding, despite exception %s",
+                traceback.format_exc(),
+            )
             return None
         raise
 
 
-def _get_temporary_executable_for_command(
-    *,
-    command_args: tuple[str, ...],
-) -> str:
-    """
-    Returns:
-        Path to a temporary file which executes the specified command. The executable
-        deletes itself the first time it runs, so do not try to run it multiple times.
-    """
-    fd, path = mkstemp(
-        prefix="pytorch-numa-bind",
-        suffix=".sh",
-    )
-
-    # We do rm first to guarantee the file deletes itself. The rest of the file
-    # will still run as intended.
-    contents = f"""#!/bin/bash
-
-# If this file is more than a few minutes old and still exists on your machine,
-# that is NOT expected. It should have deleted itself. If you are seeing an accumulation of such
-# files, that could suggest a bug in pytorch. See https://github.com/pytorch/pytorch/pull/160163.
+def _raise_if_logical_cpu_indices_invalid(*, logical_cpu_indices: set[int]) -> None:
+    if not logical_cpu_indices:
+        raise RuntimeError("Must bind to a non-empty set of CPU indices")
 
-rm -- "$0"
-{" ".join(command_args)}
-"""
 
-    with os.fdopen(fd, "w") as file:
-        file.write(contents)
+def _bind_current_process_to_logical_cpus(*, logical_cpu_indices: set[int]) -> None:
+    # 0 represents the current process
+    os.sched_setaffinity(0, logical_cpu_indices)
 
-        # Ensure the file is fully synced, in order to avoid race condition
-        # from trying to execute it too early.
-        file.flush()
-        os.fsync(fd)
 
-    # Make the script executable
-    os.chmod(path, stat.S_IRWXU)
-
-    logger.info(
-        "Created temporary executable at path %s, with contents\n%s", path, contents
-    )
-
-    return path
-
-
-def _get_numactl_cli_options(
+def _get_logical_cpus_to_bind_to(
     *,
-    command_args: tuple[str, ...],
     gpu_index: int,
     numa_options: NumaOptions,
-) -> tuple[str, ...]:
+) -> set[int]:
     """
     Args:
-        command_args: The args for a command, such as might be input to Popen.
-            Example: ("python", "trainer.py")
-        gpu_index: The index of the GPU that will be used by the subprocess which executes command_args.
+        gpu_index: The index of the GPU that will be used by the subprocess.
             Example: 0
         numa_options: See NumaOptions for details.
 
     Returns:
-        Depending on numa_options, something like
-            ("--cpunodebind=0")
+        Set of logical CPU indices to bind to.
     """
     if numa_options.affinity_mode == AffinityMode.NODE:
-        numactl_command_options = _get_node_numactl_options(gpu_index=gpu_index)
+        logical_cpus = _node_get_logical_cpus_to_bind_to(gpu_index=gpu_index)
     elif numa_options.affinity_mode == AffinityMode.SOCKET:
-        numactl_command_options = _get_socket_numactl_options(gpu_index=gpu_index)
+        logical_cpus = _socket_get_logical_cpus_to_bind_to(gpu_index=gpu_index)
     elif numa_options.affinity_mode == AffinityMode.EXCLUSIVE:
-        numactl_command_options = _get_exclusive_numactl_options(gpu_index=gpu_index)
+        logical_cpus = _exclusive_get_logical_cpus_to_bind_to(gpu_index=gpu_index)
     elif numa_options.affinity_mode == AffinityMode.CORE_COMPLEX:
-        numactl_command_options = _get_core_complex_numactl_options(gpu_index=gpu_index)
+        logical_cpus = _core_complex_get_logical_cpus_to_bind_to(gpu_index=gpu_index)
     else:
         raise ValueError(f"Affinity mode {numa_options.affinity_mode} not supported.")
 
-    return numactl_command_options
-
-
-def _raise_if_numactl_fails_dry_run(*, numactl_options: tuple[str, ...]) -> None:
-    noop_args = _get_assembled_command_from_pieces(
-        # Execute arbitrary noop
-        command_args=("true",),
-        numactl_options=numactl_options,
-    )
-
-    temporary_executable_path = _get_temporary_executable_for_command(
-        command_args=noop_args
-    )
-
-    try:
-        run(
-            (temporary_executable_path,),
-            stdout=subprocess.DEVNULL,
-            # These allow us to capture the stderr as text
-            stderr=subprocess.PIPE,
-            text=True,
-            # Raise exception if nonzero exit status.
-            check=True,
-        )
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError(
-            f"""Our binding logic inferred to prepend your command with options {noop_args[:-1]}.
-            Before doing that, we did a noop dry run with args {noop_args}, but that command failed.
-            This should NOT happen, and likely suggests a bug in pytorch's numa binding logic.
-
-            The {_NUMACTL_COMMAND} command itself had this stderr:
-
-            {e.stderr}
-            """
-        ) from e
-
-
-def _get_assembled_command_from_pieces(
-    *, command_args: tuple[str, ...], numactl_options: tuple[str, ...]
-) -> tuple[str, ...]:
-    # Syntax for invoking a command but with numactl activated is numactl <args> command <args>
-    return (_NUMACTL_COMMAND, *numactl_options, *command_args)
+    return logical_cpus
 
 
-def _raise_if_numactl_not_available() -> None:
-    if not shutil.which(_NUMACTL_COMMAND):
-        raise RuntimeError(
-            f"{_NUMACTL_COMMAND} shell command is required for NUMA bindings."
-        )
-
-
-def _get_node_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
+def _node_get_logical_cpus_to_bind_to(*, gpu_index: int) -> set[int]:
     """
     Core logic of 'node' numa strategy.
-
-    Returns options to be used with numactl. E.g.,
-    ("--cpunodebind=0").
     """
     numa_node_index = _get_numa_node_index_for_gpu_index(gpu_index=gpu_index)
 
-    return (f"--cpunodebind={numa_node_index}",)
+    return _get_allowed_logical_cpu_indices_for_numa_node(
+        numa_node_index=numa_node_index
+    )
 
 
-def _get_socket_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
+def _socket_get_logical_cpus_to_bind_to(*, gpu_index: int) -> set[int]:
     """
     Core logic of 'socket' numa strategy.
     """
@@ -306,12 +187,19 @@ def _get_socket_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
     numa_node_indices = _get_numa_node_indices_for_socket_index(
         socket_index=socket_index
     )
-    numa_node_indices_str = _get_ranges_str_from_ints(numa_node_indices)
 
-    return (f"--cpunodebind={numa_node_indices_str}",)
+    logical_cpus = set()
+    for numa_node_index in numa_node_indices:
+        logical_cpus.update(
+            _get_allowed_logical_cpu_indices_for_numa_node(
+                numa_node_index=numa_node_index
+            )
+        )
+
+    return logical_cpus
 
 
-def _get_exclusive_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
+def _exclusive_get_logical_cpus_to_bind_to(*, gpu_index: int) -> set[int]:
     """
     Core logic of 'exclusive' numa strategy.
     """
@@ -370,20 +258,18 @@ def _get_exclusive_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
     )
 
     # Slice and flatten the logical CPUs from the selected physical cores
-    logical_cpu_indices_for_original_gpu = (
+    logical_cpu_indices_for_original_gpu = {
         logical_cpu_index
         for logical_cpu_indices in list(
             physical_core_to_allowed_logical_cpu_indices.values()
         )[start:end]
         for logical_cpu_index in logical_cpu_indices
-    )
+    }
 
-    return (
-        f"--physcpubind={_get_ranges_str_from_ints(logical_cpu_indices_for_original_gpu)}",
-    )
+    return logical_cpu_indices_for_original_gpu
 
 
-def _get_core_complex_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
+def _core_complex_get_logical_cpus_to_bind_to(*, gpu_index: int) -> set[int]:
     """
     Core logic of 'core-complex' numa strategy.
 
@@ -427,9 +313,7 @@ def _get_core_complex_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
         max_level_cache_to_allowed_logical_cpu_indices.values()
     )[cache_index_for_original_gpu]
 
-    return (
-        f"--physcpubind={_get_ranges_str_from_ints(logical_cpu_indices_for_original_gpu)}",
-    )
+    return logical_cpu_indices_for_original_gpu
 
 
 K = TypeVar("K")
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 6c301ef294eb1..7eaa0a5677c4b 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -37,7 +37,6 @@
     # Base error
     "OnnxExporterError",
     "ONNXProgram",
-    "enable_fake_mode",
 ]
 
 from typing import Any, Callable, TYPE_CHECKING
@@ -47,7 +46,6 @@
 from torch._C import _onnx as _C_onnx
 from torch._C._onnx import OperatorExportTypes, TensorProtoDataType, TrainingMode
 
-from ._internal._exporter_legacy import enable_fake_mode
 from ._internal.exporter._onnx_program import ONNXProgram
 from ._type_utils import JitScalarType
 from .errors import OnnxExporterError
@@ -90,7 +88,6 @@
 JitScalarType.__module__ = "torch.onnx"
 ONNXProgram.__module__ = "torch.onnx"
 OnnxExporterError.__module__ = "torch.onnx"
-enable_fake_mode.__module__ = "torch.onnx"
 
 producer_name = "pytorch"
 producer_version = _C_onnx.PRODUCER_VERSION
@@ -169,7 +166,10 @@ def export(
         output_names: names to assign to the output nodes of the graph, in order.
         opset_version: The version of the
             `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
-            to target. Must be >= 7.
+            to target. You should set ``opset_version`` according to the supported opset versions
+            of the runtime backend or compiler you want to run the exported model with.
+            Leave as default (``None``) to use the recommended version, or refer to
+            the ONNX operators documentation for more information.
         dynamic_axes:
 
             By default the exported model will have the shapes of all input and output tensors
diff --git a/torch/onnx/_internal/_exporter_legacy.py b/torch/onnx/_internal/_exporter_legacy.py
deleted file mode 100644
index f9ae42b26b84f..0000000000000
--- a/torch/onnx/_internal/_exporter_legacy.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# mypy: allow-untyped-defs
-from __future__ import annotations
-
-
-__all__ = [
-    "enable_fake_mode",
-]
-
-
-import contextlib
-import dataclasses
-import logging
-from typing import Any, TYPE_CHECKING
-
-import torch
-import torch._ops
-from torch.onnx._internal.fx import patcher as patcher
-
-
-# We can only import onnx from this module in a type-checking context to ensure that
-# 'import torch.onnx' continues to work without having 'onnx' installed. We fully
-# 'import onnx' inside of dynamo_export (by way of _assert_dependencies).
-if TYPE_CHECKING:
-    import io
-
-    from torch._subclasses import fake_tensor
-
-log = logging.getLogger(__name__)
-
-
-@dataclasses.dataclass
-class ONNXFakeContext:
-    """A dataclass used to store context for model export using FakeTensor.
-
-    This dataclass stores the FakeTensorMode instance used to convert
-    real tensors and model parameters into fake tensors. This :attr:`ONNXFakeContext.fake_mode` is
-    reused internally during tracing of a :class:`torch.nn.Module` into a FX :class:`GraphModule`.
-    """
-
-    fake_mode: fake_tensor.FakeTensorMode
-    """The fake tensor mode used for tracing model using fake tensors and parameters."""
-
-    state_dict_paths: tuple[str | io.BytesIO | dict[str, Any]] | None = None
-    """List of paths of files that contain the model :meth:`state_dict`"""
-
-
-@contextlib.contextmanager
-def enable_fake_mode():
-    """Enable fake mode for the duration of the context.
-
-    Internally it instantiates a :class:`torch._subclasses.fake_tensor.FakeTensorMode` context manager
-    that converts user input and model parameters into :class:`torch._subclasses.fake_tensor.FakeTensor`.
-
-    A :class:`torch._subclasses.fake_tensor.FakeTensor`
-    is a :class:`torch.Tensor` with the ability to run PyTorch code without having to
-    actually do computation through tensors allocated on a ``meta`` device. Because
-    there is no actual data being allocated on the device, this API allows for
-    initializing and exporting large models without the actual memory footprint needed for executing it.
-
-    It is highly recommended to initialize the model in fake mode when exporting models that
-    are too large to fit into memory.
-
-    .. note::
-        This function does not support torch.onnx.export(..., dynamo=True, optimize=True).
-        Please call ONNXProgram.optimize() outside of the function after the model is exported.
-
-    Example::
-
-        # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
-        >>> import torch
-        >>> class MyModel(torch.nn.Module):  # Model with a parameter
-        ...     def __init__(self) -> None:
-        ...         super().__init__()
-        ...         self.weight = torch.nn.Parameter(torch.tensor(42.0))
-        ...     def forward(self, x):
-        ...         return self.weight + x
-        >>> with torch.onnx.enable_fake_mode():
-        ...     # When initialized in fake mode, the model's parameters are fake tensors
-        ...     # They do not take up memory so we can initialize large models
-        ...     my_nn_module = MyModel()
-        ...     arg1 = torch.randn(2, 2, 2)
-        >>> onnx_program = torch.onnx.export(my_nn_module, (arg1,), dynamo=True, optimize=False)
-        >>> # Saving model WITHOUT initializers (only the architecture)
-        >>> onnx_program.save(
-        ...     "my_model_without_initializers.onnx",
-        ...     include_initializers=False,
-        ...     keep_initializers_as_inputs=True,
-        ... )
-        >>> # Saving model WITH initializers after applying concrete weights
-        >>> onnx_program.apply_weights({"weight": torch.tensor(42.0)})
-        >>> onnx_program.save("my_model_with_initializers.onnx")
-
-    .. warning::
-        This API is experimental and is *NOT* backward-compatible.
-
-    """
-    from torch._subclasses import fake_tensor
-    from torch.fx.experimental.symbolic_shapes import ShapeEnv
-
-    # This overrides the internal `FakeTensorMode` instance created by `torch._dynamo.export`[1].
-    # It is a good idea to keep them in sync (constructor args) to maintain the same default behavior
-    # [1] `torch/_dynamo/output_graph.py::InstructionTranslator::OutputGraph.__init__`
-    # Mixed fake/real tensors are only allowed when `torch.onnx.dynamo_export` is not called within `FakeTensorMode`
-    # This is needed because models can create new parameters during `forward(self, *args, **kwargs)` run
-    fake_mode = fake_tensor.FakeTensorMode(
-        allow_non_fake_inputs=not torch._guards.detect_fake_mode(),
-        shape_env=ShapeEnv(
-            allow_scalar_outputs=False, allow_dynamic_output_shape_ops=False
-        ),
-    )
-    # The patcher is needed for when user calls `fake_model.load_state_dict(...)` within fake mode
-    patcher_context = patcher.ONNXTorchPatcher()
-    fake_context = ONNXFakeContext(fake_mode=fake_mode)
-    with fake_mode, patcher_context:
-        yield fake_context
-    fake_context.state_dict_paths = tuple(
-        patcher_context.paths,
-    )  # type: ignore[assignment]
diff --git a/torch/onnx/_internal/_lazy_import.py b/torch/onnx/_internal/_lazy_import.py
index 3557ef099309e..5e2340fe4c42d 100644
--- a/torch/onnx/_internal/_lazy_import.py
+++ b/torch/onnx/_internal/_lazy_import.py
@@ -30,7 +30,7 @@ def __getattr__(self, attr: str) -> object:
     import onnx
     import onnx_ir  # type: ignore[import-untyped]
     import onnxscript
-    import onnxscript._framework_apis.torch_2_8 as onnxscript_apis
+    import onnxscript._framework_apis.torch_2_9 as onnxscript_apis
 
     onnxscript_ir = onnx_ir
 
@@ -38,4 +38,4 @@ def __getattr__(self, attr: str) -> object:
     onnx = _LazyModule("onnx")
     onnxscript = _LazyModule("onnxscript")
     onnxscript_ir = _LazyModule("onnx_ir")
-    onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_8")
+    onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_9")
diff --git a/torch/onnx/_internal/exporter/_compat.py b/torch/onnx/_internal/exporter/_compat.py
index cf83aa4061543..2e25730adca23 100644
--- a/torch/onnx/_internal/exporter/_compat.py
+++ b/torch/onnx/_internal/exporter/_compat.py
@@ -13,6 +13,7 @@
 from torch.onnx import _constants as onnx_constants
 from torch.onnx._internal._lazy_import import onnxscript_apis, onnxscript_ir as ir
 from torch.onnx._internal.exporter import (
+    _constants,
     _core,
     _dynamic_shapes,
     _onnx_program,
@@ -107,7 +108,27 @@ def export_compat(
     dynamic_shapes_with_export_dim, need_axis_mapping = (
         _dynamic_shapes.convert_str_to_export_dim(dynamic_shapes)
     )
-    registry = _registration.ONNXRegistry().from_torchlib(opset_version=opset_version)
+
+    if opset_version < _constants.TORCHLIB_OPSET:
+        logger.warning(
+            "Setting ONNX exporter to use operator set version %s because "
+            "the requested opset_version %s is a lower version than we have implementations for. "
+            "Automatic version conversion will be performed, which may not be successful "
+            "at converting to the requested version. If version conversion is unsuccessful, "
+            "the opset version of the exported model will be kept at %s. "
+            "Please consider setting opset_version >=%s to leverage latest ONNX features",
+            _constants.TORCHLIB_OPSET,
+            opset_version,
+            _constants.TORCHLIB_OPSET,
+            _constants.TORCHLIB_OPSET,
+        )
+        registry_opset_version = _constants.TORCHLIB_OPSET
+    else:
+        registry_opset_version = opset_version
+
+    registry = _registration.ONNXRegistry().from_torchlib(
+        opset_version=registry_opset_version
+    )
     if custom_translation_table is not None:
         for torch_op, onnx_ops in custom_translation_table.items():
             # TODO(justinchuby): Support complex inputs with annotations
diff --git a/torch/onnx/_internal/fx/__init__.py b/torch/onnx/_internal/fx/__init__.py
index b5716bdafced7..e69de29bb2d1d 100644
--- a/torch/onnx/_internal/fx/__init__.py
+++ b/torch/onnx/_internal/fx/__init__.py
@@ -1,8 +0,0 @@
-from .patcher import ONNXTorchPatcher
-from .serialization import save_model_with_external_data
-
-
-__all__ = [
-    "save_model_with_external_data",
-    "ONNXTorchPatcher",
-]
diff --git a/torch/onnx/_internal/fx/passes/_utils.py b/torch/onnx/_internal/fx/passes/_utils.py
deleted file mode 100644
index a7b05786ab171..0000000000000
--- a/torch/onnx/_internal/fx/passes/_utils.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# mypy: allow-untyped-defs
-"""Common utility functions for FX passes.
-
-These functions should NOT be directly invoked outside of `passes` package.
-"""
-
-from __future__ import annotations
-
-import collections
-import re
-from typing import Callable
-
-import torch.fx
-import torch.fx.traceback as fx_traceback
-
-
-def wrap_graph_module_for_node_meta_preservation(
-    graph_module: torch.fx.GraphModule,
-) -> Callable:
-    """Wrap a GraphModule with contexts to preserve node meta information, such as stacktrace info.
-
-    This is typically useful before calling `make_fx`. Without this wrapper, the
-    stacktrace information will be lost afterwards.
-    """
-
-    def wrapped(*args):
-        with fx_traceback.preserve_node_meta():
-            return torch.fx.Interpreter(graph_module).run(*args)
-
-    return wrapped
-
-
-def _get_node_base_name(node_name: str) -> tuple[str, int | None]:
-    pattern = r"(.*)\.(\d+)"
-    match = re.match(pattern, node_name)
-    if match is not None:
-        base_name, count_str = match.groups()
-        return base_name, int(count_str)
-    return node_name, None
-
-
-def set_node_name(
-    node: torch.fx.Node,
-    new_name: str,
-    name_to_node_cache: dict[str, torch.fx.Node],
-):
-    """Safely set the unique name of a node.
-
-    If the new name is already taken by another node, the name of the other node will be
-    updated. If `new_name` is a string of format f"{base_name}.{count}", where `count`
-    is an integer, the other node will be renamed as f"{base_name}.{count+1}". If not,
-    the other node will be renamed as "{new_name}.1". This function will iteratively
-    update the names until there is no conflict.
-
-    ``name_to_node_cache`` is required as an argument to avoid recomputation. The caller
-    is responsible for ensuring the cache is accurate and in sync with the owning module
-    of the node. The values in the cache will be updated accordingly.
-
-    Args:
-        node: The node to update.
-        new_name: The new name to use.
-        name_to_node_cache: A cache of node names to nodes.
-    """
-    node_name_to_set = collections.deque([(node, new_name)])
-
-    while node_name_to_set:
-        node, new_name = node_name_to_set.pop()
-        if new_name in name_to_node_cache and name_to_node_cache[new_name] != node:
-            base_name, postfix_count = _get_node_base_name(new_name)
-            if postfix_count is None:
-                postfix_count = 0
-            node_name_to_set.append(
-                (name_to_node_cache[new_name], f"{base_name}.{postfix_count + 1}")
-            )
-        node.name = new_name
-        name_to_node_cache[new_name] = node
-
-
-def replace_placeholder_name_and_target(
-    module: torch.fx.GraphModule, reference_module: torch.fx.GraphModule
-):
-    """Replace the argument names in module with those in reference_module.
-
-    This function assumes the two modules have the same signature structure.
-    The caller is responsible for ensuring this. Otherwise, the behavior of this
-    function is undefined. This function only does minimal sanity check that the two
-    modules have the same number of arguments.
-
-    Name conflicts between new names and existing node names in the graph are handled.
-    Check the documentation of :func:`set_node_name` for more details.
-
-    Raises:
-        RuntimeError: If the two modules have different number of arguments.
-    """
-    placeholders = [node for node in module.graph.nodes if node.op == "placeholder"]
-    reference_placeholders = [
-        node for node in reference_module.graph.nodes if node.op == "placeholder"
-    ]
-
-    if len(placeholders) != len(reference_placeholders):
-        raise RuntimeError(
-            "The two modules have different number of arguments. "
-            f"module: {len(placeholders)}, reference_module: {len(reference_placeholders)}"
-        )
-
-    name_to_node: dict[str, torch.fx.Node] = {}
-    for node in module.graph.nodes:
-        name_to_node[node.name] = node
-
-    for placeholder, reference_placeholder in zip(placeholders, reference_placeholders):
-        placeholder.target = reference_placeholder.target
-        set_node_name(placeholder, reference_placeholder.name, name_to_node)
-
-    module.recompile()
diff --git a/torch/onnx/_internal/fx/patcher.py b/torch/onnx/_internal/fx/patcher.py
deleted file mode 100644
index 6c9724e9f5a73..0000000000000
--- a/torch/onnx/_internal/fx/patcher.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# mypy: allow-untyped-defs
-import copy
-import functools
-from typing import TYPE_CHECKING, Union
-
-import torch
-
-
-if TYPE_CHECKING:
-    import io
-
-
-# TODO: Remove after https://github.com/huggingface/safetensors/pull/318
-@functools.cache
-def has_safetensors_and_transformers():
-    try:
-        # safetensors is not an exporter requirement, but needed for some huggingface models
-        import safetensors  # type: ignore[import]  # noqa: F401
-        import transformers  # type: ignore[import]  # noqa: F401
-        from safetensors import torch as safetensors_torch  # noqa: F401
-
-        return True
-    except ImportError:
-        return False
-
-
-class ONNXTorchPatcher:
-    """Context manager to temporarily patch PyTorch during FX-to-ONNX export.
-
-    This class is a collection of "patches" required by FX-to-ONNX exporter.
-
-    This context overrides several torch functions to support symbolic
-    export of large scale models.
-
-    torch.load:
-        This function is patched to record the files PyTorch stores model
-        parameters and buffers. Downstream FX-to-ONNX exporter can create
-        initializers from these files.
-    torch.fx._symbolic_trace._wrapped_methods_to_patch:
-        This list is extended with (torch.Tensor, "__getitem__") so that
-        weight[x, :, y] becomes exportable with torch.fx.symbolic_trace.
-    safetensors.torch.load_file:
-        This function is patched to allow safetensors to be loaded within
-        FakeTensorMode. Remove after https://github.com/huggingface/safetensors/pull/318
-
-    Search for ONNXTorchPatcher in test_fx_to_onnx_with_onnxruntime.py for
-    example usage.
-
-    TODO: Should this really be a global patcher? Can we make it a local patcher?
-        A reason for splitting this into several patchers is to patch one part of the code
-        as a collateral damage of patching another part of the code. For example, we
-        for tracing model with torch._dynamo.export, we don't need to patch
-        `torch.fx._symbolic_trace._wrapped_methods_to_patch`
-    """
-
-    def __init__(self) -> None:
-        # List of file paths processed by torch.load.
-        self.paths: list[Union[str, io.BufferedIOBase]] = []
-
-        def torch_load_wrapper(f, *args, **kwargs):
-            # Record path for later serialization into ONNX proto
-            self.paths.append(f)
-            # Then, call the original torch.load.
-            return self.torch_load(f, *args, **kwargs)
-
-        # Original version of torch.load.
-        self.torch_load = torch.load
-
-        # Wrapper or modified version of torch functions.
-        self.torch_load_wrapper = torch_load_wrapper
-
-        if has_safetensors_and_transformers():
-            import safetensors
-            import transformers
-
-            def safetensors_load_file_wrapper(filename, device="cpu"):
-                # Record path for later serialization into ONNX proto
-                self.paths.append(filename)
-                result = {}
-                with safetensors.torch.safe_open(  # type: ignore[attr-defined]
-                    filename, framework="pt", device=device
-                ) as f:
-                    for k in f.keys():
-                        fake_mode = torch._guards.detect_fake_mode()
-                        if not fake_mode:
-                            result[k] = f.get_tensor(k)
-                        else:
-                            empty_tensor = f.get_slice(k)
-                            result[k] = torch.empty(
-                                tuple(empty_tensor.get_shape()),
-                                dtype=safetensors.torch._getdtype(
-                                    empty_tensor.get_dtype()
-                                ),
-                            )
-                return result
-
-            self.safetensors_torch_load_file = safetensors.torch.load_file
-            self.safetensors_torch_load_file_wrapper = safetensors_load_file_wrapper
-            self.transformers_modeling_utils_safe_load_file = (
-                transformers.modeling_utils.safe_load_file
-            )
-
-    def __enter__(self):
-        torch.load = self.torch_load_wrapper
-
-        self.torch_fx__symbolic_trace__wrapped_methods_to_patch = (
-            torch.fx._symbolic_trace._wrapped_methods_to_patch
-        )
-        desired_wrapped_methods = copy.deepcopy(
-            torch.fx._symbolic_trace._wrapped_methods_to_patch
-        )
-        if (torch.Tensor, "__getitem__") not in desired_wrapped_methods:
-            # Adding `__getitem__` to the patching list will make tensor indexing traceable via
-            # torch.fx.symbolic_trace. Otherwise, `tensor[x, :, y]` cannot be traced.
-            # This happens because `__getitem__` is neither under torch domain nor an aten operator,
-            # so the patching (or similar Proxy-generating mechanism) doesn't happen automatically.
-            # Note that torch.fx.symbolic_trace defines FX_PATCH_GETITEM environment variable for
-            # enabling the line below for patching.
-            desired_wrapped_methods.append((torch.Tensor, "__getitem__"))
-        torch.fx._symbolic_trace._wrapped_methods_to_patch = desired_wrapped_methods
-
-        if has_safetensors_and_transformers():
-            import safetensors
-            import transformers
-
-            safetensors.torch.load_file = self.safetensors_torch_load_file_wrapper
-            transformers.modeling_utils.safe_load_file = (
-                self.safetensors_torch_load_file_wrapper
-            )
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        torch.load = self.torch_load
-        torch.fx._symbolic_trace._wrapped_methods_to_patch = (
-            self.torch_fx__symbolic_trace__wrapped_methods_to_patch
-        )
-        if has_safetensors_and_transformers():
-            import safetensors
-            import transformers
-
-            safetensors.torch.load_file = self.safetensors_torch_load_file
-            transformers.modeling_utils.safe_load_file = (
-                self.transformers_modeling_utils_safe_load_file
-            )
diff --git a/torch/onnx/_internal/fx/serialization.py b/torch/onnx/_internal/fx/serialization.py
deleted file mode 100644
index cda71e465758d..0000000000000
--- a/torch/onnx/_internal/fx/serialization.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# mypy: allow-untyped-defs
-from __future__ import annotations
-
-import io
-import logging
-import os
-from typing import IO, TYPE_CHECKING
-
-import torch
-from torch.onnx import _type_utils as jit_type_utils
-
-
-if TYPE_CHECKING:
-    import onnx
-
-    from torch.types import FileLike
-
-log = logging.getLogger(__name__)
-
-
-def _create_tensor_proto_with_external_data(
-    tensor: torch.Tensor,
-    name: str,
-    location: str,
-    basepath: str,
-    dtype_override: onnx.TypeProto | None = None,  # type: ignore[name-defined]
-) -> onnx.TensorProto:  # type: ignore[name-defined]
-    """Create a TensorProto with external data from a PyTorch tensor.
-    The external data is saved to os.path.join(basepath, location).
-
-    Args:
-        tensor: Tensor to be saved.
-        name: Name of the tensor (i.e., initializer name in ONNX graph).
-        location: Relative location of the external data file
-            (e.g., "/tmp/initializers/weight_0" when model is "/tmp/model_name.onnx").
-        basepath: Base path of the external data file (e.g., "/tmp/external_data" while model must be in "/tmp").
-
-
-    Reference for ONNX's external data format:
-        How to load?
-        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L187
-        How to save?
-        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L43
-        How to set ONNX fields?
-        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L88
-    """
-    # FIXME: Avoid importing onnx into torch.onnx.
-    import onnx
-
-    scalar_type = (
-        jit_type_utils.JitScalarType.from_onnx_type(
-            dtype_override.tensor_type.elem_type
-        )
-        if dtype_override is not None
-        else jit_type_utils.JitScalarType.from_dtype(tensor.dtype)
-    )
-
-    # Checkpoints can be stored with a different dtype as the model expects because
-    # the user script can explicitly cast the original type to something or maybe
-    # PyTorch's type promotion might do it
-    if dtype_override is not None and scalar_type.dtype() != tensor.dtype:
-        tensor = tensor.to(scalar_type.dtype())
-
-    tensor_proto = onnx.TensorProto()  # type: ignore[attr-defined]
-    tensor_proto.name = name
-    tensor_proto.data_type = scalar_type.onnx_type()  # type: ignore[assignment]
-
-    tensor_proto.dims.extend(tensor.shape)
-    tensor_proto.data_location = onnx.TensorProto.EXTERNAL  # type: ignore[attr-defined]
-
-    # Settings for saving one tensor per file.
-    # Offset is zero because there is no other tensor in the same file.
-    key_value_pairs = {
-        "location": location,
-        "offset": 0,
-        "length": tensor.untyped_storage().nbytes(),
-    }
-    for k, v in key_value_pairs.items():
-        entry = tensor_proto.external_data.add()
-        entry.key = k
-        entry.value = str(v)
-
-    # Actual path to write content of tensor.
-    external_data_file_path = os.path.join(basepath, location)
-    if os.path.exists(external_data_file_path):
-        os.remove(external_data_file_path)
-
-    # Create external data's folder if not exists.
-    external_data_dir_path = os.path.dirname(external_data_file_path)
-    if not os.path.exists(external_data_dir_path):
-        # if the demo_folder directory is not present
-        # then create it.
-        os.makedirs(external_data_dir_path)
-
-    # Create a fresh file.
-    with open(external_data_file_path, "xb") as data_file:
-        # No need to call "seek" because offset is 0.
-        # data_file.seek(0)
-        # Write tensor content to the file.
-        data_file.write(tensor.numpy(force=True).tobytes())
-
-    return tensor_proto
-
-
-def _convert_safetensors_to_torch_format(safetensors_file):
-    # It this function is called, safetensors is guaranteed to exist
-    # because the HF model with safetensors was already loaded and exported to ONNX
-    from safetensors import safe_open  # type: ignore[import-not-found, import-untyped]
-
-    tensors = {}
-    with safe_open(safetensors_file, framework="pt", device="cpu") as f:  # type: ignore[attr-defined]
-        for k in f.keys():
-            tensors[k] = f.get_tensor(k).cpu()
-    return tensors
-
-
-# TODO: generalize to allow more checkpoints formats (torch or gguf)
-def save_model_with_external_data(
-    basepath: str,
-    model_location: str,
-    initializer_location: str,
-    torch_state_dicts: tuple[dict | FileLike, ...],
-    onnx_model: onnx.ModelProto,  # type: ignore[name-defined]
-    rename_initializer: bool = False,
-) -> None:
-    """Load PyTorch tensors from files and add to "onnx_model" as external initializers.
-
-    Output files:
-        ONNX model file path:
-        ONNX initializer folder: os.path.join(basepath, initializer_location)
-
-    After running this function, you can do
-        ort_sess = onnxruntime.InferenceSession(os.path.join(basepath, model_location))
-    to execute the model.
-
-    Arguments:
-        basepath: Base path of the ONNX external data file (e.g., "/path/to/large_model/").
-        model_location: Relative location of the ONNX model file.
-            E.g., "model.onnx" so that the model file is saved to
-            "<basepath>/model.onnx".
-        initializer_location: Relative location of the ONNX initializer folder.
-            E.g., "initializers" so that the initializers are saved to
-            "<basepath>/initializers/".
-            Note: When initializers are >2GB, must be the same as `model_location`.
-        torch_state_dicts: Dictionaries or files which contain PyTorch tensors to be saved
-            as ONNX initializers. For non-dict arguments, `torch.load` will be used to load them from file-like objects.
-        onnx_model: ONNX model to be saved with external initializers.
-            If an input name matches a tensor loaded from "torch_state_dicts",
-            the tensor will be saved as that input's external initializer.
-        rename_initializer: Replaces "." by "_" for all ONNX initializer names.
-            Not needed by the official torch.onnx.dynamo_export. This is a hack
-            for supporting `FXSymbolicTracer` tracer with fake tensor mode.
-            In short, `FXSymbolicTracer` lifts FX parameters (self.linear_weight)
-            as inputs (`def forward(self, linear_weight)`) and therefore, `.` cannot be used.
-    """
-    # FIXME: Avoid importing onnx into torch.onnx.
-    import onnx
-
-    initializers_to_be_deleted = {}  # Using dict because it is **ordered**
-    existing_initializers = {
-        k.name: idx for idx, k in enumerate(onnx_model.graph.initializer)
-    }
-    onnx_input_names = {input.name for input in onnx_model.graph.input}
-    for el in torch_state_dicts:
-        if isinstance(el, dict):
-            # Useful for when state_dict is loaded with torch.load(..., mmap=True, map_location="cpu") by the user
-            # Using torch.save wouldn't leverage mmap, leading to higher memory usage
-            state_dict = el
-        else:
-            if isinstance(el, (str, os.PathLike)) and os.fspath(el).endswith(
-                ".safetensors"
-            ):
-                state_dict = _convert_safetensors_to_torch_format(el)
-            else:
-                try:
-                    # Loads checkpoint using memory-map on CPU to support really large models
-                    # The underlying torch.UntypedStorage is memory mapped, so state_dict is lazy loaded
-                    state_dict = torch.load(el, map_location="cpu", mmap=True)
-                except (RuntimeError, ValueError) as e:
-                    if "mmap can only be used with files saved with" in str(e) or (
-                        isinstance(el, (io.IOBase, IO))
-                        and el.readable()
-                        and el.seekable()
-                    ):
-                        log.warning(
-                            "Failed to load the checkpoint with memory-map enabled, retrying without memory-map."
-                            "Consider updating the checkpoint with mmap by using torch.save() on PyTorch version >= 1.6."
-                        )
-                        if isinstance(el, (io.IOBase, IO)):
-                            el.seek(0)  # torch.load from `try:` has read the file.
-                        state_dict = torch.load(el, map_location="cpu")
-                    else:
-                        raise e
-
-        for name, tensor in state_dict.items():
-            if rename_initializer:
-                # Basically, "transformer.attention.self.query.weight" is mapped
-                # to "transformer_attention_self_query_weight" for mimicking the
-                # name-modifying code in FX-to-ONNX exporter.
-                # See function _replace_get_attr_with_placeholder for details.
-                name = name.replace(".", "_")
-
-            # This block tries to match the onnx initializer name with torch parameter/buffer
-            #  e.g. A pytorch buffer 'transformer.h.0.attn.bias' can be named 'h.0.attn.bias' in a ONNX initializer
-            # For each PyTorch tensor name loaded by torch.load,
-            #  1.  Search its best match in ONNX model. E.g., the match of
-            #       "transformer_attention_weight" could be "attention_weight".
-            #  2.  Set "tensor" as the initializer of the matched ONNX input.
-            #      E.g., "tensor" is stored as the initializer of "attention_weight".
-            # Step 1 is required because sometimes, tensor names are stored with prefix the dictionary
-            # loaded by torch.load.
-            if name in onnx_input_names:
-                # Same input name shouldn't be matched again
-                onnx_input_names.remove(name)
-            else:
-                for onnx_input_name in onnx_input_names:
-                    if onnx_input_name.endswith(name) or name.endswith(onnx_input_name):
-                        # Find a match. Change name to the matched ONNX input name, so that we
-                        # create initializer with the right ONNX name.
-                        name = onnx_input_name
-                        onnx_input_names.remove(onnx_input_name)
-                        break
-
-            relative_tensor_file_path = os.path.join(initializer_location, name)
-            # Create one file per tensor.
-            # tensor_proto.raw_data is stored to external file at
-            # os.path.join(basepath, relative_tensor_file_path).
-            model_input_types = {k.name: k.type for k in onnx_model.graph.input}
-
-            # Mark for deletion - a replacement will be appended next
-            if name in existing_initializers:
-                initializers_to_be_deleted[existing_initializers[name]] = name
-            tensor_proto = _create_tensor_proto_with_external_data(
-                tensor,
-                name,
-                relative_tensor_file_path,
-                basepath,
-                model_input_types.pop(name, None),
-            )
-            # Add the tensor_proto to the ONNX model as an initializer with external data.
-            onnx_model.graph.initializer.append(tensor_proto)
-    # Remove old duplicated initializers, if any. delete in desc order to not invalidate deletion indices
-    initializers_to_be_deleted = dict(
-        sorted(initializers_to_be_deleted.items(), reverse=True)
-    )
-    for idx in initializers_to_be_deleted.keys():
-        del onnx_model.graph.initializer[idx]
-
-    # model_location should be a pure file name such as "file_name.onnx", not "folder/file_name.onnx".
-    onnx.save(onnx_model, os.path.join(basepath, model_location))  # type: ignore[attr-defined]
diff --git a/torch/onnx/_onnx_supported_ops.py b/torch/onnx/_onnx_supported_ops.py
deleted file mode 100644
index f3d703ffc227f..0000000000000
--- a/torch/onnx/_onnx_supported_ops.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# mypy: allow-untyped-defs
-import inspect
-from typing import Union
-
-from torch import _C
-from torch.onnx import _constants
-from torch.onnx._internal import registration
-
-
-class _TorchSchema:
-    def __init__(self, schema: Union[_C.FunctionSchema, str]) -> None:
-        if isinstance(schema, _C.FunctionSchema):
-            self.name: str = schema.name
-            self.overload_name: str = schema.overload_name
-            self.arguments: list[str] = [arg.name for arg in schema.arguments]
-            self.optional_arguments: list[str] = []
-            self.returns: list[str] = [ret.name for ret in schema.returns]
-            self.opsets: list[int] = []
-        else:
-            self.name = schema
-            self.overload_name = ""
-            self.arguments = []
-            self.optional_arguments = []
-            self.returns = []
-            self.opsets = []
-
-    def __str__(self) -> str:
-        s = (
-            f"{self.name}.{self.overload_name}("
-            + ", ".join(self.arguments)
-            + ") -> ("
-            + ", ".join(self.returns)
-            + ")"
-            + " in opsets "
-            + ", ".join(str(opset) for opset in self.opsets)
-        )
-        return s
-
-    def __hash__(self):
-        # TODO(thiagocrepaldi): handle overload_name?
-        return hash(self.name)
-
-    def __eq__(self, other) -> bool:
-        if not isinstance(other, _TorchSchema):
-            return False
-        # TODO(thiagocrepaldi): handle overload_name?
-        return self.name == other.name
-
-    def is_aten(self) -> bool:
-        return self.name.startswith("aten::")
-
-    def is_backward(self) -> bool:
-        return "backward" in self.name
-
-
-def _symbolic_argument_count(func):
-    params = []
-    signature = inspect.signature(func)
-    optional_params = []
-    for name, parameter in signature.parameters.items():
-        if name in {"_outputs", "g"}:
-            continue
-        if parameter.default is parameter.empty:
-            optional_params.append(parameter)
-        else:
-            params.append(str(parameter))
-    return params
-
-
-def all_forward_schemas() -> dict[str, _TorchSchema]:
-    """Returns schemas for all TorchScript forward ops."""
-    torch_schemas = [_TorchSchema(s) for s in _C._jit_get_all_schemas()]
-    return {schema.name: schema for schema in torch_schemas if not schema.is_backward()}
-
-
-def all_symbolics_schemas() -> dict[str, _TorchSchema]:
-    """Returns schemas for all onnx supported ops."""
-    symbolics_schemas = {}
-
-    for name in registration.registry.all_functions():
-        func_group = registration.registry.get_function_group(name)
-        assert func_group is not None
-        symbolics_schema = _TorchSchema(name)
-        func = func_group.get(_constants.ONNX_MAX_OPSET)
-        if func is not None:
-            symbolics_schema.arguments = _symbolic_argument_count(func)
-            symbolics_schema.opsets = list(
-                range(func_group.get_min_supported(), _constants.ONNX_MAX_OPSET + 1)
-            )
-        else:
-            # Only support opset < 9
-            func = func_group.get(7)
-            symbolics_schema.arguments = _symbolic_argument_count(func)
-            symbolics_schema.opsets = list(range(7, _constants.ONNX_BASE_OPSET))
-
-        symbolics_schemas[name] = symbolics_schema
-
-    return symbolics_schemas
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
index 7354092dda4e0..1060a6287a8e6 100644
--- a/torch/optim/__init__.py
+++ b/torch/optim/__init__.py
@@ -8,6 +8,7 @@
 
 from torch.optim import lr_scheduler as lr_scheduler, swa_utils as swa_utils
 from torch.optim._adafactor import Adafactor as Adafactor
+from torch.optim._muon import Muon as Muon
 from torch.optim.adadelta import Adadelta as Adadelta
 from torch.optim.adagrad import Adagrad as Adagrad
 from torch.optim.adam import Adam as Adam
@@ -25,6 +26,7 @@
 
 
 Adafactor.__module__ = "torch.optim"
+Muon.__module__ = "torch.optim"
 
 
 del adadelta  # type: ignore[name-defined] # noqa: F821
@@ -52,6 +54,7 @@
     "ASGD",
     "LBFGS",
     "lr_scheduler",
+    "Muon",
     "NAdam",
     "Optimizer",
     "RAdam",
diff --git a/torch/optim/_muon.py b/torch/optim/_muon.py
new file mode 100644
index 0000000000000..cc320143db7ab
--- /dev/null
+++ b/torch/optim/_muon.py
@@ -0,0 +1,360 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""Implementation of the Muon optimizer."""
+
+import math
+from collections.abc import MutableMapping
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from .optimizer import (
+    _disable_dynamo_if_unsupported,
+    _params_doc,
+    _to_scalar,
+    Optimizer,
+    ParamsT,
+)
+
+
+__all__ = ["Muon"]
+
+# Constants from Keller Jordan's Muon post: https://kellerjordan.github.io/posts/muon/
+# github permlink: https://github.com/KellerJordan/Muon/blob/f90a42b28e00b8d9d2d05865fe90d9f39abcbcbd/muon.py#L16
+EPS = 1e-7
+DEFAULT_A = 3.4445
+DEFAULT_B = -4.7750
+DEFAULT_C = 2.0315
+DEFAULT_NS_STEPS = 5
+
+
+def _zeropower_via_newtonschulz(
+    grad: Tensor, ns_coefficients: tuple[float, float, float], ns_steps: int, eps: float
+) -> Tensor:
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+
+    Implementation reference: https://github.com/KellerJordan/Muon/blob/master/muon.py
+    with suggestions by @jxbz, @leloykun, and @YouJiacheng.
+    """
+    if ns_steps >= 100:
+        raise ValueError(
+            "Number of steps must be less than 100 for computational efficiency"
+        )
+    if len(grad.shape) != 2:
+        raise ValueError("Input tensor gradient must be a 2D matrix")
+    if len(ns_coefficients) != 3:
+        raise ValueError("Coefficients must be a tuple of exactly 3 values")
+    a, b, c = ns_coefficients
+    ortho_grad = grad.bfloat16()
+    if grad.size(0) > grad.size(1):
+        ortho_grad = ortho_grad.T
+    # Ensure spectral norm is at most 1
+    ortho_grad.div_(ortho_grad.norm().clamp(min=eps))
+    # Perform the NS iterations
+    for _ in range(ns_steps):
+        gram_matrix = ortho_grad @ ortho_grad.T
+        gram_update = b * gram_matrix + c * gram_matrix @ gram_matrix
+        ortho_grad = a * ortho_grad + gram_update @ ortho_grad
+
+    if grad.size(0) > grad.size(1):
+        ortho_grad = ortho_grad.T
+    return ortho_grad
+
+
+def _adjust_lr(
+    lr: float, adjust_lr_fn: Optional[str], param_shape: torch.Size
+) -> float:
+    """Default learning rate adjustment used by Muon."""
+    A, B = param_shape[:2]
+
+    if adjust_lr_fn is None or adjust_lr_fn == "original":
+        adjusted_ratio = math.sqrt(max(1, A / B))
+    elif adjust_lr_fn == "match_rms_adamw":
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+    else:
+        adjusted_ratio = 1.0
+    return lr * adjusted_ratio
+
+
+class Muon(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = 1e-3,
+        weight_decay: float = 0.1,
+        momentum: float = 0.95,
+        nesterov: bool = True,
+        ns_coefficients: tuple[float, float, float] = (DEFAULT_A, DEFAULT_B, DEFAULT_C),
+        eps: float = EPS,
+        ns_steps: int = DEFAULT_NS_STEPS,
+        adjust_lr_fn: Optional[str] = None,
+    ) -> None:
+        if isinstance(lr, Tensor) and lr.numel() != 1:
+            raise ValueError("Tensor lr must be 1-element")
+        if not 0.0 <= lr:
+            raise ValueError(f"Learning rate should be >= 0 but is: {lr}")
+        if not 0.0 <= momentum:
+            raise ValueError(f"momentum should be >= 0 but is: {momentum}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"weight decay should be >= 0 but is: {weight_decay}")
+        if adjust_lr_fn is not None and adjust_lr_fn not in [
+            "original",
+            "match_rms_adamw",
+        ]:
+            raise ValueError(
+                f"Adjust learning rate function {adjust_lr_fn} is not supported"
+            )
+
+        defaults = {
+            "lr": lr,
+            "weight_decay": weight_decay,
+            "momentum": momentum,
+            "nesterov": nesterov,
+            "ns_coefficients": ns_coefficients,
+            "eps": eps,
+            "ns_steps": ns_steps,
+            "adjust_lr_fn": adjust_lr_fn,
+        }
+        super().__init__(params, defaults)
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.ndim != 2:
+                    raise ValueError(
+                        f"Muon only supports 2D parameters whereas we found a parameter with size: {p.size()}"
+                    )
+
+    def _init_group(
+        self,
+        group: MutableMapping,
+        params_with_grad: list[Tensor],
+        grads: list[Tensor],
+        muon_momentum_bufs: list[Tensor],
+    ):
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+
+            if torch.is_complex(p):
+                raise RuntimeError("Muon does not support complex parameters")
+            if p.grad.is_sparse:
+                raise RuntimeError("Muon does not support sparse gradients")
+
+            params_with_grad.append(p)
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(
+                    p.grad, memory_format=torch.preserve_format
+                )
+            muon_momentum_bufs.append(state["momentum_buffer"])
+
+        return False  # has_complex
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            momentum = group["momentum"]
+
+            params_with_grad: list[Tensor] = []
+            grads: list[Tensor] = []
+            muon_momentum_bufs: list[Tensor] = []
+
+            has_complex = self._init_group(
+                group,
+                params_with_grad,
+                grads,
+                muon_momentum_bufs,
+            )
+
+            muon(
+                params_with_grad,
+                grads,
+                muon_momentum_bufs,
+                lr=lr,
+                weight_decay=weight_decay,
+                momentum=momentum,
+                nesterov=group["nesterov"],
+                ns_coefficients=group["ns_coefficients"],
+                eps=group["eps"],
+                ns_steps=group["ns_steps"],
+                adjust_lr_fn=group["adjust_lr_fn"],
+                has_complex=has_complex,
+            )
+        return loss
+
+
+Muon.__doc__ = (
+    r"""Implements Muon algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt} \\
+            &\textbf{input}      : \gamma \text{ (lr)},\ \lambda \text{ (weight decay)},\
+               \mu \text{ (momentum)},\ \textit{nesterov}\in\{True,False\},\\
+            &\hspace{13mm}(a,b,c)\ \text{ (NS coefficients)},\
+               \varepsilon \text{ (epsilon)},\ k \text{ (NS steps)},\
+               \theta_0 \text{ (params)},\ f(\theta) \text{ (objective)} \\
+            &\textbf{initialize} : B_0 \leftarrow 0 \text{ (momentum buffer)} \\[-1.ex]
+            &\rule{110mm}{0.4pt} \\
+            &\textbf{for}\ t=1\ \textbf{to}\ \ldots\ \textbf{do} \\[0.25ex]
+            &\hspace{5mm} g_t \leftarrow \nabla_{\theta} f_t(\theta_{t-1}) \\[0.25ex]
+            &\hspace{5mm} B_t \leftarrow \mu B_{t-1} + g_t \\[0.25ex]
+            &\hspace{5mm} \widetilde{B}_t \leftarrow
+                \begin{cases}
+                   g_t + \mu B_t, & \text{if nesterov}=True \\
+                   B_t,           & \text{if nesterov}=False
+                \end{cases} \\[1.0ex]
+            &\hspace{5mm} O_t \leftarrow \mathrm{NS}^{(a,b,c)}_{k}\!\big(\widetilde{B}_t;\ \varepsilon\big) \\[0.5ex]
+            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1} - \gamma\,\lambda\,\theta_{t-1}
+               \quad\text{(decoupled weight decay)} \\[0.25ex]
+
+            &\hspace{5mm} \gamma \leftarrow \mathrm{AdjustLR}\!\big(\gamma;\ \mathrm{shape}\!\big(\theta_t \big) \big) \\[0.25ex]
+            &\hspace{5mm} \theta_t \leftarrow \theta_t - \gamma\, O_t \\
+            &\rule{110mm}{0.4pt} \\[-1.ex]
+            &\mathbf{return}\ \theta_t \\[-1.ex]
+            &\rule{110mm}{0.4pt}s
+       \end{aligned}
+
+    Here, :math:`\mathrm{NS}^{(a,b,c)}_{k}(\cdot;\varepsilon)` denotes :math:`k` iterations of the
+    Newton–Schulz orthogonalization operator parameterized by coefficients :math:`(a,b,c)`
+    with numerical stabilization :math:`\varepsilon`.
+
+    The purpose for :math:`\mathrm{AdjustLR}\!\big(\gamma;\ \mathrm{shape}\!\big(\theta_t \big) \big)`
+    is to make the orthogonalized update have a consistent :math:`RMS` across rectangular matrices.
+
+    Keller's original implementation scales the update by :math:`\sqrt{\max\!\left(1, \frac{A}{B}\right)}`,
+    where :math:`A` and :math:`B` are dimension of the matrix being optimized.
+
+    Moonshot's implementation also focuses on matching :math:`RMS` of AdamW. The adjustment is computed as:
+    :math:`\gamma \leftarrow {0.2}\gamma\,\sqrt{\max\!\left({A}, {B}\right)}`
+    The method is adopted from `Muon is Scalable for LLM Training`_. Research
+    results show that with this adjustment Muon can directly reuse the learning rate
+    and weight decay tuned for AdamW.
+
+    We provide two options for the learning rate adjustment: "original", which follows Keller's
+    implementation, and "match_rms_adamw", which refers to Moonshot's implementation. This gives users the
+    flexibility to choose between the two. If `adjust_lr_fn` is not specified, the default is "original".
+
+    For further details regarding the algorithm we refer to `Muon: An optimizer for hidden layers in neural networks`_
+    and `Muon is Scalable for LLM Training`_.
+    """
+    + rf"""
+    Args:
+        {_params_doc}. Note that Muon is an optimizer for 2D parameters of neural network hidden layers. Other
+            parameters, such as bias, and embedding, should be optimized by a standard method such as AdamW.
+        lr (float, Tensor, optional): learning rate (default: 1e-3).
+        weight_decay (float, optional): weight decay (L2 penalty). (default: 0.1)
+        momentum (float, optional): momentum factor (default: 0.95)
+        nesterov (bool, optional): enables Nesterov momentum. Only applicable
+            when momentum is non-zero
+        ns_coefficients (tuple of three floats, optional): coefficients \(a,b,c\) for the
+            Newton–Schulz orthogonalization polynomial (default: ({DEFAULT_A}, {DEFAULT_B}, {DEFAULT_C}))
+        eps (float, optional): term added to the denominator for numerical stability. (default: {EPS})
+        ns_steps (int, optional): number of Newton–Schulz iteration steps. (default: {DEFAULT_NS_STEPS})
+        adjust_lr_fn (str, optional): function to adjust learning rate. One of "original" and "match_rms_adamw".
+            If not specified, we will default to use "original". (default: None)
+
+    .. _Muon\: An optimizer for hidden layers in neural networks:
+        https://kellerjordan.github.io/posts/muon/
+    .. _Muon is Scalable for LLM Training:
+        https://arxiv.org/pdf/2502.16982
+
+    """
+)
+
+
+def _single_tensor_muon(
+    params: list[Tensor],
+    grads: list[Tensor],
+    muon_momentum_bufs: list[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    momentum: float,
+    nesterov: bool,
+    ns_coefficients: tuple[float, float, float],
+    ns_steps: int,
+    eps: float,
+    adjust_lr_fn: Optional[str],
+    has_complex: bool,
+) -> None:
+    lr = _to_scalar(lr)
+    if has_complex:
+        raise ValueError("Complex parameters are not supported")
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        if grad.ndim != 2:
+            raise ValueError("Param gradient must be a 2D matrix")
+
+        buf = muon_momentum_bufs[i]
+        buf.lerp_(grad, 1 - momentum)
+        update = grad.lerp(buf, momentum) if nesterov else buf
+
+        update = _zeropower_via_newtonschulz(update, ns_coefficients, ns_steps, eps)
+
+        adjusted_lr = _adjust_lr(lr, adjust_lr_fn, param.shape)
+
+        param.mul_(1 - lr * weight_decay)
+        param.add_(update, alpha=-adjusted_lr)
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_muon)
+def muon(
+    params: list[Tensor],
+    grads: list[Tensor],
+    muon_momentum_bufs: list[Tensor],
+    *,
+    foreach: Optional[bool] = None,
+    lr: float,
+    weight_decay: float,
+    momentum: float,
+    nesterov: bool,
+    ns_coefficients: tuple[float, float, float],
+    ns_steps: int,
+    eps: float,
+    adjust_lr_fn: Optional[str],
+    has_complex: bool,
+):
+    r"""Functional API that performs Muon algorithm computation.
+
+    See :class:`~torch.optim.Muon` for details.
+    """
+    if foreach is not None and foreach:
+        raise RuntimeError("Foreach is not supported for Muon yet")
+
+    func = _single_tensor_muon
+
+    func(
+        params,
+        grads,
+        muon_momentum_bufs,
+        lr=lr,
+        weight_decay=weight_decay,
+        momentum=momentum,
+        nesterov=nesterov,
+        ns_coefficients=ns_coefficients,
+        ns_steps=ns_steps,
+        eps=eps,
+        adjust_lr_fn=adjust_lr_fn,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 2dc95eb555574..28a41b7c714e3 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -998,16 +998,18 @@ def zero_grad(self, set_to_none: bool = True) -> None:
         r"""Reset the gradients of all optimized :class:`torch.Tensor` s.
 
         Args:
-            set_to_none (bool): instead of setting to zero, set the grads to None.
+            set_to_none (bool, optional): Instead of setting to zero, set the grads to None. Default: ``True``
+
                 This will in general have lower memory footprint, and can modestly improve performance.
                 However, it changes certain behaviors. For example:
+
                 1. When the user tries to access a gradient and perform manual ops on it,
-                a None attribute or a Tensor full of 0s will behave differently.
+                   a None attribute or a Tensor full of 0s will behave differently.
                 2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
-                are guaranteed to be None for params that did not receive a gradient.
+                   are guaranteed to be None for params that did not receive a gradient.
                 3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
-                (in one case it does the step with a gradient of 0 and in the other it skips
-                the step altogether).
+                   (in one case it does the step with a gradient of 0 and in the other it skips
+                   the step altogether).
         """
         foreach = self.defaults.get("foreach", False) or self.defaults.get(
             "fused", False
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 3175439628208..991caa9ecb074 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -66,6 +66,12 @@ def evaluate_platform_supports_flash_attention():
         return not IS_WINDOWS and SM80OrLater
     return False
 
+def evaluate_platform_supports_ck_sdpa():
+    if TEST_WITH_ROCM:
+        return torch.backends.cuda.is_ck_sdpa_available()
+    else:
+        return False
+
 def evaluate_platform_supports_efficient_attention():
     if TEST_WITH_ROCM:
         arch_list = ["gfx90a", "gfx942", "gfx1100", "gfx1201", "gfx950"]
@@ -91,6 +97,8 @@ def evaluate_platform_supports_cudnn_attention():
 
 PLATFORM_SUPPORTS_BF16: bool = LazyVal(lambda: TEST_CUDA and SM80OrLater)
 
+PLATFORM_SUPPORTS_CK_SDPA: bool = LazyVal(lambda: evaluate_platform_supports_ck_sdpa())
+
 def evaluate_platform_supports_fp8():
     if torch.cuda.is_available():
         if torch.version.hip:
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index 96bab4a084c4f..eb594bbe50155 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -20,6 +20,7 @@
     AdamW,
     ASGD,
     LBFGS,
+    Muon,
     NAdam,
     Optimizer,
     RAdam,
@@ -245,8 +246,9 @@ def test_wrapper(*args, **kwargs):
 # Helper function for generating error inputs for all optimizers, used below.
 def get_error_inputs_for_all_optims(device, dtype):
     if _get_device_type(device) == "cpu":
-        sample_param = Parameter(torch.randn(1, device=device, dtype=dtype))
-        sample_param2 = Parameter(torch.randn(1, device=device, dtype=dtype))
+        # Creating 2D parameters for compatibility with Muon.
+        sample_param = Parameter(torch.randn(1, 1, device=device, dtype=dtype))
+        sample_param2 = Parameter(torch.randn(1, 1, device=device, dtype=dtype))
         return [
             ErrorOptimizerInput(
                 OptimizerInput(
@@ -833,6 +835,81 @@ def optim_error_inputs_func_lbfgs(device, dtype):
     return error_inputs
 
 
+def optim_inputs_func_muon(device, dtype=None):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 0.01}, desc="non-default lr"),
+        OptimizerInput(
+            params=None, kwargs={"lr": torch.tensor(0.001)}, desc="Tensor lr"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.2},
+            desc="non-default weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"momentum": 0.8},
+            desc="non-default momentum",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"ns_steps": 6},
+            desc="passing alternative ns_steps",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "ns_coefficients": (3.4, -4.7, 2.0),
+            },
+            desc="passing alternative ns_coefficients",
+        ),
+    ]
+
+
+def optim_error_inputs_func_muon(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    complex_param = torch.rand(2, 3, device=device, dtype=torch.complex64)
+    complex_param.grad = torch.rand_like(complex_param)
+    non_2d_param = torch.rand(2, 3, 4, device=device, dtype=dtype)
+    non_2d_param.grad = torch.rand_like(non_2d_param)
+    param = torch.rand(2, 3, device=device, dtype=dtype)
+    param.grad = torch.rand_like(param)
+    error_inputs += [
+        ErrorOptimizerInput(
+            OptimizerInput(
+                params=[non_2d_param],
+                kwargs=dict(),
+                desc="only support 2D parameters",
+            ),
+            error_type=ValueError,
+            error_regex="Muon only supports 2D parameters",
+            error_on=OptimizerErrorEnum.CONSTRUCTION_ERROR,
+        ),
+        ErrorOptimizerInput(
+            OptimizerInput(
+                params=[param],
+                kwargs={"adjust_lr_fn": "arbitrary"},
+                desc="only support `original` and `match_rms_adamw`",
+            ),
+            error_type=ValueError,
+            error_regex="Adjust learning rate function arbitrary is not supported",
+            error_on=OptimizerErrorEnum.CONSTRUCTION_ERROR,
+        ),
+        ErrorOptimizerInput(
+            OptimizerInput(
+                params=[complex_param],
+                kwargs=dict(),
+                desc="does not support complex parameters",
+            ),
+            error_type=RuntimeError,
+            error_regex="Muon does not support complex parameters",
+            error_on=OptimizerErrorEnum.STEP_ERROR,
+        ),
+    ]
+    return error_inputs
+
+
 def optim_inputs_func_nadam(device, dtype=None):
     cuda_supported_configs = [
         OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
@@ -1869,6 +1946,35 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             ),
         ),
     ),
+    OptimizerInfo(
+        Muon,
+        optim_inputs_func=optim_inputs_func_muon,
+        optim_error_inputs_func=optim_error_inputs_func_muon,
+        supported_impls=(),
+        not_og_supported_flags=(),
+        supports_complex=False,
+        skips=(
+            # Note on tolerances:
+            # test_correctness_Muon_use_closure_True_cuda_float32
+            # Mismatched elements: 2 / 100 (2.0%)
+            # Greatest absolute difference: 0.0006124898791313171 at index (2, 1) (up to 0.0002 allowed)
+            # Greatest relative difference: 0.026825083419680595 at index (2, 6) (up to 0.01 allowed)
+            # This is due compile uses addmm for matmul in the orthogonalization function,
+            # creating a small numerical difference compared to the plain matmul op used in eager.
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float: tol(
+                            rtol=0.08,
+                            atol=0.001,
+                        ),
+                    }
+                ),
+                "CompiledOptimizerParityTests",
+                "test_correctness",
+            ),
+        ),
+    ),
     OptimizerInfo(
         NAdam,
         optim_inputs_func=optim_inputs_func_nadam,
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 40687995470b4..e9700097aa981 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -15,6 +15,59 @@
     import triton
     from triton import language as tl
 
+    import torch
+
+    def _get_strange_configs() -> list[triton.Config]:
+        if torch.version.hip:
+            configs = [
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 16,
+                        "BLOCK_SIZE_N": 16,
+                        "matrix_instr_nonkdim": 16,
+                        "waves_per_eu": 3,
+                        "kpack": 2,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 128,
+                        "BLOCK_SIZE_N": 64,
+                        "matrix_instr_nonkdim": 16,
+                        "waves_per_eu": 3,
+                        "kpack": 2,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+            ]
+        else:
+            configs = [
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 16,
+                        "BLOCK_SIZE_N": 16,
+                        "BLOCK_SIZE_K": 16,
+                        "GROUP_SIZE_M": 4,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 128,
+                        "BLOCK_SIZE_N": 64,
+                        "BLOCK_SIZE_K": 32,
+                        "GROUP_SIZE_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+            ]
+        return configs
+
     # Define here so that multiple tests can take advantage of it
     @triton.jit
     def add_kernel(
@@ -786,28 +839,7 @@ def add_kernel_out_of_order_fn2(
         tl.store(out_ptr + offsets, output, mask=mask)
 
     @triton.autotune(
-        configs=[
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 16,
-                    "BLOCK_SIZE_N": 16,
-                    "BLOCK_SIZE_K": 16,
-                    "GROUP_SIZE_M": 4,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-        ],
+        configs=_get_strange_configs(),
         key=["M_ptr", "N", "K"],
     )
     @triton.jit
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index f3dfa4631cb3d..88d3026de9a17 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -8904,6 +8904,302 @@
                 API_PYTORCH,
             ),
         ),
+        (
+            "cuda::CUDACachingAllocator::raw_alloc",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::raw_alloc", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::raw_alloc",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::raw_alloc", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::raw_alloc_with_stream",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::raw_alloc_with_stream", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::raw_alloc_with_stream",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::raw_alloc_with_stream", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::raw_delete",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::raw_delete", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::raw_delete",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::raw_delete", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::init",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::init", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::init",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::init", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getMemoryFraction",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getMemoryFraction", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getMemoryFraction",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getMemoryFraction", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::setMemoryFraction",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::setMemoryFraction", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::setMemoryFraction",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::setMemoryFraction", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::emptyCache",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::emptyCache", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::emptyCache",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::emptyCache", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::enable",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::enable", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::enable",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::enable", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::isEnabled",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::isEnabled", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::isEnabled",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::isEnabled", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::cacheInfo",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::cacheInfo", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::cacheInfo",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::cacheInfo", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getBaseAllocation",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getBaseAllocation", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getBaseAllocation",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getBaseAllocation", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getDeviceStats",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getDeviceStats", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getDeviceStats",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getDeviceStats", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::resetAccumulatedStats",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::resetAccumulatedStats", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::resetAccumulatedStats",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::resetAccumulatedStats", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::resetPeakStats",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::resetPeakStats", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::resetPeakStats",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::resetPeakStats", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::snapshot",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::snapshot", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::snapshot",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::snapshot", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getCheckpointState",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getCheckpointState", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getCheckpointState",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getCheckpointState", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::setCheckpointState",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::setCheckpointState", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::setCheckpointState",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::setCheckpointState", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::setCheckpointPoolState",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::setCheckpointPoolState", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::setCheckpointPoolState",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::setCheckpointPoolState", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::beginAllocateToPool",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::beginAllocateToPool", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::beginAllocateToPool",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::beginAllocateToPool", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::endAllocateToPool",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::endAllocateToPool", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::endAllocateToPool",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::endAllocateToPool", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::recordHistory",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::recordHistory", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::recordHistory",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::recordHistory", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::recordAnnotation",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::recordAnnotation", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::recordAnnotation",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::recordAnnotation", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::pushCompileContext",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::pushCompileContext", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::pushCompileContext",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::pushCompileContext", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::popCompileContext",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::popCompileContext", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::popCompileContext",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::popCompileContext", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::isHistoryEnabled",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::isHistoryEnabled", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::isHistoryEnabled",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::isHistoryEnabled", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::checkPoolLiveAllocations",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::checkPoolLiveAllocations", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::checkPoolLiveAllocations",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::checkPoolLiveAllocations", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::attachOutOfMemoryObserver",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::attachOutOfMemoryObserver", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::attachOutOfMemoryObserver",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::attachOutOfMemoryObserver", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::attachAllocatorTraceTracker",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::attachAllocatorTraceTracker", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::attachAllocatorTraceTracker",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::attachAllocatorTraceTracker", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::releasePool",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::releasePool", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::releasePool",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::releasePool", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::createOrIncrefPool",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::createOrIncrefPool", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::createOrIncrefPool",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::createOrIncrefPool", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::setUseOnOOM",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::setUseOnOOM", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::setUseOnOOM",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::setUseOnOOM", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getPoolUseCount",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getPoolUseCount", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getPoolUseCount",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getPoolUseCount", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getIpcDevPtr",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getIpcDevPtr", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getIpcDevPtr",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getIpcDevPtr", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::shareIpcHandle",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::shareIpcHandle", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::shareIpcHandle",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::shareIpcHandle", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::name",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::name", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::name",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::name", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::memcpyAsync",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::memcpyAsync", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::memcpyAsync",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::memcpyAsync", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::enablePeerAccess",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::enablePeerAccess", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::enablePeerAccess",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::enablePeerAccess", API_PYTORCH),
+        ),
         (
             "cuda::CUDAAllocator::recordStream",
             (