Support NUMA Binding for Callable Entrypoints (pytorch#160163)

pdesupinski · markc-614 · commit f75c8ecf32bd · 2025-09-17T09:25:56.000+08:00
# Context This is an extension of pytorch#149334. # This PR Add support for NUMA bindings with Callable entrypoints, such as `do_train` instead of `/usr/local/bin/python`. Most notably, we utilize a hack in order to force `Process.start()` to use custom NUMA bindings for each subprocess. Please search for `HACK:` in the code to see a description of the implementation we chose, and pytorch#160006 for discussion of alternatives and why this is necessary. Other changes: * Remove unnecessary `--preferred` option from all binding strategies. By default, Linux already allocates memory to the NUMA node local to the CPU which triggered the allocation. (See [MPOL_LOCAL](https://man7.org/linux/man-pages/man2/set_mempolicy.2.html).) * Refactor so that the main API is `maybe_wrap_command_with_numa_bindings`, which computes bindings for a single rank at a time, rather than `maybe_wrap_with_numa_bindings` which computed bindings for all ranks at once. This allowed for more code sharing between `Callable` and `str` entrypoints. # Test Plan ## Automated `$ pytest test/test_numa_binding.py` ## Manual Using [this benchmark,](https://gist.github.com/pdesupinski/bbe01ade455d86e989794f2c612e2d91), ran ``` $ PYTHONUNBUFFERED=1 LOGLEVEL=INFO perf stat -e ls_dmnd_fills_from_sys.dram_io_far,ls_dmnd_fills_from_sys.dram_io_near -- python -m torch.distributed.run --standalone --nproc-per-node=8 --numa-binding=node --run-path mlp_train.py 2>&1 | tee node_callable.txt && PYTHONUNBUFFERED=1 LOGLEVEL=INFO perf stat -e ls_dmnd_fills_from_sys.dram_io_far,ls_dmnd_fills_from_sys.dram_io_near -- python -u -m torch.distributed.run --standalone --nproc-per-node=8 --run-path mlp_train.py 2>&1 | tee none_callable.txt ``` and observed * 6.6% remote memory accesses with 'node' bindings * 11.6% remote without bindings I also ran similar with `str` entrypoints as before just to be sure it's still working. NOTE: [--run-path triggers the code to be run inside a `Callable`.](https://github.com/pytorch/pytorch/blob/017259f9c65b6fad55fb9597d7077e2543eaae46/torch/distributed/run.py#L870) Pull Request resolved: pytorch#160163 Approved by: https://github.com/d4l3k
diff --git a/docs/source/elastic/numa.rst b/docs/source/elastic/numa.rst
@@ -3,8 +3,8 @@
 NUMA Binding Utilities
 ======================
 
-.. automodule:: torch.distributed.numa
+.. automodule:: torch.numa
    :members:
 
-.. automodule:: torch.distributed.numa.binding
+.. automodule:: torch.numa.binding
    :members:
diff --git a/test/test_numa_binding.py b/test/test_numa_binding.py
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
@@ -27,7 +27,7 @@
 from torch.distributed.elastic.multiprocessing import ProcessFailure, SignalException
 from torch.distributed.elastic.rendezvous import RendezvousGracefulExitError
 from torch.distributed.elastic.utils.logging import get_logger
-from torch.distributed.numa.binding import NumaOptions
+from torch.numa.binding import NumaOptions
 
 
 __all__ = [
@@ -104,13 +104,6 @@ def __post_init__(self):
             self.entrypoint = self.fn
         assert self.entrypoint
 
-        if (
-            self.numa_options is not None
-            and not self.numa_options.should_fall_back_if_binding_fails
-            and not isinstance(self.entrypoint, str)
-        ):
-            raise ValueError("numa_options is only supported for str entrypoints.")
-
     def get_entrypoint_name(self):
         """Get the entry point name.
 
diff --git a/torch/distributed/elastic/multiprocessing/__init__.py b/torch/distributed/elastic/multiprocessing/__init__.py
@@ -80,7 +80,7 @@ def trainer(a, b, c):
     to_map,
 )
 from torch.distributed.elastic.utils.logging import get_logger
-from torch.distributed.numa.binding import NumaOptions
+from torch.numa.binding import NumaOptions
 
 
 __all__ = [
@@ -227,6 +227,7 @@ def start_processes(
             log_line_prefixes=log_line_prefixes,
             start_method=start_method,
             logs_specs=logs_specs,
+            numa_options=numa_options,
         )
 
     try:
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
@@ -37,7 +37,7 @@
     SubprocessHandler,
 )
 from torch.distributed.elastic.multiprocessing.tail_log import TailLog
-from torch.distributed.numa.binding import maybe_wrap_with_numa_bindings, NumaOptions
+from torch.numa.binding import NumaOptions
 
 
 IS_WINDOWS = sys.platform == "win32"
@@ -631,6 +631,7 @@ def __init__(
         start_method: str,
         logs_specs: LogsSpecs,
         log_line_prefixes: Optional[dict[int, str]] = None,
+        numa_options: Optional[NumaOptions] = None,
     ):
         super().__init__(
             name,
@@ -655,6 +656,8 @@ def __init__(
         # successfully. If any process died on event.wait() calling set() method will deadlock.
         self._worker_finished_event = mp.get_context(self.start_method).Event()
 
+        self._numa_options: Optional[NumaOptions] = numa_options
+
     def _start(self):
         if self._pc:
             raise ValueError(
@@ -676,6 +679,7 @@ def _start(self):
             join=False,
             daemon=False,
             start_method=self.start_method,
+            numa_options=self._numa_options,
         )
 
     def _is_done(self) -> bool:
@@ -814,10 +818,6 @@ def __init__(
         log_line_prefixes: Optional[dict[int, str]] = None,
         numa_options: Optional[NumaOptions] = None,
     ):
-        entrypoint, args = maybe_wrap_with_numa_bindings(
-            entrypoint=entrypoint, local_rank_to_args=args, numa_options=numa_options
-        )
-
         super().__init__(
             name,
             entrypoint,
@@ -831,6 +831,7 @@ def __init__(
         self._running_local_ranks: set[int] = set(range(self.nprocs))
         self._failures: dict[int, ProcessFailure] = {}
         self.subprocess_handlers: dict[int, SubprocessHandler] = {}
+        self._numa_options: Optional[NumaOptions] = numa_options
 
     def _start(self):
         if self.subprocess_handlers:
@@ -845,6 +846,7 @@ def _start(self):
                 stdout=self.stdouts[local_rank],
                 stderr=self.stderrs[local_rank],
                 local_rank_id=local_rank,
+                numa_options=self._numa_options,
             )
             for local_rank in range(self.nprocs)
         }
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
@@ -3,10 +3,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from typing import Optional
 
 from torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler import (
     SubprocessHandler,
 )
+from torch.numa.binding import NumaOptions
 
 
 __all__ = ["get_subprocess_handler"]
@@ -19,6 +21,7 @@ def get_subprocess_handler(
     stdout: str,
     stderr: str,
     local_rank_id: int,
+    numa_options: Optional[NumaOptions] = None,
 ) -> SubprocessHandler:
     return SubprocessHandler(
         entrypoint=entrypoint,
@@ -27,4 +30,5 @@ def get_subprocess_handler(
         stdout=stdout,
         stderr=stderr,
         local_rank_id=local_rank_id,
+        numa_options=numa_options,
     )
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
@@ -11,6 +11,8 @@
 from subprocess import Popen
 from typing import Any, Optional
 
+from torch.numa.binding import maybe_wrap_command_with_numa_bindings, NumaOptions
+
 
 __all__ = ["SubprocessHandler"]
 
@@ -39,6 +41,7 @@ def __init__(
         stdout: Optional[str],
         stderr: Optional[str],
         local_rank_id: int,
+        numa_options: Optional[NumaOptions],
     ):
         self._stdout = open(stdout, "w") if stdout else None
         self._stderr = open(stderr, "w") if stderr else None
@@ -47,6 +50,15 @@ def __init__(
         env_vars.update(env)
 
         args_str = (entrypoint, *[str(e) for e in args])
+        args_str = (
+            maybe_wrap_command_with_numa_bindings(
+                command_args=args_str,
+                gpu_index=local_rank_id,
+                numa_options=numa_options,
+            )
+            or args_str
+        )
+
         self.local_rank_id = local_rank_id
         self.proc: Popen = self._popen(args_str, env_vars)
 
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
@@ -26,7 +26,7 @@
 from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
 from torch.distributed.elastic.utils.logging import get_logger
-from torch.distributed.numa.binding import NumaOptions
+from torch.numa.binding import NumaOptions
 
 
 __all__ = ["LaunchConfig", "elastic_launch", "launch_agent"]
@@ -107,7 +107,13 @@ def __post_init__(self):
         if self.logs_specs is None:
             self.logs_specs = DefaultLogsSpecs()
 
-        if self.numa_options is None and torch.cuda.is_available():
+        if (
+            self.numa_options is None
+            # NOTE: This filter isn't relevant for str entrypoints,
+            # but it's the default anyway.
+            and self.start_method == "spawn"
+            and torch.cuda.is_available()
+        ):
             self.numa_options = get_default_numa_options()
             logger.info("Using default numa options = %r", self.numa_options)
 
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
@@ -382,7 +382,7 @@ def main():
 from torch.distributed.elastic.utils import macros
 from torch.distributed.elastic.utils.logging import get_logger
 from torch.distributed.launcher.api import elastic_launch, LaunchConfig
-from torch.distributed.numa.binding import (
+from torch.numa.binding import (
     AffinityMode as _AffinityMode,  # Signify as private with _
     NumaOptions as _NumaOptions,
 )
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
@@ -2,6 +2,7 @@
 import logging
 import multiprocessing
 import multiprocessing.connection
+import multiprocessing.spawn as mp_spawn
 import os
 import pickle
 import signal
@@ -12,6 +13,11 @@
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from typing import Optional
 
+from torch.numa.binding import (
+    maybe_get_temporary_python_executable_with_numa_bindings,
+    NumaOptions,
+)
+
 from . import _prctl_pr_set_pdeathsig  # type: ignore[attr-defined]
 
 
@@ -236,6 +242,7 @@ def start_processes(
     join=True,
     daemon=False,
     start_method="spawn",
+    numa_options: Optional[NumaOptions] = None,
 ):
     # To speed up performance in certain cases (see https://github.com/pytorch/pytorch/issues/133010),
     # this func will start processes in parallel if start_method is 'forkserver'.
@@ -251,11 +258,43 @@ def start_processes(
         # Set env var TORCH_MP_PARALLEL_START to 0 to disable parallel start
         start_parallel = False
 
+    if numa_options is not None and start_method != "spawn":
+        raise ValueError("NUMA binding is only compatible with spawn")
+
+    if numa_options is not None and start_parallel:
+        raise ValueError("NUMA binding is not compatible with parallel start")
+
     mp = multiprocessing.get_context(start_method)
     error_files = [None] * nprocs
     processes = [None] * nprocs
+    original_executable = mp_spawn.get_executable()
 
     def start_process(i):
+        # HACK: We want to force Process.start() to kick off the subprocess
+        # using a custom numactl command per rank. However, the API exposed
+        # by multiprocessing only allows us to override the executable for
+        # the entire context, and only with a single str rather than a tuple.
+        # Furthermore, there is no API for passing additional options, e.g.
+        # to make LOCAL_RANK available to the executable.
+        #
+        # In order to get around these limitations, we pre-compute
+        # the appropriate command containing NUMA bindings and store it in a
+        # temporary executable which passes Python args on to the original
+        # executable. Then, we call set_executable before and after each
+        # Process.start() call.
+        #
+        # This assumes that, under the hood, Process.start() for rank n
+        # will not call get_executable after start_process for rank n+1
+        # calls set_executable again. We guarantee this by
+        # raising an exception if `start_parallel`, above. (Not clear
+        # if there would be a race condition otherwise, but we want to be safe.)
+        temporary_executable_path = (
+            maybe_get_temporary_python_executable_with_numa_bindings(
+                python_executable_path=original_executable,
+                gpu_index=i,
+                numa_options=numa_options,
+            )
+        )
         # Each process is assigned a file to write tracebacks to.  We
         # use the file being non-empty to indicate an exception
         # occurred (vs an expected shutdown).  Note: this previously
@@ -267,12 +306,19 @@ def start_process(i):
         )
         tf.close()
         os.unlink(tf.name)
-        process = mp.Process(
-            target=_wrap,
-            args=(fn, i, args, tf.name),
-            daemon=daemon,
-        )
-        process.start()
+
+        try:
+            if temporary_executable_path is not None:
+                mp.set_executable(temporary_executable_path)
+            process = mp.Process(
+                target=_wrap,
+                args=(fn, i, args, tf.name),
+                daemon=daemon,
+            )
+            process.start()
+        finally:
+            if temporary_executable_path is not None:
+                mp.set_executable(original_executable)
         return i, process, tf.name
 
     if not start_parallel:
diff --git a/torch/numa/__init__.py b/torch/numa/__init__.py
diff --git a/torch/numa/binding.py b/torch/numa/binding.py

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ def trainer(a, b, c):`
`80`	`80`	`to_map,`
`81`	`81`	`)`
`82`	`82`	`from torch.distributed.elastic.utils.logging import get_logger`
`83`		`-from torch.distributed.numa.binding import NumaOptions`
	`83`	`+from torch.numa.binding import NumaOptions`
`84`	`84`
`85`	`85`
`86`	`86`	`__all__ = [`
`@@ -227,6 +227,7 @@ def start_processes(`
`227`	`227`	`log_line_prefixes=log_line_prefixes,`
`228`	`228`	`start_method=start_method,`
`229`	`229`	`logs_specs=logs_specs,`
	`230`	`+ numa_options=numa_options,`
`230`	`231`	`)`
`231`	`232`
`232`	`233`	`try:`
Original file line number	Diff line number	Diff line change
`@@ -382,7 +382,7 @@ def main():`
`382`	`382`	`from torch.distributed.elastic.utils import macros`
`383`	`383`	`from torch.distributed.elastic.utils.logging import get_logger`
`384`	`384`	`from torch.distributed.launcher.api import elastic_launch, LaunchConfig`
`385`		`-from torch.distributed.numa.binding import (`
	`385`	`+from torch.numa.binding import (`
`386`	`386`	`AffinityMode as _AffinityMode, # Signify as private with _`
`387`	`387`	`NumaOptions as _NumaOptions,`
`388`	`388`	`)`