From 67780a3f6a5d50c759e089164a7c9d493e8d3802 Mon Sep 17 00:00:00 2001 From: Anay Dongre Date: Wed, 11 Jun 2025 15:31:43 -0700 Subject: [PATCH 1/5] DOC: Clarify DeviceStatsMonitor logged metrics (#20807) --- .../pytorch/callbacks/device_stats_monitor.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/lightning/pytorch/callbacks/device_stats_monitor.py b/src/lightning/pytorch/callbacks/device_stats_monitor.py index 6279dd13be4af..a8566bb4a7e76 100644 --- a/src/lightning/pytorch/callbacks/device_stats_monitor.py +++ b/src/lightning/pytorch/callbacks/device_stats_monitor.py @@ -34,6 +34,23 @@ class DeviceStatsMonitor(Callback): r"""Automatically monitors and logs device stats during training, validation and testing stage. ``DeviceStatsMonitor`` is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. + Logged Metrics: + Device statistics are logged with keys prefixed as + ``DeviceStatsMonitor.{hook_name}/{base_metric_name}`` (e.g., + ``DeviceStatsMonitor.on_train_batch_start/cpu_percent``). + The source of these metrics depends on the ``cpu_stats`` flag + and the active accelerator. + + CPU (via ``psutil``): Logs ``cpu_percent``, ``cpu_vm_percent``, ``cpu_swap_percent``. + All are percentages (%). + CUDA GPU (via :func:`torch.cuda.memory_stats`): Logs detailed memory statistics from + PyTorch's allocator (e.g., ``allocated_bytes.all.current``, ``num_ooms``; all in Bytes). + GPU compute utilization is not logged by default. + Other Accelerators (e.g., TPU, MPS): Logs device-specific stats. + - TPU example: ``avg. free memory (MB)``. + - MPS example: ``mps.current_allocated_bytes``. + Observe logs or check accelerator documentation for details. + Args: cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU. If ``True``, it will log CPU stats regardless of the accelerator. @@ -45,6 +62,7 @@ class DeviceStatsMonitor(Callback): ModuleNotFoundError: If ``psutil`` is not installed and CPU stats are monitored. + Example:: from lightning import Trainer From 4cb878428fed683d5f3677e8db43e0b5ca7ced28 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Jun 2025 03:43:14 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/pytorch/callbacks/device_stats_monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/pytorch/callbacks/device_stats_monitor.py b/src/lightning/pytorch/callbacks/device_stats_monitor.py index a8566bb4a7e76..0fb3876913d6e 100644 --- a/src/lightning/pytorch/callbacks/device_stats_monitor.py +++ b/src/lightning/pytorch/callbacks/device_stats_monitor.py @@ -38,8 +38,8 @@ class DeviceStatsMonitor(Callback): Device statistics are logged with keys prefixed as ``DeviceStatsMonitor.{hook_name}/{base_metric_name}`` (e.g., ``DeviceStatsMonitor.on_train_batch_start/cpu_percent``). - The source of these metrics depends on the ``cpu_stats`` flag - and the active accelerator. + The source of these metrics depends on the ``cpu_stats`` flag + and the active accelerator. CPU (via ``psutil``): Logs ``cpu_percent``, ``cpu_vm_percent``, ``cpu_swap_percent``. All are percentages (%). From 8dee98a9d298b8d8b542193122b20860bc5c4e39 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 18 Jun 2025 13:28:28 +0200 Subject: [PATCH 3/5] update --- .../pytorch/callbacks/device_stats_monitor.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/lightning/pytorch/callbacks/device_stats_monitor.py b/src/lightning/pytorch/callbacks/device_stats_monitor.py index 0fb3876913d6e..41206c6755cf2 100644 --- a/src/lightning/pytorch/callbacks/device_stats_monitor.py +++ b/src/lightning/pytorch/callbacks/device_stats_monitor.py @@ -34,21 +34,20 @@ class DeviceStatsMonitor(Callback): r"""Automatically monitors and logs device stats during training, validation and testing stage. ``DeviceStatsMonitor`` is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. - Logged Metrics: - Device statistics are logged with keys prefixed as - ``DeviceStatsMonitor.{hook_name}/{base_metric_name}`` (e.g., - ``DeviceStatsMonitor.on_train_batch_start/cpu_percent``). - The source of these metrics depends on the ``cpu_stats`` flag - and the active accelerator. + Device statistics are logged with keys prefixed as ``DeviceStatsMonitor.{hook_name}/{base_metric_name}`` (e.g., + ``DeviceStatsMonitor.on_train_batch_start/cpu_percent``). + The source of these metrics depends on the ``cpu_stats`` flag and the active accelerator. CPU (via ``psutil``): Logs ``cpu_percent``, ``cpu_vm_percent``, ``cpu_swap_percent``. All are percentages (%). CUDA GPU (via :func:`torch.cuda.memory_stats`): Logs detailed memory statistics from PyTorch's allocator (e.g., ``allocated_bytes.all.current``, ``num_ooms``; all in Bytes). GPU compute utilization is not logged by default. - Other Accelerators (e.g., TPU, MPS): Logs device-specific stats. + Other Accelerators (e.g., TPU, MPS): Logs device-specific stats: + - TPU example: ``avg. free memory (MB)``. - MPS example: ``mps.current_allocated_bytes``. + Observe logs or check accelerator documentation for details. Args: From fd323edc91090cb633d1548a8043e15b5eb7bc8a Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 18 Jun 2025 13:28:28 +0200 Subject: [PATCH 4/5] update --- .../pytorch/callbacks/device_stats_monitor.py | 71 +++++++++++++++---- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/src/lightning/pytorch/callbacks/device_stats_monitor.py b/src/lightning/pytorch/callbacks/device_stats_monitor.py index 41206c6755cf2..7768be03eb68e 100644 --- a/src/lightning/pytorch/callbacks/device_stats_monitor.py +++ b/src/lightning/pytorch/callbacks/device_stats_monitor.py @@ -34,21 +34,68 @@ class DeviceStatsMonitor(Callback): r"""Automatically monitors and logs device stats during training, validation and testing stage. ``DeviceStatsMonitor`` is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. - Device statistics are logged with keys prefixed as ``DeviceStatsMonitor.{hook_name}/{base_metric_name}`` (e.g., - ``DeviceStatsMonitor.on_train_batch_start/cpu_percent``). - The source of these metrics depends on the ``cpu_stats`` flag and the active accelerator. - CPU (via ``psutil``): Logs ``cpu_percent``, ``cpu_vm_percent``, ``cpu_swap_percent``. - All are percentages (%). - CUDA GPU (via :func:`torch.cuda.memory_stats`): Logs detailed memory statistics from - PyTorch's allocator (e.g., ``allocated_bytes.all.current``, ``num_ooms``; all in Bytes). - GPU compute utilization is not logged by default. - Other Accelerators (e.g., TPU, MPS): Logs device-specific stats: + **Logged Metrics** - - TPU example: ``avg. free memory (MB)``. - - MPS example: ``mps.current_allocated_bytes``. + Logs device statistics with keys prefixed as ``DeviceStatsMonitor.{hook_name}/{base_metric_name}``. - Observe logs or check accelerator documentation for details. + The actual metrics depend on the active accelerator and the ``cpu_stats`` flag. + + **CPU (via `psutil`)** + + - ``cpu_percent``: System-wide CPU utilization (%) + - ``cpu_vm_percent``: System-wide virtual memory (RAM) utilization (%) + - ``cpu_swap_percent``: System-wide swap memory utilization (%) + + **CUDA GPU (via `torch.cuda.memory_stats`)** + + Logs memory statistics from PyTorch caching allocator (all in Bytes). + GPU compute utilization is not logged by default. + + *General Memory Usage:* + + - ``allocated_bytes.all.current``: Current allocated GPU memory + - ``allocated_bytes.all.peak``: Peak allocated GPU memory + - ``reserved_bytes.all.current``: Current reserved GPU memory (allocated + cached) + - ``reserved_bytes.all.peak``: Peak reserved GPU memory + - ``active_bytes.all.current``: Current GPU memory in active use + - ``active_bytes.all.peak``: Peak GPU memory in active use + - ``inactive_split_bytes.all.current``: Memory in inactive, splittable blocks + + *Allocator Pool Statistics* (for ``small_pool`` and ``large_pool``): + + - ``allocated_bytes.{pool_type}.current`` / ``.peak`` + - ``reserved_bytes.{pool_type}.current`` / ``.peak`` + - ``active_bytes.{pool_type}.current`` / ``.peak`` + + *Allocator Events:* + + - ``num_ooms``: Cumulative out-of-memory errors + - ``num_alloc_retries``: Number of allocation retries + - ``num_device_alloc``: Number of device allocations + - ``num_device_free``: Number of device deallocations + + For a full list of CUDA memory stats, see: + https://pytorch.org/docs/stable/generated/torch.cuda.memory_stats.html + + **TPU (via `torch_xla`)** + + *Memory Metrics* (per device, e.g. ``xla:0``): + + - ``memory.free.xla:0``: Free HBM memory (MB) + - ``memory.used.xla:0``: Used HBM memory (MB) + - ``memory.percent.xla:0``: Percentage of HBM memory used (%) + + *XLA Operation Counters:* + + - ``CachedCompile.xla`` + - ``CreateXlaTensor.xla`` + - ``DeviceDataCacheMiss.xla`` + - ``UncachedCompile.xla`` + - ``xla::add.xla``, ``xla::addmm.xla``, etc. + + These counters can be retrieved using: + ``torch_xla.debug.metrics.counter_names()`` Args: cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU. From df20e4bcc9dcf0df2fee99e8ad0e35fe70abbcf5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 26 Jun 2025 02:24:14 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/pytorch/callbacks/device_stats_monitor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lightning/pytorch/callbacks/device_stats_monitor.py b/src/lightning/pytorch/callbacks/device_stats_monitor.py index 7768be03eb68e..9b03431be29e4 100644 --- a/src/lightning/pytorch/callbacks/device_stats_monitor.py +++ b/src/lightning/pytorch/callbacks/device_stats_monitor.py @@ -34,7 +34,6 @@ class DeviceStatsMonitor(Callback): r"""Automatically monitors and logs device stats during training, validation and testing stage. ``DeviceStatsMonitor`` is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. - **Logged Metrics** Logs device statistics with keys prefixed as ``DeviceStatsMonitor.{hook_name}/{base_metric_name}``. @@ -49,7 +48,7 @@ class DeviceStatsMonitor(Callback): **CUDA GPU (via `torch.cuda.memory_stats`)** - Logs memory statistics from PyTorch caching allocator (all in Bytes). + Logs memory statistics from PyTorch caching allocator (all in Bytes). GPU compute utilization is not logged by default. *General Memory Usage:*