diff --git a/docs/source/conf.py b/docs/source/conf.py index b26108561..cee51966b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -82,6 +82,7 @@ def __call__(self, filename): "approximate_mode.py", "sampling.py", "parallel_decoding.py", + "performance_tips.py", "custom_frame_mappings.py", ] else: diff --git a/docs/source/index.rst b/docs/source/index.rst index 4311d2e0d..39e5948b9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -84,6 +84,14 @@ Decoding How to sample regular and random clips from a video + .. grid-item-card:: :octicon:`file-code;1em` + Performance Tips + :img-top: _static/img/card-background.svg + :link: generated_examples/decoding/performance_tips.html + :link-type: url + + Tips for optimizing video decoding performance + Encoding ^^^^^^^^ diff --git a/examples/decoding/basic_cuda_example.py b/examples/decoding/basic_cuda_example.py index 8f82940c0..d97b13e0d 100644 --- a/examples/decoding/basic_cuda_example.py +++ b/examples/decoding/basic_cuda_example.py @@ -18,28 +18,6 @@ running the transform steps. Encoded packets are often much smaller than decoded frames so CUDA decoding also uses less PCI-e bandwidth. -When to and when not to use CUDA Decoding ------------------------------------------ - -CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios: - -#. You are decoding a large resolution video -#. You are decoding a large batch of videos that's saturating the CPU -#. You want to do whole-image transforms like scaling or convolutions on the decoded tensors - after decoding -#. Your CPU is saturated and you want to free it up for other work - - -Here are situations where CUDA Decoding may not make sense: - -#. You want bit-exact results compared to CPU Decoding -#. You have small resolution videos and the PCI-e transfer latency is large -#. Your GPU is already busy and CPU is not - -It's best to experiment with CUDA Decoding to see if it improves your use-case. With -TorchCodec you can simply pass in a device parameter to the -:class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding. - Installing TorchCodec with CUDA Enabled --------------------------------------- @@ -113,6 +91,25 @@ print(frame.data.device) +# %% +# Checking for CPU Fallback +# ------------------------------------- +# +# In some cases, CUDA decoding may fall back to CPU decoding. This can happen +# when the video codec or format is not supported by the NVDEC hardware decoder, or when NVCUVID wasn't found. +# TorchCodec provides the :class:`~torchcodec.decoders.CpuFallbackStatus` class +# to help you detect when this fallback occurs. +# +# You can access the fallback status via the +# :attr:`~torchcodec.decoders.VideoDecoder.cpu_fallback` attribute: + +with set_cuda_backend("beta"): + decoder = VideoDecoder(video_file, device="cuda") + +# Check and print the CPU fallback status +print(decoder.cpu_fallback) + + # %% # Visualizing Frames # ------------------------------------- diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py new file mode 100644 index 000000000..92aa0ac95 --- /dev/null +++ b/examples/decoding/performance_tips.py @@ -0,0 +1,213 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +.. meta:: + :description: Learn how to optimize TorchCodec video decoding performance with batch APIs, approximate seeking, multi-threading, and CUDA acceleration. + +============================================== +TorchCodec Performance Tips and Best Practices +============================================== + +This tutorial consolidates performance optimization techniques for video +decoding with TorchCodec. Learn when and how to apply various strategies +to increase performance. +""" + + +# %% +# Overview +# -------- +# +# When decoding videos with TorchCodec, several techniques can significantly +# improve performance depending on your use case. This guide covers: +# +# 1. **Batch APIs** - Decode multiple frames at once +# 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed +# 3. **Multi-threading** - Parallelize decoding across videos or chunks +# 4. **CUDA Acceleration** - Use GPU decoding for supported formats +# +# We'll explore each technique and when to use it. + +# %% +# 1. Use Batch APIs When Possible +# -------------------------------- +# +# If you need to decode multiple frames at once, the batch methods are faster than calling single-frame decoding methods multiple times. +# For example, :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` is faster than calling :meth:`~torchcodec.decoders.VideoDecoder.get_frame_at` multiple times. +# TorchCodec's batch APIs reduce overhead and can leverage internal optimizations. +# +# **Key Methods:** +# +# For index-based frame retrieval: +# +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges +# +# For timestamp-based frame retrieval: +# +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges +# +# %% +# **When to use:** +# +# - Decoding multiple frames + +# %% +# .. note:: +# +# For complete examples with runnable code demonstrating batch decoding, +# iteration, and frame retrieval, see :ref:`sphx_glr_generated_examples_decoding_basic_example.py` + +# %% +# 2. Approximate Mode & Keyframe Mappings +# ---------------------------------------- +# +# By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when +# you create the decoder to build an accurate internal index of frames. This +# ensures frame-accurate seeking but takes longer for decoder initialization, +# especially on long videos. + +# %% +# **Approximate Mode** +# ~~~~~~~~~~~~~~~~~~~~ +# +# Setting ``seek_mode="approximate"`` skips the initial :term:`scan` and relies on the +# video file's metadata headers. This dramatically speeds up +# :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long +# videos, but may result in slightly less accurate seeking in some cases. +# +# +# **Which mode should you use:** +# +# - If you care about exactness of frame seeking, use “exact”. +# - If the video is long and you're only decoding a small amount of frames, approximate mode should be faster. + +# %% +# **Custom Frame Mappings** +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# For advanced use cases, you can pre-compute a custom mapping between desired +# frame indices and actual keyframe locations. This allows you to speed up :class:`~torchcodec.decoders.VideoDecoder` +# instantiation while maintaining the frame seeking accuracy of ``seek_mode="exact"`` +# +# **When to use:** +# +# - Frame accuracy is critical, so you cannot use approximate mode +# - You can preprocess videos once and then decode them many times +# +# **Performance impact:** speeds up decoder instantiation, similarly to ``seek_mode="approximate"``. + +# %% +# .. note:: +# +# For complete benchmarks showing actual speedup numbers, accuracy comparisons, +# and implementation examples, see :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` +# and :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py` + +# %% +# 3. Multi-threading for Parallel Decoding +# ----------------------------------------- +# +# When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: +# +# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities for intra-frame parallelism, where parallelization happens within individual frames rather than across frames. For that, use the `num_ffmpeg_threads` parameter of the :class:`~torchcodec.decoders.VideoDecoder` +# - **Multiprocessing** - Distributing work across multiple processes +# - **Multithreading** - Using multiple threads within a single process +# +# You can use both multiprocessing and multithreading to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks. + +# %% +# .. note:: +# +# For complete examples comparing +# sequential, ffmpeg-based parallelism, multi-process, and multi-threaded approaches, see +# :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` + +# %% +# 4. CUDA Acceleration +# -------------------- +# +# TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder +# (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory, +# avoiding expensive CPU-GPU transfers for downstream GPU operations. +# +# %% +# **Recommended: use the Beta Interface!!** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We recommend you use the new "beta" CUDA interface which is significantly faster than the previous one, and supports the same features: +# +# .. code-block:: python +# +# with set_cuda_backend("beta"): +# decoder = VideoDecoder("file.mp4", device="cuda") +# +# %% +# **When to use:** +# +# - Decoding large resolution videos +# - Large batch of videos saturating the CPU +# +# **When NOT to use:** +# +# - You need bit-exact results with CPU decoding +# - Small resolution videos and the PCI-e transfer latency is large +# - GPU is already busy and CPU is idle +# +# **Performance impact:** CUDA decoding can significantly outperform CPU decoding, +# especially for high-resolution videos and when decoding a lot of frames. +# Actual speedup varies by hardware, resolution, and codec. + +# %% +# **Checking for CPU Fallback** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In some cases, CUDA decoding may silently fall back to CPU decoding when the +# video codec or format is not supported by NVDEC. You can detect this using +# the :attr:`~torchcodec.decoders.VideoDecoder.cpu_fallback` attribute: +# +# .. code-block:: python +# +# with set_cuda_backend("beta"): +# decoder = VideoDecoder("file.mp4", device="cuda") +# +# # Print detailed fallback status +# print(decoder.cpu_fallback) +# +# .. note:: +# +# The timing of when you can detect CPU fallback differs between backends: +# with the **FFmpeg backend**, you can only check fallback status after decoding at +# least one frame, because FFmpeg determines codec support lazily during decoding; +# with the **BETA backend**, you can check fallback status immediately after +# decoder creation, as the backend checks codec support upfront. +# +# For installation instructions, detailed examples, and visual comparisons +# between CPU and CUDA decoding, see :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` + +# %% +# Conclusion +# ---------- +# +# TorchCodec offers multiple performance optimization strategies, each suited to +# different scenarios. Use batch APIs for multi-frame decoding, approximate mode +# for faster initialization, parallel processing for high throughput, and CUDA +# acceleration to offload the CPU. +# +# The best results often come from combining techniques. Profile your specific +# use case and apply optimizations incrementally, using the benchmarks in the +# linked examples as a guide. +# +# For more information, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_basic_example.py` - Basic decoding examples +# - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` - Approximate mode benchmarks +# - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py` - Custom frame mappings +# - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` - Parallel decoding strategies +# - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` - CUDA acceleration guide +# - :class:`torchcodec.decoders.VideoDecoder` - Full API reference