diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py new file mode 100644 index 00000000000..9d9726ae7e1 --- /dev/null +++ b/recipes_source/recipes/tuning_guide.py @@ -0,0 +1,370 @@ +""" +Performance Tuning Guide +************************* +**Author**: `Szymon Migacz `_ + +Performance Tuning Guide is a set of optimizations and best practices which can +accelerate training and inference of deep learning models in PyTorch. Presented +techniques often can be implemented by changing only a few lines of code and can +be applied to a wide range of deep learning models across all domains. + +General optimizations +--------------------- +""" + +############################################################################### +# Enable async data loading and augmentation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# `torch.utils.data.DataLoader `_ +# supports asynchronous data loading and data augmentation in separate worker +# subprocesses. The default setting for ``DataLoader`` is ``num_workers=0``, +# which means that the data loading is synchronous and done in the main process. +# As a result the main training process has to wait for the data to be available +# to continue the execution. +# +# Setting ``num_workers > 0`` enables asynchronous data loading and overlap +# between the training and data loading. ``num_workers`` should be tuned +# depending on the workload, CPU, GPU, and location of training data. +# +# ``DataLoader`` accepts ``pin_memory`` argument, which defaults to ``False``. +# When using a GPU it's better to set ``pin_memory=True``, this instructs +# ``DataLoader`` to use pinned memory and enables faster and asynchronous memory +# copy from the host to the GPU. + +############################################################################### +# Disable gradient calculation for validation or inference +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# PyTorch saves intermediate buffers from all operations which involve tensors +# that require gradients. Typically gradients aren't needed for validation or +# inference. +# `torch.no_grad() `_ +# context manager can be applied to disable gradient calculation within a +# specified block of code, this accelerates execution and reduces the amount of +# required memory. +# `torch.no_grad() `_ +# can also be used as a function decorator. + +############################################################################### +# Disable bias for convolutions directly followed by a batch norm +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# `torch.nn.Conv2d() `_ +# has ``bias`` parameter which defaults to ``True`` (the same is true for +# `Conv1d `_ +# and +# `Conv3d `_ +# ). +# +# If a ``nn.Conv2d`` layer is directly followed by a ``nn.BatchNorm2d`` layer, +# then the bias in the convolution is not needed, instead use +# ``nn.Conv2d(..., bias=False, ....)``. Bias is not needed because in the first +# step ``BatchNorm`` subtracts the mean, which effectively cancels out the +# effect of bias. +# +# This is also applicable to 1d and 3d convolutions as long as ``BatchNorm`` (or +# other normalization layer) normalizes on the same dimension as convolution's +# bias. +# +# Models available from `torchvision `_ +# already implement this optimization. + +############################################################################### +# Use parameter.grad = None instead of model.zero_grad() or optimizer.zero_grad() +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Instead of calling: +model.zero_grad() +# or +optimizer.zero_grad() + +############################################################################### +# to zero out gradients, use the following method instead: + +for param in model.parameters(): + param.grad = None + +############################################################################### +# The second code snippet does not zero the memory of each individual parameter, +# also the subsequent backward pass uses assignment instead of addition to store +# gradients, this reduces the number of memory operations. +# +# Setting gradient to ``None`` has a slightly different numerical behavior than +# setting it to zero, for more details refer to the +# `documentation `_. +# +# Alternatively, starting from PyTorch 1.7, call ``model`` or +# ``optimizer.zero_grad(set_to_none=True)``. + +############################################################################### +# Fuse pointwise operations +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# Pointwise operations (elementwise addition, multiplication, math functions - +# ``sin()``, ``cos()``, ``sigmoid()`` etc.) can be fused into a single kernel +# to amortize memory access time and kernel launch time. +# +# `PyTorch JIT `_ can fuse kernels +# automatically, although there could be additional fusion opportunities not yet +# implemented in the compiler, and not all device types are supported equally. +# +# Pointwise operations are memory-bound, for each operation PyTorch launches a +# separate kernel. Each kernel loads data from the memory, performs computation +# (this step is usually inexpensive) and stores results back into the memory. +# +# Fused operator launches only one kernel for multiple fused pointwise ops and +# loads/stores data only once to the memory. This makes JIT very useful for +# activation functions, optimizers, custom RNN cells etc. +# +# In the simplest case fusion can be enabled by applying +# `torch.jit.script `_ +# decorator to the function definition, for example: + +@torch.jit.script +def fused_gelu(x): + return x * 0.5 * (1.0 + torch.erf(x / 1.41421)) + +############################################################################### +# Refer to +# `TorchScript documentation `_ +# for more advanced use cases. + +############################################################################### +# Enable channels_last memory format for computer vision models +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# PyTorch 1.5 introduced support for ``channels_last`` memory format for +# convolutional networks. This format is meant to be used in conjunction with +# `AMP `_ to further accelerate +# convolutional neural networks with +# `Tensor Cores `_. +# +# Support for ``channels_last`` is experimental, but it's expected to work for +# standard computer vision models (e.g. ResNet-50, SSD). To convert models to +# ``channels_last`` format follow +# `Channels Last Memory Format Tutorial `_. +# The tutorial includes a section on +# `converting existing models `_. + +############################################################################### +# Checkpoint intermediate buffers +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Buffer checkpointing is a technique to mitigate the memory capacity burden of +# model training. Instead of storing inputs of all layers to compute upstream +# gradients in backward propagation, it stores the inputs of a few layers and +# the others are recomputed during backward pass. The reduced memory +# requirements enables increasing the batch size that can improve utilization. +# +# Checkpointing targets should be selected carefully. The best is not to store +# large layer outputs that have small re-computation cost. The example target +# layers are activation functions (e.g. ``ReLU``, ``Sigmoid``, ``Tanh``), +# up/down sampling and matrix-vector operations with small accumulation depth. +# +# PyTorch supports a native +# `torch.utils.checkpoint `_ +# API to automatically perform checkpointing and recomputation. + +############################################################################### +# Disable debugging APIs +# ~~~~~~~~~~~~~~~~~~~~~~ +# Many PyTorch APIs are intended for debugging and should be disabled for +# regular training runs: +# +# * anomaly detection: +# `torch.autograd.detect_anomaly `_ +# or +# `torch.autograd.set_detect_anomaly(True) `_ +# * profiler related: +# `torch.autograd.profiler.emit_nvtx `_, +# `torch.autograd.profiler.profile `_ +# * autograd gradcheck: +# `torch.autograd.gradcheck `_ +# or +# `torch.autograd.gradgradcheck `_ +# + +############################################################################### +# GPU specific optimizations +# -------------------------- + +############################################################################### +# Enable cuDNN auto-tuner +# ~~~~~~~~~~~~~~~~~~~~~~~ +# `NVIDIA cuDNN `_ supports many algorithms +# to compute a convolution. Autotuner runs a short benchmark and selects the +# kernel with the best performance on a given hardware for a given input size. +# +# For convolutional networks (other types currently not supported), enable cuDNN +# autotuner before launching the training loop by setting: + +torch.backends.cudnn.benchmark = True +############################################################################### +# +# * the auto-tuner decisions may be non-deterministic; different algorithm may +# be selected for different runs. For more details see +# `PyTorch: Reproducibility `_ +# * in some rare cases, such as with highly variable input sizes, it's better +# to run convolutional networks with autotuner disabled to avoid the overhead +# associated with algorithm selection for each input size. +# + +############################################################################### +# Avoid unnecessary CPU-GPU synchronization +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Avoid unnecessary synchronizations, to let the CPU run ahead of the +# accelerator as much as possible to make sure that the accelerator work queue +# contains many operations. +# +# When possible, avoid operations which require synchronizations, for example: +# +# * ``print(cuda_tensor)`` +# * ``cuda_tensor.item()`` +# * memory copies: ``tensor.cuda()``, ``cuda_tensor.cpu()`` and equivalent +# ``tensor.to(device)`` calls +# * ``cuda_tensor.nonzero()`` +# * python control flow which depends on results of operations performed on cuda +# tensors e.g. ``if (cuda_tensor != 0).all()`` +# + +############################################################################### +# Create tensors directly on the target device +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Instead of calling ``torch.rand(size).cuda()`` to generate a random tensor, +# produce the output directly on the target device: +# ``torch.rand(size, device=torch.device('cuda'))``. +# +# This is applicable to all functions which create new tensors and accept +# ``device`` argument: +# `torch.rand() `_, +# `torch.zeros() `_, +# `torch.full() `_ +# and similar. + +############################################################################### +# Use mixed precision and AMP +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Mixed precision leverages +# `Tensor Cores `_ +# and offers up to 3x overall speedup on Volta and newer GPU architectures. To +# use Tensor Cores AMP should be enabled and matrix/tensor dimensions should +# satisfy requirements for calling kernels that use Tensor Cores. +# +# To use Tensor Cores: +# +# * set sizes to multiples of 8 (to map onto dimensions of Tensor Cores) +# +# * see +# `Deep Learning Performance Documentation +# `_ +# for more details and guidelines specific to layer type +# * if layer size is derived from other parameters rather than fixed, it can +# still be explicitly padded e.g. vocabulary size in NLP models +# +# * enable AMP +# +# * Introduction to Mixed Precision Training and AMP: +# `video `_, +# `slides `_ +# * native PyTorch AMP is available starting from PyTorch 1.6: +# `documentation `_, +# `examples `_, +# `tutorial `_ +# +# + +############################################################################### +# Pre-allocate memory in case of variable input length +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Models for speech recognition or for NLP are often trained on input tensors +# with variable sequence length. Variable length can be problematic for PyTorch +# caching allocator and can lead to reduced performance or to unexpected +# out-of-memory errors. If a batch with a short sequence length is followed by +# an another batch with longer sequence length, then PyTorch is forced to +# release intermediate buffers from previous iteration and to re-allocate new +# buffers. This process is time consuming and causes fragmentation in the +# caching allocator which may result in out-of-memory errors. +# +# A typical solution is to implement pre-allocation. It consists of the +# following steps: +# +# #. generate a (usually random) batch of inputs with maximum sequence length +# (either corresponding to max length in the training dataset or to some +# predefined threshold) +# #. execute a forward and a backward pass with the generated batch, do not +# execute an optimizer or a learning rate scheduler, this step pre-allocates +# buffers of maximum size, which can be reused in subsequent +# training iterations +# #. zero out gradients +# #. proceed to regular training +# + +############################################################################### +# Distributed optimizations +# ------------------------- + +############################################################################### +# Use efficient data-parallel backend +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# PyTorch has two ways to implement data-parallel training: +# +# * `torch.nn.DataParallel `_ +# * `torch.nn.parallel.DistributedDataParallel `_ +# +# ``DistributedDataParallel`` offers much better performance and scaling to +# multiple-GPUs. For more information refer to the +# `relevant section of CUDA Best Practices `_ +# from PyTorch documentation. + +############################################################################### +# Skip unnecessary all-reduce if training with DistributedDataParallel and gradient accumulation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# By default +# `torch.nn.parallel.DistributedDataParallel `_ +# executes gradient all-reduce after every backward pass to compute the average +# gradient over all workers participating in the training. If training uses +# gradient accumulation over N steps, then all-reduce is not necessary after +# every training step, it's only required to perform all-reduce after the last +# call to backward, just before the execution of the optimizer. +# +# ``DistributedDataParallel`` provides +# `no_sync() `_ +# context manager which disables gradient all-reduce for particular iteration. +# ``no_sync()`` should be applied to first ``N-1`` iterations of gradient +# accumulation, the last iteration should follow the default execution and +# perform the required gradient all-reduce. + +############################################################################### +# Match the order of layers in constructors and during the execution if using DistributedDataParallel(find_unused_parameters=True) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# `torch.nn.parallel.DistributedDataParallel `_ +# with ``find_unused_parameters=True`` uses the order of layers and parameters +# from model constructors to build buckets for ``DistributedDataParallel`` +# gradient all-reduce. ``DistributedDataParallel`` overlaps all-reduce with the +# backward pass. All-reduce for a particular bucket is asynchronously triggered +# only when all gradients for parameters in a given bucket are available. +# +# To maximize the amount of overlap, the order in model constructors should +# roughly match the order during the execution. If the order doesn't match, then +# all-reduce for the entire bucket waits for the gradient which is the last to +# arrive, this may reduce the overlap between backward pass and all-reduce, +# all-reduce may end up being exposed, which slows down the training. +# +# ``DistributedDataParallel`` with ``find_unused_parameters=False`` (which is +# the default setting) relies on automatic bucket formation based on order of +# operations encountered during the backward pass. With +# ``find_unused_parameters=False`` it's not necessary to reorder layers or +# parameters to achieve optimal performance. + +############################################################################### +# Load-balance workload in a distributed setting +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Load imbalance typically may happen for models processing sequential data +# (speech recognition, translation, language models etc.). If one device +# receives a batch of data with sequence length longer than sequence lengths for +# the remaining devices, then all devices wait for the worker which finishes +# last. Backward pass functions as an implicit synchronization point in a +# distributed setting with +# `DistributedDataParallel `_ +# backend. +# +# There are multiple ways to solve the load balancing problem. The core idea is +# to distribute workload over all workers as uniformly as possible within each +# global batch. For example Transformer solves imbalance by forming batches with +# approximately constant number of tokens (and variable number of sequences in a +# batch), other models solve imbalance by bucketing samples with similar +# sequence length or even by sorting dataset by sequence length. diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index f8986363092..a18febd7310 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -176,6 +176,15 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/recipes/amp_recipe.html :tags: Model-Optimization +.. Performance + +.. customcarditem:: + :header: Performance Tuning Guide + :card_description: Tips for achieving optimal performance. + :image: ../_static/img/thumbnails/cropped/profiler.png + :link: ../recipes/recipes/tuning_guide.html + :tags: Model-Optimization + .. End of tutorial card section .. raw:: html @@ -209,6 +218,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/recipes/tensorboard_with_pytorch /recipes/recipes/dynamic_quantization /recipes/recipes/amp_recipe + /recipes/recipes/tuning_guide /recipes/torchscript_inference /recipes/deployment_with_flask /recipes/distributed_rpc_profiling