From 42c044dc4d4c04f5ca6a561c4193f9c4f260fb5e Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Wed, 19 Aug 2020 13:44:38 -0600 Subject: [PATCH 01/22] fdsa --- index.rst | 1 + .../mixed_precision_tutorial.py | 53 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 intermediate_source/mixed_precision_tutorial.py diff --git a/index.rst b/index.rst index a5ad877b0f4..98d8d275503 100644 --- a/index.rst +++ b/index.rst @@ -500,6 +500,7 @@ Additional Resources :hidden: :caption: Frontend APIs + intermediate/mixed_precision_tutorial intermediate/named_tensor_tutorial intermediate/memory_format_tutorial advanced/cpp_frontend diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py new file mode 100644 index 00000000000..8090cb3dfd0 --- /dev/null +++ b/intermediate_source/mixed_precision_tutorial.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +""" +Automatic Mixed Precision in PyTorch +******************************************************* +**Author**: `Michael Carilli `_ + +``torch.cuda.amp`` provides convenience methods for mixed precision, +where some operations use the ``torch.float32`` (``float``) datatype and other operations +use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions, +are much faster in ``float16``. Other ops, like reductions, often require the dynamic +range of ``float32``. Mixed precision tries to match each op to its appropriate datatype. +which can reduce your network's runtime and memory footprint. + +Ordinarily, "automatic mixed precision training" uses :class:`torch.cuda.amp.autocast` and +:class:`torch.cuda.amp.GradScaler` together. +Here we'll walk through adding ``autocast`` and ``GradScaler`` to a toy network. +First we'll cover typical use, then describe more advanced cases. + +.. contents:: :local: +""" + +###################################################################### +# Without torch.cuda.amp, the following simple network executes all +# ops in default precision (torch.float32): + +import torch + +###################################################################### +# Adding autocast +# --------------- +# + + +###################################################################### +# Adding GradScaler +# ----------------- +# + + + + + + + +###################################################################### +# Advanced topics +# --------------- +# + + + +# +# know by creating `an issue `_. From 1122d9b9021da7f898b32cf619f28b046e61662b Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 25 Aug 2020 11:14:56 -0600 Subject: [PATCH 02/22] Tutorial runs --- .../mixed_precision_tutorial.py | 182 +++++++++++++++++- 1 file changed, 174 insertions(+), 8 deletions(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index 8090cb3dfd0..54b6c89e528 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -4,50 +4,216 @@ ******************************************************* **Author**: `Michael Carilli `_ -``torch.cuda.amp`` provides convenience methods for mixed precision, +`torch.cuda.amp `_ provides convenience methods for mixed precision, where some operations use the ``torch.float32`` (``float``) datatype and other operations use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions, are much faster in ``float16``. Other ops, like reductions, often require the dynamic range of ``float32``. Mixed precision tries to match each op to its appropriate datatype. which can reduce your network's runtime and memory footprint. -Ordinarily, "automatic mixed precision training" uses :class:`torch.cuda.amp.autocast` and -:class:`torch.cuda.amp.GradScaler` together. +Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast `_ and +`torch.cuda.amp.GradScaler `_ together. Here we'll walk through adding ``autocast`` and ``GradScaler`` to a toy network. First we'll cover typical use, then describe more advanced cases. .. contents:: :local: """ +import time, gc + +_start_time = None + +def start_timer(): + global _start_time + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.synchronize() + _start_time = time.time() + +def end_timer_and_print(local_msg): + torch.cuda.synchronize() + print(local_msg) + print("Total execution time = {:.3f} sec".format(time.time() - _start_time)) + print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated())) + +def make_model(in_size, out_size, num_layers): + layers = [] + for _ in range(num_layers - 1): + layers.append(torch.nn.Linear(in_size, in_size)) + layers.append(torch.nn.ReLU()) + layers.append(torch.nn.Linear(in_size, out_size)) + return torch.nn.Sequential(*tuple(layers)).cuda() + ###################################################################### -# Without torch.cuda.amp, the following simple network executes all +# Without torch.cuda.amp, the following simple "network" executes all # ops in default precision (torch.float32): import torch +# batch_size, in_size, out_size, and num_layers are chosen to be large enough to saturate the GPU. +# Typically, mixed precision provides the greatest speedup when GPU is working hard. +# Small networks may be CPU bound, in which case mixed precision won't improve performance. +# Sizes are also chosen such that the linear layers' participating dimensions are multiples of 8, +# to permit Tensor Core usage on Tensor Core-capable GPUs. +# See :ref:`Troubleshooting `. +# Exercise: Vary participating sizes and see how the mixed precision speedup changes. +batch_size = 256 # Try, for example, 32, 33 +in_size = 4096 +out_size = 4096 +num_layers = 6 +num_batches = 128 +epochs = 3 + +data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)] +targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)] +loss_fn = torch.nn.MSELoss().cuda() + +net = make_model(in_size, out_size, num_layers) +opt = torch.optim.SGD(net.parameters(), lr=0.001) + +start_timer() +for epoch in range(epochs): + for input, target in zip(data, targets): + output = net(input) + loss = loss_fn(output, target) + loss.backward() + opt.step() + opt.zero_grad() # set_to_none=True here can modestly improve performance +end_timer_and_print("With default precision:") + ###################################################################### # Adding autocast # --------------- # - +for epoch in range(epochs): + for input, target in zip(data, targets): + # Runs the forward pass under autocast + with torch.cuda.amp.autocast(): + output = net(input) + # Linear layers with ``float32`` inputs `autocast to float16 `_ + assert output.dtype is torch.float16 + + loss = loss_fn(output, target) + # ``mse_loss`` layers with ``float16`` inputs `autocast to float32 `_ + assert loss.dtype is torch.float32 + + # Exits autocast before backward(). + # Backward passes under autocast are not recommended. + # Backward ops run in the same dtype autocast chose for corresponding forward ops. + loss.backward() + opt.step() + opt.zero_grad() # set_to_none=True here can modestly improve performance ###################################################################### # Adding GradScaler # ----------------- -# +# +# See `Gradient Scaling `_ +# for a full explanation of each step. +# Constructs scaler with default args, which are effective for most networks. +# If your network fails to converge with default GradScaler args, please file an issue. +scaler = torch.cuda.amp.GradScaler() +for epoch in range(epochs): + for input, target in zip(data, targets): + with torch.cuda.amp.autocast(): + output = net(input) + loss = loss_fn(output, target) + # Scales loss. Calls backward() on scaled loss to create scaled gradients. + scaler.scale(loss).backward() + # scaler.step() first unscales the gradients of the optimizer's assigned params. + # If these gradients do not contain infs or NaNs, optimizer.step() is then called, + # otherwise, optimizer.step() is skipped. + scaler.step(opt) + # Updates the scale for next iteration. + scaler.update() + opt.zero_grad() ###################################################################### -# Advanced topics +# All together +# ------------ + +net = make_model(in_size, out_size, num_layers) +opt = torch.optim.SGD(net.parameters(), lr=0.001) +scaler = torch.cuda.amp.GradScaler() + +start_timer() +for epoch in range(epochs): + for input, target in zip(data, targets): + with torch.cuda.amp.autocast(): + output = net(input) + loss = loss_fn(output, target) + scaler.scale(loss).backward() + scaler.step(opt) + scaler.update() + opt.zero_grad() +end_timer_and_print("With mixed precision:") + + +###################################################################### +# Inspecting/modifying gradients (e.g., gradient clipping) # --------------- # +# All gradients produced by ``scaler.scale(loss).backward()`` are scaled. If you wish to modify or inspect +# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should +# unscale them first using `scaler.unscale_(optimizer)`. +for epoch in range(epochs): + for input, target in zip(data, targets): + with torch.cuda.amp.autocast(): + output = net(input) + loss = loss_fn(output, target) + scaler.scale(loss).backward() + # Unscales the gradients of optimizer's assigned params in-place + scaler.unscale_(opt) + # Since the gradients of optimizer's assigned params are now unscaled, clips as usual. + # You may use the same value for max_norm here as you would without gradient scaling. + torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1) + + scaler.step(opt) + scaler.update() + opt.zero_grad() + +###################################################################### +# Advanced topics +# --------------- +# +# See the `Automatic Mixed Precision Examples `_ for advanced use cases including: +# * Gradient penalty/double backward +# * Networks with multiple models, optimizers, or losses +# * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``) +# * Custom autograd functions (subclasses of ``torch.autograd.Function``) + +###################################################################### +# Troubleshooting +# --------------- +# +# Speedup with Amp is minor +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. Your network may not be saturating the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance +# won't matter. A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s) +# as much as you can without running OOM. Also, try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or +# printing values from CUDA tensors), and try to avoid sequences of many small CUDA ops (coalesce these into a few +# large CUDA ops if you can). +# 2. Your network may be compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores. +# In this case a more modest speedup is expected. +# 3. Matmul dimensions are not Tensor Core-friendly. Make sure matmuls' participating sizes are multiples of 8. +# (For NLP models with encoders/decoders, this can be subtle. Also. convolutions used to have similar size constraints +# for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist. See `here ` for details). +# # -# know by creating `an issue `_. +# Loss is inf/NaN +# ~~~~~~~~~~~~~~~ +# First, check if your network fits an advanced use case in the `Automatic Mixed Precision Examples `_. +# If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information: +# 1. Try disabling ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if inf/NaN persist. +# 2. ??? +# 3. profit From e7426143e8e4639c2344ca53193437a1b5a0a7ea Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 25 Aug 2020 11:30:58 -0600 Subject: [PATCH 03/22] clarify one scaler per convergence run --- intermediate_source/mixed_precision_tutorial.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index 54b6c89e528..41ed77ebd84 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -112,8 +112,11 @@ def make_model(in_size, out_size, num_layers): # See `Gradient Scaling `_ # for a full explanation of each step. -# Constructs scaler with default args, which are effective for most networks. +# Constructs scaler once, at the beginning of the convergence run, using default args. # If your network fails to converge with default GradScaler args, please file an issue. +# The same GradScaler instance should be used for the entire convergence run. +# If you perform multiple convergence runs in the same script, each run should use +# a dedicated fresh GradScaler instance. GradScaler instances are lightweight. scaler = torch.cuda.amp.GradScaler() for epoch in range(epochs): From d5890ae5204017a53b75740400cc566d532fa77d Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 25 Aug 2020 17:37:53 -0600 Subject: [PATCH 04/22] adjust sizes, dont run illustrative sections --- .../mixed_precision_tutorial.py | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index 41ed77ebd84..710f00e3552 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -19,22 +19,23 @@ .. contents:: :local: """ -import time, gc +import torch, time, gc -_start_time = None +start_time = None def start_timer(): - global _start_time + global start_time gc.collect() torch.cuda.empty_cache() torch.cuda.reset_max_memory_allocated() torch.cuda.synchronize() - _start_time = time.time() + start_time = time.time() def end_timer_and_print(local_msg): torch.cuda.synchronize() - print(local_msg) - print("Total execution time = {:.3f} sec".format(time.time() - _start_time)) + end_time = time.time() + print("\n" + local_msg) + print("Total execution time = {:.3f} sec".format(end_time - start_time)) print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated())) def make_model(in_size, out_size, num_layers): @@ -45,30 +46,32 @@ def make_model(in_size, out_size, num_layers): layers.append(torch.nn.Linear(in_size, out_size)) return torch.nn.Sequential(*tuple(layers)).cuda() -###################################################################### -# Without torch.cuda.amp, the following simple "network" executes all -# ops in default precision (torch.float32): - -import torch - # batch_size, in_size, out_size, and num_layers are chosen to be large enough to saturate the GPU. # Typically, mixed precision provides the greatest speedup when GPU is working hard. # Small networks may be CPU bound, in which case mixed precision won't improve performance. -# Sizes are also chosen such that the linear layers' participating dimensions are multiples of 8, -# to permit Tensor Core usage on Tensor Core-capable GPUs. -# See :ref:`Troubleshooting `. +# Sizes are also chosen such that linear layers' participating dimensions are multiples of 8, +# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting `). +# # Exercise: Vary participating sizes and see how the mixed precision speedup changes. -batch_size = 256 # Try, for example, 32, 33 +batch_size = 512 # Try, for example, 128, 256, 513. in_size = 4096 out_size = 4096 -num_layers = 6 -num_batches = 128 +num_layers = 3 +num_batches = 50 epochs = 3 +# Creates data in default precision. The same data is used for both default and mixed precision trials below. +# You don't need to manually change the type of input data when enabling mixed precision. data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)] targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)] loss_fn = torch.nn.MSELoss().cuda() +###################################################################### +# Default Precision (Baseline) +# ---------------------------- +# +# Without torch.cuda.amp, the following simple network executes all ops in default precision (torch.float32): + net = make_model(in_size, out_size, num_layers) opt = torch.optim.SGD(net.parameters(), lr=0.001) @@ -86,7 +89,7 @@ def make_model(in_size, out_size, num_layers): # Adding autocast # --------------- # -for epoch in range(epochs): +for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): # Runs the forward pass under autocast with torch.cuda.amp.autocast(): @@ -119,7 +122,7 @@ def make_model(in_size, out_size, num_layers): # a dedicated fresh GradScaler instance. GradScaler instances are lightweight. scaler = torch.cuda.amp.GradScaler() -for epoch in range(epochs): +for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): with torch.cuda.amp.autocast(): output = net(input) @@ -161,13 +164,13 @@ def make_model(in_size, out_size, num_layers): ###################################################################### # Inspecting/modifying gradients (e.g., gradient clipping) -# --------------- +# -------------------------------------------------------- # # All gradients produced by ``scaler.scale(loss).backward()`` are scaled. If you wish to modify or inspect # the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should # unscale them first using `scaler.unscale_(optimizer)`. -for epoch in range(epochs): +for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): with torch.cuda.amp.autocast(): output = net(input) From 4c0bdc5e3ede901c853b99da4631d5837f2cda22 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Wed, 26 Aug 2020 20:51:48 -0600 Subject: [PATCH 05/22] satisfying ocd --- .../mixed_precision_tutorial.py | 55 +++++++++++-------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index 710f00e3552..457d3ac8200 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Automatic Mixed Precision in PyTorch -******************************************************* +************************************ **Author**: `Michael Carilli `_ `torch.cuda.amp `_ provides convenience methods for mixed precision, @@ -66,10 +66,9 @@ def make_model(in_size, out_size, num_layers): targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)] loss_fn = torch.nn.MSELoss().cuda() -###################################################################### +############################## # Default Precision (Baseline) # ---------------------------- -# # Without torch.cuda.amp, the following simple network executes all ops in default precision (torch.float32): net = make_model(in_size, out_size, num_layers) @@ -85,20 +84,27 @@ def make_model(in_size, out_size, num_layers): opt.zero_grad() # set_to_none=True here can modestly improve performance end_timer_and_print("With default precision:") -###################################################################### +################# # Adding autocast # --------------- +# Instances of `torch.cuda.amp.autocast `_ serve as context managers that allow regions of your script to run +# in mixed precision. # +# In these regions, CUDA ops run in a dtype chosen by autocast +# to improve performance while maintaining accuracy. +# See the :ref:`Autocast Op Reference` for details on what precision +# autocast chooses for each op, and under what circumstances. + for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): - # Runs the forward pass under autocast - with torch.cuda.amp.autocast(): + # Runs the forward pass under autocast. + with torch.cuda.amp.autocast(enabled=try_amp): output = net(input) - # Linear layers with ``float32`` inputs `autocast to float16 `_ + # output is float16 because linear layers autocast to float16. assert output.dtype is torch.float16 loss = loss_fn(output, target) - # ``mse_loss`` layers with ``float16`` inputs `autocast to float32 `_ + # loss is float32 because mse_loss layers autocast to float32. assert loss.dtype is torch.float32 # Exits autocast before backward(). @@ -108,12 +114,14 @@ def make_model(in_size, out_size, num_layers): opt.step() opt.zero_grad() # set_to_none=True here can modestly improve performance -###################################################################### +################### # Adding GradScaler # ----------------- +# `Gradient scaling `_ +# helps prevent gradients with small magnitudes from flushing to zero +# ("underflowing") when training with mixed precision. # -# See `Gradient Scaling `_ -# for a full explanation of each step. +# ``torch.cuda.amp.GradScaler`` performs the steps of gradient scaling conveniently. # Constructs scaler once, at the beginning of the convergence run, using default args. # If your network fails to converge with default GradScaler args, please file an issue. @@ -141,7 +149,7 @@ def make_model(in_size, out_size, num_layers): opt.zero_grad() -###################################################################### +############## # All together # ------------ @@ -162,13 +170,13 @@ def make_model(in_size, out_size, num_layers): end_timer_and_print("With mixed precision:") -###################################################################### +########################################################## # Inspecting/modifying gradients (e.g., gradient clipping) # -------------------------------------------------------- # # All gradients produced by ``scaler.scale(loss).backward()`` are scaled. If you wish to modify or inspect # the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should -# unscale them first using `scaler.unscale_(optimizer)`. +# unscale them first using ``scaler.unscale_(optimizer)``. for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): @@ -188,33 +196,36 @@ def make_model(in_size, out_size, num_layers): scaler.update() opt.zero_grad() -###################################################################### +################# # Advanced topics # --------------- # # See the `Automatic Mixed Precision Examples `_ for advanced use cases including: +# # * Gradient penalty/double backward # * Networks with multiple models, optimizers, or losses # * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``) # * Custom autograd functions (subclasses of ``torch.autograd.Function``) -###################################################################### +################# # Troubleshooting # --------------- # # Speedup with Amp is minor # ~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. Your network may not be saturating the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance -# won't matter. A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s) -# as much as you can without running OOM. Also, try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or -# printing values from CUDA tensors), and try to avoid sequences of many small CUDA ops (coalesce these into a few -# large CUDA ops if you can). +# won't matter. +# +# * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s) +# as much as you can without running OOM. +# * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors). +# * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can). # 2. Your network may be compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores. # In this case a more modest speedup is expected. # 3. Matmul dimensions are not Tensor Core-friendly. Make sure matmuls' participating sizes are multiples of 8. # (For NLP models with encoders/decoders, this can be subtle. Also. convolutions used to have similar size constraints -# for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist. See `here ` for details). -# +# for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist. See +# `here ` for details). # # Loss is inf/NaN # ~~~~~~~~~~~~~~~ From a5e5e2a5a81e8e590ec718253065e78a2b30a9d2 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Thu, 27 Aug 2020 02:43:31 -0600 Subject: [PATCH 06/22] MORE --- .../mixed_precision_tutorial.py | 86 +++++++++++++------ 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index 457d3ac8200..f2a28404b54 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -13,14 +13,19 @@ Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast `_ and `torch.cuda.amp.GradScaler `_ together. -Here we'll walk through adding ``autocast`` and ``GradScaler`` to a toy network. -First we'll cover typical use, then describe more advanced cases. +This tutorial measures the performance of a simple network in default precision, +then walks through adding ``autocast`` and ``GradScaler`` to run the same network in +mixed precision with improved performance. + +You may download and run this tutorial as a standalone Python script. +The only requirements are Pytorch 1.6+ and a CUDA-capable GPU. .. contents:: :local: """ import torch, time, gc +# Timing utilities start_time = None def start_timer(): @@ -38,6 +43,12 @@ def end_timer_and_print(local_msg): print("Total execution time = {:.3f} sec".format(end_time - start_time)) print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated())) +########################################################## +# A simple network +# ---------------- +# +# The following sequence of linear layers and ReLUs should show a nice speedup with mixed precision. + def make_model(in_size, out_size, num_layers): layers = [] for _ in range(num_layers - 1): @@ -46,13 +57,15 @@ def make_model(in_size, out_size, num_layers): layers.append(torch.nn.Linear(in_size, out_size)) return torch.nn.Sequential(*tuple(layers)).cuda() -# batch_size, in_size, out_size, and num_layers are chosen to be large enough to saturate the GPU. -# Typically, mixed precision provides the greatest speedup when GPU is working hard. +########################################################## +# ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work. +# Typically, mixed precision provides the greatest speedup when GPU is saturated. # Small networks may be CPU bound, in which case mixed precision won't improve performance. # Sizes are also chosen such that linear layers' participating dimensions are multiples of 8, -# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting `). +# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting` below). # # Exercise: Vary participating sizes and see how the mixed precision speedup changes. + batch_size = 512 # Try, for example, 128, 256, 513. in_size = 4096 out_size = 4096 @@ -60,16 +73,18 @@ def make_model(in_size, out_size, num_layers): num_batches = 50 epochs = 3 -# Creates data in default precision. The same data is used for both default and mixed precision trials below. +# Creates data in default precision. +# The same data is used for both default and mixed precision trials below. # You don't need to manually change the type of input data when enabling mixed precision. data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)] targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)] + loss_fn = torch.nn.MSELoss().cuda() -############################## -# Default Precision (Baseline) -# ---------------------------- -# Without torch.cuda.amp, the following simple network executes all ops in default precision (torch.float32): +########################################################## +# Default Precision +# ----------------- +# Without torch.cuda.amp, the following simple network executes all ops in default precision (``torch.float32``): net = make_model(in_size, out_size, num_layers) opt = torch.optim.SGD(net.parameters(), lr=0.001) @@ -84,7 +99,7 @@ def make_model(in_size, out_size, num_layers): opt.zero_grad() # set_to_none=True here can modestly improve performance end_timer_and_print("With default precision:") -################# +########################################################## # Adding autocast # --------------- # Instances of `torch.cuda.amp.autocast `_ serve as context managers that allow regions of your script to run @@ -114,7 +129,7 @@ def make_model(in_size, out_size, num_layers): opt.step() opt.zero_grad() # set_to_none=True here can modestly improve performance -################### +########################################################## # Adding GradScaler # ----------------- # `Gradient scaling `_ @@ -128,6 +143,7 @@ def make_model(in_size, out_size, num_layers): # The same GradScaler instance should be used for the entire convergence run. # If you perform multiple convergence runs in the same script, each run should use # a dedicated fresh GradScaler instance. GradScaler instances are lightweight. + scaler = torch.cuda.amp.GradScaler() for epoch in range(0): # 0 epochs, this section is for illustration only @@ -149,18 +165,24 @@ def make_model(in_size, out_size, num_layers): opt.zero_grad() -############## +########################################################## # All together # ------------ +# +# The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``. +# If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops. +# This allows switching between default precision and mixed precision without if/else statements. + +use_amp = True net = make_model(in_size, out_size, num_layers) opt = torch.optim.SGD(net.parameters(), lr=0.001) -scaler = torch.cuda.amp.GradScaler() +scaler = torch.cuda.amp.GradScaler(enabled=use_amp) start_timer() for epoch in range(epochs): for input, target in zip(data, targets): - with torch.cuda.amp.autocast(): + with torch.cuda.amp.autocast(enabled=use_amp): output = net(input) loss = loss_fn(output, target) scaler.scale(loss).backward() @@ -169,7 +191,6 @@ def make_model(in_size, out_size, num_layers): opt.zero_grad() end_timer_and_print("With mixed precision:") - ########################################################## # Inspecting/modifying gradients (e.g., gradient clipping) # -------------------------------------------------------- @@ -196,24 +217,26 @@ def make_model(in_size, out_size, num_layers): scaler.update() opt.zero_grad() -################# +########################################################## # Advanced topics # --------------- # # See the `Automatic Mixed Precision Examples `_ for advanced use cases including: # +# * Gradient accumulation # * Gradient penalty/double backward # * Networks with multiple models, optimizers, or losses # * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``) # * Custom autograd functions (subclasses of ``torch.autograd.Function``) - -################# +# +# .. _troubleshooting: +# # Troubleshooting # --------------- # # Speedup with Amp is minor # ~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. Your network may not be saturating the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance +# 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance # won't matter. # # * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s) @@ -225,12 +248,27 @@ def make_model(in_size, out_size, num_layers): # 3. Matmul dimensions are not Tensor Core-friendly. Make sure matmuls' participating sizes are multiples of 8. # (For NLP models with encoders/decoders, this can be subtle. Also. convolutions used to have similar size constraints # for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist. See -# `here ` for details). +# `here `_ for guidance.) # # Loss is inf/NaN # ~~~~~~~~~~~~~~~ # First, check if your network fits an advanced use case in the `Automatic Mixed Precision Examples `_. +# See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy `_. +# # If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information: -# 1. Try disabling ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if inf/NaN persist. -# 2. ??? -# 3. profit +# +# 1. Try disabling ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist. +# 2. If you suspect some region of your network overflows (e.g., a complex loss function), run that forward region in ``float32``. +# `The autocast docstring `_'s last code snippet +# shows running a subregion in ``float32`` (by locally disabling autocast and casting the subregion's inputs). +# +# Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Autocast tries to cover all ops that benefit from or require casting. The +# `ops that receive explicit coverage `_ +# are based on reasoning about numerical properties, but also on experience. +# If you see a type mismatch error in an autocast-enabled forward region or a backward pass following that region, +# it's possible autocast missed an op. +# +# Please file an issue with the error backtrace. ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide +# more fine-grained information on which backend op is failing. From 5072dd32d6e14ae6ad8316c7dc7febd4e5f3cc47 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Thu, 27 Aug 2020 10:32:27 -0600 Subject: [PATCH 07/22] fdsa --- intermediate_source/mixed_precision_tutorial.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index f2a28404b54..497dcb2dd70 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -97,7 +97,7 @@ def make_model(in_size, out_size, num_layers): loss.backward() opt.step() opt.zero_grad() # set_to_none=True here can modestly improve performance -end_timer_and_print("With default precision:") +end_timer_and_print("Default precision:") ########################################################## # Adding autocast @@ -189,7 +189,7 @@ def make_model(in_size, out_size, num_layers): scaler.step(opt) scaler.update() opt.zero_grad() -end_timer_and_print("With mixed precision:") +end_timer_and_print("Mixed precision:") ########################################################## # Inspecting/modifying gradients (e.g., gradient clipping) From 38a0a0d0188bdce5b345b2afaa1564d16598aef6 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Thu, 27 Aug 2020 23:56:52 -0600 Subject: [PATCH 08/22] details --- .../mixed_precision_tutorial.py | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index 497dcb2dd70..e7fb23d6d32 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -46,8 +46,7 @@ def end_timer_and_print(local_msg): ########################################################## # A simple network # ---------------- -# -# The following sequence of linear layers and ReLUs should show a nice speedup with mixed precision. +# The following sequence of linear layers and ReLUs should show a speedup with mixed precision. def make_model(in_size, out_size, num_layers): layers = [] @@ -75,7 +74,7 @@ def make_model(in_size, out_size, num_layers): # Creates data in default precision. # The same data is used for both default and mixed precision trials below. -# You don't need to manually change the type of input data when enabling mixed precision. +# You don't need to manually change inputs' dtype when enabling mixed precision. data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)] targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)] @@ -102,8 +101,8 @@ def make_model(in_size, out_size, num_layers): ########################################################## # Adding autocast # --------------- -# Instances of `torch.cuda.amp.autocast `_ serve as context managers that allow regions of your script to run -# in mixed precision. +# Instances of `torch.cuda.amp.autocast `_ +# serve as context managers that allow regions of your script to run in mixed precision. # # In these regions, CUDA ops run in a dtype chosen by autocast # to improve performance while maintaining accuracy. @@ -166,9 +165,8 @@ def make_model(in_size, out_size, num_layers): opt.zero_grad() ########################################################## -# All together -# ------------ -# +# All together ("Automatic Mixed Precision") +# ------------------------------------------ # The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``. # If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops. # This allows switching between default precision and mixed precision without if/else statements. @@ -192,9 +190,8 @@ def make_model(in_size, out_size, num_layers): end_timer_and_print("Mixed precision:") ########################################################## -# Inspecting/modifying gradients (e.g., gradient clipping) +# Inspecting/modifying gradients (e.g., clipping) # -------------------------------------------------------- -# # All gradients produced by ``scaler.scale(loss).backward()`` are scaled. If you wish to modify or inspect # the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should # unscale them first using ``scaler.unscale_(optimizer)``. @@ -217,6 +214,11 @@ def make_model(in_size, out_size, num_layers): scaler.update() opt.zero_grad() +########################################################## +# Inference/Evaluation +# -------------------- +# ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary. +# ########################################################## # Advanced topics # --------------- @@ -229,6 +231,10 @@ def make_model(in_size, out_size, num_layers): # * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``) # * Custom autograd functions (subclasses of ``torch.autograd.Function``) # +# If you're registering a custom C++ op with the dispatcher, see the +# `autocast section `_ +# of the dispatcher tutorial. +# # .. _troubleshooting: # # Troubleshooting @@ -257,10 +263,11 @@ def make_model(in_size, out_size, num_layers): # # If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information: # -# 1. Try disabling ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist. -# 2. If you suspect some region of your network overflows (e.g., a complex loss function), run that forward region in ``float32``. +# 1. Disable ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist. +# 2. If you suspect part of your network (e.g., a complicated loss function) overflows , run that forward region in ``float32`` +# and see if infs/NaNs persist. # `The autocast docstring `_'s last code snippet -# shows running a subregion in ``float32`` (by locally disabling autocast and casting the subregion's inputs). +# shows forcing a subregion to run in ``float32`` (by locally disabling autocast and casting the subregion's inputs). # # Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From fb10b93651a15052019f5060f96ab0b87ddf2ba7 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Fri, 28 Aug 2020 00:15:44 -0600 Subject: [PATCH 09/22] rephrase --- intermediate_source/mixed_precision_tutorial.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index e7fb23d6d32..6688f0da95d 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -271,9 +271,9 @@ def make_model(in_size, out_size, num_layers): # # Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Autocast tries to cover all ops that benefit from or require casting. The -# `ops that receive explicit coverage `_ -# are based on reasoning about numerical properties, but also on experience. +# Autocast tries to cover all ops that benefit from or require casting. +# `Ops that receive explicit coverage `_ +# are chosen based on numerical properties, but also on experience. # If you see a type mismatch error in an autocast-enabled forward region or a backward pass following that region, # it's possible autocast missed an op. # From e432d5f0b8198b81baf6c46f0ed57533532f1348 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Fri, 28 Aug 2020 00:23:49 -0600 Subject: [PATCH 10/22] fix formatting --- intermediate_source/mixed_precision_tutorial.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py index 6688f0da95d..4e3e97e512b 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/intermediate_source/mixed_precision_tutorial.py @@ -13,6 +13,7 @@ Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast `_ and `torch.cuda.amp.GradScaler `_ together. + This tutorial measures the performance of a simple network in default precision, then walks through adding ``autocast`` and ``GradScaler`` to run the same network in mixed precision with improved performance. @@ -106,8 +107,8 @@ def make_model(in_size, out_size, num_layers): # # In these regions, CUDA ops run in a dtype chosen by autocast # to improve performance while maintaining accuracy. -# See the :ref:`Autocast Op Reference` for details on what precision -# autocast chooses for each op, and under what circumstances. +# See the `Autocast Op Reference `_ +# for details on what precision autocast chooses for each op, and under what circumstances. for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): @@ -218,11 +219,10 @@ def make_model(in_size, out_size, num_layers): # Inference/Evaluation # -------------------- # ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary. -# + ########################################################## # Advanced topics # --------------- -# # See the `Automatic Mixed Precision Examples `_ for advanced use cases including: # # * Gradient accumulation @@ -234,12 +234,12 @@ def make_model(in_size, out_size, num_layers): # If you're registering a custom C++ op with the dispatcher, see the # `autocast section `_ # of the dispatcher tutorial. -# + +########################################################## # .. _troubleshooting: # # Troubleshooting # --------------- -# # Speedup with Amp is minor # ~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance @@ -250,7 +250,7 @@ def make_model(in_size, out_size, num_layers): # * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors). # * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can). # 2. Your network may be compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores. -# In this case a more modest speedup is expected. +# In this case a reduced speedup is expected. # 3. Matmul dimensions are not Tensor Core-friendly. Make sure matmuls' participating sizes are multiples of 8. # (For NLP models with encoders/decoders, this can be subtle. Also. convolutions used to have similar size constraints # for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist. See From d082a38207c17e0f27301e1e41763ef8ac6798a1 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 8 Sep 2020 09:45:34 -0600 Subject: [PATCH 11/22] move script to recipes --- .../mixed_precision_tutorial.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) rename {intermediate_source => recipes_source}/mixed_precision_tutorial.py (96%) diff --git a/intermediate_source/mixed_precision_tutorial.py b/recipes_source/mixed_precision_tutorial.py similarity index 96% rename from intermediate_source/mixed_precision_tutorial.py rename to recipes_source/mixed_precision_tutorial.py index 4e3e97e512b..732a3a867e0 100644 --- a/intermediate_source/mixed_precision_tutorial.py +++ b/recipes_source/mixed_precision_tutorial.py @@ -20,8 +20,6 @@ You may download and run this tutorial as a standalone Python script. The only requirements are Pytorch 1.6+ and a CUDA-capable GPU. - -.. contents:: :local: """ import torch, time, gc @@ -113,7 +111,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): # Runs the forward pass under autocast. - with torch.cuda.amp.autocast(enabled=try_amp): + with torch.cuda.amp.autocast(): output = net(input) # output is float16 because linear layers autocast to float16. assert output.dtype is torch.float16 @@ -143,7 +141,6 @@ def make_model(in_size, out_size, num_layers): # The same GradScaler instance should be used for the entire convergence run. # If you perform multiple convergence runs in the same script, each run should use # a dedicated fresh GradScaler instance. GradScaler instances are lightweight. - scaler = torch.cuda.amp.GradScaler() for epoch in range(0): # 0 epochs, this section is for illustration only @@ -221,6 +218,8 @@ def make_model(in_size, out_size, num_layers): # ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary. ########################################################## +# .. _advanced-topics: +# # Advanced topics # --------------- # See the `Automatic Mixed Precision Examples `_ for advanced use cases including: @@ -249,7 +248,7 @@ def make_model(in_size, out_size, num_layers): # as much as you can without running OOM. # * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors). # * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can). -# 2. Your network may be compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores. +# 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores. # In this case a reduced speedup is expected. # 3. Matmul dimensions are not Tensor Core-friendly. Make sure matmuls' participating sizes are multiples of 8. # (For NLP models with encoders/decoders, this can be subtle. Also. convolutions used to have similar size constraints @@ -258,7 +257,7 @@ def make_model(in_size, out_size, num_layers): # # Loss is inf/NaN # ~~~~~~~~~~~~~~~ -# First, check if your network fits an advanced use case in the `Automatic Mixed Precision Examples `_. +# First, check if your network fits an :ref:`advanced use case`. # See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy `_. # # If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information: @@ -278,4 +277,4 @@ def make_model(in_size, out_size, num_layers): # it's possible autocast missed an op. # # Please file an issue with the error backtrace. ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide -# more fine-grained information on which backend op is failing. +# fine-grained information on which backend op is failing. From 831d503584a223298be91ff180c4d6e22e529600 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 8 Sep 2020 14:26:12 -0600 Subject: [PATCH 12/22] hopefully moved to recipes --- _static/img/thumbnails/cropped/amp.png | Bin 0 -> 14849 bytes index.rst | 1 - recipes_source/recipes/README.txt | 4 ++++ .../amp_tutorial.py} | 4 ++-- recipes_source/recipes_index.rst | 9 +++++++++ 5 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 _static/img/thumbnails/cropped/amp.png rename recipes_source/{mixed_precision_tutorial.py => recipes/amp_tutorial.py} (99%) diff --git a/_static/img/thumbnails/cropped/amp.png b/_static/img/thumbnails/cropped/amp.png new file mode 100644 index 0000000000000000000000000000000000000000..a6916ce5605e99d5168c7d52aa157f913b8e7526 GIT binary patch literal 14849 zcmc(`bx<9_w=PP8TX1(taCZw%a0{}tad&qoxVyXS#wEDJ#x-o*osA^81s=ci&V5zy z&wJ}uovG^nzSX@}P0vhq_gcMbB2|^Yp`#F^z`($u%gIWq!@zt5{HGwnzPCiSSzf@v z5Z!re09@6LJSZKV9n7t4%_v>H9L*@rJgv-OU_4h3vUQwk+nuG}8cc z96&v_c*4(?6cw(ixRmr7ms3+S#X`cvg%Xx>-=1cmufcEDmn#SJpgW+*+d-ZW>5I|p ztwDf4Vt)SCOu(?OZ@|-m%e@ojmD;XI_owE+-Wb6%69#m?o!>GHL_O}lVxRjJ-v4wU zQ0!>3ZxO{vn%RS1Cu|X%9^HOBhkV6w>lry&dXAn^-4um(nPYA8E%a_2VK~#scmAot z-IMcud1+ljnul8aS;*fSxtAD!A^YyymHK;=hKkal?EZY?dFI0t_P#?v`(^Rl-zAU! z_`QE#PpoOTWHhf2CqGsF$zSM(^6hiJ%HZ~rBXPx0lwH5Jd#rUI@pL9KVbwTDdEK)J z3Wl>OOXTG(FxHKz>|EY;39?-evZ?p{yGmX{5s_+i)*Cen$Fk?2c}9t*I`lN&6B)r< zxXvua_t8B`rPJRTk-2l{bZ*^sV)*OyLa5+*v6lZ_W4P{x*h36UfGU4or+?vr__lv= zp@*XO=MIZ$1TR!KtZ?##Z;P60r{`KL4<^8;d*M=a2eBfl|0-l9WBbpwTjY-cgJpS@ zJe)12u3mZiw$?2IN~!fu_Kbwlz&ff_)nEsXR?LXv#De}&lX0h{pQ`ffXq<;B)~+n- zoa@;xK+hf$1&|uWpb~9SNfvQ?VUMiVH2h1mt)XR#hBHHHn$~2hzKjyjlDdtZCM_S( zx;<4(#|Gj}lE$plU_CY25lik91glWj?FQ|R$J*GI3ARD+5q&Tn$u!S1gRGQIeFcB~ z%2d~QYCWN4U;@tNH1#qKc#tex#86 z3g(nSxkheS|C89+%XuGvB}4G|%_}+}U7-B;KaxzBk_&LxKUy0d*I#)t-Bqd_w~?oN zLNr->U?2Z~4qI;qdZWrvx@qH`-~P@Gu;Mh@)Rj4g9v%Ugbhertdd8-J3QckRH9~$Teh?_>^tOa!fLyvz zn%7Kjal)H@rpg7-Dn7N15D%aQ!P9c(u0&Or==R7n5v5Rg=$rs;zvfQtL*FZ1^06VgP}toPAL zv4)PnM@Xf)ExUgsErNoqW~rsPe5~SD7QtL6Y*uF+`jHWHsL+lzOTk>x10ly zHSdM;X3_nu84||Ha#^qZDXbQ%6XWS=t`O~%-6P~4&hi~ctg@4 z{OQx+Z-PG^*^#;^YxmyD5Zp0Tzn-~kCo9a+#x#_5u4zYa*d-gK-k+|F{!3D}=hJB( zdQssu5b-bYmeA%E?0SqEO2IaR^@!LFQoTd<&t?uX{KuRVsGdb+6(A~sK7JlaV)c>n zm~#v2H@n?@1|Hm+zBw|iOUzki7sY41Z<9Q6y|aCT(Xj(Rwe2Lc1Ml2>1oY@GsN<2A z2`A-LZsTo&y_&^$N|PBoEIkl{GIabxf|C?*&(Tv>Bqvr@P&8InZe^y_* zqILo2roNU7O!3H@2#+LqpFNNo%YE*Ck2{$;gX1W2%NkM_hL4T#`X1DIuw@!$;t&5)r%f$EFfK0LGcFV<4v0G3b;rkY?shvM(M$3@>MyH0072Ik3 zT6e=7jKncTiy*rQM#r4J`MGKXyIP%s#U3gHJHPT-xcc;0O9Apq8HQRqk9w)bYEKlZ zpq~{GXS1`fM_FB(FO>Fy_p7jBld(5UuwXfY6m8`Ih1-lUWKswwI1dAF#q?fXSK({I zX)(S_O$vQsujXNfav}}8Xmwe*ljFsL!Kz6h<_-Zyvz8$_LRnZ;jpCA=Dv=K(c)wI9!?lBZQTI2IjzwAz zNbRW~f`^CtM=>YkN6v_m^Ktbu*5K{g=WWAxCoAvMPH0g7&7r~|ERF<@Y02F5Ng4ih z|4D@s4NaCPP^Ks&n`&f3uZA$w8%cITfWPx8O`_6PkMrxQ`j8S+WYAFnkOeAAmhr387U4(83?U&QTb}gB)n<^TFUs(M-H-hn9w9rUa zx8{k-Q?Yr_pV(J1I!gO z+t51cx@|zuG7iRSDwYY+^nv3XVFUqHrRg29;7!H)4y|x+;Wmv?%9PqUE`WW9(p=`> z6@!F#c)tyayV=OIG;Iq6Hf5ND2%wp)&HFoI?@M9^|6k|yAO$dE7itH)-lJl z_*D)?)j;dvA%*_?OXjJ&nsx(d|rf=4d04S>3B$aZPX&=n84qYK+ zJX2Mw~CkSnHOv!zx6d@riz};pNn`uX)R2-KE_c>u?a5fxa>+qdo3d0arU-74pbZ!HI|qfI0e!;PDn2AliUT;9e?}=w5oh8%Ogl)+#wuY zf`3b~I8?EZyePdd$e{Z4XIbR1EjtjYgMg>Lg>^|^_hXHw+_}W zaFcZ%Q<2vi1UiZG!Hly!ADLGq8+3QZV-Qb?{456^{^XgdtT>EzJ-PX)(qO((D;CF- z8HhvZ?7`}<;*0Wdp>*R$dSDEvKG~KB zcV$@ppz4r-<40)GQ~e#xZpoRy+qMcDd;`@zw|)FWh~Yv4)HC*p`K{`O2YJunT7-+` zjF@NCKozkRw$V&jy$r*2Tb~1dRMUXoGhi{Y{-TZQ&lTv)3Ey)0o^{QDf@wJt=)xIk zpa}kV_A7UnL3~I2V4at66AVh)xde-v-Br2de4OT!>yTf-F^|4o@Q09_8CJjmavaTS zO}#}3y-vrU)+gRuGlQ;%2y-2VapQ&{nmV)BRkloq# zp}=Ci&97tj^cIUL2Y0=E&ma!_(~M}v>(YR_NG~wRySvkX>|x>lNFqa?__EByDs}sl z7q{dO$zuB5AV#DcpCN;ahY(GYhQ9XdISKF*qm@UJVSK5$UxDk%Hg`>x^a{>w58xuA zeAZiIH_X!GNeg#z4!dIS0!#Z55_~0!Nh-)$(Bobr&<$y}Bo9aty5}>7zH@T7=yLgFCzYg zbNA_BS7PE3gZP(W;8vng5M1Hx6N6I$*TC?;5vL=Y)i{-%%bOwghw98fhx^~+oSBsl zo-^B2zI#cZyUgY#R6b28>B);t{|V<8wP?Fkd6dh^|6on@Lg3`jdG4QTO-b%Rj6A3ipdi*NEy35X+Is z2fM5W`sv<4ZhiX6cSnwTGPcHB>zGppIg|A9C<-9*TXQfOWD%LWyU_h-DfkrXXO(PW zSfX2c5cp@I7$5TF-_rZz^9W-Q@^>*gyWDDx>4d%|tNrAraP(azzdaztEV9i>P!$W@*06nx4+1 zA+tG~&kWhxC{U`GCjF?pl~KJGYhZ3pw)Ik^gxlZUIBrUp5ldG^`RB&psT1f2GwpL( zz!L=gH_!Q5D~Q*j!n66S5{;Hj`63+shdm4WGuST+t81N4^I)pi61~(~t?12Fc-Dp8 z&#Q2o!g$#n2^Us@n<#qw!C!->^A|6RhS{=s?3+!})N?qf)2n0VQlbR&kV3zkgFY280Uz^52L-nuo0UuP6XhRBN0AKo{`_2lXHcT9ty5C|k`n!| zc)}PqUEqXr)M}1R{A@I0un-YWkh)hY9%DFAEhQS?i)LuA(qA}^r8}c%NA|fw8MQDr zJBu`e+^Ns0$g;UbQb%d^dm#v?!XCF!@ru)c>tM{V^c42?`-Q8GHGvcfMMg}*hp!2K zT1?eyMf8WeL}7aTgZ46_Vsd|iA!++-(dRA*THJK!F?}W)Ldm6J2}rU)SFE5SY`xP2 zhl}aZv23_6<(GQ3VP~Y>a?TF8X`juE&eO}Jm5x0xh zj)$g_CnpW?RR|xtTAFi$a9C(Od{*@0^DKX4DC4W!^nWQ#2|hzY^7a}BdQBO&E}umC z&o2M~MO z(=RREwDA1iNR(KRDA|w8*x%0D81=NFFUZo6vrrM5pQnbKl8tb3eCkCqw(pSZ9oaP2i8X2+23uax_=>vJ5=C&mMi5%mVtJmWx(rr99rIb8}okxxibJ?aOLuOZseT88DQpF*NZC z12NVjAAhIJuUs-Yh*Z8^BaetvJIAl$(Nm{rokki?4hzOm^RvK=Z8dd|H8HD0#_V@K znF00$8J7Mm0DQ)}iSDD9I#JVnV@9;5_zCut12Py9lAGh~U@yCt9+PP@X*!Wzf0 z2*snWnAUlBEZNXyv3*@gRYqPWF;XwEjCE5y#*$UUR(zq4G``S+krz8F4aaC9Q1_dn zMU8$;XSVkT!a_qs5nI~XLpE3U^7D`1ED6g3KLrQm1qZ)rQ_d%?sw$b#YcOcUy~;d8 z89oV;t;+>4uouJbb5jl>>_iJ}isdhU!8>mrX(UcHQ%s^h=aXYC%2h!q>r+w~9{;MD znhVKlVePDZc)VHJW#|dk=sTpEWNHrPS@oNzAB$NUsfP&H+eegChUTb8M-vzc6FTqm z_D2WMQD1fkeZtUKv|loZg3Z-9QPl`0CvPg94m2prEuP9O zCZ9T4zWw~I=qOZXt`X5LdwL$p^2Z+_T!K09MZ#e0EtsSu22xX-#r2o2JGd~D9B}hp zrM(}py0)l1!T9yGUA9U(5Y8Fc2uq)oyc*Wp`VkvtmuN^qg}dVOmvcE6-XhPwHE2<@ zLa;8?UhGx)ZZk=D3x;%MA|~ycn$@+_yth2#4cWq>=$^Objcp`69AlUTrLTfJZdIQ3 zQj-|^Mp*8j`AU5sRvayLtYAr$2smpm!OBddV3@r~1h9uEx|s4J)c$ssfw^d&hK)Ljbg@0Fr zbA~8;>xac1db6j`&8B)jgZ5g~>J99!d{|VbTF+i1MFJ1L>xl8Y zo--IGF}yu4+O}=kGW+s)bcnz7Z=B4biELtSNK~UwIrPD-QouVkXAAg2R$;#Qw|uZO zh$2k5zt+!?`uxS-{pJStu%jI()O3=Nm?b@Qf$=A_$nZtXb7F@Bp=Q&}hls!4Bm-&p z6K%x|D~dyH)t;292GjHh+Twt!)`ZG~yq2$EwWNHG8@YH0WmWUvkO5{O{Re!TT_=P1 z1JXtqjujtkw?UU%kxc} z+r7<8z(QWCjnosrbI$duQa}0y3K5@Rc0qn?K{jO2CEZ6W*Mh3tTM+lt{fDpak1er7 zSR#D=o`E;MBK09XEWoWGDYe8f^~KgnUlwn_H!2C#L$+1DIN)N;C6#Z-lEE(&QHnLL zkM|rQq9lDD_xGE_0i>Z@i2eWK5&!C8t}ppWGneOGJNN}mmEyNKy-F{!A``?TExxG) zi>eM*1T0PTEt}v(DMpy0j}!d7J2Nf9sSGysj+wwewu&(E){eU2^Y{Da zKR#@l5rNQR0(cmh4{=12~QB`rZFhBTK+okEHp8vr*YCXOH$SlO97KgOa||BiV=-W zwQ^J_?t6Uu5?pIQo32=kjqA)@@DV);iufG7wE=t_LuO;ZMiew4Ju-nqI{z4(wdYGA zq`U`*^c7ayd%~a+mMgzx2?)GFI$lFyt<}luLe#y0fmI1Jm9NDu89TV{dnWSxj(^+> z-?7xSnf1)?2Q&X$^9q5VA5I1}rTgOg!qLa!Ezj!i9+o@GIWBPj6VLniKvGrIkbZl6 zyLvbR@1}RG2Q)4_W=@C&bYr@=!+3Wh)j})>x8qN4mM-qL5=Pi#hUkB-dLCTO{648G zoKZ~~e&oW8Rf59Fznq7Z(a4=`2!AUFNbke1#hqPt}i$?9TvY`xpxB6*Vne?8e z{3YDt@D3l06~0NqM8K2Dz6;2XvOip4U_K50r+ffARG7XuBD%^cNh9tfBjFOjq$Bet zz`&56%SnlAc&?u31~{zBuZCfvmK-7JD>GPM*(b@nVjN&lj$nVLnj6s~`B~&1`O&oZ z?c0w(Gns#v3F~o^bhYE+P{N3Oe4q0y@`%nPKD^)Jf8yE}S$NX!ey!9M_{ef_MR6OT znK+iF@#YreuI{M&M$%Anbg3fyrMoo?=}W0js@jB>uLRJlqB!LomesTg>&&|RHG$n< zl23NBhXgMrwD#(!7}E#WmT-%fS&YA$tWNmjQ9Bqz4Fu(~vJ{3?u_6NBq+CPVUBe<| z2Mv%+99Bmou#8w~Bu%A4Qc?h>AHRv>xXF1x);bkMGhuNW6FbH@tWKpDBX0SwN>= z-D;v9c{Grxo0ISD!S#j#M!C<@AcxJv^^WTwd37t1HKjn|%{MEra7X2@=@SM$Lt849 za{~So1B~#}+2jVmf zqwkxXX~MI#Tlu9u$vpbxw95JFOVIti44AP?6U>MJWi#V9g#l@f6~KJo?~j@i*t${&A-fb4P=LLTT_+BXH(C1Ht^1Gkf(l=T7HIC((Xu*|fnaq+ z?H%&~LsCnAF z{rX6Qa_XFXR8p`~wTge@uauxLzUX|KTMvlR=4!WEBW~Im;0W(A3FV7Um6@Hjj0@a> zs&IKtkUn4-4HncLqWR#ffrBtk!F0kqG7NZA#D-YejwwkO&*N%eyG*9C(y`0GkEg@# z5g966TA#HlSIa}kA=e8BX5g_h{!Cq{&*$g)C)-je&XL(SVT1UEOZ|%gE_5sRGMm{GaPr9Xg}t(_fx6Mw&mr z2Y_9J^;)mDHKG zVoJ?d0Sw1_F_-Jqa;LJcFDaB)Af50}rNn6gEBvGD?>}Ycpv<^!k&?! zGc+sAQC7V6Ldbeivb?pX-yx}Nga*n*w$r8b)!pALQ=3fMpC1&++%#N6dG=2-&Pf&c z3@f`*>O+>4kzCTu_g>>l)7EiKf5%*u)>>4{`5F1$H*EV4H1drA$`dg%)VV8bH*h#V ztszhJx-#81^-f+VVej?i)HCk>;d9#K<$z=bNRcv??VG!9XwkXX%3d|L-kh$GF;y~w zkTQ`tva}_j$>vR`_*z>HOgF5(jSL6VlLQ2i#I|WJr@hv=+jv^AAa;?*)J{r^SAn6? zXv4Bu9w}LYNoJNNGx~v$wCL{Fw9uAZlf{(jOWQ54K~RVpiUY0vI`4?QD|6v~m=N^~ zwmK`H#_;c+R_*lG6_60T9}XGx<+zlTiE89Y;d}GCP%Bf`s1c&k`=EU8W5*{@tFvf9 z*3L>k{8|qqOG9{Y7q?5lC*Pj6y-tQntFhQXWrt^wP?a2wW`X|P(9c~?triBJ>;v+m zjK9-Kg~o2@JgP^ZS{#i3YV)dSun4f0e;#Z@!BIjAlCkWlzUfr4n7H zy4NG+)0B;sx_Iya=2bR#>9uiODm$a_94;Bcr9Gjkd}zx(mKNQ-0K^GoYz+6>pW*_Y ziZIC~PcTV6P&4A`Kq)aUYkV4NNaDQAvb7w( ze3+28tmKEdb3nXI=Oz)&G9L8Fs*^qW%aVdcg($(+w&PvDiYIicdYrRDrNGu@cxijF z6hnRrak^V7F}DI%J#|HgkZeQ|M&UG3)krwW)NMzUhB!@SN7AWNgz|%`R!)VLfd<|0 zPZew>plPi<=_>#tC!;IINZc(t7aF~de9=j@E`Y(PRfUF`XLQv={3IF)y0)bJ8A><- zg}b>gt*k5(isaYY8zzNnP-LSf5AKZ4yoj1nef!R;kRZcB8p*y_g+DPkSsfQ)$FSHMO+^P{#jqTcrOkWwmFAIHGd$ zp-fb_v6vdgGu~BWYpLnIrpUz{Q7>p$1A z1B@D?L(@V%C`Otk`oj7Ex;4SH3w!*s8}UwWXLs{eXh4LEAmRbQ|A;_d{O^^_WaCUD z->^#++F2a;_evm`@#U6ENUyuVBsD42jPI8_vr@emb?7A&a=ku#VtKYV7~KHzc5+D; z4_v@rzX6=)DF=BDJRW3c9gtE$_X&>x)&EY0G;bI~4)oOz5d~tRbW-bMO=?zE}*1EbD4-Ot_fu4`f0QS60S;a0}W0>g65ZjkLp4SrYxq{ENmTp1~b*8F2fNvkAbe zAmgGBv}0MEnb{u>>dmV5R^cve|6vQV>w>J@FBx>AKz%o;$4M;cta!c2*KpM{bCct< zdX3xuf^t+FUq;l9)?J2@TN_t5Z2l#8EG&@{C~YB=RLgv2>J6#a#b`lg6Q6D&??yV1 z?->SfEEQ`_+wq~iZ#v$=l%H2H)ZhjN8ZhzWLhBuwM?aFV>;U-R1(W~4_WuTrga7}` z{{K{~Od|m`|kMr%1C-o z&P13{RLRnZG43u0$FtqA=Pb<%Yu`x1#MkEKayUU)eII1`+8BZ?Y}*Ds9b+8T zOSVM|Qdn>j#eRqpY~25j`KgS-IgT-<`_9dKgUn{}7n}IGdErFd^{&Au#Vw=sSTi`tTp} zuJj-BU)n|={^!j+>l8_!^6J;MakAj=hL5Emm7Dr{GKh_0t?)$ga4`in-OA5u%_{}TA_}9T5AH)xxSm= zW8v>(nTc>HApI}SK^vmW{JyBMIEv&LY_8vk%}h;|a-7p7vk<8S6y)MEqc^v}`^7Md zh?>908zBFpv>L!4E^zZB^>WBQw{zUIrTa253D`c_5Me@ErJja&EyqiXs0qD z)RE62cjL~1#(wk0zhbTI65MdH)X6NWb*dWD;-m7=^v5?2kBB!WC_UJwC7iq~pK!wK zr4b7cvfm=8MVmIZ)rRQ@6F-b?tR|$;BQ8OR@z>tmDRjMQpTEo9xg=>quATCksf{s@CC?A}+_%ik z6VB=)Bc~LQf#)6`#{t#}GhE-_Y=;+GIhn+hmluZ+$>xql`^iH+8`|?V;uW&Nta}gU z&*(=d66us=(Z$>C1%@v4`lb_dAcOTG_s}hzM#_;U0SI@SepQ%q#B~2OLCtL&Z!DczW-a3y!;6A`_1TCW|a|MllCN3+wu~4F6 zW~zNn^xr!4Q3|jZVAR+UU$%~T#5$5HlMNMZF1ggTt@?)F=NW8*XJ4A(x%fk#?HqBK zVL-{R9UVwx^>icHNoccrF6g&jw3e%a&PQUUtx4u7;on8F_OCL#pCr>b;@=?9@87d* z&(l+K&w|;^mw&egqNnsYMWKQ$&9vi zKA5iG4tE$s6FVjD;dH8HbSR{4oOhb@KhNZ^CO_uV=llf;kuZ%kqoq-g6_<*+a-GmS zaFXE(cCQD}5X4Y|poz2rOY*NIKuDhrjn6F30QDLLUKU4nZAY}e1TN2%A-{Xx2tEz9R6W5l!%#Az4_hj_l zXNhWg?-7-SMB79n{n%2^&nLPo^Q$+Q;@t=u^xQNo>5n|=iIO?gL@P^3M*d=H4A^d4 zUYfo??de3}SS%45V5tg#*PPICJ*Btv9i7%4AiA3~ayEvK2L-&U0sdulb8@nSn&S>i zx=;tQ$mncazxA(t5)uVdK9Pj-5Oij0+&*Bh;dZfTnh1LEzYlHEH;_NA;fcGVy(e;xWQj$x}k=M8ujlIxAdWiWz8Hd zubJq$4SeOh(=Owfj&-2-;FnmLlRee@inW)npXJ@^2x@3VJcWiRShv8si<6;dUtA4L zY&Ho&=&OpPx&QsncRj*!p^s)ak_UIS5#Q{pYc}Z*IC9v48;gv8@b858+7)?EN_MCf zCZ)?(Sws^~XvnJ>7(nkGPcBp2tPX!uC{=$|h;iG9uG%YqA-i@inKHG@V(J2%rXIwl z+`S2W>mZznkRhzyWA3US)yVctVe~doa4v)pe^*H|QR43R2{IATE9E}$^ztwgg#A0J zxf6qM9&}S3K1piR*5B}yRm3)&SYmZNtrsu0=mCF|@#H-mya&e-KLJ9H5P47C1c ziJLdpaKY_@T2|*EgzNohN&~ed2><=mwvw)sP{AGav$8wwAaQpDPM(zmLfhsl(Xm4p z_Nwu0s-Cfq-s;M{#78^PxZxFc@Y*4N0^kzLQtrJIVS!>_aykRL>f^;kD?kn=3dQ-YPr!sF1FPQD)WCV z*Zg;lBt?XTw0vWDaLv72;@oKRE05fm^~i1Z`G&;C_)bWfoLoJsQJ|b>dG9Y7Va~S0 z=g+-*-c;NV@n!~X>n9~*otZFUtpE!!XDT)uF4*}fFZa96S7D!+gQz3UsxX^?j8!&T zL9Jb&h#QuqsQbB=ecUbO)N*L%&mMDAvrn!1Y!{OsYy6DAi#WxZT@af{_tXeNa&031 z4U{3zGLzYqfJSg8dXU|HDvy7Rbz$&!%!uW+wx+L?)T{GDG2J9TSx^|AtKHA_YMG|_B_Z9B%0h^Vylz-g8-G?<iwdBa;A+dcuD-ULIv4d zn#7J$X6s}8D>u+j;Ygzj?K=l5D*s;cKX*s4Hb*{MvHwtI4O9|xe)Ff<;7yR3fI*r%;Rb!ENk41Aq$raSQXTq$5@BE9>%xCG= z%OI1!A&gK9<4u z9-BOn0BC;fRFeltUz|M519kUNCu4x$ifH+?`?k{tz;GFUN2u&t(}WgkkbN5I#7q~H zaQV_KO+K+=Dr*)>6-X84`)|Lp3Pd_F!~NElim&ox#L*aFd^&aSv}eqjDLnp&-LL;m zMQ^WH!Q=AZUbFpvhWaO;yB=L^vWc~R$c@?=4a%>PBEH71-_siGeAm0X0}eVq-uV)M ze=TGN=rVkw&+fCMa<@bLY0#RxR30Ee+hZ*bg<^wE$o|n%9>;uh(U&1kZ|}O~hk%x& z+7t>$d-hKjwq?{>4Y1`PNrc>wQ#n%&T2F(QK3YUD@aZwQK<_;0#Uw}WWq5U!bbxIb zF0|g0vXH#n#H+A|MVEffbn~AAMAO5FTKB9>x07Ua#>E}9{SdT$WQ^P7FaA2bQ@ZsS ziq5CrW9F|-EBJ}{sq9`i4v$m}^FkMDo|PgVZ2;=#ILsPRdKp~yP+M&wi)Pc}aUFo`oCD2lUh?OThV@xh7!Ohlcru2c_WeKkQ}sm{82Ao^Wi zLTLeL?yk;*0_n`~#O|Iv6VgXrH$0KWubLi|2fWYK%ZuJ;)6^}xXaUieGi6~^1UGAqteu1P zpL}@90O=$95{%<#_jKc1-b-J+_#r@GQ*?u;L|Q!8k%ez^p|kV1V6R`#&m$z~r5P|z zCK;92)qVcG)ekzhBJmq|#9Y3>H(n>1+Cgr(F2MApD-R=GuO$q)c;TyJx=8omyE zKL}uO&b`tAPoAS#Am6$>F_YT3$d9L8pt~QGVuW4F1NgP>;tnP&VmF?5HWb0Hxb*L* z2j{~MN)3Bzm$^Sp%q{@E0n+jtMh&p<9n$!Hx?_Atov#}{0o#D~{RV_z{dzYe@!j`G zz;UMdmS#|;S*-UZE-BO7q);5Z@G$v*$4>76^S?#MBM@nSnjG|hlrtSbVi-f4mHNV* zkTgk?H42ByFy=ZZ^@UJY?(n0M=>R4JLky-gelsB+w&?+uaUoMGa@A-ct3=fP<_RiS zpOh?CZ34W>fG)F5Bs)8E5lC`BYF5wY$0Wf87`9{Hq;bfI?NtWAk;WrK>f>lc1L8Ns zGCxlokkCRN?~tZEX3XDA;J2asa%>&k2#i7ZAjF89Ta+=wtG!Sgcc1*RkRT6iz?erM zi#Px9z9+L>pNb-V!HaH>tZH?xfo$|quge=-iRjg6gx0odq@BND7&`%xOaCWwgfK8& b?yrbNC+vqew=cv08G)R%vQ({vVbK2r7|~Lg literal 0 HcmV?d00001 diff --git a/index.rst b/index.rst index 98d8d275503..a5ad877b0f4 100644 --- a/index.rst +++ b/index.rst @@ -500,7 +500,6 @@ Additional Resources :hidden: :caption: Frontend APIs - intermediate/mixed_precision_tutorial intermediate/named_tensor_tutorial intermediate/memory_format_tutorial advanced/cpp_frontend diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt index f93ee92c2c6..8ca89860ba4 100644 --- a/recipes_source/recipes/README.txt +++ b/recipes_source/recipes/README.txt @@ -56,3 +56,7 @@ PyTorch Recipes 14. mobile_perf.py PyTorch Mobile Performance Recipes https://pytorch.org/tutorials/recipes/mobile_perf.html + +15. amp_tutorial.py + Automatic Mixed Precision + https://pytorch.org/tutorials/recipes/amp_tutorial.html diff --git a/recipes_source/mixed_precision_tutorial.py b/recipes_source/recipes/amp_tutorial.py similarity index 99% rename from recipes_source/mixed_precision_tutorial.py rename to recipes_source/recipes/amp_tutorial.py index 732a3a867e0..d09b9ddbe23 100644 --- a/recipes_source/mixed_precision_tutorial.py +++ b/recipes_source/recipes/amp_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -Automatic Mixed Precision in PyTorch -************************************ +Automatic Mixed Precision +************************* **Author**: `Michael Carilli `_ `torch.cuda.amp `_ provides convenience methods for mixed precision, diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index e842c19bae5..0e8f0d875cc 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -167,6 +167,15 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py :link: ../recipes/android_native_app_with_custom_op.html :tags: Mobile +.. Automatic Mixed Precision + +.. customcarditem:: + :header: Automatic Mixed Precision + :card_description: Use `torch.cuda.amp` to reduce runtime and save memory on NVIDIA GPUs. + :image: ../_static/img/thumbnails/cropped/amp.png + :link: ../recipes/amp_tutorial.html + :tags: Model-Optimization + .. End of tutorial card section .. raw:: html From d48288ac85c935d44c592c1c9b8ae04c62df631b Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 8 Sep 2020 14:37:58 -0600 Subject: [PATCH 13/22] fdsa --- recipes_source/recipes_index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 0e8f0d875cc..99574c1bc4f 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -171,7 +171,7 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py .. customcarditem:: :header: Automatic Mixed Precision - :card_description: Use `torch.cuda.amp` to reduce runtime and save memory on NVIDIA GPUs. + :card_description: Use ``torch.cuda.amp`` to reduce runtime and save memory on NVIDIA GPUs. :image: ../_static/img/thumbnails/cropped/amp.png :link: ../recipes/amp_tutorial.html :tags: Model-Optimization From 58a403d3624b505d0bc0d553b9da42a43975ffb4 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 8 Sep 2020 15:07:36 -0600 Subject: [PATCH 14/22] add amp_tutorial to toctree --- recipes_source/recipes_index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index cb93a4dd00e..9e379ec4361 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -208,6 +208,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/recipes/Captum_Recipe /recipes/recipes/tensorboard_with_pytorch /recipes/recipes/dynamic_quantization + /recipes/recipes/amp_tutorial /recipes/torchscript_inference /recipes/deployment_with_flask /recipes/distributed_rpc_profiling From 641e7a593a4d88758d929c5d97c2891c6003a923 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 8 Sep 2020 15:10:41 -0600 Subject: [PATCH 15/22] amp_tutorial -> amp_recipe --- recipes_source/recipes/README.txt | 4 ++-- recipes_source/recipes/{amp_tutorial.py => amp_recipe.py} | 0 recipes_source/recipes_index.rst | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) rename recipes_source/recipes/{amp_tutorial.py => amp_recipe.py} (100%) diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt index 8ca89860ba4..a182b0a11c5 100644 --- a/recipes_source/recipes/README.txt +++ b/recipes_source/recipes/README.txt @@ -57,6 +57,6 @@ PyTorch Recipes PyTorch Mobile Performance Recipes https://pytorch.org/tutorials/recipes/mobile_perf.html -15. amp_tutorial.py +15. amp_recipe.py Automatic Mixed Precision - https://pytorch.org/tutorials/recipes/amp_tutorial.html + https://pytorch.org/tutorials/recipes/amp_recipe.html diff --git a/recipes_source/recipes/amp_tutorial.py b/recipes_source/recipes/amp_recipe.py similarity index 100% rename from recipes_source/recipes/amp_tutorial.py rename to recipes_source/recipes/amp_recipe.py diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 9e379ec4361..6f4368d63e8 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -173,7 +173,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :header: Automatic Mixed Precision :card_description: Use ``torch.cuda.amp`` to reduce runtime and save memory on NVIDIA GPUs. :image: ../_static/img/thumbnails/cropped/amp.png - :link: ../recipes/amp_tutorial.html + :link: ../recipes/amp_recipe.html :tags: Model-Optimization .. End of tutorial card section @@ -208,7 +208,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/recipes/Captum_Recipe /recipes/recipes/tensorboard_with_pytorch /recipes/recipes/dynamic_quantization - /recipes/recipes/amp_tutorial + /recipes/recipes/amp_recipe /recipes/torchscript_inference /recipes/deployment_with_flask /recipes/distributed_rpc_profiling From 3f5f6cadc82770f6b6079764dfb579c24808a2bf Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 8 Sep 2020 15:20:30 -0600 Subject: [PATCH 16/22] looks like backtick highlights dont render in card_description --- recipes_source/recipes_index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 6f4368d63e8..cb857ce53c6 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -171,7 +171,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu .. customcarditem:: :header: Automatic Mixed Precision - :card_description: Use ``torch.cuda.amp`` to reduce runtime and save memory on NVIDIA GPUs. + :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs. :image: ../_static/img/thumbnails/cropped/amp.png :link: ../recipes/amp_recipe.html :tags: Model-Optimization From 8820068788bebc23397daa6c45a3ba83169dd155 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 8 Sep 2020 15:29:23 -0600 Subject: [PATCH 17/22] correct path for amp_recipe.html --- recipes_source/recipes_index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index cb857ce53c6..f8986363092 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -173,7 +173,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :header: Automatic Mixed Precision :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs. :image: ../_static/img/thumbnails/cropped/amp.png - :link: ../recipes/amp_recipe.html + :link: ../recipes/recipes/amp_recipe.html :tags: Model-Optimization .. End of tutorial card section From ac602d647ab4e5e5728cca383665d779a55add78 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Fri, 11 Sep 2020 12:37:29 -0600 Subject: [PATCH 18/22] arch notes and saving/restoring --- recipes_source/recipes/amp_recipe.py | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index d09b9ddbe23..8dcd9b2097a 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -20,6 +20,11 @@ You may download and run this tutorial as a standalone Python script. The only requirements are Pytorch 1.6+ and a CUDA-capable GPU. + +Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere). +This recipe should show significant (2-3X) speedup on those architectures. +On earlier architectures (Kepler, Maxwell, Pascal), you may observe a modest speedup. +Run ``nvidia-smi`` to display your GPU's architecture. """ import torch, time, gc @@ -212,6 +217,38 @@ def make_model(in_size, out_size, num_layers): scaler.update() opt.zero_grad() +########################################################## +# Saving/Resuming +# ---------------- +# To save/resume Amp-enabled runs with bitwise accuracy, use +# `scaler.state_dict `_ and +# `scaler.load_state_dict `_. +# +# When saving, save the scaler state dict alongside the usual model and optimizer state dicts. +# Do this either at the beginning of an iteration before any forward passes, or at the end of +# an iteration after ``scaler.update()``. + +checkpoint = {"model": net.state_dict(), + "optimizer": opt.state_dict(), + "scaler": scaler.state_dict()} + +# (write checkpoint as desired, e.g., ``torch.save(checkpoint, "filename")``.) +# +# When resuming, load the scaler state dict alongside the model and optimizer state dicts. +# (read checkpoint as desired, e.g., +# ``checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(torch.cuda.current_device()))``) + +net.load_state_dict(checkpoint["model"]) +opt.load_state_dict(checkpoint["optimizer"]) +scaler.load_state_dict(checkpoint["scaler"]) + +# If a checkpoint was created from a run _without_ mixed precision, and you want to resume training _with_ mixed precision, +# load model and optimizer states from the checkpoint as usual. The checkpoint won't contain a saved scaler state, so +# use a fresh instance of ``GradScaler``. +# +# If a checkpoint was created from a run _with_ mixed precision and you want to resume training _without_ mixed precision, +# load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state. + ########################################################## # Inference/Evaluation # -------------------- From 00b83bfab7d1cd0cd7c53fbba59db65c2e021bb6 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Fri, 11 Sep 2020 12:58:44 -0600 Subject: [PATCH 19/22] formatting --- recipes_source/recipes/amp_recipe.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index 8dcd9b2097a..911bd336010 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -8,17 +8,17 @@ where some operations use the ``torch.float32`` (``float``) datatype and other operations use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions, are much faster in ``float16``. Other ops, like reductions, often require the dynamic -range of ``float32``. Mixed precision tries to match each op to its appropriate datatype. +range of ``float32``. Mixed precision tries to match each op to its appropriate datatype, which can reduce your network's runtime and memory footprint. Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast `_ and `torch.cuda.amp.GradScaler `_ together. -This tutorial measures the performance of a simple network in default precision, +This recipe measures the performance of a simple network in default precision, then walks through adding ``autocast`` and ``GradScaler`` to run the same network in mixed precision with improved performance. -You may download and run this tutorial as a standalone Python script. +You may download and run this recipe as a standalone Python script. The only requirements are Pytorch 1.6+ and a CUDA-capable GPU. Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere). @@ -62,7 +62,7 @@ def make_model(in_size, out_size, num_layers): ########################################################## # ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work. -# Typically, mixed precision provides the greatest speedup when GPU is saturated. +# Typically, mixed precision provides the greatest speedup when the GPU is saturated. # Small networks may be CPU bound, in which case mixed precision won't improve performance. # Sizes are also chosen such that linear layers' participating dimensions are multiples of 8, # to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting` below). @@ -87,7 +87,7 @@ def make_model(in_size, out_size, num_layers): ########################################################## # Default Precision # ----------------- -# Without torch.cuda.amp, the following simple network executes all ops in default precision (``torch.float32``): +# Without ``torch.cuda.amp``, the following simple network executes all ops in default precision (``torch.float32``): net = make_model(in_size, out_size, num_layers) opt = torch.optim.SGD(net.parameters(), lr=0.001) @@ -139,7 +139,8 @@ def make_model(in_size, out_size, num_layers): # helps prevent gradients with small magnitudes from flushing to zero # ("underflowing") when training with mixed precision. # -# ``torch.cuda.amp.GradScaler`` performs the steps of gradient scaling conveniently. +# `torch.cuda.amp.GradScaler `_ +# performs the steps of gradient scaling conveniently. # Constructs scaler once, at the beginning of the convergence run, using default args. # If your network fails to converge with default GradScaler args, please file an issue. @@ -170,9 +171,9 @@ def make_model(in_size, out_size, num_layers): ########################################################## # All together ("Automatic Mixed Precision") # ------------------------------------------ -# The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``. +# (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``. # If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops. -# This allows switching between default precision and mixed precision without if/else statements. +# This allows switching between default precision and mixed precision without if/else statements.) use_amp = True @@ -196,8 +197,8 @@ def make_model(in_size, out_size, num_layers): # Inspecting/modifying gradients (e.g., clipping) # -------------------------------------------------------- # All gradients produced by ``scaler.scale(loss).backward()`` are scaled. If you wish to modify or inspect -# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should -# unscale them first using ``scaler.unscale_(optimizer)``. +# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should +# unscale them first using `scaler.unscale_(optimizer) `_. for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): @@ -232,6 +233,7 @@ def make_model(in_size, out_size, num_layers): "optimizer": opt.state_dict(), "scaler": scaler.state_dict()} +########################################################## # (write checkpoint as desired, e.g., ``torch.save(checkpoint, "filename")``.) # # When resuming, load the scaler state dict alongside the model and optimizer state dicts. @@ -242,11 +244,12 @@ def make_model(in_size, out_size, num_layers): opt.load_state_dict(checkpoint["optimizer"]) scaler.load_state_dict(checkpoint["scaler"]) -# If a checkpoint was created from a run _without_ mixed precision, and you want to resume training _with_ mixed precision, +########################################################## +# If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp, # load model and optimizer states from the checkpoint as usual. The checkpoint won't contain a saved scaler state, so # use a fresh instance of ``GradScaler``. # -# If a checkpoint was created from a run _with_ mixed precision and you want to resume training _without_ mixed precision, +# If a checkpoint was created from a run *with* Amp and you want to resume training *without* Amp, # load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state. ########################################################## From 85ab17c3d6485e8e1b118de33549fd79623d23ec Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Fri, 11 Sep 2020 14:55:39 -0600 Subject: [PATCH 20/22] fdsa --- recipes_source/recipes/amp_recipe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index 911bd336010..90f5ab2c77a 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -169,7 +169,7 @@ def make_model(in_size, out_size, num_layers): opt.zero_grad() ########################################################## -# All together ("Automatic Mixed Precision") +# All together: "Automatic Mixed Precision" # ------------------------------------------ # (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``. # If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops. @@ -270,6 +270,9 @@ def make_model(in_size, out_size, num_layers): # * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``) # * Custom autograd functions (subclasses of ``torch.autograd.Function``) # +# If you perform multiple convergence runs in the same script, each run should use +# a dedicated fresh GradScaler instance. GradScaler instances are lightweight. +# # If you're registering a custom C++ op with the dispatcher, see the # `autocast section `_ # of the dispatcher tutorial. From a824b85100e8200f426c87a254c55e56ec668e63 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Fri, 11 Sep 2020 15:38:34 -0600 Subject: [PATCH 21/22] Clarify autograd-autocast interaction for custom ops --- advanced_source/dispatcher.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst index 23ba0f96be1..4e895507cd6 100644 --- a/advanced_source/dispatcher.rst +++ b/advanced_source/dispatcher.rst @@ -105,6 +105,8 @@ speaking, the structure of your registrations will look like this: that provides implementations for all basic operators on the XLA dispatch key. +.. _autograd-support: + Adding autograd support ----------------------- @@ -299,6 +301,28 @@ the safest choice for the execution type: at::autocast::cached_cast(exec_type, t1)); } +If your custom op is :ref:`autograd-enabled`, you only need to write and register +an autocast wrapper for same name onto which the autograd wrapper is registered. +For example, if you wanted an autocast wrapper for the ``myadd`` function shown +in the autograd section, all you'd need is + +.. code-block:: cpp + + Tensor myadd_autocast(const Tensor& self, const Tensor& other) { + c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); + return myadd(at::autocast::cached_cast(, self), + at::autocast::cached_cast(, other)); + } + + TORCH_LIBRARY_IMPL(myops, Autocast, m) { + m.impl("myadd", myadd_autocast); + } + +There are no separate gymnastics to make the backward method autocast compatible. +However, the backward method defined in your custom autograd function will run in the same +dtype as autocast sets for the forward method, so you should choose a ```` +suitable for both your forward and backward methods. + Batched ^^^^^^^ From 3e9815a77a8740795838a003b1279298970d8459 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Sat, 12 Sep 2020 15:20:45 -0600 Subject: [PATCH 22/22] touchups --- advanced_source/dispatcher.rst | 2 +- recipes_source/recipes/amp_recipe.py | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst index 4e895507cd6..4f3b52fea32 100644 --- a/advanced_source/dispatcher.rst +++ b/advanced_source/dispatcher.rst @@ -302,7 +302,7 @@ the safest choice for the execution type: } If your custom op is :ref:`autograd-enabled`, you only need to write and register -an autocast wrapper for same name onto which the autograd wrapper is registered. +an autocast wrapper for the same name onto which the autograd wrapper is registered. For example, if you wanted an autocast wrapper for the ``myadd`` function shown in the autograd section, all you'd need is diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index 90f5ab2c77a..c1ec52a3883 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -166,7 +166,7 @@ def make_model(in_size, out_size, num_layers): # Updates the scale for next iteration. scaler.update() - opt.zero_grad() + opt.zero_grad() # set_to_none=True here can modestly improve performance ########################################################## # All together: "Automatic Mixed Precision" @@ -190,7 +190,7 @@ def make_model(in_size, out_size, num_layers): scaler.scale(loss).backward() scaler.step(opt) scaler.update() - opt.zero_grad() + opt.zero_grad() # set_to_none=True here can modestly improve performance end_timer_and_print("Mixed precision:") ########################################################## @@ -216,7 +216,7 @@ def make_model(in_size, out_size, num_layers): scaler.step(opt) scaler.update() - opt.zero_grad() + opt.zero_grad() # set_to_none=True here can modestly improve performance ########################################################## # Saving/Resuming @@ -232,14 +232,16 @@ def make_model(in_size, out_size, num_layers): checkpoint = {"model": net.state_dict(), "optimizer": opt.state_dict(), "scaler": scaler.state_dict()} +# Write checkpoint as desired, e.g., +# torch.save(checkpoint, "filename") ########################################################## -# (write checkpoint as desired, e.g., ``torch.save(checkpoint, "filename")``.) -# # When resuming, load the scaler state dict alongside the model and optimizer state dicts. -# (read checkpoint as desired, e.g., -# ``checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(torch.cuda.current_device()))``) +# Read checkpoint as desired, e.g., +# dev = torch.cuda.current_device() +# checkpoint = torch.load("filename", +# map_location = lambda storage, loc: storage.cuda(dev)) net.load_state_dict(checkpoint["model"]) opt.load_state_dict(checkpoint["optimizer"]) scaler.load_state_dict(checkpoint["scaler"]) @@ -294,7 +296,7 @@ def make_model(in_size, out_size, num_layers): # 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores. # In this case a reduced speedup is expected. # 3. Matmul dimensions are not Tensor Core-friendly. Make sure matmuls' participating sizes are multiples of 8. -# (For NLP models with encoders/decoders, this can be subtle. Also. convolutions used to have similar size constraints +# (For NLP models with encoders/decoders, this can be subtle. Also, convolutions used to have similar size constraints # for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist. See # `here `_ for guidance.) #