From 42c044dc4d4c04f5ca6a561c4193f9c4f260fb5e Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Wed, 19 Aug 2020 13:44:38 -0600
Subject: [PATCH 01/22] fdsa

---
 index.rst                                     |  1 +
 .../mixed_precision_tutorial.py               | 53 +++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 intermediate_source/mixed_precision_tutorial.py

diff --git a/index.rst b/index.rst
index a5ad877b0f4..98d8d275503 100644
--- a/index.rst
+++ b/index.rst
@@ -500,6 +500,7 @@ Additional Resources
    :hidden:
    :caption: Frontend APIs
 
+   intermediate/mixed_precision_tutorial
    intermediate/named_tensor_tutorial
    intermediate/memory_format_tutorial
    advanced/cpp_frontend
diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
new file mode 100644
index 00000000000..8090cb3dfd0
--- /dev/null
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+"""
+Automatic Mixed Precision in PyTorch
+*******************************************************
+**Author**: `Michael Carilli <https://github.com/mcarilli>`_
+
+``torch.cuda.amp`` provides convenience methods for mixed precision,
+where some operations use the ``torch.float32`` (``float``) datatype and other operations
+use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
+are much faster in ``float16``. Other ops, like reductions, often require the dynamic
+range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype.
+which can reduce your network's runtime and memory footprint.
+
+Ordinarily, "automatic mixed precision training" uses :class:`torch.cuda.amp.autocast` and
+:class:`torch.cuda.amp.GradScaler` together.
+Here we'll walk through adding ``autocast`` and ``GradScaler`` to a toy network.
+First we'll cover typical use, then describe more advanced cases.
+
+.. contents:: :local:
+"""
+
+######################################################################
+# Without torch.cuda.amp, the following simple network executes all
+# ops in default precision (torch.float32):
+
+import torch
+
+######################################################################
+# Adding autocast
+# ---------------
+#
+
+
+######################################################################
+# Adding GradScaler
+# -----------------
+# 
+
+
+
+
+
+
+
+######################################################################
+# Advanced topics
+# ---------------
+#
+
+
+
+#
+# know by creating `an issue <https://github.com/pytorch/pytorch/issues>`_.

From 1122d9b9021da7f898b32cf619f28b046e61662b Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 25 Aug 2020 11:14:56 -0600
Subject: [PATCH 02/22] Tutorial runs

---
 .../mixed_precision_tutorial.py               | 182 +++++++++++++++++-
 1 file changed, 174 insertions(+), 8 deletions(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index 8090cb3dfd0..54b6c89e528 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -4,50 +4,216 @@
 *******************************************************
 **Author**: `Michael Carilli <https://github.com/mcarilli>`_
 
-``torch.cuda.amp`` provides convenience methods for mixed precision,
+`torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_ provides convenience methods for mixed precision,
 where some operations use the ``torch.float32`` (``float``) datatype and other operations
 use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
 are much faster in ``float16``. Other ops, like reductions, often require the dynamic
 range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype.
 which can reduce your network's runtime and memory footprint.
 
-Ordinarily, "automatic mixed precision training" uses :class:`torch.cuda.amp.autocast` and
-:class:`torch.cuda.amp.GradScaler` together.
+Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_ and
+`torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
 Here we'll walk through adding ``autocast`` and ``GradScaler`` to a toy network.
 First we'll cover typical use, then describe more advanced cases.
 
 .. contents:: :local:
 """
 
+import time, gc
+
+_start_time = None
+
+def start_timer():
+    global _start_time
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.synchronize()
+    _start_time = time.time()
+
+def end_timer_and_print(local_msg):
+    torch.cuda.synchronize()
+    print(local_msg)
+    print("Total execution time = {:.3f} sec".format(time.time() - _start_time))
+    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))
+
+def make_model(in_size, out_size, num_layers):
+    layers = []
+    for _ in range(num_layers - 1):
+        layers.append(torch.nn.Linear(in_size, in_size))
+        layers.append(torch.nn.ReLU())
+    layers.append(torch.nn.Linear(in_size, out_size))
+    return torch.nn.Sequential(*tuple(layers)).cuda()
+
 ######################################################################
-# Without torch.cuda.amp, the following simple network executes all
+# Without torch.cuda.amp, the following simple "network" executes all
 # ops in default precision (torch.float32):
 
 import torch
 
+# batch_size, in_size, out_size, and num_layers are chosen to be large enough to saturate the GPU.
+# Typically, mixed precision provides the greatest speedup when GPU is working hard.
+# Small networks may be CPU bound, in which case mixed precision won't improve performance.
+# Sizes are also chosen such that the linear layers' participating dimensions are multiples of 8,
+# to permit Tensor Core usage on Tensor Core-capable GPUs.
+# See :ref:`Troubleshooting <Troubleshooting>`.
+# Exercise: Vary participating sizes and see how the mixed precision speedup changes.
+batch_size = 256 # Try, for example, 32, 33
+in_size = 4096
+out_size = 4096
+num_layers = 6
+num_batches = 128
+epochs = 3
+
+data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
+targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
+loss_fn = torch.nn.MSELoss().cuda()
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        output = net(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("With default precision:")
+
 ######################################################################
 # Adding autocast
 # ---------------
 #
-
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        # Runs the forward pass under autocast
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            # Linear layers with ``float32`` inputs `autocast to float16 <https://pytorch.org/docs/stable/amp.html#ops-that-can-autocast-to-float16>`_
+            assert output.dtype is torch.float16
+
+            loss = loss_fn(output, target)
+            # ``mse_loss`` layers with ``float16`` inputs `autocast to float32 <https://pytorch.org/docs/stable/amp.html#ops-that-can-autocast-to-float16>`_
+            assert loss.dtype is torch.float32
+
+        # Exits autocast before backward().
+        # Backward passes under autocast are not recommended.
+        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
 
 ######################################################################
 # Adding GradScaler
 # -----------------
-# 
+#
+# See `Gradient Scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
+# for a full explanation of each step.
 
+# Constructs scaler with default args, which are effective for most networks.
+# If your network fails to converge with default GradScaler args, please file an issue.
+scaler = torch.cuda.amp.GradScaler()
 
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
 
+        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+        scaler.scale(loss).backward()
 
+        # scaler.step() first unscales the gradients of the optimizer's assigned params.
+        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
+        # otherwise, optimizer.step() is skipped.
+        scaler.step(opt)
 
+        # Updates the scale for next iteration.
+        scaler.update()
 
+        opt.zero_grad()
 
 ######################################################################
-# Advanced topics
+# All together
+# ------------
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+scaler = torch.cuda.amp.GradScaler()
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad()
+end_timer_and_print("With mixed precision:")
+
+
+######################################################################
+# Inspecting/modifying gradients (e.g., gradient clipping)
 # ---------------
 #
+# All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
+# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``,  you should
+# unscale them first using `scaler.unscale_(optimizer)`.
 
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
 
+        # Unscales the gradients of optimizer's assigned params in-place
+        scaler.unscale_(opt)
 
+        # Since the gradients of optimizer's assigned params are now unscaled, clips as usual.
+        # You may use the same value for max_norm here as you would without gradient scaling.
+        torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1)
+
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad()
+
+######################################################################
+# Advanced topics
+# ---------------
+#
+# See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
+# * Gradient penalty/double backward
+# * Networks with multiple models, optimizers, or losses
+# * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
+# * Custom autograd functions (subclasses of ``torch.autograd.Function``)
+
+######################################################################
+# Troubleshooting
+# ---------------
+#
+# Speedup with Amp is minor
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# 1. Your network may not be saturating the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
+#    won't matter.  A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s)
+#    as much as you can without running OOM.  Also, try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or
+#    printing values from CUDA tensors), and try to avoid sequences of many small CUDA ops (coalesce these into a few
+#    large CUDA ops if you can).
+# 2. Your network may be compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
+#    In this case a more modest speedup is expected.
+# 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
+#    (For NLP models with encoders/decoders, this can be subtle.  Also. convolutions used to have similar size constraints
+#    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>` for details).
+#
 #
-# know by creating `an issue <https://github.com/pytorch/pytorch/issues>`_.
+# Loss is inf/NaN
+# ~~~~~~~~~~~~~~~
+# First, check if your network fits an advanced use case in the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_.
+# If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information:
+# 1. Try disabling ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if inf/NaN persist.
+# 2. ???
+# 3. profit

From e7426143e8e4639c2344ca53193437a1b5a0a7ea Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 25 Aug 2020 11:30:58 -0600
Subject: [PATCH 03/22] clarify one scaler per convergence run

---
 intermediate_source/mixed_precision_tutorial.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index 54b6c89e528..41ed77ebd84 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -112,8 +112,11 @@ def make_model(in_size, out_size, num_layers):
 # See `Gradient Scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
 # for a full explanation of each step.
 
-# Constructs scaler with default args, which are effective for most networks.
+# Constructs scaler once, at the beginning of the convergence run, using default args.
 # If your network fails to converge with default GradScaler args, please file an issue.
+# The same GradScaler instance should be used for the entire convergence run.
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
 scaler = torch.cuda.amp.GradScaler()
 
 for epoch in range(epochs):

From d5890ae5204017a53b75740400cc566d532fa77d Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 25 Aug 2020 17:37:53 -0600
Subject: [PATCH 04/22] adjust sizes, dont run illustrative sections

---
 .../mixed_precision_tutorial.py               | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index 41ed77ebd84..710f00e3552 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -19,22 +19,23 @@
 .. contents:: :local:
 """
 
-import time, gc
+import torch, time, gc
 
-_start_time = None
+start_time = None
 
 def start_timer():
-    global _start_time
+    global start_time
     gc.collect()
     torch.cuda.empty_cache()
     torch.cuda.reset_max_memory_allocated()
     torch.cuda.synchronize()
-    _start_time = time.time()
+    start_time = time.time()
 
 def end_timer_and_print(local_msg):
     torch.cuda.synchronize()
-    print(local_msg)
-    print("Total execution time = {:.3f} sec".format(time.time() - _start_time))
+    end_time = time.time()
+    print("\n" + local_msg)
+    print("Total execution time = {:.3f} sec".format(end_time - start_time))
     print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))
 
 def make_model(in_size, out_size, num_layers):
@@ -45,30 +46,32 @@ def make_model(in_size, out_size, num_layers):
     layers.append(torch.nn.Linear(in_size, out_size))
     return torch.nn.Sequential(*tuple(layers)).cuda()
 
-######################################################################
-# Without torch.cuda.amp, the following simple "network" executes all
-# ops in default precision (torch.float32):
-
-import torch
-
 # batch_size, in_size, out_size, and num_layers are chosen to be large enough to saturate the GPU.
 # Typically, mixed precision provides the greatest speedup when GPU is working hard.
 # Small networks may be CPU bound, in which case mixed precision won't improve performance.
-# Sizes are also chosen such that the linear layers' participating dimensions are multiples of 8,
-# to permit Tensor Core usage on Tensor Core-capable GPUs.
-# See :ref:`Troubleshooting <Troubleshooting>`.
+# Sizes are also chosen such that linear layers' participating dimensions are multiples of 8,
+# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting <Troubleshooting>`).
+#
 # Exercise: Vary participating sizes and see how the mixed precision speedup changes.
-batch_size = 256 # Try, for example, 32, 33
+batch_size = 512 # Try, for example, 128, 256, 513.
 in_size = 4096
 out_size = 4096
-num_layers = 6
-num_batches = 128
+num_layers = 3
+num_batches = 50
 epochs = 3
 
+# Creates data in default precision.  The same data is used for both default and mixed precision trials below.
+# You don't need to manually change the type of input data when enabling mixed precision.
 data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
 targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
 loss_fn = torch.nn.MSELoss().cuda()
 
+######################################################################
+# Default Precision (Baseline)
+# ----------------------------
+#
+# Without torch.cuda.amp, the following simple network executes all ops in default precision (torch.float32):
+
 net = make_model(in_size, out_size, num_layers)
 opt = torch.optim.SGD(net.parameters(), lr=0.001)
 
@@ -86,7 +89,7 @@ def make_model(in_size, out_size, num_layers):
 # Adding autocast
 # ---------------
 #
-for epoch in range(epochs):
+for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
         # Runs the forward pass under autocast
         with torch.cuda.amp.autocast():
@@ -119,7 +122,7 @@ def make_model(in_size, out_size, num_layers):
 # a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
 scaler = torch.cuda.amp.GradScaler()
 
-for epoch in range(epochs):
+for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
         with torch.cuda.amp.autocast():
             output = net(input)
@@ -161,13 +164,13 @@ def make_model(in_size, out_size, num_layers):
 
 ######################################################################
 # Inspecting/modifying gradients (e.g., gradient clipping)
-# ---------------
+# --------------------------------------------------------
 #
 # All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
 # the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``,  you should
 # unscale them first using `scaler.unscale_(optimizer)`.
 
-for epoch in range(epochs):
+for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
         with torch.cuda.amp.autocast():
             output = net(input)

From 4c0bdc5e3ede901c853b99da4631d5837f2cda22 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Wed, 26 Aug 2020 20:51:48 -0600
Subject: [PATCH 05/22] satisfying ocd

---
 .../mixed_precision_tutorial.py               | 55 +++++++++++--------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index 710f00e3552..457d3ac8200 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 Automatic Mixed Precision in PyTorch
-*******************************************************
+************************************
 **Author**: `Michael Carilli <https://github.com/mcarilli>`_
 
 `torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_ provides convenience methods for mixed precision,
@@ -66,10 +66,9 @@ def make_model(in_size, out_size, num_layers):
 targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
 loss_fn = torch.nn.MSELoss().cuda()
 
-######################################################################
+##############################
 # Default Precision (Baseline)
 # ----------------------------
-#
 # Without torch.cuda.amp, the following simple network executes all ops in default precision (torch.float32):
 
 net = make_model(in_size, out_size, num_layers)
@@ -85,20 +84,27 @@ def make_model(in_size, out_size, num_layers):
         opt.zero_grad() # set_to_none=True here can modestly improve performance
 end_timer_and_print("With default precision:")
 
-######################################################################
+#################
 # Adding autocast
 # ---------------
+# Instances of `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#autocasting>`_ serve as context managers that allow regions of your script to run
+# in mixed precision.
 #
+# In these regions, CUDA ops run in a dtype chosen by autocast
+# to improve performance while maintaining accuracy.
+# See the :ref:`Autocast Op Reference<autocast-op-reference>` for details on what precision
+# autocast chooses for each op, and under what circumstances.
+
 for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
-        # Runs the forward pass under autocast
-        with torch.cuda.amp.autocast():
+        # Runs the forward pass under autocast.
+        with torch.cuda.amp.autocast(enabled=try_amp):
             output = net(input)
-            # Linear layers with ``float32`` inputs `autocast to float16 <https://pytorch.org/docs/stable/amp.html#ops-that-can-autocast-to-float16>`_
+            # output is float16 because linear layers autocast to float16.
             assert output.dtype is torch.float16
 
             loss = loss_fn(output, target)
-            # ``mse_loss`` layers with ``float16`` inputs `autocast to float32 <https://pytorch.org/docs/stable/amp.html#ops-that-can-autocast-to-float16>`_
+            # loss is float32 because mse_loss layers autocast to float32.
             assert loss.dtype is torch.float32
 
         # Exits autocast before backward().
@@ -108,12 +114,14 @@ def make_model(in_size, out_size, num_layers):
         opt.step()
         opt.zero_grad() # set_to_none=True here can modestly improve performance
 
-######################################################################
+###################
 # Adding GradScaler
 # -----------------
+# `Gradient scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
+# helps prevent gradients with small magnitudes from flushing to zero
+# ("underflowing") when training with mixed precision.
 #
-# See `Gradient Scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
-# for a full explanation of each step.
+# ``torch.cuda.amp.GradScaler`` performs the steps of gradient scaling conveniently.
 
 # Constructs scaler once, at the beginning of the convergence run, using default args.
 # If your network fails to converge with default GradScaler args, please file an issue.
@@ -141,7 +149,7 @@ def make_model(in_size, out_size, num_layers):
 
         opt.zero_grad()
 
-######################################################################
+##############
 # All together
 # ------------
 
@@ -162,13 +170,13 @@ def make_model(in_size, out_size, num_layers):
 end_timer_and_print("With mixed precision:")
 
 
-######################################################################
+##########################################################
 # Inspecting/modifying gradients (e.g., gradient clipping)
 # --------------------------------------------------------
 #
 # All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
 # the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``,  you should
-# unscale them first using `scaler.unscale_(optimizer)`.
+# unscale them first using ``scaler.unscale_(optimizer)``.
 
 for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
@@ -188,33 +196,36 @@ def make_model(in_size, out_size, num_layers):
         scaler.update()
         opt.zero_grad()
 
-######################################################################
+#################
 # Advanced topics
 # ---------------
 #
 # See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
+#
 # * Gradient penalty/double backward
 # * Networks with multiple models, optimizers, or losses
 # * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
 # * Custom autograd functions (subclasses of ``torch.autograd.Function``)
 
-######################################################################
+#################
 # Troubleshooting
 # ---------------
 #
 # Speedup with Amp is minor
 # ~~~~~~~~~~~~~~~~~~~~~~~~~
 # 1. Your network may not be saturating the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
-#    won't matter.  A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s)
-#    as much as you can without running OOM.  Also, try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or
-#    printing values from CUDA tensors), and try to avoid sequences of many small CUDA ops (coalesce these into a few
-#    large CUDA ops if you can).
+#    won't matter.
+#
+#    * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s)
+#      as much as you can without running OOM.
+#    * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors).
+#    * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can).
 # 2. Your network may be compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
 #    In this case a more modest speedup is expected.
 # 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
 #    (For NLP models with encoders/decoders, this can be subtle.  Also. convolutions used to have similar size constraints
-#    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>` for details).
-#
+#    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See
+#    `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>` for details).
 #
 # Loss is inf/NaN
 # ~~~~~~~~~~~~~~~

From a5e5e2a5a81e8e590ec718253065e78a2b30a9d2 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Thu, 27 Aug 2020 02:43:31 -0600
Subject: [PATCH 06/22] MORE

---
 .../mixed_precision_tutorial.py               | 86 +++++++++++++------
 1 file changed, 62 insertions(+), 24 deletions(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index 457d3ac8200..f2a28404b54 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -13,14 +13,19 @@
 
 Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_ and
 `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
-Here we'll walk through adding ``autocast`` and ``GradScaler`` to a toy network.
-First we'll cover typical use, then describe more advanced cases.
+This tutorial measures the performance of a simple network in default precision,
+then walks through adding ``autocast`` and ``GradScaler`` to run the same network in
+mixed precision with improved performance.
+
+You may download and run this tutorial as a standalone Python script.
+The only requirements are Pytorch 1.6+ and a CUDA-capable GPU.
 
 .. contents:: :local:
 """
 
 import torch, time, gc
 
+# Timing utilities
 start_time = None
 
 def start_timer():
@@ -38,6 +43,12 @@ def end_timer_and_print(local_msg):
     print("Total execution time = {:.3f} sec".format(end_time - start_time))
     print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))
 
+##########################################################
+# A simple network
+# ----------------
+#
+# The following sequence of linear layers and ReLUs should show a nice speedup with mixed precision.
+
 def make_model(in_size, out_size, num_layers):
     layers = []
     for _ in range(num_layers - 1):
@@ -46,13 +57,15 @@ def make_model(in_size, out_size, num_layers):
     layers.append(torch.nn.Linear(in_size, out_size))
     return torch.nn.Sequential(*tuple(layers)).cuda()
 
-# batch_size, in_size, out_size, and num_layers are chosen to be large enough to saturate the GPU.
-# Typically, mixed precision provides the greatest speedup when GPU is working hard.
+##########################################################
+# ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work.
+# Typically, mixed precision provides the greatest speedup when GPU is saturated.
 # Small networks may be CPU bound, in which case mixed precision won't improve performance.
 # Sizes are also chosen such that linear layers' participating dimensions are multiples of 8,
-# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting <Troubleshooting>`).
+# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting<troubleshooting>` below).
 #
 # Exercise: Vary participating sizes and see how the mixed precision speedup changes.
+
 batch_size = 512 # Try, for example, 128, 256, 513.
 in_size = 4096
 out_size = 4096
@@ -60,16 +73,18 @@ def make_model(in_size, out_size, num_layers):
 num_batches = 50
 epochs = 3
 
-# Creates data in default precision.  The same data is used for both default and mixed precision trials below.
+# Creates data in default precision.
+# The same data is used for both default and mixed precision trials below.
 # You don't need to manually change the type of input data when enabling mixed precision.
 data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
 targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
+
 loss_fn = torch.nn.MSELoss().cuda()
 
-##############################
-# Default Precision (Baseline)
-# ----------------------------
-# Without torch.cuda.amp, the following simple network executes all ops in default precision (torch.float32):
+##########################################################
+# Default Precision
+# -----------------
+# Without torch.cuda.amp, the following simple network executes all ops in default precision (``torch.float32``):
 
 net = make_model(in_size, out_size, num_layers)
 opt = torch.optim.SGD(net.parameters(), lr=0.001)
@@ -84,7 +99,7 @@ def make_model(in_size, out_size, num_layers):
         opt.zero_grad() # set_to_none=True here can modestly improve performance
 end_timer_and_print("With default precision:")
 
-#################
+##########################################################
 # Adding autocast
 # ---------------
 # Instances of `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#autocasting>`_ serve as context managers that allow regions of your script to run
@@ -114,7 +129,7 @@ def make_model(in_size, out_size, num_layers):
         opt.step()
         opt.zero_grad() # set_to_none=True here can modestly improve performance
 
-###################
+##########################################################
 # Adding GradScaler
 # -----------------
 # `Gradient scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
@@ -128,6 +143,7 @@ def make_model(in_size, out_size, num_layers):
 # The same GradScaler instance should be used for the entire convergence run.
 # If you perform multiple convergence runs in the same script, each run should use
 # a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
+
 scaler = torch.cuda.amp.GradScaler()
 
 for epoch in range(0): # 0 epochs, this section is for illustration only
@@ -149,18 +165,24 @@ def make_model(in_size, out_size, num_layers):
 
         opt.zero_grad()
 
-##############
+##########################################################
 # All together
 # ------------
+#
+# The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
+# If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
+# This allows switching between default precision and mixed precision without if/else statements.
+
+use_amp = True
 
 net = make_model(in_size, out_size, num_layers)
 opt = torch.optim.SGD(net.parameters(), lr=0.001)
-scaler = torch.cuda.amp.GradScaler()
+scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
 
 start_timer()
 for epoch in range(epochs):
     for input, target in zip(data, targets):
-        with torch.cuda.amp.autocast():
+        with torch.cuda.amp.autocast(enabled=use_amp):
             output = net(input)
             loss = loss_fn(output, target)
         scaler.scale(loss).backward()
@@ -169,7 +191,6 @@ def make_model(in_size, out_size, num_layers):
         opt.zero_grad()
 end_timer_and_print("With mixed precision:")
 
-
 ##########################################################
 # Inspecting/modifying gradients (e.g., gradient clipping)
 # --------------------------------------------------------
@@ -196,24 +217,26 @@ def make_model(in_size, out_size, num_layers):
         scaler.update()
         opt.zero_grad()
 
-#################
+##########################################################
 # Advanced topics
 # ---------------
 #
 # See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
 #
+# * Gradient accumulation
 # * Gradient penalty/double backward
 # * Networks with multiple models, optimizers, or losses
 # * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
 # * Custom autograd functions (subclasses of ``torch.autograd.Function``)
-
-#################
+#
+# .. _troubleshooting:
+#
 # Troubleshooting
 # ---------------
 #
 # Speedup with Amp is minor
 # ~~~~~~~~~~~~~~~~~~~~~~~~~
-# 1. Your network may not be saturating the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
+# 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
 #    won't matter.
 #
 #    * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s)
@@ -225,12 +248,27 @@ def make_model(in_size, out_size, num_layers):
 # 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
 #    (For NLP models with encoders/decoders, this can be subtle.  Also. convolutions used to have similar size constraints
 #    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See
-#    `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>` for details).
+#    `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>`_ for guidance.)
 #
 # Loss is inf/NaN
 # ~~~~~~~~~~~~~~~
 # First, check if your network fits an advanced use case in the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_.
+# See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy <https://pytorch.org/docs/stable/amp.html#prefer-binary-cross-entropy-with-logits-over-binary-cross-entropy>`_.
+#
 # If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information:
-# 1. Try disabling ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if inf/NaN persist.
-# 2. ???
-# 3. profit
+#
+# 1. Try disabling ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist.
+# 2. If you suspect some region of your network overflows (e.g., a complex loss function), run that forward region in ``float32``.
+#    `The autocast docstring <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_'s last code snippet
+#    shows running a subregion in ``float32`` (by locally disabling autocast and casting the subregion's inputs).
+#
+# Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Autocast tries to cover all ops that benefit from or require casting.  The
+# `ops that receive explicit coverage <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# are based on reasoning about numerical properties, but also on experience.
+# If you see a type mismatch error in an autocast-enabled forward region or a backward pass following that region,
+# it's possible autocast missed an op.
+#
+# Please file an issue with the error backtrace.  ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide
+# more fine-grained information on which backend op is failing.

From 5072dd32d6e14ae6ad8316c7dc7febd4e5f3cc47 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Thu, 27 Aug 2020 10:32:27 -0600
Subject: [PATCH 07/22] fdsa

---
 intermediate_source/mixed_precision_tutorial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index f2a28404b54..497dcb2dd70 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -97,7 +97,7 @@ def make_model(in_size, out_size, num_layers):
         loss.backward()
         opt.step()
         opt.zero_grad() # set_to_none=True here can modestly improve performance
-end_timer_and_print("With default precision:")
+end_timer_and_print("Default precision:")
 
 ##########################################################
 # Adding autocast
@@ -189,7 +189,7 @@ def make_model(in_size, out_size, num_layers):
         scaler.step(opt)
         scaler.update()
         opt.zero_grad()
-end_timer_and_print("With mixed precision:")
+end_timer_and_print("Mixed precision:")
 
 ##########################################################
 # Inspecting/modifying gradients (e.g., gradient clipping)

From 38a0a0d0188bdce5b345b2afaa1564d16598aef6 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Thu, 27 Aug 2020 23:56:52 -0600
Subject: [PATCH 08/22] details

---
 .../mixed_precision_tutorial.py               | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index 497dcb2dd70..e7fb23d6d32 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -46,8 +46,7 @@ def end_timer_and_print(local_msg):
 ##########################################################
 # A simple network
 # ----------------
-#
-# The following sequence of linear layers and ReLUs should show a nice speedup with mixed precision.
+# The following sequence of linear layers and ReLUs should show a speedup with mixed precision.
 
 def make_model(in_size, out_size, num_layers):
     layers = []
@@ -75,7 +74,7 @@ def make_model(in_size, out_size, num_layers):
 
 # Creates data in default precision.
 # The same data is used for both default and mixed precision trials below.
-# You don't need to manually change the type of input data when enabling mixed precision.
+# You don't need to manually change inputs' dtype when enabling mixed precision.
 data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
 targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
 
@@ -102,8 +101,8 @@ def make_model(in_size, out_size, num_layers):
 ##########################################################
 # Adding autocast
 # ---------------
-# Instances of `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#autocasting>`_ serve as context managers that allow regions of your script to run
-# in mixed precision.
+# Instances of `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#autocasting>`_
+# serve as context managers that allow regions of your script to run in mixed precision.
 #
 # In these regions, CUDA ops run in a dtype chosen by autocast
 # to improve performance while maintaining accuracy.
@@ -166,9 +165,8 @@ def make_model(in_size, out_size, num_layers):
         opt.zero_grad()
 
 ##########################################################
-# All together
-# ------------
-#
+# All together ("Automatic Mixed Precision")
+# ------------------------------------------
 # The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
 # If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
 # This allows switching between default precision and mixed precision without if/else statements.
@@ -192,9 +190,8 @@ def make_model(in_size, out_size, num_layers):
 end_timer_and_print("Mixed precision:")
 
 ##########################################################
-# Inspecting/modifying gradients (e.g., gradient clipping)
+# Inspecting/modifying gradients (e.g., clipping)
 # --------------------------------------------------------
-#
 # All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
 # the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``,  you should
 # unscale them first using ``scaler.unscale_(optimizer)``.
@@ -217,6 +214,11 @@ def make_model(in_size, out_size, num_layers):
         scaler.update()
         opt.zero_grad()
 
+##########################################################
+# Inference/Evaluation
+# --------------------
+# ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary.
+#
 ##########################################################
 # Advanced topics
 # ---------------
@@ -229,6 +231,10 @@ def make_model(in_size, out_size, num_layers):
 # * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
 # * Custom autograd functions (subclasses of ``torch.autograd.Function``)
 #
+# If you're registering a custom C++ op with the dispatcher, see the
+# `autocast section <https://pytorch.org/tutorials/advanced/dispatcher.html#autocast>`_
+# of the dispatcher tutorial.
+#
 # .. _troubleshooting:
 #
 # Troubleshooting
@@ -257,10 +263,11 @@ def make_model(in_size, out_size, num_layers):
 #
 # If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information:
 #
-# 1. Try disabling ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist.
-# 2. If you suspect some region of your network overflows (e.g., a complex loss function), run that forward region in ``float32``.
+# 1. Disable ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist.
+# 2. If you suspect part of your network (e.g., a complicated loss function) overflows , run that forward region in ``float32``
+#    and see if infs/NaNs persist.
 #    `The autocast docstring <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_'s last code snippet
-#    shows running a subregion in ``float32`` (by locally disabling autocast and casting the subregion's inputs).
+#    shows forcing a subregion to run in ``float32`` (by locally disabling autocast and casting the subregion's inputs).
 #
 # Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM)
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From fb10b93651a15052019f5060f96ab0b87ddf2ba7 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Fri, 28 Aug 2020 00:15:44 -0600
Subject: [PATCH 09/22] rephrase

---
 intermediate_source/mixed_precision_tutorial.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index e7fb23d6d32..6688f0da95d 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -271,9 +271,9 @@ def make_model(in_size, out_size, num_layers):
 #
 # Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM)
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Autocast tries to cover all ops that benefit from or require casting.  The
-# `ops that receive explicit coverage <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
-# are based on reasoning about numerical properties, but also on experience.
+# Autocast tries to cover all ops that benefit from or require casting.
+# `Ops that receive explicit coverage <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# are chosen based on numerical properties, but also on experience.
 # If you see a type mismatch error in an autocast-enabled forward region or a backward pass following that region,
 # it's possible autocast missed an op.
 #

From e432d5f0b8198b81baf6c46f0ed57533532f1348 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Fri, 28 Aug 2020 00:23:49 -0600
Subject: [PATCH 10/22] fix formatting

---
 intermediate_source/mixed_precision_tutorial.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/intermediate_source/mixed_precision_tutorial.py
index 6688f0da95d..4e3e97e512b 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/intermediate_source/mixed_precision_tutorial.py
@@ -13,6 +13,7 @@
 
 Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_ and
 `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
+
 This tutorial measures the performance of a simple network in default precision,
 then walks through adding ``autocast`` and ``GradScaler`` to run the same network in
 mixed precision with improved performance.
@@ -106,8 +107,8 @@ def make_model(in_size, out_size, num_layers):
 #
 # In these regions, CUDA ops run in a dtype chosen by autocast
 # to improve performance while maintaining accuracy.
-# See the :ref:`Autocast Op Reference<autocast-op-reference>` for details on what precision
-# autocast chooses for each op, and under what circumstances.
+# See the `Autocast Op Reference <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# for details on what precision autocast chooses for each op, and under what circumstances.
 
 for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
@@ -218,11 +219,10 @@ def make_model(in_size, out_size, num_layers):
 # Inference/Evaluation
 # --------------------
 # ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary.
-#
+
 ##########################################################
 # Advanced topics
 # ---------------
-#
 # See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
 #
 # * Gradient accumulation
@@ -234,12 +234,12 @@ def make_model(in_size, out_size, num_layers):
 # If you're registering a custom C++ op with the dispatcher, see the
 # `autocast section <https://pytorch.org/tutorials/advanced/dispatcher.html#autocast>`_
 # of the dispatcher tutorial.
-#
+
+##########################################################
 # .. _troubleshooting:
 #
 # Troubleshooting
 # ---------------
-#
 # Speedup with Amp is minor
 # ~~~~~~~~~~~~~~~~~~~~~~~~~
 # 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
@@ -250,7 +250,7 @@ def make_model(in_size, out_size, num_layers):
 #    * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors).
 #    * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can).
 # 2. Your network may be compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
-#    In this case a more modest speedup is expected.
+#    In this case a reduced speedup is expected.
 # 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
 #    (For NLP models with encoders/decoders, this can be subtle.  Also. convolutions used to have similar size constraints
 #    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See

From d082a38207c17e0f27301e1e41763ef8ac6798a1 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 8 Sep 2020 09:45:34 -0600
Subject: [PATCH 11/22] move script to recipes

---
 .../mixed_precision_tutorial.py                     | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)
 rename {intermediate_source => recipes_source}/mixed_precision_tutorial.py (96%)

diff --git a/intermediate_source/mixed_precision_tutorial.py b/recipes_source/mixed_precision_tutorial.py
similarity index 96%
rename from intermediate_source/mixed_precision_tutorial.py
rename to recipes_source/mixed_precision_tutorial.py
index 4e3e97e512b..732a3a867e0 100644
--- a/intermediate_source/mixed_precision_tutorial.py
+++ b/recipes_source/mixed_precision_tutorial.py
@@ -20,8 +20,6 @@
 
 You may download and run this tutorial as a standalone Python script.
 The only requirements are Pytorch 1.6+ and a CUDA-capable GPU.
-
-.. contents:: :local:
 """
 
 import torch, time, gc
@@ -113,7 +111,7 @@ def make_model(in_size, out_size, num_layers):
 for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
         # Runs the forward pass under autocast.
-        with torch.cuda.amp.autocast(enabled=try_amp):
+        with torch.cuda.amp.autocast():
             output = net(input)
             # output is float16 because linear layers autocast to float16.
             assert output.dtype is torch.float16
@@ -143,7 +141,6 @@ def make_model(in_size, out_size, num_layers):
 # The same GradScaler instance should be used for the entire convergence run.
 # If you perform multiple convergence runs in the same script, each run should use
 # a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
-
 scaler = torch.cuda.amp.GradScaler()
 
 for epoch in range(0): # 0 epochs, this section is for illustration only
@@ -221,6 +218,8 @@ def make_model(in_size, out_size, num_layers):
 # ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary.
 
 ##########################################################
+# .. _advanced-topics:
+#
 # Advanced topics
 # ---------------
 # See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
@@ -249,7 +248,7 @@ def make_model(in_size, out_size, num_layers):
 #      as much as you can without running OOM.
 #    * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors).
 #    * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can).
-# 2. Your network may be compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
+# 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
 #    In this case a reduced speedup is expected.
 # 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
 #    (For NLP models with encoders/decoders, this can be subtle.  Also. convolutions used to have similar size constraints
@@ -258,7 +257,7 @@ def make_model(in_size, out_size, num_layers):
 #
 # Loss is inf/NaN
 # ~~~~~~~~~~~~~~~
-# First, check if your network fits an advanced use case in the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_.
+# First, check if your network fits an :ref:`advanced use case<advanced-topics>`.
 # See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy <https://pytorch.org/docs/stable/amp.html#prefer-binary-cross-entropy-with-logits-over-binary-cross-entropy>`_.
 #
 # If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information:
@@ -278,4 +277,4 @@ def make_model(in_size, out_size, num_layers):
 # it's possible autocast missed an op.
 #
 # Please file an issue with the error backtrace.  ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide
-# more fine-grained information on which backend op is failing.
+# fine-grained information on which backend op is failing.

From 831d503584a223298be91ff180c4d6e22e529600 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 8 Sep 2020 14:26:12 -0600
Subject: [PATCH 12/22] hopefully moved to recipes

---
 _static/img/thumbnails/cropped/amp.png          | Bin 0 -> 14849 bytes
 index.rst                                       |   1 -
 recipes_source/recipes/README.txt               |   4 ++++
 .../amp_tutorial.py}                            |   4 ++--
 recipes_source/recipes_index.rst                |   9 +++++++++
 5 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 _static/img/thumbnails/cropped/amp.png
 rename recipes_source/{mixed_precision_tutorial.py => recipes/amp_tutorial.py} (99%)

diff --git a/_static/img/thumbnails/cropped/amp.png b/_static/img/thumbnails/cropped/amp.png
new file mode 100644
index 0000000000000000000000000000000000000000..a6916ce5605e99d5168c7d52aa157f913b8e7526
GIT binary patch
literal 14849
zcmc(`bx<9_w=PP8TX1(taCZw%a0{}tad&qoxVyXS#wEDJ#x-o*osA^81s=ci&V5zy
z&wJ}uovG^nzSX@}P0vhq_gcMbB2|^Yp`#F^z`($u%gIWq!@zt5{HGwnzPCiSSzf@v
z5Z!re09@6LJSZKV9n7t4%_v>H9L*@rJgv-OU_4h3vUQwk+nuG}8c<k1IKZU^muI>c
z96&v_c*4(?6cw(ixRmr7ms3+S#X`cvg%Xx>-=1cmufcEDmn#SJpgW+*+d-ZW>5I|p
ztwDf4Vt)SCOu(?OZ@|-m%e@ojmD;XI_owE+-Wb6%69#m?o!>GHL_O}lVxRjJ-v4wU
zQ0!>3ZxO{vn%RS1Cu|X%9^HOBhkV6w>lry&dXAn^-4um(nPYA8E%a_2VK~#scmAot
z-IMcud1+ljnul8aS;*fSxtAD!A^YyymHK;=hKkal?EZY?dFI0t_P#?v`(^Rl-zAU!
z_`QE#PpoOTWHhf2CqGsF$zSM(^6hiJ%HZ~rBXPx0lwH5Jd#rUI@pL9KVbwTDdEK)J
z3Wl>OOXTG(FxHKz>|EY;39?-evZ?p{yGmX{5s_+i)*Cen$Fk?2c}9t*I`lN&6B)r<
zxXvua_t8B`rPJRTk-2l{bZ*^sV)*OyLa5+*v6lZ_W4P{x*h36UfGU4or+?vr__lv=
zp@*XO=MIZ$1TR!KtZ?##Z;P60r{`KL4<^8;d*M=a2eBfl|0-l9WBbpwTjY-cgJpS@
zJe)12u3mZiw$?2IN~!fu_Kbwlz&ff_)nEsXR?LXv#De}&lX0h{pQ`ffXq<;B)~+n-
zoa@;xK+hf$1&|uWpb~9SNfvQ?VUMiVH2h1mt)XR#hBHHHn$~2hzKjyjlDdtZCM_S(
zx;<4(#|Gj}lE$plU_CY25lik91glWj?FQ|R$J*GI3ARD+5q&Tn$u!S1gRGQIeFcB~
z%2d~QYCWN4U;@tN<!*r5ueopS!Fd=)C94SBHk*{^JN2(B@ohOn?Uw9&_h@V`L9c(F
zf<d7kklkbkzdAqDSQR~+EmkZkI%xqK6Anbq&6E!?WT{*ZR7Vw)3>H1#qKc#tex#86
z3g(nSxkheS|C89+%XuGvB}4G|%_}+}U7-B;KaxzBk_&LxKUy0d*I#)t-Bqd_w~?oN
zLNr->U?2Z~4qI;qdZWrvx@qH`-~P@Gu;Mh@)<FM#CI6X^vAnW#R@LVyxDrmZZ8sST
z1-M$M|7vBK+Qh!xDJaiJ>Rj4g9v%Ugbhertdd8-J3QckRH9~$Teh?_>^tOa!fLyvz
zn%7Kjal)H@rpg7-Dn7N15D%aQ!P9c(u0&Or==R7n5v5Rg=$rs;zvfQtL*FZ1<C{9&
zLxlTF-7-!d)&&<Gfn_C|HPFe05cZf7Ir}l3#*WZpsKz(KLxf%*Y4Rw7W5LgDag1$d
z;*`J8`rPg;XHCV4Y`1`6dQTa9p(_p6FwX6=A^v_IqNMcIVq`~WWc~LcDJ;JLck6s9
zA}Alj|2Cu6gRMR4yj5ZsYMKK7VI@yHkDt?&a@47eLAav4ibPKwCZ_)Fij~aDE2wp?
zK$Y8hdn3m5AS$_Q%*=4hwJxbmMEMGQRn#y1yqAJ5@nRe!okS=dOG};4tFb?w|Ai3~
z;%Zy6bD~ZB#I5FATiJno%Cs1K;c|T39~)?qVg1&sXREmye7V!=TX>^06VgP}toPAL
zv4)PnM@Xf)ExUgsErNoqW~rsPe5~SD7QtL6Y*uF+`jHWHsL+lzOTk>x10<sN$&EZf
zu$)+=PO2%DX0L#Zz0lo!64kg~hsGkkodD!`($e~C4owUYl1R$fWVh|S2FktzxV>ly
zHSdM;X3_nu84||Ha#^qZDXbQ%6XWS=t`O~%-6P~4<t4KgQJqpVpKi6b95*<IW)_%7
z)LH~$g-Ec_z}6n$kNMy<5gEz`AQ{M3=k~c;7H7)ndRVPnZxUN3{6OMsB(%=|8U$+$
znOCjrEUx<)8~B6PAk!t61Tp<@%S~nn9gd&H{=-kd%L-_bDiR={&e4e`B54-abww8O
zTlv;kFtM#CO|~qehIo_gikee|a~9vh%C_N%p2rN=eJmRxmz{FyCD?Qr>&hi~ctg@4
z{OQx+Z-PG^*^#;^YxmyD5Zp0Tzn-~kCo9a+#x#_5u4zYa*d-gK-k+|F{!3D}=hJB(
zdQssu5b-bYmeA%E?0SqEO2IaR^@!LFQoTd<&t?uX{KuRVsGdb+6(A~sK7JlaV)c>n
zm~#v2H@n?@1|Hm+zBw|iOUzki7sY41Z<9Q6y|aCT(Xj(Rwe2Lc1Ml2>1oY@GsN<2A
z2`A-LZsTo&y_&^$N|PBoEIkl{GIabxf|C?*&(Tv>Bqvr@P&8InZe^<OyjeS-D>y_*
zqILo2roNU7O!3H@2#+LqpFNNo%YE*Ck2{$;gX1W2%NkM_hL4T#`X<CG|5k=Zp*QS%
z$)B>1DIuw@!$;t&5)<!WorOc=R_OIT8)c%J(fYV3W-$E10(HA$1;|LHRX?9&%fX-r
zBi-vclN_JHuj6OAhX|`TaeI8@-weW?cB{m6*o=tJ28VqYRFzlsyOH1ta~mUDJZ|m@
z!a;mOEk=!QMa5eu2TF>r%f$EFfK0LGcFV<4v0G3b;rkY?shvM(M$3@>MyH0072Ik3
zT6e=7jKncTiy*rQM#r4J`MGKXyIP%s#U3gHJHPT-xcc;0O9Apq8HQRqk9w)bYEKlZ
zpq~{GXS1`fM_FB(FO>Fy_p7jBld(5UuwXfY6m8`Ih1-lUWKswwI1dAF#q?fXSK({I
zX)(S_O$vQsujXNfav}}8Xmwe*ljFsL!Kz6h<_-Zyvz<hKVv~STiXxH`JE6mp@;TB9
zJjlglkWsNk%0abB?YHA$Xr~ESWw!KqeS6wlfd1yf;@<K_c;{BO@LTwg*aixqa_i5V
zq5KH6)BaluTXij^9CS;P*m}+gk>8$_LRnZ;jpCA=Dv=K(c)wI9!?lBZQTI2IjzwAz
zNbRW~f`^CtM=>YkN6v_m^Ktbu*5K{g=WWAxCoAvMPH0g7&7r~|ERF<@Y02F5Ng4ih
z|4D@s4NaCPP^Ks&n`&f3uZA$w8%cITfWPx8O`_6Pk<Qs`Jp+8!KLiMF-agvb#`aS1
zGAu{S{$!z3_cJ-5hA@%K_Nax)aZi@)U~i*T#;A|2qG?sqI8NdIiA9^rGb7*D9#6gb
z7AuySS@12;W_LX4lb`Kuc}10P3=mq7%A7R5ETNy^DlUZmxy@5NJNX*p`$|kL9nrpy
z^rlD9Z3sn*n(n6CXb$g(KFN?dnLAupWS4n!KJ2nx<>MrxQ`j8S+WYAFn<nC4m5PpP
zwu7W;L?=XrCF^nWPFT47WFBK{r<PcM2c+0AEsOZ1C=)nf%<xaP;}#!smM$OJ18SW3
z9vVzyc-=hZ5a1A~)=Q>kOeAAmhr387U4(83?U&QTb}gB)n<^TFUs(M-H-hn9w9rUa
zx8{k-Q?Yr_pV(J1I!g<Qei=ORMveC?Y<rHGi5L&FqOnP2|BA+MP(8^)ruw=bgi;>O
z+t51cx@|zuG7iRSDwYY+^nv3XVFUqHrRg29;7!H)4y|x+;Wmv?%9PqUE`WW9(p=`>
z<O)5G>6@!F#c)tyayV=OIG;Iq6Hf5ND2%wp)&HFoI?@M9^|6k|yAO$dE7itH)-lJl
z_*D)?)j;dvA<leag7YpgXrol$QqM^6QpYIa=OsWfbfj^D34kGpfF`I8sX7F*ss~@C
zmA7aSf1&)+%Tt*|OLr0}#S0I>%*_?OXjJ&nsx(d|rf=4d04S>3B$aZPX&=n84qYK+
zJX2<s0>Mw~CkSnHOv!zx6d@riz};pNn`uX)R2-KE_c>u?a5fxa>+qdo3d0arU<slZ
z70a8k)P{Z@jG%;|rHb{t&s|9K59ZNZZxfm?c8sd3TzpNYVBhAI$l7I*A~rO?VS|b3
zpb6yTAWFg%T52Oean>-74pbZ!HI|qfI0e!;PDn2AliUT;9e?}=w5oh8%Ogl)+#wuY
zf`3b~I8?EZyePdd$e{Z4XIbR1EjtjYgMg>Lg>^|^_h<s=dc)GmSA%s51M>XHw+_}W
zaFcZ%Q<2vi1UiZG!Hly!ADLGq8+3QZV-Qb?{456^{^XgdtT>EzJ-PX)(qO((D;CF-
z8H<U~FY&cVTc`|xhAc1JziwS}H4o#4ZYqQnRKXCU*i_7!2$L08o_2l?_6vCYyF%Jn
zxRdlpEm<xQr35=vs-m6HFAjlD8RqX}!?B*9L>hvZ?7`}<;*0Wdp>*R$dSDEvKG~KB
zcV$@ppz4r-<40)GQ~e#xZpoRy+qMcDd;`@zw|)FWh~Yv4)HC*p`K{`O2YJunT7-+`
zjF@NCKozkRw$V&jy$r*2Tb~1dRMUXoGhi{Y{-TZQ&lTv)3Ey)0o^{QDf@wJt=)xIk
zpa}kV_A7UnL3~I2V4at66AVh)xde-v-Br2de4OT!>yTf-F^|4o@Q09_8CJjmavaTS
zO}#}3<C8(y<0IcDq98Hs8iC`@X}lYJR<#ar=4)rpi5U=~B*4;?i$tPb-#M};$bxxC
zmVOx`oK#8Z7Nei_Qm0_f7q01FQ%d>y-vrU)+gRuGlQ;%2y-2VapQ&{nmV)BRkloq#
zp}=Ci&97tj^cIUL2Y0=E&ma!_(~M}v>(YR_NG~wRySvkX>|x>lNFqa?__EByDs}sl
z7q{dO$zuB5AV#DcpCN;ahY(GYhQ9XdISKF*qm@UJVSK5$UxDk%Hg`>x^a{>w58xuA
zeAZiIH_X!GNeg#z4!dIS0!#Z55_~0!Nh-)$(Bobr&<$y}Bo9aty5}><XvS(5vB&bH
z6s($qvC6|W%a)feuY2I=62g%#p)I!OW&DJWbU@u{iHO*O2ohp+Szv#VkVYLK26C<e
zf&JZIBxApF%wHVN7tyC9q8(kKN|_v4S$56(H*XclpdIyr`5%L;7J=3RJC%Mi&A$s)
zs}<(9^HR^2+*(}jPsU%{d55w8a{c~&Y-wqpJoJHw|2VOixzr-nsQQ-^!5qUDxMTg?
z#;?YZ%uafKDjOfBycxe=%+ySG$|m$5K{8IcF@bcdP6x%`VKHB>7zH@T7=yLgFCzYg
zbNA_BS7PE3gZP(W;8vng5M1Hx6N6I$*TC?;5vL=Y)i{-%%bOwghw98fhx^~+oSBsl
zo-^B2zI#cZyUgY#R6b28>B);t{|V<8wP?Fkd6dh^|6on@Lg3`jdG4QTO-<CLMNYtr
zDE&Fmq}vHkyDX*O>b%Rj6A3ipdi*<fxQ(#Jic2X3K{7X(8caTciEEco>NEy35X+Is
z2fM5W`sv<4ZhiX6cSnwTGPcHB>zGppIg|A9C<-9*TXQfOWD%LWyU_h-DfkrXXO(PW
zSfX2c5cp@I7$5TF-_rZz^9W-Q@^>*gyWDD<tIMpeo3;zPRqCqmsPUh=3D^a2<MZZ@
z8yB3HFdqM+%!dH4L3$*Gh~?IL&*HcOG^33#a`t0Z=Ni+RNiZz41(i<T+m}ULrl_~g
z^5vpb^$mZ@%Y(H)6~OSUe(p?|)Cd5Tl%E}(6gF$Kd;B#fz=Z_}qh;wqtsa!6e8IVf
zJM$=Rpy*iZE9KC`s7t<<?i%`^>x>4d%|tNrAraP(azzdaztEV9i>P!$W@*06nx4+1
zA+tG~&kWhxC{U`GCjF?pl~KJGYhZ3pw)Ik^gxlZUIBrUp5ldG^`RB&psT1f2GwpL(
zz!L=gH_!Q5D~Q*j!n66S5{;Hj`63+shdm4WGuST+t81N4^I)pi61~(~t?12Fc-Dp8
z&#Q2o!g$#n2^Us@n<#qw!C!->^A|6RhS{=s?3+!})N?qf)<hn=p%E<tES!W-ddlKA
zD*Mh0lElSA?D#HMeb`LB|1hvAR5Eugo|gQroqyd)IZ95VpWQ1cuX22ovq2aV2LGsB
zralo1Du_$MtWo?xk6x=)sj|vQlyhtW%DuC%EsYvhu)iahL{g}xF1F!ew_=l?fRJE;
z@W3SX;0*l~9O(kHUyMt+W=z-q&Ph{q^AT$jy;SZyMKOeO@OY_~qRt<&82g5!(Bhl}
z>2n0VQlbR&kV3zkgFY280Uz^52L-nuo0UuP6XhRBN0AKo{`_2lXHcT9ty5C|k`n!|
zc)}PqUEqXr)M}1R{A@I0un-YWkh)hY9%DFAEhQS?i)LuA(qA}^r8}c%NA|fw8MQDr
zJBu`e+^Ns0$g;UbQb%d^dm#v?!XCF!@ru)c>tM{V^c42?`-Q8GHGvcfMMg}*hp!2K
zT1?eyMf8WeL}7aTgZ46_Vsd|iA!++-(dRA*THJK!F?}W)Ldm6J2}rU)SFE5SY`xP2
zhl}aZv23_6<<v@q+_S7qW<#1D*-h+}kr&~-$+8M5{t;KR`vJ+Er*b%by^5gCjU9Zp
zeKxyO(xUKyLWpl&jU&;POtBl~GGtO0HC|v_aP{~H0XAcD)@7wJ+pVSumH4DZ2&nB3
z?Z2<qRIT=Im91*($ng-1=FrBgjJxcUhHN6katzaNJjNs2RVA7R4{+U)h6b<)E{%}c
zn;VoN_+_Wnz%KR;E(}oJL2|pWf%(M<i^pmx$5Uks2KTA^wpU%}p&Hv4goeQvAR@-@
zBWx_sX$MIH=jLFZcM3YyC&B{j(`M^6ueLI>(GQ3VP~Y>a?TF8X`juE&eO}Jm5x0xh
zj)$g_CnpW?RR|xtTAFi$a9C(Od{*@0^DKX4DC4W!^nWQ#2|hzY^7a}BdQBO&E}umC
z&o2M~MO<k#XxgKR!YQy+S?vSjKEY&2AnT3kn4Tuk89tIU?|A*LPz(gm$G2j$yZx19
zAz;tTnS9)<<`&)W2#fe`mAGs|pYY?=i8{Fpw(+;J7Ot-OJp^R)p)BmlMFZ;=1++O>
z(=RREwDA1iNR(KRDA|w8*x%0D81=NF<UD*6z}JEPJ)1fD@d>FUZo6vrrM5pQ<G$_=
z$&i)QieN#?QCya*oL6fjbyQD<8vW<`*)OXwGLDZEd`YKBrWh;}$-Si0zrQds<e(a6
z1#ijrOVRcX-oQ6rH2lfk!CLywfN?&TTD+TySNLHA$qutZP}r~84B^Nf283Z~pBj1k
z!4T*Wk!SMHQkuXxD3al!Y;&_s#6TR|icM@{3G-pyG}h{q2qsbg$Y)2#SncSa9zYE!
zixKCHq?YA5!EQNQfeo>nbKl8tb3eCkCqw(pSZ9oaP2i8X2+23uax_=>vJ<S^(sIvO
z<b`xqD@<M)5>5=C&mMi5%mVtJmWx(rr99rIb8}okxxibJ?aOLuOZseT88DQpF*NZC
z12NVjAAhIJuUs-Yh*Z8^BaetvJIAl$(Nm{rokki?4hzOm^RvK=Z8dd|H8HD0#_V@K
znF<a>00$8J7Mm0DQ)}iSDD9I#JVnV@9;5_zCut12Py9lAGh~U@yCt9+PP@X*!Wzf0
z2*snWnAUlBEZNXyv3*@gRYqPWF;XwEjCE5y#*$UUR(zq4G``S+krz8F4aaC9Q1_dn
zMU8$;XSVkT!a_qs5nI~XLpE3U^7D`1ED6g3KLrQm1qZ)rQ_d%?sw$b#YcOcUy~;d8
z89oV;t;+>4uouJbb5jl>>_iJ}isdhU!8>mrX(UcHQ%s^h=aXYC%2h!q>r+w~9{;MD
znhVKlVePDZc)VHJW#|dk=sTpEWNHrPS@oNzAB$NUsfP&H+eegChUTb8M-vzc6FTqm
z_D2WMQD1fkeZtUKv|l<TmMoGrUKmFei*%D@k0>oZg3Z-9QPl`0CvPg94m2prEuP9O
zCZ9T4zWw~I=qOZXt`X5LdwL$p^2Z+_T!K09MZ#e0EtsSu22xX-#r2o2JGd~D9B}hp
zrM(}py0)l1!T9yGUA9U(5Y8Fc2uq)oyc*Wp`VkvtmuN^qg}dVOmvcE6-XhPwHE2<@
zLa;8?UhGx)ZZk=D3x;%MA|~ycn$@+_yth2#4cWq>=$^Objcp`69AlUTrLTfJZdIQ3
zQj-|^Mp*8j`AU5sRvayLtYAr$2smpm!OBddV3@r~1h9uEx|s4J)c$ss<QU2@xDY|3
z0mHtCGq8#*wYOId=~42MlMSGByJ0pc=-HXVS#N^p{4Pv2et%+xm7}&q16EbxXP~5p
z?GC4Bv5t&FvGi%5lYjoPiX2HO?NP{x*>fw^d&hK)Ljbg@0<xn2?)P74l@OOCFy>Fr
zbA~8;>xac1db6<I`o}*}_I%(278>j`&8B)jgZ5g~>J99!d{|VbTF+i1MFJ1L>xl8Y
zo--IGF}yu4+O}=kGW+s)bcnz7Z=B4biELtSNK~UwIrPD-QouVkXAAg2R$;#Qw|uZO
zh$2k5zt+!?`uxS-{pJStu%jI()O3=Nm?b@Qf$=A_$nZtXb7F@Bp=Q&}hls!4Bm-&p
z6K%x|D~dyH)t;292GjHh+Twt!)`ZG~yq2$EwWNHG8@YH0WmWUvkO5{O{Re!TT_=P1
z<IrBc*WceMdRvSyQbkG4srOtlG)z73LM0)vqtRjF+ER@$_U%cD33%shu!wG=DxkJ}
zx@(WiMksfa63uDb*=81bvfJ1AN;}6KBc#t(vGDe?ZGF*`>1JXtqjujtkw?UU%kxc}
z+r7<8z(QWCjnosrbI$duQa}0y3K5@Rc0qn?K{jO2CEZ6W*Mh3tTM+lt{fDpak1er7
zSR#D=o`E;MBK09XEWoWGDYe8f^~KgnUlwn_H!2C#L$+1DIN)N;C6#Z-lEE(&QHnLL
zkM|rQq9lDD_xGE_0i>Z@i2eWK5&!C8t}ppWGneOGJNN}mmEyNKy-F{!A``?TExxG)
zi>eM*1T0PTEt}v(DMpy0j}!d7J2Nf9sSGysj<TD){}gp{>+wwewu&(E){eU2^Y{Da
zKR#@l5rNQR0(cmh4<S~PlB#l&lK(3M^q%+0@k<nu9TX!SHuzqu$@Iw_(<wqVmpLMe
z-x@DUmFhQ2jbKK+zj$&O?eMUgBMbjS1HjGE39{I)i3ukvmP4A0t5-7g_Bj-vgPin!
zZ=)N$Arx4hjyH8mh{(Oc`#rKPVXPLVP}`LXja-B<4R;~~XPDL(yd?6`wmZ6k`OPID
zPM^nuK}?J?PZMq7%O+*sW;6Q8^H7(1ipH+XE=Mw}?8~U@${ybce7Zo|SoX<pVGb2$
z+*N?RNvWmv(DxxV%>{=12~QB`rZFhBTK+okEHp8vr*YCXOH$SlO97KgOa||BiV=-W
zwQ^J_?t6Uu5?pIQo32=kjqA)@@DV);iufG7wE=t_LuO;ZMiew4Ju-nqI{z4(wdYGA
zq`U`*^c7ayd%~a+mMgzx2?)GFI$lFyt<}luLe#y0fmI1Jm9NDu89TV{dnWSxj(^+>
z-?7xSnf1)?2Q&X$^9q5VA5I1}rTgOg!qLa!Ezj!i9+o@GIWBPj6VLniKvGrIkbZl6
zyLvbR@1}RG2Q)4_W=@C&bYr@=!+3Wh)j})>x8qN4mM-qL5=Pi#hUkB-dLCTO{648G
zoKZ~~<KH;%eg1bJ+(#HbK)G`X&YhB-*v~JW11v*-%^sC!Oo&Af(O3O8ynDXvTo0T$
z$UnVV>e&oW8Rf59Fznq7Z(a4=`2!AUFNbke1#hqPt}i$?<f;~gA6;D{d1}oZY^IO2
ziKY8u`srBKWW*0NE}iUeU7WDTInhVidbSqVEsXLfd)z-i5&kry3^SL_4$7bC29(79
zTAmK?t_kd@K=RbItVzGTyD_Xt;!k!jpBW_lX&By911O2{Y$>9TvY`xpxB6*Vne?8e
z{3YDt@D3l06~0NqM8K2Dz6;2XvOip4U_K50r+ffARG7XuBD%^cNh9tfBjFOjq$Bet
zz`&56%SnlAc&?u31~{zBuZCfvmK-7JD>GPM*(b@nVjN&lj$nVLnj6s~`B~&1`O&oZ
z?c0w(Gns#v3F~o^bhYE+P{N3Oe4q0y@`%nPKD^)Jf8yE}S$NX!ey!9M_{ef_MR6OT
znK+iF@#YreuI{M&M$%Anbg3fyrMoo?=}W0js@jB>uLRJlqB!LomesTg>&&|RHG$n<
zl23NBhXgMrwD#(!7}E#WmT-%fS&YA$tWNmjQ9Bqz4Fu(~vJ{3?u_6NBq+CPVUBe<|
z2Mv%+99Bmou#8w~Bu%A4Qc<NthH$B1CA|lhU?qy^OpT8OEQ=!*A7B}<j1R<N8OCyo
z-ldQfQ^0}pRjxSI;JaRw7yt>?h>AHRv>xXF1x);bkMGhuNW6FbH@tWKpDBX0SwN>=
z-D;v9c{Grxo0ISD!S#j#M!C<@AcxJv^^WTwd37t1HKjn|%{MEra7X2@=@SM$Lt849
za{~So1B~#}+<rzPV7+x^6OR?Fr%jbS!4Q}Ev|1hbqgAUeJ)^||8W!PVlf^yv_5+a*
z<<|%yng*A$9OlI?HvZ33C==az<LAlOcV1LBU`9yQUw^!&39v*{uNByuap{%>2jVmf
zqwkxXX~MI#Tlu9u$vpbxw95JFOVIti44AP?6U>MJW<z1Z0Xq*B<E6b7m`|0d{5Uyv
zBxO<tDC4g9R}VN(_xu<gZ1_5iTus<JLFO^|5L-fWDyO%jR;m|(uA;HtO_%b*-~r|&
z`P+Gx8yxTOIX?9m;2;!_M-*^y2B>i#V9g#l@f6~KJo?~j@i<evxHCIo{+_$IdZRGQ
zfc7=$%L5k8n@%3SJ(}Lum1XV&YOWVlY*y*!Rtl88x`(r|P{Kl{%5Vzv`N)FmG$9U<
zD$Na6zKx7K)iPJ>*t${&A-fb4P=LLTT_+BXH(C1Ht^1Gkf(l=T7HIC((Xu*|fnaq+
z?H%&~<LNwOZM}6AC<KvQvjpz@ciW(a<mH7Yxrh_UXUouoIJ{~$CQ#Lyhvw>LsCnAF
z{rX6Qa_XFXR8p`~wTge@uauxLzUX|KTMvlR=4!WEBW~Im;0W(A3FV7Um6@Hjj0@a>
zs&IKtkUn4-4HncLqWR#ffrBtk!F0kqG7NZA#D-YejwwkO&*N%eyG*9C(y`0GkEg@#
z5g966TA#HlSIa}kA=e8BX5<?L#&Pdx*EqrqFp@(@aI<+Ypa8kg+S94UO|T1s3jB4o
zCX2s+#O8$F{?&QR{{FqRYarh@5^3VQO3j8XzQRQwyw9omC&mNy+0N&5-R^V$*5)Vh
zX<Y>g_h{!Cq{&*$g)C)-je&XL(SVT1UEOZ|%gE_5sRGMm{GaPr9Xg}t(_fx6Mw&mr
z2Y_9J^<BV!PcFDZ=UgROupe<hGq_1`{O+=;1fpVm{&Jti4EpXtzK(G;e#Axxy8V=g
zg80O=D1&aWM_PS#u$EqQ3a}ol+1|6)$!SXj|IM_uWU9=(z1__jFOMH57l>;)mDHKG
zVoJ?d0Sw1_F_-Jqa;LJcFDaB)Af50}rNn6gE<Dwcepnw)ATuG!Sce{;#IF$2fdanV
zCYM{uU(m~Fpg_Xg(iUq;PegwtmCwfItu!bnhiIym5_n1CM>BvGD?>}Ycpv<^!k&?!
zGc+sAQC7V6Ldbeivb?pX-yx}Nga*n*w$r8b)!pALQ=3fMpC1&++%#N6dG=2-&Pf&c
z3@f`*>O+>4kzCTu_g>>l)7EiKf5%*u)>>4{`5F1$H*EV4H1drA$`dg%)VV8bH*h#V
ztszhJx-#81^-f+VVej?i)HCk>;d9#K<$z=bNRcv??VG!9XwkXX%3d|L-kh$GF;y~w
zkTQ`tva}_j$>vR`_*z>HOgF5(jSL6VlLQ2i#I|WJr@hv=+jv^AAa;?*)J{r^SAn6?
zXv4Bu9w}LYNoJNNGx~v$wCL{Fw9uAZlf{(jOWQ54K~RVpiUY0vI`4?QD|6v~m=N^~
zwmK`H#_;c+R_*lG6_60T9}XGx<+zlTiE89Y;d}GCP%Bf`s1c&k`=EU8W5*{@tFvf9
z*3L>k{8|qqOG9{Y7q?5lC*Pj6y-tQntFhQXWrt^wP?a2wW`X|P(9c~?triBJ>;v+m
zjK9-Kg~o2@JgP^ZS{#i3<x0AOnoNzn!nGKFMbjOX?sc0mJ1GyMv%h+G#w{wJ02(rP
zTRAawL~MA?z}_52b}<dEahfQ)_Gjp7{>YV)dSun4f0e;#Z@!BIjAlCkWlzUfr4n7H
zy4NG+)0B;sx_Iya=2bR#>9uiODm$a_94;Bcr9Gjkd}zx(mKNQ-0K^GoYz+6>pW*_Y
ziZIC~PcTV6P&4A`Kq<sQ^HA9zkbkC?ok;XP9RieQL?ZX6_>)aUYkV4NNaDQAvb7w(
ze3+28tmKEdb3nXI=Oz)&G9L8Fs*^qW%aVdcg($(+w&PvDiYIicdYrRDrNGu@cxijF
z6hnRrak^V7F}DI%J#|HgkZeQ|M&UG3)krwW)NMzUhB!@SN7AWNgz|%`R!)VLfd<|0
zPZew>plPi<=_>#tC!;IINZc(t7aF~de9=j@E`Y(PRfUF`XLQv={3IF)y0)bJ8A><-
zg}b>gt*k5(isaYY8zzNnP-LSf5AKZ4yoj1nef!R;kRZcB<yMQO#nZJtFdtQ?0DvkU
z)X6~N4q98g5IU&y+^HHxQ>8p*y_g+DPkSsfQ)$FSHMO+^P{#jqTcrOkWwmFAIHGd$
zp-fb_v6vdgGu~BWYpLnIrpUz{<WmZv&8bWo<~inyx01|Pg*z5bQpKVK+jhK}uwLF;
zJ)eO*fmnKD?)sXqx_*C{b2veaTw37UYxB-RI0ZqY_F*zIzSG-<J(`tX{qTB4Dq7i$
zL!mWD!;9*ESC@4<hS|}QKKk6f?XQfhOx%2ONCmrwCx<<<{-0+yGjW-AW{>Q7>p$1A
z1B@D?L(@V%C`Otk`oj7Ex;4SH3w!*s8}UwWXLs{eXh4LEAmRbQ|A;_d{O^^_WaCUD
z->^#++F2a;_evm`@#U6ENUyuVBsD42jPI8_vr@emb?7A&a=ku#VtKYV7~KHzc5+D;
z4_v@rzX6=)DF=BDJRW3c9gtE$_X&>x)&EY0G;bI~4)oOz5d~tRbW-bMO=?zE<axA1
z!>}*1EbD4-Ot_fu4`f0QS60S;a0}W0>g65ZjkLp4SrYxq{ENmTp1~b*8F2fNvkAbe
zAmgGBv}0MEnb{u>>dmV5R^cve|6vQV>w>J@FBx>AKz%o;$4M;cta!c2*KpM{bCct<
zdX3xuf^t+FUq;l9)?J2@TN_t5Z2l#8EG&@{C~YB=RLgv2>J6#a#b`lg6Q6D&??yV1
z?->SfEEQ`_+wq~iZ#v$=l%H2H)ZhjN8ZhzWLhBuwM?aFV>;U-R1(W~4_WuTrga7}`
z{{K{~Od|m<E)|i7HW3-Ggmys;CsS2Yo|B9_5!tMS_8L#gR08qcsVj_B<hRfmt!Kwj
zOp!{EK6@qgmBJ&Mm61)9BYTqY9?+$yj_$6cDXuVC0VBZ<Qjb#)c#Gr>`|kMr%1C-o
z&P13{RLRnZG43u0$FtqA=Pb<<VVj>%Yu`x1#MkEKayUU)eII1`+8BZ?Y}*Ds9b+8T
zOSVM|Qdn><STvHu3_wq7Vbg51$sMB1{*vsK6{#5`E3DUls+S*R+ENJtem1yji}t6C
z%ZS%o>j#eRqpY~25j`KgS-IgT-<`_9dKgUn{}7n}IGdErFd^{&Au#Vw=sSTi`tTp}
zuJj-BU)n|={^!j+>l8_!^6J;MakAj=hL5Emm7Dr{GKh_0t?)<t8xgos_KkiJ;=fm0
zsJyWsS9I{h;JpA49Q2P~fd8WS0?ebsjqJ>$ga4`in-OA5u%_{}TA_}9T5AH)xxSm=
zW8v>(nTc>HApI}SK^vmW{JyBMIEv&LY_8vk%}h;|a-7p7vk<8S6y)MEqc^v}`^7Md
zh?>908zBFpv>L!4E^zZB^>W<LNPAxN?)vInMN~68LKIfaG36MfT^z(<8rnuCdZrID
z!9Je&CpxaFDmkHn#%)rhKK;%w883MA%gH)YCBagIn{u8~Aw~xSxJ1%a!KwYDvohZj
zaq$~R1BF6LIKScjP$Z_NP5oR|z&2@;gsEowk;xLb%#s?1H8BaTx*n;~*(mS}XTO9z
zCw|sw$Y*t9)eKZwIRt6S{{$cZ6=x*CmuzDF*L(tLN5xV*GLxc$2O9nZ|5qsVR!A$r
zyUCbcSM*}6ed&)<?U8TeF0gk@*P+S94fHmtO?sBw!Lm2!%@pTGChT;0MHJ%pyit7y
zE@iBRR=%ONkqk~wHFQ90;t?yWzFj)$)C}u5GVetC+kl-fkh9nH)<@=x_up9b(5`S&
zU+oU}0oTEJkTdqqzj|a$U4l72zhay4(wH&6x07W3!JRqAaR)#0LvF-EKMRxnxnLOr
zTug<=`WK42*mvcWn%uJaL|sbdQFzweSED`cwYhVC(g0}dw+Z!Th1A@bb^*c8f)YC?
zMk~lRcapeqM+#6`dk9+$U3#N;S7&2DFInA3Pa3qx%~%tboNKy`L=P`Bd{+Hx6Q$s2
zeix&So1y_mwBXIU%Mtfq1Jx8u&g*>BQw{zUIrTa253D`c_5Me@ErJja&EyqiXs0qD
z)RE62cjL~1#(wk0zhbTI65MdH)X6NWb*dWD;-m7=^v5?2kBB!WC_UJwC7iq~pK!wK
zr4b7cvfm=8MVmIZ)<FFj3a7c*MFDoMr4r!j@STcV#NZTKL7iBo1z8RF3#pym2S+#L
zU3qKeHfc`;pP~G62la69kuCrx%dcyla%n-(%9hhM8NOg4E<v*9i&lGoV!yw3CYx?q
zC(d8<>rRQ@6F-b?tR|$;BQ8OR@z>tmDRjMQpTEo9xg=>quATCksf{s@CC?A}+_%ik
z6VB=)Bc~LQf#)6`#{t#}GhE-_Y=;+GIhn+hmluZ+$>xql`^iH+8`|?V;uW&Nta}gU
z&*(=d66us=(Z$>C1%@v4`lb_dAcOTG_s}hzM#_;U0SI@SepQ%q#B~2OLCtL&Z!D<x
zMaCLEpBLhSH8y(V9X~$jsU378qnBo&SPkv+TS`I@`B`Od^npM{{`5!8rc8}x#IH^h
zhJh?3u-RuncE$Aj0QzS~A`f88)sK<*m#1CUgA0I48NW5paP&%5mB}jZPP5s!)Ff3E
z0~*klw^3{^N4#VIN`NTH3}0gz#p6}ns>czW-a3y!;6A`_1TCW|a|MllCN3+wu~4F6
zW~zNn^xr!4Q3|jZVAR+UU$%~T#5$5HlMNMZF1ggTt@?)F=NW8*XJ4A(x%fk#?HqBK
zVL-{R9UVwx^>icHNoccrF6g&jw3e%a&PQUUtx4u7;on8F_OCL#pCr>b;@=?9@87d*
z&(l+K&w|;^mw&egqNnsYM<jzGd5(3qlHTB}Dh`Y9Cc9+I$tj=M6gz@zJ>WKQ$&9vi
zKA5iG4tE$s6FVjD;dH8HbSR{4oOhb@KhNZ^CO_uV=llf;kuZ%kqoq-g6_<*+a-GmS
zaFXE(cCQD}5X4Y|poz2rOY*NIKuDhrjn6F30QDLLUKU4nZAY}e1TN2%A-<qp%&j46
z=N1Wn-K3f&rhoda<;1_$u)mM9EJZd1GTG3-tM^~B>{Xx2tEz9R6W5l!%#Az4_hj_l
zXNhWg?-7-SMB79n{n%2^&nLPo^Q$+Q;@t=u^xQNo>5n|=iIO?gL@P^3M*d=H4A^d4
zUYfo??de3}SS%45V5tg#*PPICJ*Btv<l;`*ci`$G(qr?3WC7hoKrpMVJD*MVzJFUh
zGditsh&nz=X|2Ym7t<%niQI-zpgYQ>9i7%4AiA3~ayEvK2L-&U0sdulb8@nSn&S>i
zx=;tQ$mncazxA(t5)uVdK9Pj-5Oij0+&*Bh<WTY)h&cOnZe5{7s?yvK@P$Xc2{?CL
z`y4|-T$fF<%*8P%;famm4V>;dZfTnh1LEz<g)%!oNNunYCU?iUf&yZVLG%KVU$wTB
z-j_f?N9}C$fx=<BFEY!X4>YlHEH;_NA;fcGVy(e;xWQj$x}k=M8ujlIxAdWiWz8Hd
zubJq$4SeOh(=Owfj&-2-;FnmLlRee@inW)npXJ@^2x@3VJcWiRShv8si<6;dUtA4L
zY&Ho&=&OpPx&QsncRj*!p^s)ak_UIS5#Q{pYc}Z*IC9v48;gv8@b858+7)?EN_MCf
zCZ)?(Sws^~XvnJ>7(nkGPcBp2tPX!uC{=$|h;iG9uG%YqA-i@inKHG@V(J2%rXIwl
z+`S2W>mZznkRhzyWA3US)yVctVe~doa4v)pe^*H|QR43R2{IATE9E}$^ztwgg#A0J
zxf6qM9&}S3K1piR*5B}yRm3)&SYmZNt<xcQY)1IIDsi<Z(Z?FxF~1ztdvCq;OEKHw
z_(`v4QQ`YcTAUdjKLnr_Z#2%dL+<(mDt$>rsu0=mCF|@#H-mya&e-KLJ9H5P47C1c
ziJLdpaKY_@T2|*EgzNohN&~ed2><=mwvw)sP{AGav$8wwAaQpDPM(zmLfhsl(Xm4p
z_Nwu0s-Cfq-s;M{#78^PxZxFc@Y*4N0^kzLQtrJI<zSJW5S(hRyk8WDH##}<cM(C3
zD?NU=v?HyCPJL7QqGxg(Mv8%T;yngBihB<=-H0OQL{1MBCtK}z03DOVl<t$6GZpM}
za`^MQC#ut_4)THzV0Qn}`G((w6LB8Qe87fI8u5+^ORGCt3~<NVHozp-9I;<tsPl0!
z<;;NuoFz3-8JUjUm_N}F=H6^I*i8}kZ)LZrz+kjR4rqVK)Sd?bH+sZ3tTUAK2wTw#
zXf3gpoS?cyHYiS2u*-JOr1&m4>VS!>_aykRL>f^;kD?kn=3dQ-YPr!sF1FPQD)WCV
z*Zg;lBt?XTw0vWDaLv72;@oKRE05fm^~i1Z`G&;C_)bWfoLoJsQJ|b>dG9Y7Va~S0
z=g+-*-c;NV@n!~X>n9~*otZFUtpE!!XDT)uF4*}fFZa96S7D!+gQz3UsxX^?j8!&T
zL9Jb&h#QuqsQbB=ecUbO)N*L%&mMDAvrn!1Y!{OsYy6DAi#WxZT@af{_tXeNa&031
z4U{3zGLzYqfJSg8dXU|HDvy7Rbz$&!%!uW+wx+L?)T{GDG<rs2WfjkTl6kJ<{Djx_
zwV;u>2J9TSx^|AtKHA_YMG|_B_Z9B%0h^Vylz-g8-G?&LTiwdBa;A+dcuD-ULIv4d
zn#7J$X6s}8D>u+j;Ygzj?K=l5D*s;cKX*s4Hb*{MvHwtI4O9|xe)Ff<;7yR3fI<bg
zp$erU{5xBheOlx$9?VOL%mOxe%rdPT&%<r_=A#@CU1VejpE)<K<jqndT%II4_~+fC
zsTWckDhreiNo4Ij;S7nb`uRoA_LdBp>*r%;Rb!ENk41Aq$raSQXTq$5@BE9>%xCG=
z%O<sa_qTdnoEt+pzW)0!HQ-VbR@q{SYc<`Em~oF?6e%~sUVRi78!s>I1!A&gK9<4u
z9-BOn0BC;fRFeltUz|M519kUNCu4x$ifH+?`?k{tz;GFUN2u&t(}WgkkbN5I#7q~H
zaQV_KO+K+=Dr*)>6-X84`)|Lp3Pd_F!~NElim&ox#L*aFd^&aSv}eqjDLnp&-LL;m
zMQ^WH!Q=AZUbFpvhWaO;yB=L^vWc~R$c@?=4a%>PBEH71-_siGeAm0X0}eVq-uV)M
ze=TGN=rVkw&+fCMa<@bLY0#RxR30Ee+hZ*bg<^wE$o|n%9>;uh(U&1kZ|}O~hk%x&
z+7t>$d-hKjwq?{>4Y1`PNrc>wQ#n%&T2F(QK3YUD@aZwQK<_;0#Uw}WWq5U!bbxIb
zF0|g0vXH#n#H+A|MVEffbn~AAMAO5FTKB9>x07Ua#>E}9{SdT$WQ^P7FaA2bQ@ZsS
ziq5CrW9F|-EBJ}{sq9`i4v$m}^FkMDo|PgVZ2;=#ILsPRdKp<N_F(o3^zA)A4J&2P
z=0)|+>~yP+M&wi)Pc}aUFo`oCD2lUh?OThV@xh7!Ohlcru2c_WeKkQ}sm{82Ao^Wi
zLTLeL?yk;*0<?egQWq#co;{1M5IbN){*^9E%{vU6K<9+gQfk^)APqye?NHqH(Z@gI
zjd>_n`~#O|Iv6VgX<l>rH$0KWubLi|2fWYK%ZuJ;)6^}xXaUieGi6~^1UGAqteu1P
zpL}@90O=$95{%<#_jKc1-b-J+_#r@GQ*?u;L|Q!8k%ez^p|kV1V6R`#&m$z~r5P|z
zCK;92)qVcG)ekzhBJmq|#9Y3>H(n><Ot+&8B}51X&XmXPysEy(W$2(T*oQzL#r~$P
zgI{Z)<BV|&c)s*hb3wUh^K3Ui@zW^lR>1+Cgr(F2MApD-R=GuO$q)c;TyJx=8omyE
zKL}uO&b`tAPoAS#Am6$>F_YT3$d9L8pt~QGVuW4F1NgP>;tnP&VmF?5HWb0Hxb*L*
z2j{~MN)3Bzm$^Sp%q{@E0n+jtMh&p<9n$!Hx?_Atov#}{0o#D~{RV_z{dzYe@!j`G
zz;UMdmS#|;S*-UZE-BO7q);5Z@G$v*$4>76^S?#MBM@nSnjG|hlrtSbVi-f4mHNV*
zkTgk?H42ByFy=ZZ^@UJY?(n0M=>R4JLky-gelsB+w&?+uaUoMGa@A-ct3=fP<_RiS
zpOh?CZ34W>fG)F5Bs)8E5lC`BYF5wY$0Wf87`9{Hq;bfI?NtWAk;WrK>f>lc1L8Ns
zGCxlokkCRN?~tZEX3XDA;J2asa%>&k2#i7ZAjF89Ta+=wtG!Sgcc1*RkRT6iz?erM
zi#Px9z9+L>pNb-V!HaH>tZH?xfo$|quge=-iRjg6gx0odq@BND7&`%xOaCWwgfK8&
b?yrbNC+vqew=cv08G)R%vQ({vVbK2r7|~Lg

literal 0
HcmV?d00001

diff --git a/index.rst b/index.rst
index 98d8d275503..a5ad877b0f4 100644
--- a/index.rst
+++ b/index.rst
@@ -500,7 +500,6 @@ Additional Resources
    :hidden:
    :caption: Frontend APIs
 
-   intermediate/mixed_precision_tutorial
    intermediate/named_tensor_tutorial
    intermediate/memory_format_tutorial
    advanced/cpp_frontend
diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt
index f93ee92c2c6..8ca89860ba4 100644
--- a/recipes_source/recipes/README.txt
+++ b/recipes_source/recipes/README.txt
@@ -56,3 +56,7 @@ PyTorch Recipes
 14. mobile_perf.py
          PyTorch Mobile Performance Recipes
          https://pytorch.org/tutorials/recipes/mobile_perf.html
+
+15. amp_tutorial.py
+         Automatic Mixed Precision
+         https://pytorch.org/tutorials/recipes/amp_tutorial.html
diff --git a/recipes_source/mixed_precision_tutorial.py b/recipes_source/recipes/amp_tutorial.py
similarity index 99%
rename from recipes_source/mixed_precision_tutorial.py
rename to recipes_source/recipes/amp_tutorial.py
index 732a3a867e0..d09b9ddbe23 100644
--- a/recipes_source/mixed_precision_tutorial.py
+++ b/recipes_source/recipes/amp_tutorial.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-Automatic Mixed Precision in PyTorch
-************************************
+Automatic Mixed Precision
+*************************
 **Author**: `Michael Carilli <https://github.com/mcarilli>`_
 
 `torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_ provides convenience methods for mixed precision,
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index e842c19bae5..0e8f0d875cc 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -167,6 +167,15 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
    :link: ../recipes/android_native_app_with_custom_op.html
    :tags: Mobile
 
+.. Automatic Mixed Precision
+
+.. customcarditem::
+   :header: Automatic Mixed Precision
+   :card_description: Use `torch.cuda.amp` to reduce runtime and save memory on NVIDIA GPUs.
+   :image: ../_static/img/thumbnails/cropped/amp.png
+   :link: ../recipes/amp_tutorial.html
+   :tags: Model-Optimization
+
 .. End of tutorial card section
 
 .. raw:: html

From d48288ac85c935d44c592c1c9b8ae04c62df631b Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 8 Sep 2020 14:37:58 -0600
Subject: [PATCH 13/22] fdsa

---
 recipes_source/recipes_index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 0e8f0d875cc..99574c1bc4f 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -171,7 +171,7 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
 
 .. customcarditem::
    :header: Automatic Mixed Precision
-   :card_description: Use `torch.cuda.amp` to reduce runtime and save memory on NVIDIA GPUs.
+   :card_description: Use ``torch.cuda.amp`` to reduce runtime and save memory on NVIDIA GPUs.
    :image: ../_static/img/thumbnails/cropped/amp.png
    :link: ../recipes/amp_tutorial.html
    :tags: Model-Optimization

From 58a403d3624b505d0bc0d553b9da42a43975ffb4 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 8 Sep 2020 15:07:36 -0600
Subject: [PATCH 14/22] add amp_tutorial to toctree

---
 recipes_source/recipes_index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index cb93a4dd00e..9e379ec4361 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -208,6 +208,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    /recipes/recipes/Captum_Recipe
    /recipes/recipes/tensorboard_with_pytorch
    /recipes/recipes/dynamic_quantization
+   /recipes/recipes/amp_tutorial
    /recipes/torchscript_inference
    /recipes/deployment_with_flask
    /recipes/distributed_rpc_profiling

From 641e7a593a4d88758d929c5d97c2891c6003a923 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 8 Sep 2020 15:10:41 -0600
Subject: [PATCH 15/22] amp_tutorial -> amp_recipe

---
 recipes_source/recipes/README.txt                         | 4 ++--
 recipes_source/recipes/{amp_tutorial.py => amp_recipe.py} | 0
 recipes_source/recipes_index.rst                          | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename recipes_source/recipes/{amp_tutorial.py => amp_recipe.py} (100%)

diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt
index 8ca89860ba4..a182b0a11c5 100644
--- a/recipes_source/recipes/README.txt
+++ b/recipes_source/recipes/README.txt
@@ -57,6 +57,6 @@ PyTorch Recipes
          PyTorch Mobile Performance Recipes
          https://pytorch.org/tutorials/recipes/mobile_perf.html
 
-15. amp_tutorial.py
+15. amp_recipe.py
          Automatic Mixed Precision
-         https://pytorch.org/tutorials/recipes/amp_tutorial.html
+         https://pytorch.org/tutorials/recipes/amp_recipe.html
diff --git a/recipes_source/recipes/amp_tutorial.py b/recipes_source/recipes/amp_recipe.py
similarity index 100%
rename from recipes_source/recipes/amp_tutorial.py
rename to recipes_source/recipes/amp_recipe.py
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 9e379ec4361..6f4368d63e8 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -173,7 +173,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :header: Automatic Mixed Precision
    :card_description: Use ``torch.cuda.amp`` to reduce runtime and save memory on NVIDIA GPUs.
    :image: ../_static/img/thumbnails/cropped/amp.png
-   :link: ../recipes/amp_tutorial.html
+   :link: ../recipes/amp_recipe.html
    :tags: Model-Optimization
 
 .. End of tutorial card section
@@ -208,7 +208,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    /recipes/recipes/Captum_Recipe
    /recipes/recipes/tensorboard_with_pytorch
    /recipes/recipes/dynamic_quantization
-   /recipes/recipes/amp_tutorial
+   /recipes/recipes/amp_recipe
    /recipes/torchscript_inference
    /recipes/deployment_with_flask
    /recipes/distributed_rpc_profiling

From 3f5f6cadc82770f6b6079764dfb579c24808a2bf Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 8 Sep 2020 15:20:30 -0600
Subject: [PATCH 16/22] looks like backtick highlights dont render in
 card_description

---
 recipes_source/recipes_index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 6f4368d63e8..cb857ce53c6 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -171,7 +171,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
 
 .. customcarditem::
    :header: Automatic Mixed Precision
-   :card_description: Use ``torch.cuda.amp`` to reduce runtime and save memory on NVIDIA GPUs.
+   :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs.
    :image: ../_static/img/thumbnails/cropped/amp.png
    :link: ../recipes/amp_recipe.html
    :tags: Model-Optimization

From 8820068788bebc23397daa6c45a3ba83169dd155 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 8 Sep 2020 15:29:23 -0600
Subject: [PATCH 17/22] correct path for amp_recipe.html

---
 recipes_source/recipes_index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index cb857ce53c6..f8986363092 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -173,7 +173,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :header: Automatic Mixed Precision
    :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs.
    :image: ../_static/img/thumbnails/cropped/amp.png
-   :link: ../recipes/amp_recipe.html
+   :link: ../recipes/recipes/amp_recipe.html
    :tags: Model-Optimization
 
 .. End of tutorial card section

From ac602d647ab4e5e5728cca383665d779a55add78 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Fri, 11 Sep 2020 12:37:29 -0600
Subject: [PATCH 18/22] arch notes and saving/restoring

---
 recipes_source/recipes/amp_recipe.py | 37 ++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
index d09b9ddbe23..8dcd9b2097a 100644
--- a/recipes_source/recipes/amp_recipe.py
+++ b/recipes_source/recipes/amp_recipe.py
@@ -20,6 +20,11 @@
 
 You may download and run this tutorial as a standalone Python script.
 The only requirements are Pytorch 1.6+ and a CUDA-capable GPU.
+
+Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere).
+This recipe should show significant (2-3X) speedup on those architectures.
+On earlier architectures (Kepler, Maxwell, Pascal), you may observe a modest speedup.
+Run ``nvidia-smi`` to display your GPU's architecture.
 """
 
 import torch, time, gc
@@ -212,6 +217,38 @@ def make_model(in_size, out_size, num_layers):
         scaler.update()
         opt.zero_grad()
 
+##########################################################
+# Saving/Resuming
+# ----------------
+# To save/resume Amp-enabled runs with bitwise accuracy, use
+# `scaler.state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.state_dict>`_ and
+# `scaler.load_state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.load_state_dict>`_.
+#
+# When saving, save the scaler state dict alongside the usual model and optimizer state dicts.
+# Do this either at the beginning of an iteration before any forward passes, or at the end of
+# an iteration after ``scaler.update()``.
+
+checkpoint = {"model": net.state_dict(),
+              "optimizer": opt.state_dict(),
+              "scaler": scaler.state_dict()}
+
+# (write checkpoint as desired, e.g., ``torch.save(checkpoint, "filename")``.)
+#
+# When resuming, load the scaler state dict alongside the model and optimizer state dicts.
+# (read checkpoint as desired, e.g.,
+# ``checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(torch.cuda.current_device()))``)
+
+net.load_state_dict(checkpoint["model"])
+opt.load_state_dict(checkpoint["optimizer"])
+scaler.load_state_dict(checkpoint["scaler"])
+
+# If a checkpoint was created from a run _without_ mixed precision, and you want to resume training _with_ mixed precision,
+# load model and optimizer states from the checkpoint as usual.  The checkpoint won't contain a saved scaler state, so
+# use a fresh instance of ``GradScaler``.
+#
+# If a checkpoint was created from a run _with_ mixed precision and you want to resume training _without_ mixed precision,
+# load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state.
+
 ##########################################################
 # Inference/Evaluation
 # --------------------

From 00b83bfab7d1cd0cd7c53fbba59db65c2e021bb6 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Fri, 11 Sep 2020 12:58:44 -0600
Subject: [PATCH 19/22] formatting

---
 recipes_source/recipes/amp_recipe.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
index 8dcd9b2097a..911bd336010 100644
--- a/recipes_source/recipes/amp_recipe.py
+++ b/recipes_source/recipes/amp_recipe.py
@@ -8,17 +8,17 @@
 where some operations use the ``torch.float32`` (``float``) datatype and other operations
 use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
 are much faster in ``float16``. Other ops, like reductions, often require the dynamic
-range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype.
+range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype,
 which can reduce your network's runtime and memory footprint.
 
 Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_ and
 `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
 
-This tutorial measures the performance of a simple network in default precision,
+This recipe measures the performance of a simple network in default precision,
 then walks through adding ``autocast`` and ``GradScaler`` to run the same network in
 mixed precision with improved performance.
 
-You may download and run this tutorial as a standalone Python script.
+You may download and run this recipe as a standalone Python script.
 The only requirements are Pytorch 1.6+ and a CUDA-capable GPU.
 
 Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere).
@@ -62,7 +62,7 @@ def make_model(in_size, out_size, num_layers):
 
 ##########################################################
 # ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work.
-# Typically, mixed precision provides the greatest speedup when GPU is saturated.
+# Typically, mixed precision provides the greatest speedup when the GPU is saturated.
 # Small networks may be CPU bound, in which case mixed precision won't improve performance.
 # Sizes are also chosen such that linear layers' participating dimensions are multiples of 8,
 # to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting<troubleshooting>` below).
@@ -87,7 +87,7 @@ def make_model(in_size, out_size, num_layers):
 ##########################################################
 # Default Precision
 # -----------------
-# Without torch.cuda.amp, the following simple network executes all ops in default precision (``torch.float32``):
+# Without ``torch.cuda.amp``, the following simple network executes all ops in default precision (``torch.float32``):
 
 net = make_model(in_size, out_size, num_layers)
 opt = torch.optim.SGD(net.parameters(), lr=0.001)
@@ -139,7 +139,8 @@ def make_model(in_size, out_size, num_layers):
 # helps prevent gradients with small magnitudes from flushing to zero
 # ("underflowing") when training with mixed precision.
 #
-# ``torch.cuda.amp.GradScaler`` performs the steps of gradient scaling conveniently.
+# `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_
+# performs the steps of gradient scaling conveniently.
 
 # Constructs scaler once, at the beginning of the convergence run, using default args.
 # If your network fails to converge with default GradScaler args, please file an issue.
@@ -170,9 +171,9 @@ def make_model(in_size, out_size, num_layers):
 ##########################################################
 # All together ("Automatic Mixed Precision")
 # ------------------------------------------
-# The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
+# (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
 # If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
-# This allows switching between default precision and mixed precision without if/else statements.
+# This allows switching between default precision and mixed precision without if/else statements.)
 
 use_amp = True
 
@@ -196,8 +197,8 @@ def make_model(in_size, out_size, num_layers):
 # Inspecting/modifying gradients (e.g., clipping)
 # --------------------------------------------------------
 # All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
-# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``,  you should
-# unscale them first using ``scaler.unscale_(optimizer)``.
+# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should
+# unscale them first using `scaler.unscale_(optimizer) <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.unscale_>`_.
 
 for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
@@ -232,6 +233,7 @@ def make_model(in_size, out_size, num_layers):
               "optimizer": opt.state_dict(),
               "scaler": scaler.state_dict()}
 
+##########################################################
 # (write checkpoint as desired, e.g., ``torch.save(checkpoint, "filename")``.)
 #
 # When resuming, load the scaler state dict alongside the model and optimizer state dicts.
@@ -242,11 +244,12 @@ def make_model(in_size, out_size, num_layers):
 opt.load_state_dict(checkpoint["optimizer"])
 scaler.load_state_dict(checkpoint["scaler"])
 
-# If a checkpoint was created from a run _without_ mixed precision, and you want to resume training _with_ mixed precision,
+##########################################################
+# If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp,
 # load model and optimizer states from the checkpoint as usual.  The checkpoint won't contain a saved scaler state, so
 # use a fresh instance of ``GradScaler``.
 #
-# If a checkpoint was created from a run _with_ mixed precision and you want to resume training _without_ mixed precision,
+# If a checkpoint was created from a run *with* Amp and you want to resume training *without* Amp,
 # load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state.
 
 ##########################################################

From 85ab17c3d6485e8e1b118de33549fd79623d23ec Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Fri, 11 Sep 2020 14:55:39 -0600
Subject: [PATCH 20/22] fdsa

---
 recipes_source/recipes/amp_recipe.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
index 911bd336010..90f5ab2c77a 100644
--- a/recipes_source/recipes/amp_recipe.py
+++ b/recipes_source/recipes/amp_recipe.py
@@ -169,7 +169,7 @@ def make_model(in_size, out_size, num_layers):
         opt.zero_grad()
 
 ##########################################################
-# All together ("Automatic Mixed Precision")
+# All together: "Automatic Mixed Precision"
 # ------------------------------------------
 # (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
 # If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
@@ -270,6 +270,9 @@ def make_model(in_size, out_size, num_layers):
 # * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
 # * Custom autograd functions (subclasses of ``torch.autograd.Function``)
 #
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
+#
 # If you're registering a custom C++ op with the dispatcher, see the
 # `autocast section <https://pytorch.org/tutorials/advanced/dispatcher.html#autocast>`_
 # of the dispatcher tutorial.

From a824b85100e8200f426c87a254c55e56ec668e63 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Fri, 11 Sep 2020 15:38:34 -0600
Subject: [PATCH 21/22] Clarify autograd-autocast interaction for custom ops

---
 advanced_source/dispatcher.rst | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
index 23ba0f96be1..4e895507cd6 100644
--- a/advanced_source/dispatcher.rst
+++ b/advanced_source/dispatcher.rst
@@ -105,6 +105,8 @@ speaking, the structure of your registrations will look like this:
     that provides implementations for all basic operators on the XLA dispatch
     key.
 
+.. _autograd-support:
+
 Adding autograd support
 -----------------------
 
@@ -299,6 +301,28 @@ the safest choice for the execution type:
                                   at::autocast::cached_cast(exec_type, t1));
     }
 
+If your custom op is :ref:`autograd-enabled<autograd-support>`, you only need to write and register
+an autocast wrapper for same name onto which the autograd wrapper is registered.
+For example, if you wanted an autocast wrapper for the ``myadd`` function shown
+in the autograd section, all you'd need is
+
+.. code-block:: cpp
+
+    Tensor myadd_autocast(const Tensor& self, const Tensor& other) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      return myadd(at::autocast::cached_cast(<desired dtype>, self),
+                   at::autocast::cached_cast(<desired dtype>, other));
+    }
+
+    TORCH_LIBRARY_IMPL(myops, Autocast, m) {
+      m.impl("myadd", myadd_autocast);
+    }
+
+There are no separate gymnastics to make the backward method autocast compatible.
+However, the backward method defined in your custom autograd function will run in the same
+dtype as autocast sets for the forward method, so you should choose a ``<desired dtype>``
+suitable for both your forward and backward methods.
+
 Batched
 ^^^^^^^
 

From 3e9815a77a8740795838a003b1279298970d8459 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Sat, 12 Sep 2020 15:20:45 -0600
Subject: [PATCH 22/22] touchups

---
 advanced_source/dispatcher.rst       |  2 +-
 recipes_source/recipes/amp_recipe.py | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
index 4e895507cd6..4f3b52fea32 100644
--- a/advanced_source/dispatcher.rst
+++ b/advanced_source/dispatcher.rst
@@ -302,7 +302,7 @@ the safest choice for the execution type:
     }
 
 If your custom op is :ref:`autograd-enabled<autograd-support>`, you only need to write and register
-an autocast wrapper for same name onto which the autograd wrapper is registered.
+an autocast wrapper for the same name onto which the autograd wrapper is registered.
 For example, if you wanted an autocast wrapper for the ``myadd`` function shown
 in the autograd section, all you'd need is
 
diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
index 90f5ab2c77a..c1ec52a3883 100644
--- a/recipes_source/recipes/amp_recipe.py
+++ b/recipes_source/recipes/amp_recipe.py
@@ -166,7 +166,7 @@ def make_model(in_size, out_size, num_layers):
         # Updates the scale for next iteration.
         scaler.update()
 
-        opt.zero_grad()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
 
 ##########################################################
 # All together: "Automatic Mixed Precision"
@@ -190,7 +190,7 @@ def make_model(in_size, out_size, num_layers):
         scaler.scale(loss).backward()
         scaler.step(opt)
         scaler.update()
-        opt.zero_grad()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
 end_timer_and_print("Mixed precision:")
 
 ##########################################################
@@ -216,7 +216,7 @@ def make_model(in_size, out_size, num_layers):
 
         scaler.step(opt)
         scaler.update()
-        opt.zero_grad()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
 
 ##########################################################
 # Saving/Resuming
@@ -232,14 +232,16 @@ def make_model(in_size, out_size, num_layers):
 checkpoint = {"model": net.state_dict(),
               "optimizer": opt.state_dict(),
               "scaler": scaler.state_dict()}
+# Write checkpoint as desired, e.g.,
+# torch.save(checkpoint, "filename")
 
 ##########################################################
-# (write checkpoint as desired, e.g., ``torch.save(checkpoint, "filename")``.)
-#
 # When resuming, load the scaler state dict alongside the model and optimizer state dicts.
-# (read checkpoint as desired, e.g.,
-# ``checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(torch.cuda.current_device()))``)
 
+# Read checkpoint as desired, e.g.,
+# dev = torch.cuda.current_device()
+# checkpoint = torch.load("filename",
+#                         map_location = lambda storage, loc: storage.cuda(dev))
 net.load_state_dict(checkpoint["model"])
 opt.load_state_dict(checkpoint["optimizer"])
 scaler.load_state_dict(checkpoint["scaler"])
@@ -294,7 +296,7 @@ def make_model(in_size, out_size, num_layers):
 # 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
 #    In this case a reduced speedup is expected.
 # 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
-#    (For NLP models with encoders/decoders, this can be subtle.  Also. convolutions used to have similar size constraints
+#    (For NLP models with encoders/decoders, this can be subtle.  Also, convolutions used to have similar size constraints
 #    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See
 #    `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>`_ for guidance.)
 #