diff --git a/.gitignore b/.gitignore index 929998c3d3..2899f2fb19 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ docsrc/_build docsrc/_notebooks docsrc/_cpp_api docsrc/_tmp +docsrc/tutorials/_rendered_examples *.so __pycache__ *.egg-info @@ -66,4 +67,4 @@ bazel-tensorrt *.cache *cifar-10-batches-py* bazel-project -build/ \ No newline at end of file +build/ diff --git a/docsrc/conf.py b/docsrc/conf.py index 9f9af43dd4..f11f6a5050 100644 --- a/docsrc/conf.py +++ b/docsrc/conf.py @@ -47,6 +47,7 @@ "sphinx.ext.coverage", "sphinx.ext.mathjax", "sphinx.ext.viewcode", + "sphinx_gallery.gen_gallery", ] napoleon_use_ivar = True @@ -79,6 +80,12 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +# sphinx-gallery configuration +sphinx_gallery_conf = { + "examples_dirs": "../examples/dynamo", + "gallery_dirs": "tutorials/_rendered_examples/", +} + # Setup the breathe extension breathe_projects = {"Torch-TensorRT": "./_tmp/xml"} breathe_default_project = "Torch-TensorRT" diff --git a/docsrc/index.rst b/docsrc/index.rst index e5da81d2a5..e12c9e6f83 100644 --- a/docsrc/index.rst +++ b/docsrc/index.rst @@ -36,30 +36,43 @@ Getting Started getting_started/getting_started_with_windows -Tutorials +User Guide ------------ * :ref:`creating_a_ts_mod` * :ref:`getting_started_with_fx` * :ref:`ptq` * :ref:`runtime` -* :ref:`serving_torch_tensorrt_with_triton` * :ref:`use_from_pytorch` * :ref:`using_dla` + +.. toctree:: + :caption: User Guide + :maxdepth: 1 + :hidden: + + user_guide/creating_torchscript_module_in_python + user_guide/getting_started_with_fx_path + user_guide/ptq + user_guide/runtime + user_guide/use_from_pytorch + user_guide/using_dla + +Tutorials +------------ +* :ref:`serving_torch_tensorrt_with_triton` * :ref:`notebooks` +* :ref:`dynamo_compile` .. toctree:: :caption: Tutorials - :maxdepth: 1 + :maxdepth: 3 :hidden: - tutorials/creating_torchscript_module_in_python - tutorials/getting_started_with_fx_path - tutorials/ptq - tutorials/runtime tutorials/serving_torch_tensorrt_with_triton - tutorials/use_from_pytorch - tutorials/using_dla tutorials/notebooks + tutorials/_rendered_examples/dynamo_compile_resnet_example + tutorials/_rendered_examples/dynamo_compile_transformers_example + tutorials/_rendered_examples/dynamo_compile_advanced_usage Python API Documenation ------------------------ diff --git a/docsrc/requirements.txt b/docsrc/requirements.txt index ccbe311f0f..ac75bf5632 100644 --- a/docsrc/requirements.txt +++ b/docsrc/requirements.txt @@ -1,4 +1,5 @@ sphinx==4.5.0 +sphinx-gallery==0.13.0 breathe==4.33.1 exhale==0.3.1 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst new file mode 100644 index 0000000000..7d407f73d0 --- /dev/null +++ b/examples/dynamo/README.rst @@ -0,0 +1,12 @@ +.. _dynamo_compile: + +Dynamo Compile Examples +================ + +This document contains examples of usage of the `torch_tensorrt.dynamo.compile` API which integrates with `torch.compile` functionality + +Overview of Available Scripts +----------------------------------------------- +- `dynamo_compile_resnet_example.py <./dynamo_compile_resnet_example.html>`_: Example showcasing compilation of ResNet model +- `dynamo_compile_transformers_example.py <./dynamo_compile_transformers_example.html>`_: Example showcasing compilation of transformer-based model +- `dynamo_compile_advanced_usage.py <./dynamo_compile_advanced_usage.html>`_: Advanced usage including making a custom backend to use directly with the `torch.compile` API diff --git a/examples/dynamo/dynamo_compile_advanced_usage.py b/examples/dynamo/dynamo_compile_advanced_usage.py new file mode 100644 index 0000000000..4e1e67f816 --- /dev/null +++ b/examples/dynamo/dynamo_compile_advanced_usage.py @@ -0,0 +1,83 @@ +""" +Dynamo Compile Advanced Usage +========================= + +This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +import torch +from torch_tensorrt.dynamo.backend import create_backend +from torch_tensorrt.fx.lower_setting import LowerPrecision + +# %% + +# We begin by defining a model +class Model(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.relu = torch.nn.ReLU() + + def forward(self, x: torch.Tensor, y: torch.Tensor): + x_out = self.relu(x) + y_out = self.relu(y) + x_y_out = x_out + y_out + return torch.mean(x_y_out) + + +# %% +# Compilation with `torch.compile` Using Default Settings +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Define sample float inputs and initialize model +sample_inputs = [torch.rand((5, 7)).cuda(), torch.rand((5, 7)).cuda()] +model = Model().eval().cuda() + +# %% + +# Next, we compile the model using torch.compile +# For the default settings, we can simply call torch.compile +# with the backend "tensorrt", and run the model on an +# input to cause compilation, as so: +optimized_model = torch.compile(model, backend="tensorrt") +optimized_model(*sample_inputs) + +# %% +# Compilation with `torch.compile` Using Custom Settings +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Define sample half inputs and initialize model +sample_inputs_half = [ + torch.rand((5, 7)).half().cuda(), + torch.rand((5, 7)).half().cuda(), +] +model_half = Model().eval().cuda() + +# %% + +# If we want to customize certain options in the backend, +# but still use the torch.compile call directly, we can call the +# convenience/helper function create_backend to create a custom backend +# which has been pre-populated with certain keys +custom_backend = create_backend( + lower_precision=LowerPrecision.FP16, + debug=True, + min_block_size=2, + torch_executed_ops={}, +) + +# Run the model on an input to cause compilation, as so: +optimized_model_custom = torch.compile(model_half, backend=custom_backend) +optimized_model_custom(*sample_inputs_half) + +# %% +# Cleanup +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Finally, we use Torch utilities to clean up the workspace +torch._dynamo.reset() + +with torch.no_grad(): + torch.cuda.empty_cache() diff --git a/examples/dynamo/dynamo_compile_resnet_example.py b/examples/dynamo/dynamo_compile_resnet_example.py new file mode 100644 index 0000000000..737d188c54 --- /dev/null +++ b/examples/dynamo/dynamo_compile_resnet_example.py @@ -0,0 +1,82 @@ +""" +Dynamo Compile ResNet Example +========================= + +This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +import torch +import torch_tensorrt +import torchvision.models as models + +# %% + +# Initialize model with half precision and sample inputs +model = models.resnet18(pretrained=True).half().eval().to("cuda") +inputs = [torch.randn((1, 3, 224, 224)).to("cuda").half()] + +# %% +# Optional Input Arguments to `torch_tensorrt.dynamo.compile` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Enabled precision for TensorRT optimization +enabled_precisions = {torch.half} + +# Whether to print verbose logs +debug = True + +# Workspace size for TensorRT +workspace_size = 20 << 30 + +# Maximum number of TRT Engines +# (Lower value allows more graph segmentation) +min_block_size = 3 + +# Operations to Run in Torch, regardless of converter support +torch_executed_ops = {} + +# %% +# Compilation with `torch_tensorrt.dynamo.compile` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Build and compile the model with torch.compile, using Torch-TensorRT backend +optimized_model = torch_tensorrt.dynamo.compile( + model, + inputs, + enabled_precisions=enabled_precisions, + debug=debug, + workspace_size=workspace_size, + min_block_size=min_block_size, + torch_executed_ops=torch_executed_ops, +) + +# %% +# Equivalently, we could have run the above via the convenience frontend, as so: +# `torch_tensorrt.compile(model, ir="dynamo_compile", inputs=inputs, ...)` + +# %% +# Inference +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Does not cause recompilation (same batch size as input) +new_inputs = [torch.randn((1, 3, 224, 224)).half().to("cuda")] +new_outputs = optimized_model(*new_inputs) + +# %% + +# Does cause recompilation (new batch size) +new_batch_size_inputs = [torch.randn((8, 3, 224, 224)).half().to("cuda")] +new_batch_size_outputs = optimized_model(*new_batch_size_inputs) + +# %% +# Cleanup +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Finally, we use Torch utilities to clean up the workspace +torch._dynamo.reset() + +with torch.no_grad(): + torch.cuda.empty_cache() diff --git a/examples/dynamo/dynamo_compile_transformers_example.py b/examples/dynamo/dynamo_compile_transformers_example.py new file mode 100644 index 0000000000..bcff5f946b --- /dev/null +++ b/examples/dynamo/dynamo_compile_transformers_example.py @@ -0,0 +1,92 @@ +""" +Dynamo Compile Transformers Example +========================= + +This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +import torch +import torch_tensorrt +from transformers import BertModel + +# %% + +# Initialize model with float precision and sample inputs +model = BertModel.from_pretrained("bert-base-uncased").eval().to("cuda") +inputs = [ + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), +] + + +# %% +# Optional Input Arguments to `torch_tensorrt.dynamo.compile` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Enabled precision for TensorRT optimization +enabled_precisions = {torch.float} + +# Whether to print verbose logs +debug = True + +# Workspace size for TensorRT +workspace_size = 20 << 30 + +# Maximum number of TRT Engines +# (Lower value allows more graph segmentation) +min_block_size = 3 + +# Operations to Run in Torch, regardless of converter support +torch_executed_ops = {} + +# %% +# Compilation with `torch_tensorrt.dynamo.compile` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Build and compile the model with torch.compile, using tensorrt backend +optimized_model = torch_tensorrt.dynamo.compile( + model, + inputs, + enabled_precisions=enabled_precisions, + debug=debug, + workspace_size=workspace_size, + min_block_size=min_block_size, + torch_executed_ops=torch_executed_ops, +) + +# %% +# Equivalently, we could have run the above via the convenience frontend, as so: +# `torch_tensorrt.compile(model, ir="dynamo_compile", inputs=inputs, ...)` + +# %% +# Inference +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Does not cause recompilation (same batch size as input) +new_inputs = [ + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), +] +new_outputs = optimized_model(*new_inputs) + +# %% + +# Does cause recompilation (new batch size) +new_inputs = [ + torch.randint(0, 2, (4, 14), dtype=torch.int32).to("cuda"), + torch.randint(0, 2, (4, 14), dtype=torch.int32).to("cuda"), +] +new_outputs = optimized_model(*new_inputs) + +# %% +# Cleanup +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Finally, we use Torch utilities to clean up the workspace +torch._dynamo.reset() + +with torch.no_grad(): + torch.cuda.empty_cache()