Merge branch 'master' into aten_to

peri044 · peri044 · commit 1f2ffc4c6b07 · 2021-08-09T22:56:08.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -40,4 +40,7 @@ py/wheelhouse
 py/.eggs
 notebooks/.ipynb_checkpoints/
 *.cache
-tests/py/data
+tests/py/data
+examples/**/deps/**/*
+!examples/**/deps/.gitkeep
+examples/trtorchrt_example/trtorchrt_example
diff --git a/core/plugins/README.md b/core/plugins/README.md
@@ -6,7 +6,7 @@ On a high level, TRTorch plugin library interface does the following :
 
 - Uses TensorRT plugin registry as the main data structure to access all plugins.
 
-- Automatically registers TensorRT plugins with empty namepsace.  
+- Automatically registers TensorRT plugins with empty namepsace.
 
 - Automatically registers TRTorch plugins with `"trtorch"` namespace.
 
@@ -37,4 +37,4 @@ If you'd like to compile your plugin with TRTorch,
 
 Once you've completed the above steps, upon successful compilation of TRTorch library, your plugin should be available in  `libtrtorch_plugins.so`.
 
-A sample runtime application on how to run a network with plugins can be found <a href="https://github.com/NVIDIA/TRTorch/tree/master/examples/sample_rt_app" >here</a>
+A sample runtime application on how to run a network with plugins can be found <a href="https://github.com/NVIDIA/TRTorch/tree/master/examples/trtorchrt_example" >here</a>
diff --git a/cpp/trtorchc/README.md b/cpp/trtorchc/README.md
@@ -59,8 +59,8 @@ trtorchc [input_file_path] [output_file_path]
       --dla-core=[dla_core]             DLACore id if running on available DLA
                                         (defaults to 0)
       --engine-capability=[capability]  The type of device the engine should be
-                                        built for [ default | safe_gpu |
-                                        safe_dla ]
+                                        built for [ standard | safety |
+                                        dla_standalone ]
       --calibration-cache-file=[file_path]
                                         Path to calibration cache file to use
                                         for post training quantization
diff --git a/cpp/trtorchc/main.cpp b/cpp/trtorchc/main.cpp
@@ -264,7 +264,7 @@ int main(int argc, char** argv) {
   args::ValueFlag<std::string> engine_capability(
       parser,
       "capability",
-      "The type of device the engine should be built for [ default | safe_gpu | safe_dla ]",
+      "The type of device the engine should be built for [ standard | safety | dla_standalone ]",
       {"engine-capability"});
 
   args::ValueFlag<std::string> calibration_cache_file(
@@ -537,12 +537,12 @@ int main(int argc, char** argv) {
     auto capability = args::get(engine_capability);
     std::transform(
         capability.begin(), capability.end(), capability.begin(), [](unsigned char c) { return std::tolower(c); });
-    if (capability == "default") {
-      compile_settings.capability = trtorch::CompileSpec::EngineCapability::kDEFAULT;
-    } else if (capability == "safe_gpu") {
-      compile_settings.capability = trtorch::CompileSpec::EngineCapability::kSAFE_GPU;
-    } else if (capability == "safe_dla") {
-      compile_settings.capability = trtorch::CompileSpec::EngineCapability::kSAFE_DLA;
+    if (capability == "standard") {
+      compile_settings.capability = trtorch::CompileSpec::EngineCapability::kSTANDARD;
+    } else if (capability == "safety") {
+      compile_settings.capability = trtorch::CompileSpec::EngineCapability::kSAFETY;
+    } else if (capability == "dla_standalone") {
+      compile_settings.capability = trtorch::CompileSpec::EngineCapability::kDLA_STANDALONE;
     } else {
       trtorch::logging::log(
           trtorch::logging::Level::kERROR, "Invalid engine capability, options are [ default | safe_gpu | safe_dla ]");
diff --git a/docsrc/RELEASE_CHECKLIST.md b/docsrc/RELEASE_CHECKLIST.md
@@ -21,7 +21,7 @@ will result in a minor version bump and sigificant bug fixes will result in a pa
     - All checked in applications (cpp and python) should compile and work
 3. Generate new index of converters and evalutators
     - `bazel run //tools/supportedops -- <PATH TO TRTORCH>/docsrc/indices/supported_ops.rst`
-3. Version bump PR
+4. Version bump PR
     - There should be a PR which will be the PR that bumps the actual version of the library, this PR should contain the following
         - Bump version in `py/setup.py`
         - Make sure dependency versions are updated in `py/requirements.txt`, `tests/py/requirements.txt` and `py/setup.py`
@@ -34,7 +34,29 @@ will result in a minor version bump and sigificant bug fixes will result in a pa
             - `make html`
         - Generate changelog
             - `conventional-changelog -p angular -s -i CHANGELOG.md -t <last version tag> -a`
-4. Once PR is merged tag commit and start creating release on GitHub
+
+5. Run performance tests:
+    - Models: 
+        - Torchbench BERT
+            - `[2, 128], [2, 128]` 
+        - EfficientNet B0
+            - `[3, 224, 224]`
+            - `[3, 1920, 1080]` (P2)  
+        - ViT
+            - `[3, 224, 224]` 
+            - `[3, 1920, 1080]` (P2)   
+        - ResNet50 (v1.5 ?)
+            - `[3, 224, 224]`
+            - `[3, 1920, 1080]` (P2)    
+    - Batch Sizes: 1, 4, 8, 16, 32
+    - Frameworks: PyTorch, TRTorch, ONNX + TRT
+        - If any models do not convert to ONNX / TRT, that is fine. Mark them as failling / no result
+    - Devices: 
+        - A100 (P0)
+        - T4 is a nice to have (P2) (Add batch sizes 64, 128, 256, 512, 1024 if so)
+        - Jetson also nice to have (P4) 
+
+6. Once PR is merged tag commit and start creating release on GitHub
     - Paste in Milestone information and Changelog information into release notes
     - Generate libtrtorch.tar.gz for the following platforms:
         - x86_64 cxx11-abi
@@ -43,4 +65,4 @@ will result in a minor version bump and sigificant bug fixes will result in a pa
     - Generate Python packages for Python 3.6/3.7/3.8/3.9 for x86_64
         - TODO: Build a manylinux container for aarch64
         - `docker run -it -v$(pwd)/..:/workspace/TRTorch build_trtorch_wheel /bin/bash /workspace/TRTorch/py/build_whl.sh` generates all wheels
-            - To build container `docker build -t build_trtorch_wheel .`
+            - To build container `docker build -t build_trtorch_wheel .`
diff --git a/docsrc/tutorials/runtime.rst b/docsrc/tutorials/runtime.rst
@@ -22,4 +22,15 @@ link ``libtrtorchrt.so`` in your deployment programs or use ``DL_OPEN`` or ``LD_
 you can load the runtime with ``torch.ops.load_library("libtrtorchrt.so")``. You can then continue to use
 programs just as you would otherwise via PyTorch API.
 
-.. note:: If you are using the standard distribution of PyTorch in Python on x86, likely you will need the pre-cxx11-abi variant of ``libtrtorchrt.so``, check :ref:`Installation` documentation for more details.
+.. note:: If you are using the standard distribution of PyTorch in Python on x86, likely you will need the pre-cxx11-abi variant of ``libtrtorchrt.so``, check :ref:`Installation` documentation for more details.
+
+.. note:: If you are linking ``libtrtorchrt.so``, likely using the following flags will help ``-Wl,--no-as-needed -ltrtorchrt -Wl,--as-needed`` as theres no direct symbol dependency to anything in the TRTorch runtime for most TRTorch runtime applications
+
+An example of how to use ``libtrtorchrt.so`` can be found here: https://github.com/NVIDIA/TRTorch/tree/master/examples/trtorchrt_example
+
+Plugin Library
+---------------
+
+In the case you use TRTorch as a converter to a TensorRT engine and your engine uses plugins provided by TRTorch, TRTorch
+ships the library ``libtrtorch_plugins.so`` which contains the implementation of the TensorRT plugins used by TRTorch during
+compilation. This library can be ``DL_OPEN`` or ``LD_PRELOAD`` similar to other TensorRT plugin libraries.
diff --git a/docsrc/tutorials/trtorchc.rst b/docsrc/tutorials/trtorchc.rst
@@ -45,7 +45,7 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
          --ffo,
          --forced-fallback-ops             List of operators in the graph that
                                           should be forced to fallback to Pytorch for execution
-                                          
+
          --disable-tf32                    Prevent Float32 layers from using the
                                             TF32 data format
           -p[precision...],
@@ -55,16 +55,16 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
                                             calibration-cache argument) [ float |
                                             float32 | f32 | half | float16 | f16 |
                                             int8 | i8 ] (default: float)
-                                            
+
           -d[type], --device-type=[type]    The type of device the engine should be
                                             built for [ gpu | dla ] (default: gpu)
           --gpu-id=[gpu_id]                 GPU id if running on multi-GPU platform
                                             (defaults to 0)
           --dla-core=[dla_core]             DLACore id if running on available DLA
                                             (defaults to 0)
           --engine-capability=[capability]  The type of device the engine should be
-                                            built for [ default | safe_gpu |
-                                            safe_dla ]
+                                            built for [ standard | safety |
+                                            dla_standalone ]
           --calibration-cache-file=[file_path]
                                             Path to calibration cache file to use
                                             for post training quantization
diff --git a/examples/sample_rt_app/BUILD b/examples/sample_rt_app/BUILD
diff --git a/examples/sample_rt_app/README.md b/examples/sample_rt_app/README.md
diff --git a/examples/trtorchrt_example/BUILD b/examples/trtorchrt_example/BUILD
@@ -0,0 +1,14 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_binary(
+    name = "trtorchrt_example",
+    srcs = [
+        "main.cpp"
+    ],
+    deps = [
+        "//core/runtime:runtime",
+        "@libtorch//:libtorch",
+        "@libtorch//:caffe2",
+        "@tensorrt//:nvinfer",
+    ],
+)
diff --git a/examples/trtorchrt_example/Makefile b/examples/trtorchrt_example/Makefile
@@ -0,0 +1,14 @@
+CXX=g++
+DEP_DIR=$(PWD)/deps
+INCLUDE_DIRS=-I$(DEP_DIR)/libtorch/include -I$(DEP_DIR)/trtorch/include
+LIB_DIRS=-L$(DEP_DIR)/trtorch/lib -L$(DEP_DIR)/libtorch/lib # -Wl,-rpath $(DEP_DIR)/tensorrt/lib -Wl,-rpath $(DEP_DIR)/cudnn/lib64
+LIBS=-Wl,--no-as-needed -ltrtorchrt -Wl,--as-needed -ltorch -ltorch_cuda -ltorch_cpu -ltorch_global_deps -lbackend_with_compiler -lc10 -lc10_cuda
+SRCS=main.cpp
+
+TARGET=trtorchrt_example
+
+$(TARGET):
+	$(CXX) $(SRCS) $(INCLUDE_DIRS) $(LIB_DIRS) $(LIBS) -o $(TARGET)
+
+clean:
+	$(RM) $(TARGET)
diff --git a/examples/trtorchrt_example/README.md b/examples/trtorchrt_example/README.md
@@ -0,0 +1,53 @@
+# trtorchrt_example
+
+## Sample application which uses TRTorch runtime library and plugin library.
+
+This sample is a demonstration on how to use TRTorch runtime library `libtrtorchrt.so` along with plugin library `libtrtorch_plugins.so`
+
+In this demo, we convert two models `ConvGelu` and `Norm` to TensorRT using TRTorch python API and perform inference using `samplertapp`. In these models, `Gelu` and `Norm` layer are expressed as plugins in the network.
+
+### Generating Torch script modules with TRT Engines
+
+The following command will generate `conv_gelu.jit` and `norm.jit` torchscript modules which contain TensorRT engines.
+
+```sh
+python network.py
+```
+
+### `trtorchrt_example`
+The main goal is to use TRTorch runtime library `libtrtorchrt.so`, a lightweight library sufficient enough to deploy your Torchscript programs containing TRT engines.
+
+1) Download releases of LibTorch and TRTorch from https://pytorch.org and the TRTorch github repo and unpack both in the deps directory.
+
+```sh
+cd examples/trtorchrt_example/deps
+// Download latest TRTorch release tar file (libtrtorch.tar.gz) from https://github.com/NVIDIA/TRTorch/releases
+tar -xvzf libtrtorch.tar.gz
+unzip libtorch-cxx11-abi-shared-with-deps-1.9.0+cu111.zip
+```
+
+> If cuDNN and TensorRT are not installed on your system / in your LD_LIBRARY_PATH then do the following as well
+
+```sh
+cd deps
+mkdir cudnn && tar -xvzf <cuDNN TARBALL> --directory cudnn --strip-components=1
+mkdir tensorrt && tar -xvzf <TensorRT TARBALL> --directory tensorrt --strip-components=1
+cd ..
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/deps/trtorch/lib:$(pwd)/deps/libtorch/lib:$(pwd)/deps/tensorrt/lib:$(pwd)/deps/cudnn/lib64:/usr/local/cuda/lib
+```
+
+This gives maximum compatibility with system configurations for running this example but in general you are better off adding `-Wl,-rpath $(DEP_DIR)/tensorrt/lib -Wl,-rpath $(DEP_DIR)/cudnn/lib64` to your linking command for actual applications
+
+ 2) Build and run `trtorchrt_example`
+
+​     `trtorchrt_example` is a binary which loads the torchscript modules `conv_gelu.jit` or `norm.jit` and runs the TRT engines on a random input using TRTorch runtime components. Checkout the `main.cpp` and `Makefile ` file for necessary code and compilation dependencies.
+
+To build and run the app
+
+```sh
+cd examples/trtorchrt_example
+make
+# If paths are different than the ones below, change as necessary
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/deps/trtorch/lib:$(pwd)/deps/libtorch/lib:$(pwd)/deps/tensorrt/lib:$(pwd)/deps/cudnn/lib64:/usr/local/cuda/lib
+./trtorchrt_example $PWD/examples/trtorchrt_example/norm.jit
+```
diff --git a/examples/trtorchrt_example/deps/.gitkeep b/examples/trtorchrt_example/deps/.gitkeep
diff --git a/examples/trtorchrt_example/main.cpp b/examples/trtorchrt_example/main.cpp
diff --git a/examples/trtorchrt_example/network.py b/examples/trtorchrt_example/network.py