refactor(//cpp/bin/torchtrtc): Address review comments

narendasan · narendasan · commit a182c0e112a2 · 2022-02-23T17:22:14.000-08:00
Signed-off-by: Naren Dasan &lt;naren@narendasan.com&gt;
Signed-off-by: Naren Dasan &lt;narens@nvidia.com&gt;
diff --git a/cpp/bin/torchtrtc/README.md b/cpp/bin/torchtrtc/README.md
@@ -31,17 +31,12 @@ torchtrtc [input_file_path] [output_file_path]
         --i, --info                       Dumps info messages generated during
                                           compilation onto the console
       --build-debuggable-engine         Creates a debuggable engine
-      --use-strict-types                Restrict operating type to only use set
-                                        operation precision
       --allow-gpu-fallback              (Only used when targeting DLA
                                         (device-type)) Lets engine run layers on
                                         GPU if they are not supported on DLA
       --require-full-compilation        Require that the model should be fully
                                         compiled to TensorRT or throw an error
-      --is-supported=[method_name],
-      --supported=[method_name],
-      --check-support=[method_name],
-      --check-method-op-support=[method_name]
+      --check-method-support=[method_name]
                                         Check the support for end to end
                                         compilation of a specified method in the
                                         TorchScript module
@@ -79,8 +74,8 @@ torchtrtc [input_file_path] [output_file_path]
                                         (Repeatable) Module that should always
                                         be run in Pytorch for execution (partial
                                         compilation must be enabled)
-      --mbs=[min-block-size],
-      --min-block-size=[min-block-size] Minimum number of contiguous TensorRT
+      --mbs=[num_ops],
+      --min-block-size=[num_ops]        Minimum number of contiguous TensorRT
                                         supported ops to compile a subgraph to
                                         TensorRT
       --embed-engine                    Whether to treat input file as a
@@ -122,6 +117,7 @@ torchtrtc [input_file_path] [output_file_path]
                                         32)@f16%NHWC"
       "--" can be used to terminate flag options and force all following
       arguments to be treated as positional options
+
 ```
 
 e.g.
diff --git a/cpp/bin/torchtrtc/luts.h b/cpp/bin/torchtrtc/luts.h
@@ -8,7 +8,7 @@
 namespace torchtrtc {
 namespace luts {
 
-at::ScalarType to_torch_dtype(torchtrt::DataType dtype) {
+inline at::ScalarType to_torch_dtype(torchtrt::DataType dtype) {
   switch (dtype) {
     case torchtrt::DataType::kHalf:
       return at::kHalf;
diff --git a/cpp/bin/torchtrtc/main.cpp b/cpp/bin/torchtrtc/main.cpp
@@ -5,7 +5,6 @@
 #include "NvInfer.h"
 #include "third_party/args/args.hpp"
 #include "torch/script.h"
-#include "torch/torch.h"
 
 #include "torch_tensorrt/logging.h"
 #include "torch_tensorrt/ptq.h"
@@ -38,8 +37,7 @@ int main(int argc, char** argv) {
 
   args::Flag build_debuggable_engine(
       parser, "build-debuggable-engine", "Creates a debuggable engine", {"build-debuggable-engine"});
-  args::Flag use_strict_types(
-      parser, "use-strict-types", "Restrict operating type to only use set operation precision", {"use-strict-types"});
+
   args::Flag allow_gpu_fallback(
       parser,
       "allow-gpu-fallback",
@@ -56,7 +54,7 @@ int main(int argc, char** argv) {
       parser,
       "method_name",
       "Check the support for end to end compilation of a specified method in the TorchScript module",
-      {"supported", "is-supported", "check-support", "check-method-op-support"});
+      {"check-method-support"});
 
   args::Flag disable_tf32(
       parser, "disable-tf32", "Prevent Float32 layers from using the TF32 data format", {"disable-tf32"});
@@ -105,7 +103,7 @@ int main(int argc, char** argv) {
 
   args::ValueFlag<uint64_t> min_block_size(
       parser,
-      "min-block-size",
+      "num_ops",
       "Minimum number of contiguous TensorRT supported ops to compile a subgraph to TensorRT",
       {"mbs", "min-block-size"});
 
@@ -239,7 +237,6 @@ int main(int argc, char** argv) {
     compile_settings.debug = true;
   }
 
-
   if (allow_gpu_fallback) {
     compile_settings.device.allow_gpu_fallback = true;
   }
diff --git a/docsrc/tutorials/torchtrtc.rst b/docsrc/tutorials/torchtrtc.rst
@@ -39,13 +39,16 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
                                           GPU if they are not supported on DLA
         --require-full-compilation        Require that the model should be fully
                                           compiled to TensorRT or throw an error
+        --check-method-support=[method_name]
+                                          Check the support for end to end
+                                          compilation of a specified method in the
+                                          TorchScript module
         --disable-tf32                    Prevent Float32 layers from using the
                                           TF32 data format
         --sparse-weights                  Enable sparsity for weights of conv and
                                           FC layers
         -p[precision...],
-        --enabled-precision=[precision...]
-                                          (Repeatable) Enabling an operating
+        --enable-precision=[precision...] (Repeatable) Enabling an operating
                                           precision for kernels to use when
                                           building the engine (Int8 requires a
                                           calibration-cache argument) [ float |
@@ -64,20 +67,18 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
         --calibration-cache-file=[file_path]
                                           Path to calibration cache file to use
                                           for post training quantization
-        --teo=[torch-executed-ops...],
-        --torch-executed-ops=[torch-executed-ops...]
-                                          (Repeatable) Operator in the graph that
+        --teo=[op_name...],
+        --torch-executed-op=[op_name...]  (Repeatable) Operator in the graph that
                                           should always be run in PyTorch for
                                           execution (partial compilation must be
                                           enabled)
-        --tem=[torch-executed-mods...],
-        --torch-executed-mods=[torch-executed-mods...]
+        --tem=[module_name...],
+        --torch-executed-mod=[module_name...]
                                           (Repeatable) Module that should always
                                           be run in Pytorch for execution (partial
                                           compilation must be enabled)
-        --mbs=[torch-executed-mods...],
-        --min-block-size=[torch-executed-mods...]
-                                          Minimum number of contiguous TensorRT
+        --mbs=[num_ops],
+        --min-block-size=[num_ops]        Minimum number of contiguous TensorRT
                                           supported ops to compile a subgraph to
                                           TensorRT
         --embed-engine                    Whether to treat input file as a
@@ -119,114 +120,6 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
                                           32)@f16%NHWC"
         "--" can be used to terminate flag options and force all following
         arguments to be treated as positional options
-    [input_specs...] {OPTIONS}
-
-    torchtrtc is a compiler for TorchScript, it will compile and optimize
-    TorchScript programs to run on NVIDIA GPUs using TensorRT
-
-  OPTIONS:
-
-      -h, --help                        Display this help menu
-      Verbiosity of the compiler
-        -v, --verbose                     Dumps debugging information about the
-                                          compilation process onto the console
-        -w, --warnings                    Disables warnings generated during
-                                          compilation onto the console (warnings
-                                          are on by default)
-        --i, --info                       Dumps info messages generated during
-                                          compilation onto the console
-      --build-debuggable-engine         Creates a debuggable engine
-      --use-strict-types                Restrict operating type to only use set
-                                        operation precision
-      --allow-gpu-fallback              (Only used when targeting DLA
-                                        (device-type)) Lets engine run layers on
-                                        GPU if they are not supported on DLA
-      --require-full-compilation        Require that the model should be fully
-                                        compiled to TensorRT or throw an error
-      --is-supported=[method_name],
-      --supported=[method_name],
-      --check-support=[method_name],
-      --check-method-op-support=[method_name]
-                                        Check the support for end to end
-                                        compilation of a specified method in the
-                                        TorchScript module
-      --disable-tf32                    Prevent Float32 layers from using the
-                                        TF32 data format
-      --sparse-weights                  Enable sparsity for weights of conv and
-                                        FC layers
-      -p[precision...],
-      --enable-precision=[precision...] (Repeatable) Enabling an operating
-                                        precision for kernels to use when
-                                        building the engine (Int8 requires a
-                                        calibration-cache argument) [ float |
-                                        float32 | f32 | fp32 | half | float16 |
-                                        f16 | fp16 | int8 | i8 | char ]
-                                        (default: float)
-      -d[type], --device-type=[type]    The type of device the engine should be
-                                        built for [ gpu | dla ] (default: gpu)
-      --gpu-id=[gpu_id]                 GPU id if running on multi-GPU platform
-                                        (defaults to 0)
-      --dla-core=[dla_core]             DLACore id if running on available DLA
-                                        (defaults to 0)
-      --engine-capability=[capability]  The type of device the engine should be
-                                        built for [ standard | safety |
-                                        dla_standalone ]
-      --calibration-cache-file=[file_path]
-                                        Path to calibration cache file to use
-                                        for post training quantization
-      --teo=[op_name...],
-      --torch-executed-op=[op_name...]  (Repeatable) Operator in the graph that
-                                        should always be run in PyTorch for
-                                        execution (partial compilation must be
-                                        enabled)
-      --tem=[module_name...],
-      --torch-executed-mod=[module_name...]
-                                        (Repeatable) Module that should always
-                                        be run in Pytorch for execution (partial
-                                        compilation must be enabled)
-      --mbs=[min-block-size],
-      --min-block-size=[min-block-size] Minimum number of contiguous TensorRT
-                                        supported ops to compile a subgraph to
-                                        TensorRT
-      --embed-engine                    Whether to treat input file as a
-                                        serialized TensorRT engine and embed it
-                                        into a TorchScript module (device spec
-                                        must be provided)
-      --num-min-timing-iter=[num_iters] Number of minimization timing iterations
-                                        used to select kernels
-      --num-avg-timing-iters=[num_iters]
-                                        Number of averaging timing iterations
-                                        used to select kernels
-      --workspace-size=[workspace_size] Maximum size of workspace given to
-                                        TensorRT
-      -t[threshold],
-      --threshold=[threshold]           Maximum acceptable numerical deviation
-                                        from standard torchscript output
-                                        (default 2e-5)
-      --no-threshold-check              Skip checking threshold compliance
-      --truncate-long-double,
-      --truncate, --truncate-64bit      Truncate weights that are provided in
-                                        64bit to 32bit (Long, Double to Int,
-                                        Float)
-      --save-engine                     Instead of compiling a full a
-                                        TorchScript program, save the created
-                                        engine to the path specified as the
-                                        output path
-      input_file_path                   Path to input TorchScript file
-      output_file_path                  Path for compiled TorchScript (or
-                                        TensorRT engine) file
-      input_specs...                    Specs for inputs to engine, can either
-                                        be a single size or a range defined by
-                                        Min, Optimal, Max sizes, e.g.
-                                        "(N,..,C,H,W)"
-                                        "[(MIN_N,..,MIN_C,MIN_H,MIN_W);(OPT_N,..,OPT_C,OPT_H,OPT_W);(MAX_N,..,MAX_C,MAX_H,MAX_W)]".
-                                        Data Type and format can be specified by
-                                        adding an "@" followed by dtype and "%"
-                                        followed by format to the end of the
-                                        shape spec. e.g. "(3, 3, 32,
-                                        32)@f16%NHWC"
-      "--" can be used to terminate flag options and force all following
-      arguments to be treated as positional options
 
 e.g.