pytorch
diff --git a/‎.circleci/config.yml
Lines changed: 639 additions & 53 deletions b/‎.circleci/config.yml
Lines changed: 639 additions & 53 deletions
diff --git a/‎.github/code-owners.yml
Lines changed: 1 addition & 1 deletion b/‎.github/code-owners.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 8 additions & 7 deletions b/‎README.md
Lines changed: 8 additions & 7 deletions
diff --git a/‎WORKSPACE
Lines changed: 10 additions & 10 deletions b/‎WORKSPACE
Lines changed: 10 additions & 10 deletions
diff --git a/‎core/compiler.cpp
Lines changed: 14 additions & 22 deletions b/‎core/compiler.cpp
Lines changed: 14 additions & 22 deletions
diff --git a/‎core/conversion/conversionctx/ConversionCtx.cpp
Lines changed: 17 additions & 4 deletions b/‎core/conversion/conversionctx/ConversionCtx.cpp
Lines changed: 17 additions & 4 deletions
diff --git a/‎core/conversion/conversionctx/ConversionCtx.h
Lines changed: 3 additions & 1 deletion b/‎core/conversion/conversionctx/ConversionCtx.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎core/conversion/converters/converter_util.cpp
Lines changed: 128 additions & 2 deletions b/‎core/conversion/converters/converter_util.cpp
Lines changed: 128 additions & 2 deletions
@@ -110,7 +110,7 @@
   - "peri044"
   - "bowang007"
 
-"component: docker":
+"channel: docker":
   - "andi4191"
   - "narendasan"
 
 
@@ -62,3 +62,6 @@ bazel-Torch-TensorRT-Preview
 docsrc/src/
 bazel-TensorRT
 bazel-tensorrt
+.pytest_cache
+*.cache
+*cifar-10-batches-py*
@@ -2,13 +2,14 @@
 
 [![Documentation](https://img.shields.io/badge/docs-master-brightgreen)](https://nvidia.github.io/Torch-TensorRT/)
 
-> Ahead of Time (AOT) compiling for PyTorch JIT
+> Ahead of Time (AOT) compiling for PyTorch JIT and FX
 
-Torch-TensorRT is a compiler for PyTorch/TorchScript, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. Unlike PyTorch's Just-In-Time (JIT) compiler, Torch-TensorRT is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript program into an module targeting a TensorRT engine. Torch-TensorRT operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/FP16/INT8) and other settings for your module.
+Torch-TensorRT is a compiler for PyTorch/TorchScript/FX, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. Unlike PyTorch's Just-In-Time (JIT) compiler, Torch-TensorRT is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript or FX program into an module targeting a TensorRT engine. Torch-TensorRT operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/FP16/INT8) and other settings for your module.
 
 Resources:
 - [Documentation](https://nvidia.github.io/Torch-TensorRT/)
-- [Torch-TensorRT Explained in 2 minutes!](https://www.youtube.com/watch?v=TU5BMU6iYZ0&ab_channel=NVIDIADeveloper)
+- [FX path Documentation](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst)
+- [Torch-TensorRT Explained in 2 minutes!](https://www.youtube.com/watch?v=TU5BMU6iYZ0&ab_channel=NVIDIADeveloper) 
 - [Comprehensive Discusion (GTC Event)](https://www.nvidia.com/en-us/on-demand/session/gtcfall21-a31107/)
 - [Pre-built Docker Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch). To use this container, make an NGC account and sign in to NVIDIA's registry with an API key. Refer to [this guide](https://docs.nvidia.com/ngc/ngc-catalog-user-guide/index.html#registering-activating-ngc-account) for the same.
 
@@ -111,10 +112,10 @@ torch.jit.save(trt_ts_module, "trt_torchscript_module.ts") # save the TRT embedd
 These are the following dependencies used to verify the testcases. Torch-TensorRT can work with other versions, but the tests are not guaranteed to pass.
 
 - Bazel 5.1.1
-- Libtorch 1.11.0 (built with CUDA 11.3)
+- Libtorch 1.12.0 (built with CUDA 11.3)
 - CUDA 11.3
-- cuDNN 8.2.1
-- TensorRT 8.2.4.2
+- cuDNN 8.4.1
+- TensorRT 8.4.1.5
 
 ## Prebuilt Binaries and Wheel files
 
@@ -213,7 +214,7 @@ bazel build //:libtorchtrt --compilation_mode opt
 ```
 
 ### FX path (Python only) installation
-If the user plan to try FX path (Python only) and would like to avoid bazel build. Please follow the steps below.
+If the user plans to try FX path (Python only) and would like to avoid bazel build. Please follow the steps below.
 ``` shell
 cd py && python3 setup.py install --fx-only
 ```
 
@@ -56,17 +56,17 @@ new_local_repository(
 http_archive(
     name = "libtorch",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "8d9e829ce9478db4f35bdb7943308cf02e8a2f58cf9bb10f742462c1d57bf287",
+    sha256 = "80f089939de20e68e3fcad4dfa72a26c8bf91b5e77b11042f671f39ebac35865",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcu113.zip"],
+    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu113.zip"],
 )
 
 http_archive(
     name = "libtorch_pre_cxx11_abi",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad",
+    sha256 = "8e35371403f7052d9e9b43bcff383980dbde4df028986dc1dab539953481d55f",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip"],
+    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.12.0%2Bcu113.zip"],
 )
 
 # Download these tarballs manually from the NVIDIA website
@@ -76,20 +76,20 @@ http_archive(
 http_archive(
     name = "cudnn",
     build_file = "@//third_party/cudnn/archive:BUILD",
-    sha256 = "0e5d2df890b9967efa6619da421310d97323565a79f05a1a8cb9b7165baad0d7",
-    strip_prefix = "cuda",
+    sha256 = "ec96d2376d81fca42bdd3d4c3d705a99b29a065bab57f920561c763e29c67d01",
+    strip_prefix = "cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive",
     urls = [
-        "https://developer.nvidia.com/compute/machine-learning/cudnn/secure/8.2.4/11.4_20210831/cudnn-11.4-linux-x64-v8.2.4.15.tgz",
+        "https://developer.nvidia.com/compute/cudnn/secure/8.4.1/local_installers/11.6/cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz",
     ],
 )
 
 http_archive(
     name = "tensorrt",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    sha256 = "826180eaaecdf9a7e76116855b9f1f3400ea9b06e66b06a3f6a0747ba6f863ad",
-    strip_prefix = "TensorRT-8.2.4.2",
+    sha256 = "8107861af218694130f170e071f49814fa3e27f1386ce7cb6d807ac05a7fcf0e",
+    strip_prefix = "TensorRT-8.4.1.5",
     urls = [
-        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.4/tars/tensorrt-8.2.4.2.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
+        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.1/tars/tensorrt-8.4.1.5.linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz",
     ],
 )
 
 
@@ -359,14 +359,6 @@ void MapInputsAndDetermineDTypes(
   }
 }
 
-uint64_t GetRecommendedWorkspaceSize(const runtime::CudaDevice& device) {
-  if (device.major < 6) {
-    return 256 * (1 << 20);
-  } else {
-    return 1 << 30;
-  }
-}
-
 std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
   // Go through Lowering to simplify graph and extract weight parameters
   auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
@@ -380,14 +372,14 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
   // Infer the type of an input from the weights of the calculation
   auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
 
-  // GPU default WS size : 1 GB
-  // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
-  auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
-  auto device_spec = cfg.convert_info.engine_settings.device;
-  auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
-  if (workspace_size == 0) {
-    cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
-  }
+  // // GPU default WS size : 1 GB
+  // // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
+  // auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
+  // auto device_spec = cfg.convert_info.engine_settings.device;
+  // auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+  // if (workspace_size == 0) {
+  //   cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
+  // }
 
   MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
 
@@ -399,14 +391,14 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
   torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
 
-  // GPU default WS size : 1 GB
-  // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
-  auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
+  // // GPU default WS size : 1 GB
+  // // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
+  // auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
   auto device_spec = cfg.convert_info.engine_settings.device;
   auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
-  if (workspace_size == 0) {
-    cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
-  }
+  // if (workspace_size == 0) {
+  //   cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
+  // }
 
   for (const torch::jit::Method& method : mod.get_methods()) {
     if (method.name().compare("forward") == 0) {
 
@@ -20,9 +20,11 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) {
        << "\n    Debuggable Engine: " << s.debug                                           \
        << "\n    GPU ID: " << s.device.gpu_id                                              \
        << "\n    Allow GPU Fallback (if running on DLA): " << s.device.allow_gpu_fallback  \
-       << "\n    Min Timing Iterations: " << s.num_min_timing_iters                        \
        << "\n    Avg Timing Iterations: " << s.num_avg_timing_iters                        \
-       << "\n    Max Workspace Size: " << s.workspace_size;
+       << "\n    Max Workspace Size: " << s.workspace_size                                 \
+       << "\n    DLA SRAM Size: " << s.dla_sram_size                                       \
+       << "\n    DLA Local DRAM Size: " << s.dla_local_dram_size                           \
+       << "\n    DLA Global DRAM Size: " << s.dla_global_dram_size;
 
     os << "\n    Device Type: " << s.device.device_type                                    \
        << "\n    GPU ID: " << s.device.gpu_id;
@@ -104,9 +106,11 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
     cfg->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
   }
 
-  cfg->setMinTimingIterations(settings.num_min_timing_iters);
   cfg->setAvgTimingIterations(settings.num_avg_timing_iters);
-  cfg->setMaxWorkspaceSize(settings.workspace_size);
+  if (settings.workspace_size != 0){
+    cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, settings.workspace_size);
+  }
+
   cfg->setDefaultDeviceType(settings.device.device_type);
   cfg->setEngineCapability(settings.capability);
 
@@ -120,6 +124,15 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
         settings.enabled_precisions.find(nvinfer1::DataType::kFLOAT) == settings.enabled_precisions.end(),
         "DLA supports only fp16 or int8 precision");
     cfg->setDLACore(settings.device.dla_core);
+    if (settings.dla_sram_size != 1048576){
+      cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM, settings.dla_sram_size);
+    }
+    if (settings.dla_local_dram_size != 1073741824){
+      cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM, settings.dla_local_dram_size);
+    }
+    if (settings.dla_global_dram_size != 536870912){
+      cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM, settings.dla_global_dram_size);
+    }
   }
 }
 
 
@@ -33,9 +33,11 @@ struct BuilderSettings {
   Device device;
   nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
   nvinfer1::IInt8Calibrator* calibrator = nullptr;
-  uint64_t num_min_timing_iters = 2;
   uint64_t num_avg_timing_iters = 1;
   uint64_t workspace_size = 0;
+  uint64_t dla_sram_size = 1048576;
+  uint64_t dla_local_dram_size = 1073741824;
+  uint64_t dla_global_dram_size = 536870912;
 
   BuilderSettings() = default;
   BuilderSettings(const BuilderSettings& other) = default;
 
@@ -135,9 +135,10 @@ nvinfer1::ITensor* castITensor(ConversionCtx* ctx, nvinfer1::ITensor* tensor, nv
 
     auto id_layer = ctx->net->addIdentity(*tensor);
     TORCHTRT_CHECK(id_layer, "Unable to create identity layer for ITensor: " << tensor_id.str());
-    auto casted_tensor = id_layer->getOutput(0);
-    casted_tensor->setType(dtype);
+    // layer->setOutputType should be used for casting and not manually setting output_tensor->setType()
+    id_layer->setOutputType(0, dtype);
 
+    auto casted_tensor = id_layer->getOutput(0);
     LOG_DEBUG(ctx->logger, "Casting ITensor " << tensor_id.str() << " from " << tensor->getType() << " to " << dtype);
 
     std::stringstream ss;
@@ -199,6 +200,131 @@ nvinfer1::ITensor* tensor_to_const(ConversionCtx* ctx, at::Tensor t, const std::
   return out;
 }
 
+// clamp x to [lower_bound, upper_bound]
+nvinfer1::ITensor* clamp(
+    ConversionCtx* ctx,
+    nvinfer1::ITensor* x,
+    nvinfer1::ITensor* lower_bound,
+    nvinfer1::ITensor* upper_bound,
+    std::string const& name) {
+
+  auto max_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMAX, x, lower_bound, "max layer for " + name);
+  TORCHTRT_CHECK(max_layer, "Unable to create max layer for clamp");
+  LOG_DEBUG(ctx->logger, "Create " << max_layer->getName() << " for clamp");
+  auto max_itensor = max_layer->getOutput(0);
+
+  auto min_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name);
+  TORCHTRT_CHECK(min_layer, "Unable to create min layer for clamp");
+  LOG_DEBUG(ctx->logger, "Create " << min_layer->getName() << " for clamp");
+  auto min_itensor = min_layer->getOutput(0);
+  return min_itensor;
+}
+
+// clamp x to [0, input_dim]
+nvinfer1::ITensor* clamp_to_input_dim(
+    ConversionCtx* ctx,
+    nvinfer1::ITensor* x,
+    nvinfer1::ITensor* input_dim,
+    int nbdims,
+    std::string const& name) {
+
+  auto zero = torch::zeros({nbdims}).to(torch::kI32);
+  auto zero_itensor = tensor_to_const(ctx, zero);
+  auto one = torch::ones({nbdims}).to(torch::kI32);
+  auto one_itensor = tensor_to_const(ctx, one);
+
+  auto upper_bound_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, input_dim, one_itensor, "sub layer for " + name);
+  TORCHTRT_CHECK(upper_bound_layer, "Unable to create sub layer for clamp to inputDim");
+  LOG_DEBUG(ctx->logger, "Create " << upper_bound_layer->getName() << " for clamp to inputDim");
+  auto upper_bound = upper_bound_layer->getOutput(0);
+
+  auto max_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMAX, x, zero_itensor, "max layer for " + name);
+  TORCHTRT_CHECK(max_layer, "Unable to create max_layer for clamp to inputDim");
+  LOG_DEBUG(ctx->logger, "Create " << max_layer->getName() << " for clamp to inputDim");
+  auto max_itensor = max_layer->getOutput(0);
+
+  auto min_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name);
+  TORCHTRT_CHECK(min_layer, "Unable to create min_layer for clamp to inputDim");
+  LOG_DEBUG(ctx->logger, "Create " << min_layer->getName() << " for clamp to inputDim");
+  auto min_itensor = min_layer->getOutput(0);
+  return min_itensor;
+}
+
+// return indices < 0 ? inputDims + indices : indices
+nvinfer1::ITensor* normalize_indices(
+    ConversionCtx* ctx,
+    nvinfer1::ITensor* input_dim,
+    nvinfer1::ITensor* indices,
+    int nbdims,
+    std::string const& name) {
+
+  auto zero = torch::zeros({nbdims}).to(torch::kI32);
+  auto neg = -torch::ones({nbdims}).to(torch::kI32);
+  auto zero_itensor = tensor_to_const(ctx, zero);
+  auto neg_itensor = tensor_to_const(ctx, neg);
+  // find the indices that = -1
+  auto signs = clamp(ctx, indices, neg_itensor, zero_itensor, "clamp layer for " + name);
+
+  // get the inputDim value where indices == -1, else 0
+  auto mul = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, signs, input_dim, "prod layer for " + name);
+  TORCHTRT_CHECK(mul, "Unable to create mul layer in normalize_indices");
+  LOG_DEBUG(ctx->logger, "Create " << mul->getName() << " for normalize_indices");
+  auto mul_itensor = mul->getOutput(0);
+
+  // add the inputDim value to indices where indices == -1
+  auto sub = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, indices, mul_itensor, "sub layer for " + name);
+  TORCHTRT_CHECK(sub, "Unable to create sub layer in normalize_indices");
+  LOG_DEBUG(ctx->logger, "Create " << sub->getName() << " for normalize_indices");
+  auto sub_itensor = sub->getOutput(0);
+  return sub_itensor;
+}
+
+std::vector<nvinfer1::ITensor*> normalize_start_and_end(
+    ConversionCtx* ctx,
+    nvinfer1::ITensor* in_shape,
+    nvinfer1::ITensor* in_start,
+    nvinfer1::ITensor* in_end,
+    int nbdims,
+    std::string const& name) {
+  auto start = normalize_indices(ctx, in_shape, in_start, nbdims, "normalize start of " + name);
+  auto out_start = clamp_to_input_dim(ctx, start, in_shape, nbdims, "clamp start to inputDim for " + name);
+  auto end = normalize_indices(ctx, in_shape, in_end, nbdims, "normalize end of " + name);
+  auto out_end = clamp_to_input_dim(ctx, end, in_shape, nbdims, "clamp end to inputDim for " + name);
+  std::vector<nvinfer1::ITensor*> outputs;
+  outputs.push_back(out_start);
+  outputs.push_back(out_end);
+  return outputs;
+}
+
+// size = (end - start) / stride + 1, where range is [start, end], end is included
+nvinfer1::ITensor* get_slice_size(
+    ConversionCtx* ctx,
+    nvinfer1::ITensor* start,
+    nvinfer1::ITensor* end,
+    nvinfer1::ITensor* stride,
+    int nbdims,
+    std::string const& name) {
+  at::Tensor one_tensor = torch::ones({nbdims}).to(torch::kI32);
+  auto one_itensor = tensor_to_const(ctx, one_tensor);
+
+  auto sub_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, end, start, "get_slice_size sub layer for " + name);
+  TORCHTRT_CHECK(sub_layer, "Unable to create sub layer in calculate_output_size");
+  LOG_DEBUG(ctx->logger, "Create " << sub_layer->getName() << " for calculate_output_size");
+  auto sub_itensor = sub_layer->getOutput(0);
+
+  auto div_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, sub_itensor, stride, "get_slice_size div layer for " + name);
+  TORCHTRT_CHECK(div_layer, "Unable to create div layer in calculate_output_size");
+  LOG_DEBUG(ctx->logger, "Create " << div_layer->getName() << " for calculate_output_size");
+  auto div_itensor = div_layer->getOutput(0);
+
+  auto add_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUM, div_itensor, one_itensor, "get_slice_size sum layer for " + name);
+  TORCHTRT_CHECK(add_layer, "Unable to create add layer in calculate_output_size");
+  LOG_DEBUG(ctx->logger, "Create " << add_layer->getName() << " for calculate_output_size");
+  auto size_itensor = add_layer->getOutput(0);
+
+  return size_itensor;
+}
+
 } // namespace converters
 } // namespace conversion
 } // namespace core