From 7393fa878c1904fb92f01b0da7255e5ef9053dce Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sat, 23 Jul 2022 16:34:56 -0700
Subject: [PATCH 01/16] feat: support for grouped inputs

Signed-off-by: Naren Dasan <naren@narendasan.com>
---
 core/compiler.cpp                             | 120 +++---
 core/compiler.h                               |   6 +-
 core/conversion/conversion.cpp                |  26 +-
 core/conversion/conversion.h                  |   3 +
 core/conversion/evaluators/aten.cpp           |  15 -
 core/ir/BUILD                                 |   3 +-
 core/ir/GraphInputs.cpp                       |  76 ++++
 core/ir/StaticParams.cpp                      |   5 +-
 core/ir/ir.cpp                                |  93 ++++-
 core/ir/ir.h                                  |  32 +-
 core/lowering/lowering.cpp                    |   1 -
 core/partitioning/partitioning.cpp            |  48 ++-
 core/partitioning/shape_analysis.cpp          |  62 ++-
 core/partitioning/shape_analysis.h            |   5 +-
 cpp/include/torch_tensorrt/torch_tensorrt.h   |  32 +-
 cpp/src/compile_spec.cpp                      |  55 ++-
 cpp/src/torch_tensorrt.cpp                    |   3 +
 .../csrc/register_tensorrt_classes.cpp        |   8 +
 py/torch_tensorrt/csrc/tensorrt_classes.cpp   |  84 +++-
 py/torch_tensorrt/csrc/tensorrt_classes.h     |  11 +
 py/torch_tensorrt/csrc/torch_tensorrt_py.cpp  |  11 +
 py/torch_tensorrt/ts/_compile_spec.py         |  64 ++-
 py/torch_tensorrt/ts/_compiler.py             |   2 +
 .../test_resolve_nontensor_inputs.cpp         |  16 +-
 .../core/partitioning/test_shape_analysis.cpp |  16 +-
 tests/cpp/BUILD                               |  20 +-
 tests/cpp/test_collection.cpp                 | 363 ++++++++++++++++++
 tests/modules/custom_models.py                |  61 +++
 tests/modules/hub.py                          |  28 +-
 tests/modules/requirements.txt                |   1 +
 tests/py/api/test_collections.py              | 147 +++++++
 tests/py/model_test_case.py                   |   2 +
 tests/py/requirements.txt                     |   3 +-
 33 files changed, 1257 insertions(+), 165 deletions(-)
 create mode 100644 core/ir/GraphInputs.cpp
 create mode 100644 tests/cpp/test_collection.cpp
 create mode 100644 tests/py/api/test_collections.py

diff --git a/core/compiler.cpp b/core/compiler.cpp
index fc1cc66aee..214443a9c6 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -256,6 +256,7 @@ GraphAndMapping ConstructFallbackGraph(
       // update the input ranges for each segments
       convert_cfg.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
 
+      // TODO mapping Inputs Ivalue to flatten one here
       auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, static_params);
       auto temp_g = std::make_shared<torch::jit::Graph>();
       auto device_spec = convert_cfg.engine_settings.device;
@@ -306,57 +307,72 @@ void MapInputsAndDetermineDTypes(
     CompileSpec& cfg,
     std::shared_ptr<torch::jit::Graph>& g,
     ir::StaticParams& static_params,
-    ir::TypeMap& first_use_type_map) {
-  // Associate input specs with inputs
-  cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
-
-  for (auto& in : g->inputs()) {
-    if (static_params.find(in) == static_params.end()) {
-      ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
-      auto est_type_opt = first_use_type_map.find(in)->second;
-      if (est_type_opt && !spec.dtype_is_user_defined) {
-        // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
-        // type
-        LOG_INFO(
-            "Since input type is not explicitly defined, infering using first tensor calculation\n  Found input "
-            << in->debugName() << " has type " << est_type_opt.value()
-            << ". If this is incorrect explicitly set dtype for input and file a bug");
-        spec.dtype = util::ScalarTypeToTRTDataType(est_type_opt.value());
-      } else if (!est_type_opt && !spec.dtype_is_user_defined) {
-        // If we cannot calculate the type and the user did not define the type, then default to FP32
-        LOG_WARNING(
-            "Cannot infer input type from calcuations in graph for input "
-            << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
-        spec.dtype = nvinfer1::DataType::kFLOAT;
-      } else if (spec.dtype_is_user_defined && cfg.partition_info.enabled) {
-        if (!est_type_opt) {
-          LOG_INFO("Cannot infer input tensor dtype in graph. Using user provided input dtype settings");
-          first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
-        } else {
-          if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
+    ir::CollectionTypeMap& first_use_type_map) {
+    cfg.convert_info.collection_input_spec_map = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
+
+    auto collection_inputs = ir::get_collection_inputs(g, static_params);
+    LOG_DEBUG("In MapInputsAndDetermineDTypes, the g->inputs() size is " << g->inputs().size() << ", CollectionInputSpecMap size is" << collection_inputs.size());
+
+    for (auto in : collection_inputs) {
+      std::vector<ir::Input>& spec = cfg.convert_info.collection_input_spec_map.find(in)->second;
+      std::vector<c10::optional<at::ScalarType>> est_type_opt;
+
+      auto est_it = first_use_type_map.find(in);
+      if (est_it != first_use_type_map.end()) {
+        est_type_opt = first_use_type_map.find(in)->second;
+      }
+      // traverse elements in est_type_out and spec
+      for (int i = 0; i < est_type_opt.size(); i++) {
+        if (est_type_opt[i] && !spec[i].dtype_is_user_defined) {
+          // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
+          // type
+          LOG_INFO(
+              "Since input type is not explicitly defined, infering using first tensor calculation\n  Inferred input "
+              << in->debugName() << " has type " << est_type_opt[i].value());
+          spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value());
+        } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) {
+          // If we cannot calculate the type and the user did not define the type, then default to FP32
+          LOG_WARNING(
+              "Cannot infer input type from calcuations in graph for input "
+              << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
+          spec[i].dtype = nvinfer1::DataType::kFLOAT;
+        } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) {
+          if (!est_type_opt[i]) {
+            LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
             std::stringstream ss;
             ss << "For input " << in->debugName() << ", found user specified input dtype as ";
-            ss << cfg.convert_info.inputs.find(in)->second.dtype;
-            ss << ", however when inspecting the graph, the input type expected was inferred to be ";
-            ss << est_type_opt.value() << std::endl;
-            ss << "The compiler is going to use the user setting " << cfg.convert_info.inputs.find(in)->second.dtype;
-            ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
-            ss << "compatibility with PyTorch's data type convention is required.\n";
-            ss << "If you do indeed see errors at runtime either:\n";
-            ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
-            ss << "- Disable partial compilation by setting require_full_compilation to True";
+            ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+            ss << ". The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
             auto warn_str = ss.str();
             LOG_WARNING(warn_str);
+            // Overwrite type map with user settings
+            first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
+
+          } else {
+            if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype) != est_type_opt[i].value()) {
+              std::stringstream ss;
+              ss << "For input " << in->debugName() << ", found user specified input dtype as ";
+              ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+              ss << ", however when inspecting the graph, the input type expected was inferred to be ";
+              ss << est_type_opt[i].value() << std::endl;
+              ss << "The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+              ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
+              ss << "compatibility with PyTorch's data type convention is required.\n";
+              ss << "If you do indeed see errors at runtime either:\n";
+              ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
+              ss << "- Disable partial compilation by setting require_full_compilation to True";
+              auto warn_str = ss.str();
+              LOG_WARNING(warn_str);
+              // Overwrite type map with user settings
+              first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
+            }
           }
-          // Overwrite type map with user settings
-          // We use this map for partitiioning since we need c10::ScalarTypes not nvinfer::DataTypes
-          first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
+        } else {
+          // The user defined the type so no changes are necessary
         }
-      } else {
-        // The user defined the type so no changes are necessary
       }
     }
-  }
+  // }
 }
 
 std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
@@ -370,7 +386,8 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
   auto params = graph_and_parameters.second;
   auto static_params = ir::get_static_params(g->inputs(), params);
   // Infer the type of an input from the weights of the calculation
-  auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
+  // auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
+  auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
   MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
 
@@ -395,10 +412,11 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       auto params = graph_and_parameters.second;
       auto static_params = ir::get_static_params(g->inputs(), params);
       // Infer the type of an input from the weights of the calculation
-      auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
+      auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
       MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
       auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
+      auto outputIsCollection = conversion::OutputIsCollection(g->block());
       if (cfg.partition_info.enabled &&
           (cfg.lower_info.forced_fallback_modules.size() == 0 &&
            cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
@@ -406,12 +424,13 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       }
 
       if (cfg.partition_info.enabled &&
-          !(cfg.lower_info.forced_fallback_modules.size() == 0 &&
-            cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
-        auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types);
+          (!(cfg.lower_info.forced_fallback_modules.size() == 0 &&
+            cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)
+            || outputIsCollection)) {
+
         std::unordered_map<torch::jit::Node*, int> fallback_nodes;
-        auto graph_and_mapping =
-            ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params, fallback_nodes);
+        auto collection_input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types);
+        auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), collection_input_ivalues_map, cfg, static_params, fallback_nodes);
         new_g = graph_and_mapping.first;
         // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly
         for (size_t i = 0; i < new_g->inputs().size(); ++i) {
@@ -429,6 +448,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
         TORCHTRT_CHECK(
             conversion::VerifyConverterSupportForBlock(g->block()),
             "Not all operations in graph are supported by the compiler");
+        // TODO find the right
         auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params);
         AddEngineToGraph(new_mod, new_g, engine, cuda_device);
       }
diff --git a/core/compiler.h b/core/compiler.h
index c1bb85aa3b..c8dc85020b 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -8,13 +8,15 @@
 #include "core/partitioning/partitioning.h"
 #include "core/runtime/runtime.h"
 #include "torch/csrc/jit/api/module.h"
+#include "torch/csrc/jit/ir/ir.h"
 
 namespace torch_tensorrt {
 namespace core {
 
 struct CompileSpec {
-  CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {}
-  std::vector<ir::Input> inputs;
+  CompileSpec(std::vector<ir::Input> inputs) : graph_inputs(inputs) {}
+  CompileSpec(torch::jit::IValue& input_signature) : graph_inputs(input_signature) {}
+  ir::GraphInputs graph_inputs;
   conversion::ConversionInfo convert_info;
   lowering::LowerInfo lower_info;
   partitioning::PartitionInfo partition_info;
diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 3211e7dd98..914f1ddb9d 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -138,7 +138,10 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
 void AddInputs(
     ConversionCtx* ctx,
     c10::ArrayRef<const torch::jit::Value*> inputs,
-    std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs) {
+    ConversionInfo& conversion_info) {
+  std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs = conversion_info.inputs;
+  std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>> collection_input_spec = conversion_info.collection_input_spec_map;
+
   std::vector<const torch::jit::Value*> input_tensors;
   for (auto in : inputs) {
     // Disregarding inputs that are not tensors
@@ -166,9 +169,15 @@ void AddInputs(
   for (auto input : input_tensors) {
     const torch::jit::Value* in = input;
     TORCHTRT_CHECK(
-        input_specs.find(in) != input_specs.end(),
+        input_specs.find(in) != input_specs.end() || collection_input_spec.find(in) != collection_input_spec.end(),
         "Cannot find an input spec associated with input: " << in->debugName());
-    ir::Input& spec = input_specs.find(in)->second;
+    ir::Input spec;
+    if (input_specs.find(in) != input_specs.end()) {
+        spec = input_specs.find(in)->second;
+    } else {
+      spec = collection_input_spec.find(in)->second[0]; // assume input is tensor
+    }
+    // ir::Input& spec = input_specs.find(in)->second;
 
     std::string name = std::string("input_") + std::to_string(ctx->num_inputs);
     LOG_INFO(
@@ -408,7 +417,7 @@ void ConvertBlockToNetDef(
 
   auto inputs = b->inputs();
   AddParamsToCtxValueMap(ctx, static_params);
-  AddInputs(ctx, inputs, build_info.inputs);
+  AddInputs(ctx, inputs, build_info);
 
   auto nodes = b->nodes();
 
@@ -549,6 +558,15 @@ std::set<std::string> ConvertableOpsInBlock(const torch::jit::Block* b) {
   return convertable_ops;
 }
 
+bool OutputIsCollection(const torch::jit::Block* b) {
+  for (auto out: b->outputs()) {
+    if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors) {
   auto unsupported_ops = GetUnsupportedOpsInBlock(b);
   if (unsupported_ops.size() != 0) {
diff --git a/core/conversion/conversion.h b/core/conversion/conversion.h
index 58c06b42a3..a578c4288e 100644
--- a/core/conversion/conversion.h
+++ b/core/conversion/conversion.h
@@ -13,6 +13,7 @@ namespace conversion {
 
 struct ConversionInfo {
   ir::InputSpecMap inputs;
+  ir::CollectionInputSpecMap collection_input_spec_map;
   BuilderSettings engine_settings;
 };
 
@@ -25,6 +26,8 @@ std::string ConvertBlockToEngine(
 
 bool OpSupported(const torch::jit::Node* n);
 
+bool OutputIsCollection(const torch::jit::Block* b);
+
 bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors = false);
 
 c10::optional<torch::jit::IValue> EvaluateNode(
diff --git a/core/conversion/evaluators/aten.cpp b/core/conversion/evaluators/aten.cpp
index 4632744790..7bb1f6d202 100644
--- a/core/conversion/evaluators/aten.cpp
+++ b/core/conversion/evaluators/aten.cpp
@@ -264,21 +264,6 @@ auto aten_registrations TORCHTRT_UNUSED =
              },
              EvalOptions().validSchemas(
                  {"aten::size(Tensor self) -> (int[])", "aten::size.int(Tensor self, int dim) -> (int)"})})
-        .evaluator({c10::Symbol::fromQualString("aten::__getitem__"),
-                    [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
-                      auto list = args.at(n->input(0)).IValue()->to<c10::List<c10::IValue>>();
-                      auto idx = args.at(n->input(1)).unwrapToInt();
-
-                      const int64_t list_size = list.size();
-                      const int64_t normalized_idx = normalizeIndex(idx, list_size);
-                      TORCHTRT_CHECK(
-                          normalized_idx >= 0 || normalized_idx < list_size,
-                          "List index out of range (aten::__getitem__)");
-                      return list.get(normalized_idx);
-                    },
-                    EvalOptions().validSchemas({
-                        "aten::__getitem__.t(t[](a) list, int idx) -> (t(*))",
-                    })})
         .evaluator({c10::Symbol::fromQualString("aten::append"),
                     [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
                       auto list = args.at(n->input(0)).IValue()->to<c10::List<c10::IValue>>();
diff --git a/core/ir/BUILD b/core/ir/BUILD
index a613aaf489..2e9ef7e6a8 100644
--- a/core/ir/BUILD
+++ b/core/ir/BUILD
@@ -15,7 +15,8 @@ cc_library(
     srcs = [
         "ir.cpp",
         "Input.cpp",
-        "StaticParams.cpp"
+        "StaticParams.cpp",
+        "GraphInputs.cpp"
     ],
     deps = [
         "@tensorrt//:nvinfer",
diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp
new file mode 100644
index 0000000000..792189137a
--- /dev/null
+++ b/core/ir/GraphInputs.cpp
@@ -0,0 +1,76 @@
+#include "core/ir/ir.h"
+#include "core/util/prelude.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace ir {
+
+void flatten_dfs(std::vector<torch_tensorrt::core::ir::Input>& flattened_inputs, std::vector<std::vector<torch_tensorrt::core::ir::Input>>& collection_inputs, 
+                 torch::jit::IValue input_ivalue, int level, int index) {
+    if (input_ivalue.isTuple()) {
+      auto input_tuple = input_ivalue.toTuple();
+      int idx = 0;
+      if (level == 0) {
+        collection_inputs.resize(input_tuple->elements().size());
+      }
+      for (auto item: input_tuple->elements()) {
+        torch::jit::IValue converted_item;
+        int cur_idx = level < 1 ? idx: index;
+        flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx);
+        idx++;
+      }
+    } else if(input_ivalue.isList()) {
+      auto input_list = input_ivalue.toList().vec();
+      if (level == 0) {
+        collection_inputs.resize(input_list.size());
+      }
+      c10::TypePtr type = input_list[0].type();
+      auto converted_elements = c10::impl::GenericList(type);
+      int idx = 0;
+      for (auto item: input_list) {
+        int cur_idx = level < 1 ? idx: index;
+        flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx);
+        idx++;
+      }
+    } else if(input_ivalue.isCustomClass()) {
+      torch_tensorrt::core::ir::Input cur_input = *(input_ivalue.toCustomClass<torch_tensorrt::core::ir::Input>());
+      flattened_inputs.push_back(cur_input);
+      if (level == 0) {  // a single value like A
+        collection_inputs.resize(1);
+        collection_inputs[0].push_back(cur_input);
+      } else if (level == 1) { // like A in [A, A] or [(B, B), A]
+        collection_inputs[index].push_back(cur_input);
+      } else if (level == 2) {  // like A in [(A, A), C]
+        collection_inputs[index].push_back(cur_input);
+      } else {// only support 2 level
+        LOG_ERROR("Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]");
+      }
+    }
+}
+
+
+GraphInputs::GraphInputs(std::vector<ir::Input> inputs_) {
+    LOG_DEBUG("Construct GraphInput with ir::Input");
+    inputs = inputs_;
+    collection_inputs.resize(inputs_.size());
+    for (int i = 0; i < inputs_.size(); i++) {
+        collection_inputs[i].push_back(inputs_[i]);
+    }
+}
+
+GraphInputs::GraphInputs(torch::jit::IValue& input_signature_) {
+    LOG_DEBUG("Construct GraphInput with IValue");
+
+    std::vector<torch_tensorrt::core::ir::Input> flattened_inputs;
+    std::vector<std::vector<torch_tensorrt::core::ir::Input>> collection_inputs_;
+
+    flatten_dfs(flattened_inputs, collection_inputs_, input_signature_, 0, 0);
+    inputs = flattened_inputs;
+    input_signature = input_signature_;
+    collection_inputs = collection_inputs_;
+    LOG_DEBUG("Collection Input Size: " << collection_inputs_.size());
+}
+
+} // namespace ir
+} // namespace core
+} // namespace torch_tensorrt
\ No newline at end of file
diff --git a/core/ir/StaticParams.cpp b/core/ir/StaticParams.cpp
index ac16c72d9f..0073ad2888 100644
--- a/core/ir/StaticParams.cpp
+++ b/core/ir/StaticParams.cpp
@@ -11,7 +11,10 @@ StaticParams get_static_params(c10::ArrayRef<torch::jit::Value*> inputs, std::ve
   StaticParams static_params;
   auto param_it = params.begin();
   for (auto in : inputs) {
-    if (in->type() != c10::TensorType::get() && param_it != params.end()) {
+    // handle TensorType, TupleType and ListType
+    if (in->type() != c10::TensorType::get() && 
+        in->type()->kind() != torch::jit::TypeKind::TupleType &&
+        in->type()->kind() != torch::jit::TypeKind::ListType && param_it != params.end()) {
       static_params[in] = *param_it;
       ++param_it;
     }
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index fcca3df33c..cc82fe09b4 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -13,6 +13,14 @@ InputSpecMap associate_specs_with_inputs(
   return pair_input_vals_with_specs(tensor_inputs, specs);
 }
 
+CollectionInputSpecMap associate_specs_with_collection_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    ir::GraphInputs graph_inputs,
+    StaticParams& static_params) {
+  auto tensor_inputs = get_collection_inputs(g, static_params);
+  return pair_input_vals_with_specs_collection(tensor_inputs, graph_inputs.collection_inputs);
+}
+
 InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<Input> specs) {
   TORCHTRT_CHECK(
       vals.size() == specs.size(),
@@ -27,12 +35,28 @@ InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> va
   return a;
 }
 
+CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs) {
+  TORCHTRT_CHECK(
+      vals.size() == specs.size(),
+      "Expected dimension specifications for all input tensors"
+          << ", but found " << vals.size() << " input tensors and " << specs.size() << " dimension specs");
+
+  CollectionInputSpecMap a;
+  for (size_t i = 0; i < vals.size(); i++) {
+    LOG_DEBUG("Paring " << i << ": " << vals[i]->debugName() << " : " << specs[i]);
+    a.insert({vals[i], specs[i]});
+  }
+  return a;
+}
+
 std::vector<const torch::jit::Value*> get_tensor_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     StaticParams& static_params) {
   std::vector<const torch::jit::Value*> input_tensors;
   auto inputs = g->inputs();
+  LOG_DEBUG("Raw inputs size of get_tensor_inputs: " << inputs.size());
   for (auto in : inputs) {
+    LOG_DEBUG("Handle input of debug name: " << in->debugName());
     // Disregarding inputs that are not tensors or are static
     //
     // Ex.
@@ -40,6 +64,29 @@ std::vector<const torch::jit::Value*> get_tensor_inputs(
     // input.1:Tensor -> used
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in);
+    } 
+  }
+  return input_tensors;
+}
+
+std::vector<const torch::jit::Value*> get_collection_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    StaticParams& static_params) {
+  std::vector<const torch::jit::Value*> input_tensors;
+  auto inputs = g->inputs();
+  LOG_DEBUG("Raw inputs size of get_collection_inputs: " << inputs.size());
+  for (auto in : inputs) {
+    LOG_DEBUG("Handle input of debug name: " << in->debugName());
+    if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
+      input_tensors.push_back(in);
+    } else if (in->type()->kind() == torch::jit::TypeKind::TupleType && static_params.find(in) == static_params.end()) {
+    // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) {
+      input_tensors.push_back(in); // push original tuple
+      at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(in);
+      LOG_DEBUG("get_collection_inputs, tuple size " << unpack_tuple.size());
+    } else if (in->type()->kind() == torch::jit::TypeKind::ListType && static_params.find(in) == static_params.end()) {
+      LOG_DEBUG("get_collection_inputs, list use size " << in->uses().size());
+      input_tensors.push_back(in); // push original list
     }
   }
   return input_tensors;
@@ -52,9 +99,6 @@ c10::optional<at::ScalarType> get_value_first_calc_dtype_opt(torch::jit::Block*
   auto b_ins = b->inputs();
   std::unordered_set<torch::jit::Value*> b_in_set(b_ins.begin(), b_ins.end());
 
-  TORCHTRT_ASSERT(
-      in->type() == c10::TensorType::get(), "Input is not a tensor, cannot check for dtype based on calculation");
-
   auto consumers = in->uses();
   auto search_list = std::vector<torch::jit::Use>(consumers.begin(), consumers.end());
 
@@ -142,16 +186,57 @@ c10::optional<at::ScalarType> get_value_first_calc_dtype_opt(torch::jit::Block*
 
 TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b) {
   TypeMap types;
-
   for (auto i : b->inputs()) {
     if (i->type() == c10::TensorType::get()) {
       torch::jit::Value* in = i;
       types.insert({in, get_value_first_calc_dtype_opt(b, i)});
+    } else if(i->type()->cast<c10::TupleType>()) {
+      // make sure very time get the same ptr
+      at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
+      LOG_DEBUG("Tuple size " << unpack_tuple.size());
+      for (auto item: unpack_tuple) {
+        torch::jit::Value* in = item;
+        types.insert({in, get_value_first_calc_dtype_opt(b, i)});
+      }
+    } else if(i->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+      LOG_INFO("Unsupported type of c10::ListType::ofTensors()");
     }
   }
   return types;
 }
 
+CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* b) {
+  CollectionTypeMap types;
+  for (auto i : b->inputs()) {
+    if (i->type() == c10::TensorType::get()) {
+      torch::jit::Value* in = i;
+      types.insert({in, {get_value_first_calc_dtype_opt(b, i)}});
+
+    } else if(i->type()->kind() == torch::jit::TypeKind::TupleType) {
+      // TODO: to evaluate the data type of tuple element
+      // make sure very time get the same ptr
+      // c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
+      at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
+      // TODO: calculate the tuple element type, currently we use {} as default datatype
+      // std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size(), tp);
+      std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size());
+      types.insert({i, dytpes}); // insert an empty 
+
+    } else if(i->type()->kind() == torch::jit::TypeKind::ListType) {
+      // TODO: to decide the size of list and type of list element
+      LOG_DEBUG("get_block_first_calc_dtypes_opt ListType: use size " << i->uses().size());
+      c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
+      // std::vector<c10::optional<at::ScalarType>> dytpes(i->uses().size());
+      std::vector<c10::optional<at::ScalarType>> dytpes(i->uses().size(), tp);
+      types.insert({i, dytpes}); // insert an empty
+    }
+  }
+  return types;
+}
+
+static auto core_input_container =
+    torch::class_<Input>("_torch_tensorrt_core_ir", "Input").def(torch::init<>());
+
 } // namespace ir
 } // namespace core
 } // namespace torch_tensorrt
diff --git a/core/ir/ir.h b/core/ir/ir.h
index 2d9acccc69..966c747176 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -11,9 +11,8 @@ namespace torch_tensorrt {
 namespace core {
 namespace ir {
 
-struct Input {
-  // Input(std::vector<int64_t> shape);
-  // Input(std::vector<int64_t> min_shape, std::vector<int64_t> opt_shape, std::vector<int64_t> max_shape);
+struct Input : torch::CustomClassHolder {
+  Input() {};
   Input(
       std::vector<int64_t> shape,
       nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT,
@@ -36,27 +35,52 @@ struct Input {
   nvinfer1::Dims opt;
   nvinfer1::DataType dtype;
   nvinfer1::TensorFormat format;
+  int id;
 };
 
+// Add to spec
+struct GraphInputs {
+  GraphInputs(std::vector<ir::Input> inputs);
+  GraphInputs(torch::jit::IValue& input_signature);
+  torch::jit::IValue input_signature;  // nested Input, full input spec
+  std::vector<Input> inputs;  // flattend Input
+  std::vector<std::vector<Input>> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e)
+};
+
+typedef std::pair<GraphInputs, torch::jit::IValue> GraphIO; // Graph input output mapping
+
 using StaticParams = std::map<torch::jit::Value*, torch::jit::IValue>;
 StaticParams get_static_params(c10::ArrayRef<torch::jit::Value*> inputs, std::vector<torch::jit::IValue> params);
 
 using InputSpecMap = std::unordered_map<const torch::jit::Value*, Input>;
+using CollectionInputSpecMap = std::unordered_map<const torch::jit::Value*, std::vector<Input>>;
 
+std::vector<const torch::jit::Value*> get_tensor_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    StaticParams& static_params);
 InputSpecMap associate_specs_with_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     std::vector<Input> specs,
     StaticParams& static_params);
+CollectionInputSpecMap associate_specs_with_collection_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    ir::GraphInputs graph_inputs,
+    StaticParams& static_params);
 InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<Input> specs);
+CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs);
 std::vector<const torch::jit::Value*> get_tensor_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     StaticParams& static_params);
+std::vector<const torch::jit::Value*> get_collection_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    StaticParams& static_params);
 
 using TypeMap = std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>;
+using CollectionTypeMap = std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>;
 
 c10::optional<at::ScalarType> get_value_first_calc_dtype_opt(torch::jit::Block* b, torch::jit::Value* in);
 ir::TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b);
-
+ir::CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* b);
 } // namespace ir
 } // namespace core
 } // namespace torch_tensorrt
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
index d3296c347c..8bbae296c3 100644
--- a/core/lowering/lowering.cpp
+++ b/core/lowering/lowering.cpp
@@ -33,7 +33,6 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   torch::jit::InlineFunctionalGraphs(g);
   torch::jit::PeepholeOptimize(g, false);
   torch::jit::FuseLinear(g);
-  torch::jit::LowerAllTuples(g);
   if (!lower_info.disable_cse) {
     torch::jit::EliminateCommonSubexpression(g);
   }
diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index 8fcd29f7a8..f14d5438c6 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -242,6 +242,36 @@ bool check_node_fallback(torch::jit::Node* n, const std::unordered_map<torch::ji
           "Node fallback to Torch because the NonTensor dependencies with other fallback nodes: "
           << util::node_info(n));
     }
+  }
+  return false;
+}
+
+bool is_collection(torch::jit::Node* n) {
+  for (auto out: n->outputs()) {
+    if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool should_run_in_trt(torch::jit::Node* n, const std::unordered_set<std::string>& torch_ops) {
+  // If the op is not supported by the conversion phase it should run in PyTorch
+  if (!conversion::OpSupported(n)) {
+    LOG_GRAPH("Node not supported by conversion: " << util::node_info(n));
+    return false;
+  }
+
+  // If the user specifies the op to run in Torch it should run in PyTorch
+  if (torch_ops.find(n->kind().toQualString()) != torch_ops.end()) {
+    LOG_GRAPH("Node explicitly set to run in torch: " << util::node_info(n));
+    return false;
+  }
+
+  // If the user specifies the module containing this op to run in torch it should run in PyTorch
+  const auto to_compile_sym = c10::Symbol::attr("to_compile");
+  if (n->hasAttribute(to_compile_sym) && n->i(to_compile_sym) == (int64_t) false) {
+    LOG_GRAPH("Node is within a module set to run in torch: " << util::node_info(n));
     return false;
   }
 
@@ -360,19 +390,25 @@ PartitionedGraph segment_graph(
   find_min_block_size_fallback_nodes(block, global_fallback_nodes, min_block_size);
 
   auto nodes = block->nodes();
-
+  auto reverse_nodes = nodes.reverse(); // merge from output side to input side
   PartitionedGraph segmented_blocks;
 
   // segment the nodes
   std::vector<torch::jit::Node*> in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes;
-  for (const auto n : nodes) {
+  for (const auto n : reverse_nodes) {
     // Skip constant nodes as they are resources for both kinds of modules
     if (n->kind() == torch::jit::prim::Constant) {
       continue;
     }
+<<<<<<< HEAD
 
     if (check_node_fallback(n, global_fallback_nodes)) {
       in_prog_trt_blk_nodes.push_back(n);
+=======
+    // the outputs of trt subgraph shouldn't be collections
+    if (should_run_in_trt(n, forced_fallback_ops) && !(in_prog_trt_blk_nodes.size() == 0 && is_collection(n))) {
+      in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n);
+>>>>>>> feat: support for grouped inputs
 
       // If there is an active PyTorch block and we have passed the threshold for a valid TRT
       // block then segment and reset the active PyTorch block
@@ -388,7 +424,7 @@ PartitionedGraph segment_graph(
         LOG_DEBUG(
             "In progress TRT block does not meet minimum block size requirements, therefore folding into in progress PyTorch block");
         in_prog_pyt_blk_nodes.insert(
-            in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end());
+            in_prog_pyt_blk_nodes.begin(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end());
       }
       in_prog_trt_blk_nodes.clear();
       // if there is a prim::If then this if node will be encapsulated in a SegmentedBlock
@@ -407,14 +443,14 @@ PartitionedGraph segment_graph(
           finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes);
         }
         if (checkLoopEvaluatable(n)) {
-          in_prog_trt_blk_nodes.push_back(n);
+          in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n);
         } else {
           auto loop_node = std::vector<torch::jit::Node*>{n};
           finalize_block(segmented_blocks, SegmentedBlock::kTorch, loop_node);
         }
         continue;
       }
-      in_prog_pyt_blk_nodes.push_back(n);
+      in_prog_pyt_blk_nodes.insert(in_prog_pyt_blk_nodes.begin(), n);
     }
   }
 
@@ -429,7 +465,7 @@ PartitionedGraph segment_graph(
         in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end());
     finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes);
   }
-
+  std::reverse(segmented_blocks.begin(), segmented_blocks.end());
   return segmented_blocks;
 }
 
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index d24b1f980a..22c3ea104f 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -8,27 +8,56 @@ namespace torch_tensorrt {
 namespace core {
 namespace partitioning {
 
+at::Tensor generateSingleInput(ir::Input& input, c10::optional<at::ScalarType>& type_opt) {
+      auto cur_shape = input.input_shape;
+      std::vector<int64_t> shape;
+      shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
+      // auto type_opt = types[input.first][i];
+      auto type = at::kFloat;
+      if (type_opt) {
+        type = type_opt.value();
+      } else {
+        LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32");
+      }
+      auto in = at::randint(5, shape, {at::kCUDA}).to(type);
+      // ivalue_map[input.first] = in.clone();
+      return in;
+}
+
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
-    std::unordered_map<const torch::jit::Value*, ir::Input>& inputs,
-    std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>& types) {
+    std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& inputs,
+    std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types) {
+
   // generate random inputs for running pytorch segments
   std::unordered_map<const torch::jit::Value*, torch::jit::IValue> ivalue_map;
 
-  uint64_t in_i = 0;
+
   for (auto& input : inputs) {
-    auto cur_shape = input.second.input_shape;
-    std::vector<int64_t> shape;
-    shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
-    auto type_opt = types[input.first];
-    auto type = at::kFloat;
-    if (type_opt) {
-      type = type_opt.value();
+
+    if (input.first->type()->kind() == torch::jit::TypeKind::ListType) {
+      // create list
+      std::vector<torch::jit::IValue> list;
+      c10::TypePtr elementType = c10::TensorType::get();
+      auto generic_list = c10::impl::GenericList(elementType);
+      for (int i = 0; i < input.second.size(); i++) {
+        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        generic_list.push_back(in.clone());
+      }
+      ivalue_map[input.first] = c10::IValue(generic_list);
+    } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
+      // create tuple
+      std::vector<torch::jit::IValue> list;
+      for (int i = 0; i < input.second.size(); i++) {
+        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        list.push_back(in.clone());
+      }
+      auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr
+      ivalue_map[input.first] = c10::IValue(tuple);
     } else {
-      LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32");
+      auto in = generateSingleInput(input.second[0], types[input.first][0]);
+      ivalue_map[input.first] = in.clone();
+      
     }
-    auto in = at::randint(5, shape, {at::kCUDA}).to(type);
-    ivalue_map[input.first] = in.clone();
-    in_i++;
   }
   return ivalue_map;
 }
@@ -79,8 +108,10 @@ void getSegmentsOutputByRunning(
     } else if (input->type()->isSubtypeOf(torch::jit::BoolType::get())) {
       jit_inputs_ivalues.push_back(ivalues_maps[input].toBool());
     } else if (input->type()->kind() == torch::jit::TypeKind::ListType) {
-      jit_inputs_ivalues.push_back(ivalues_maps[input].toList());
+      // create list
+      jit_inputs_ivalues.push_back(ivalues_maps[input].toList());;
     } else if (input->type()->kind() == torch::jit::TypeKind::TupleType) {
+      // create tuple
       jit_inputs_ivalues.push_back(ivalues_maps[input].toTuple());
     } else if (input->type()->kind() == torch::jit::TypeKind::NumberType) {
       jit_inputs_ivalues.push_back(ivalues_maps[input].toScalar());
@@ -145,6 +176,7 @@ void getSegmentsOutputByRunning(
       }
       input_types.push_back(cur_ivalue.toTensor().scalar_type());
     }
+    // TODO: tuple and list inputs in subgraph
   }
 
   seg_block.register_inshapes(input_shapes);
diff --git a/core/partitioning/shape_analysis.h b/core/partitioning/shape_analysis.h
index 0626490222..2654699a1d 100644
--- a/core/partitioning/shape_analysis.h
+++ b/core/partitioning/shape_analysis.h
@@ -6,9 +6,10 @@ namespace torch_tensorrt {
 namespace core {
 namespace partitioning {
 
+
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
-    std::unordered_map<const torch::jit::Value*, ir::Input>& input_ranges,
-    std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>& input_types);
+    std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& input_ranges,
+    std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& input_types);
 
 void runShapeAnalysis(
     std::vector<SegmentedBlock>& segmented_blocks,
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
index 66706db791..70dea51bc7 100644
--- a/cpp/include/torch_tensorrt/torch_tensorrt.h
+++ b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -14,6 +14,7 @@
 #include <set>
 #include <string>
 #include <vector>
+#include "torch/custom_class.h"
 
 // Just include the .h?
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
@@ -363,7 +364,7 @@ class TORCHTRT_API TensorFormat {
  * signifying a static input shape or a set of three input shapes representing
  * the min, optiminal and max input shapes allowed for the engine.
  */
-struct TORCHTRT_API Input {
+struct TORCHTRT_API Input : torch::CustomClassHolder{
   /// Minimum acceptable input size into the engine
   std::vector<int64_t> min_shape;
   /// Optimal input size into the engine (size optimized for given kernels accept any size in min max range)
@@ -378,6 +379,7 @@ struct TORCHTRT_API Input {
   /// Expected tensor format for the input
   TensorFormat format;
 
+  Input() {}
   /**
    * @brief Construct a new Input spec object for static input size from
    * vector, optional arguments allow the user to configure expected input shape
@@ -512,6 +514,16 @@ struct TORCHTRT_API Input {
   bool input_is_dynamic;
 };
 
+/**
+ * @brief A struct to hold complex inputs
+ *
+ * This struct can either hold a complex inputs of shape or a flattened one,
+ */
+struct TORCHTRT_API GraphInputs {
+  torch::jit::IValue input_signature;  // nested Input, full input spec
+  std::vector<Input> inputs; // flatten input spec
+};
+
 /**
  * @brief Get the build information for the library including the dependency
  * versions
@@ -579,18 +591,22 @@ struct TORCHTRT_API CompileSpec {
    *
    * @param inputs
    */
-  CompileSpec(std::vector<Input> inputs) : inputs(std::move(inputs)) {}
-
-  // Defaults should reflect TensorRT defaults for BuilderConfig
+  CompileSpec(std::vector<Input> inputs);
 
   /**
-   * @brief Specifications for inputs to the engine, can either be a single size or a range defined by min, opt and max
-   * sizes Users can also specify expected input type as well as tensor memory format
+   * @brief Construct a new Extra Info object from IValue.
+   * The IValue store a complex Input
    *
-   * Order in vector should match call order for the function
+   * @param input_signature
    */
-  std::vector<Input> inputs;
+  CompileSpec(torch::jit::IValue input_signature);
+  // Defaults should reflect TensorRT defaults for BuilderConfig
 
+  /**
+   * @brief Specifications for inputs to the engine, can store a IValue which has stored complex Input
+   *  or a flatened Input
+   */
+  GraphInputs graph_inputs;
   /**
    * @brief The set of precisions TensorRT is allowed to use for kernels during compilation
    *
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 2881887aea..1fb4c56a98 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -18,18 +18,67 @@ torchtrt::core::runtime::CudaDevice to_internal_cuda_device(Device device);
 namespace torchscript {
 CompileSpec::CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes) {
   for (auto in : fixed_sizes) {
-    inputs.push_back(Input(in));
+    graph_inputs.inputs.push_back(Input(in));
   }
 }
 
 CompileSpec::CompileSpec(std::vector<std::vector<int64_t>> fixed_sizes) {
   for (auto in : fixed_sizes) {
-    inputs.push_back(Input(in));
+    graph_inputs.inputs.push_back(Input(in));
+  }
+}
+
+CompileSpec::CompileSpec(std::vector<Input> inputs) {
+    graph_inputs.inputs = std::move(inputs);
+}
+
+CompileSpec::CompileSpec(torch::jit::IValue input_signature) {
+    graph_inputs.input_signature = input_signature;
+}
+
+
+
+void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) {
+    if (input_ivalue.isTuple()) {
+      auto input_tuple = input_ivalue.toTuple();
+      std::vector<torch::jit::IValue> converted_elements;
+      for (auto item: input_tuple->elements()) {
+        torch::jit::IValue converted_item;
+        to_internal_input_signature(item, converted_item);
+        converted_elements.push_back(converted_item);
+        auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
+        converted_ivalue = torch::jit::IValue(tuple_ptr);
+      }
+    } else if(input_ivalue.isList()) {
+      auto input_list = input_ivalue.toList().vec();
+      c10::TypePtr type = input_list[0].type();
+      auto converted_elements = c10::impl::GenericList(type);
+      for (auto item: input_list) {
+        torch::jit::IValue converted_item;
+        to_internal_input_signature(item, converted_item);
+        converted_elements.push_back(converted_item);
+      }
+      converted_ivalue = torch::jit::IValue(converted_elements);
+    } else if(input_ivalue.isCustomClass()) {
+      torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass<torchtrt::Input>()));
+      converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::core::ir::Input>(cur_input)));
+    }
+}
+
+torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) {
+  if (external.graph_inputs.inputs.size() > 0) {
+    torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.graph_inputs.inputs));
+    return internal;
+  } else {
+    torch::jit::IValue converted_input_signature;
+    to_internal_input_signature(external.graph_inputs.input_signature, converted_input_signature);
+    torchtrt::core::CompileSpec internal(converted_input_signature);
+    return internal;
   }
 }
 
 torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
-  torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.inputs));
+  torchtrt::core::CompileSpec internal = init_compile_spec(external);
 
   for (auto p : external.enabled_precisions) {
     internal.convert_info.engine_settings.enabled_precisions.insert(toTRTDataType(p));
diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp
index 42b44833de..93813190ab 100644
--- a/cpp/src/torch_tensorrt.cpp
+++ b/cpp/src/torch_tensorrt.cpp
@@ -52,4 +52,7 @@ void set_device(const int gpu_id) {
   // Want to export a much simpler (non CUDA header dependent) API
   torch_tensorrt::core::set_device(gpu_id);
 }
+
+static auto tensorrt_input_container =
+    torch::class_<Input>("_torch_tensorrt", "Input").def(torch::init<>());
 } // namespace torch_tensorrt
diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
index 9165b21185..0eb6fba2de 100644
--- a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
@@ -23,6 +23,13 @@ void RegisterTRTCompileSpec() {
   ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistration, torch_tensorrt::pyapi::Input, input_is_dynamic);
   ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistration, torch_tensorrt::pyapi::Input, explicit_set_dtype);
 
+  static auto TORCHTRT_UNUSED TRTInputSignatureTSRegistration =
+      torch::class_<torch_tensorrt::pyapi::InputSignature>("tensorrt", "_InputSignature")
+          .def(torch::init<>())
+          .def("__str__", &torch_tensorrt::pyapi::InputSignature::to_str);
+
+  ADD_FIELD_GET_SET_REGISTRATION(TRTInputSignatureTSRegistration, torch_tensorrt::pyapi::InputSignature, signature_ivalue);
+
   static auto TORCHTRT_UNUSED TRTDeviceTSRegistration =
       torch::class_<torch_tensorrt::pyapi::Device>("tensorrt", "_Device")
           .def(torch::init<>())
@@ -49,6 +56,7 @@ void RegisterTRTCompileSpec() {
       torch::class_<torch_tensorrt::pyapi::CompileSpec>("tensorrt", "CompileSpec")
           .def(torch::init<>())
           .def("_append_input", &torch_tensorrt::pyapi::CompileSpec::appendInput)
+          .def("_set_input_signature", &torch_tensorrt::pyapi::CompileSpec::setInputSignature)
           .def("_set_precisions", &torch_tensorrt::pyapi::CompileSpec::setPrecisions)
           .def("_set_device", &torch_tensorrt::pyapi::CompileSpec::setDeviceIntrusive)
           .def("_set_torch_fallback", &torch_tensorrt::pyapi::CompileSpec::setTorchFallbackIntrusive)
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
index 5aeac3b6d6..9eb58b3e73 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
@@ -104,6 +104,12 @@ std::string Input::to_str() {
   return ss.str();
 }
 
+std::string InputSignature::to_str() {
+  std::stringstream ss;
+  ss << signature_ivalue;
+  return ss.str();
+}
+
 std::string to_str(DeviceType value) {
   switch (value) {
     case DeviceType::kDLA:
@@ -184,13 +190,63 @@ std::string TorchFallback::to_str() {
   return ss.str();
 }
 
-core::CompileSpec CompileSpec::toInternalCompileSpec() {
-  std::vector<core::ir::Input> internal_inputs;
-  for (auto i : inputs) {
-    internal_inputs.push_back(i.toInternalInput());
+void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) {
+    if (input_ivalue.isTuple()) {
+      auto input_tuple = input_ivalue.toTuple();
+      std::vector<torch::jit::IValue> converted_elements;
+      for (auto item: input_tuple->elements()) {
+        torch::jit::IValue converted_item;
+        to_internal_input_signature(item, converted_item);
+        converted_elements.push_back(converted_item);
+        auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
+        converted_ivalue = torch::jit::IValue(tuple_ptr);
+      }
+    } else if(input_ivalue.isList()) {
+      auto input_list = input_ivalue.toList().vec();
+      c10::TypePtr type = input_list[0].type();
+      auto converted_elements = c10::impl::GenericList(type);
+      for (auto item: input_list) {
+        torch::jit::IValue converted_item;
+        to_internal_input_signature(item, converted_item);
+        converted_elements.push_back(converted_item);
+      }
+      converted_ivalue = torch::jit::IValue(converted_elements);
+    } else if(input_ivalue.isCustomClass()) {
+      core::ir::Input cur_input = (*(input_ivalue.toCustomClass<Input>())).toInternalInput();
+      converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<core::ir::Input>(cur_input)));
+    } else if(input_ivalue.isPyObject()) {
+      auto py_object_holder = input_ivalue.toPyObjectHolder();
+      auto infer_type = py_object_holder->tryToInferType();
+      auto type = infer_type.type();
+      torch::jit::IValue ival = py_object_holder->toIValue(type);
+      torch::jit::IValue converted_item;
+      to_internal_input_signature(ival, converted_item);
+      converted_ivalue = torch::jit::IValue(converted_item);
+    } else {
+      LOG_ERROR("Unknown input spec type");
+    }
+}
+
+core::CompileSpec init_compile_spec(CompileSpec external) {
+  if (external.inputs.size() > 0) {
+    LOG_DEBUG("init_compile_spec with input vector");
+    std::vector<core::ir::Input> internal_inputs;
+    for (auto i : external.inputs) {
+      internal_inputs.push_back(i.toInternalInput());
+    }
+    core::CompileSpec internal(internal_inputs);
+    return internal;
+  } else {
+    LOG_DEBUG("init_compile_spec with input signature");
+    torch::jit::IValue converted_input_signature;
+    to_internal_input_signature(external.input_signature.signature_ivalue, converted_input_signature);
+    core::CompileSpec internal(converted_input_signature);
+    return internal;
   }
+}
 
-  auto info = core::CompileSpec(internal_inputs);
+core::CompileSpec CompileSpec::toInternalCompileSpec() {
+  core::CompileSpec info = init_compile_spec(*this);
 
   for (auto p : enabled_precisions) {
     info.convert_info.engine_settings.enabled_precisions.insert(toTRTDataType(p));
@@ -237,16 +293,20 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() {
 std::string CompileSpec::stringify() {
   std::stringstream ss;
   ss << "TensorRT Compile Spec: {" << std::endl;
-  ss << "    \"Inputs\": [" << std::endl;
-  for (auto i : inputs) {
-    ss << i.to_str();
+  if (inputs.size() > 0) {
+    ss << "    \"Inputs\": [" << std::endl;
+    for (auto i : inputs) {
+      ss << i.to_str();
+    }
+    ss << "    ]" << std::endl;
+  } else {
+    ss << "    \"Input Signature\": " << input_signature.to_str() << std::endl;
   }
-  ss << "    ]" << std::endl;
-  ss << "    \"Enabled Precision\": [" << std::endl;
+  ss << "    \"Enabled Precision\": [";
   for (auto p : enabled_precisions) {
-    ss << to_str(p);
+    ss << to_str(p) << ", " ;
   }
-  ss << "    ]" << std::endl;
+  ss << "]" << std::endl;
   ss << "    \"TF32 Disabled\": " << disable_tf32 << std::endl;
   ss << "    \"Sparsity\": " << sparse_weights << std::endl;
   ss << "    \"Refit\": " << refit << std::endl;
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h
index b615022bd0..d3b22740c2 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.h
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.h
@@ -57,6 +57,12 @@ struct Input : torch::CustomClassHolder {
   std::string to_str();
 };
 
+struct InputSignature : torch::CustomClassHolder {
+  torch::jit::IValue signature_ivalue;  // nested Input, full input spec
+  ADD_FIELD_GET_SET(signature_ivalue, torch::jit::IValue);
+  std::string to_str();
+};
+
 enum DeviceType : int8_t {
   kGPU,
   kDLA,
@@ -119,6 +125,10 @@ struct CompileSpec : torch::CustomClassHolder {
     inputs.push_back(*ir);
   }
 
+  void setInputSignature(const c10::intrusive_ptr<InputSignature>& is) {
+    input_signature = *is;
+  }
+
   void setPrecisions(const std::vector<int64_t>& precisions_raw) {
     for (auto p : precisions_raw) {
       TORCHTRT_CHECK(p >= 0 && p <= static_cast<int64_t>(DataType::kBool), "Invalid enum value for field");
@@ -158,6 +168,7 @@ struct CompileSpec : torch::CustomClassHolder {
   ADD_FIELD_GET_SET(ptq_calibrator, nvinfer1::IInt8Calibrator*);
 
   std::vector<Input> inputs;
+  InputSignature input_signature;
   nvinfer1::IInt8Calibrator* ptq_calibrator = nullptr;
   std::set<DataType> enabled_precisions = {};
   bool sparse_weights = false;
diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
index 74a8b72711..6247789a93 100644
--- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
+++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
@@ -2,6 +2,7 @@
 #include "pybind11/stl.h"
 
 #include "Python.h"
+#include "ATen/core/jit_type.h"
 #include "core/compiler.h"
 #include "core/conversion/conversion.h"
 #include "tensorrt_classes.h"
@@ -178,6 +179,15 @@ PYBIND11_MODULE(_C, m) {
       .def_readwrite("dtype", &Input::dtype)
       .def_readwrite("format", &Input::format);
 
+  py::class_<InputSignature>(m, "InputSignature")
+      .def(pybind11::init([](py::object py_obj) {
+        InputSignature input_signature;
+        input_signature.signature_ivalue = torch::jit::toIValue(std::move(py_obj), c10::PyObjectType::get(), c10::nullopt);
+        return input_signature;
+      }))
+      .def("__str__", &InputSignature::to_str)
+      .def_readwrite("_signature_ivalue", &InputSignature::signature_ivalue);
+
   py::enum_<DataType>(m, "dtype", "Enum to specifiy operating precision for engine execution")
       .value("float", DataType::kFloat, "32 bit floating point number")
       .value("float32", DataType::kFloat, "32 bit floating point number")
@@ -292,6 +302,7 @@ PYBIND11_MODULE(_C, m) {
       .def("__str__", &torch_tensorrt::pyapi::CompileSpec::stringify)
       .def("_get_calibrator_handle", &CompileSpec::getPTQCalibratorHandle, "[Internal] gets a handle from a calibrator")
       .def_readwrite("inputs", &CompileSpec::inputs)
+      .def_readwrite("input_signature", &CompileSpec::input_signature)
       .def_readwrite("enabled_precisions", &CompileSpec::enabled_precisions)
       .def_readwrite("ptq_calibrator", &CompileSpec::ptq_calibrator)
       .def_readwrite("refit", &CompileSpec::refit)
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index 4c7b8b5b5d..0eb8a1cdce 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -5,10 +5,22 @@
 from torch_tensorrt import _enums
 from torch_tensorrt._Input import Input
 from torch_tensorrt._Device import Device
-
+from torch_tensorrt.logging import Level, log
+from typing import Tuple, List, Dict
 import warnings
 
 
+def _internal_input_to_torch_class_input(i: _C.Input) -> torch.classes.tensorrt._Input:
+    clone = torch.classes.tensorrt._Input()
+    clone._set_min(i.min)
+    clone._set_opt(i.opt)
+    clone._set_max(i.max)
+    clone._set_dtype(i.dtype)
+    clone._set_format(i.format)
+    clone._set_input_is_dynamic(i.input_is_dynamic)
+    clone._set_explicit_set_dtype(i._explicit_set_dtype)
+    return clone
+
 def _supported_input_size_type(input_size: Any) -> bool:
     if isinstance(input_size, torch.Size):
         return True
@@ -156,15 +168,30 @@ def _parse_torch_fallback(fallback_info: Dict[str, Any]) -> _ts_C.TorchFallback:
 
     return info
 
+def _parse_input_signature(input_signature: Any):
+    if isinstance(input_signature, tuple):
+        input_list = []
+        for item in input_signature:
+           input = _parse_input_signature(item)
+           input_list.append(input)
+        return tuple(input_list)
+    elif isinstance(input_signature, list):
+        input_list = []
+        for item in input_signature:
+           input = _parse_input_signature(item)
+           input_list.append(input)
+        return input_list
+    elif isinstance(input_signature, Input) or isinstance(input_signature, torch.Tensor):
+        i = Input._from_tensor(input_signature) if isinstance(input_signature, torch.Tensor) else input_signature
+        clone = _internal_input_to_torch_class_input(i._to_internal())
+        return clone
+    else:
+        raise KeyError("Input signature contains an unsupported type {}".format(type(input_signature)))
 
 def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
     info = _ts_C.CompileSpec()
-    if "inputs" not in compile_spec:
-        raise KeyError(
-            "Module input definitions are requried to compile module. Provide a list of torch_tensorrt.Input keyed to \"inputs\" in the compile spec"
-        )
 
-    if "inputs" in compile_spec:
+    if len(compile_spec["inputs"]) > 0:
         if not all([isinstance(i, torch.Tensor) or isinstance(i, Input) for i in compile_spec["inputs"]]):
             raise KeyError("Input specs should be either torch_tensorrt.Input or torch.Tensor, found types: {}".format(
                 [type(i) for i in compile_spec["inputs"]]))
@@ -172,7 +199,15 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
         inputs = [Input._from_tensor(i) if isinstance(i, torch.Tensor) else i for i in compile_spec["inputs"]]
         info.inputs = [i._to_internal() for i in inputs]
 
-    assert (len(info.inputs) > 0), "Require at least one input definition to compile model"
+    elif compile_spec["input_signature"] is not None:
+        log(Level.Warning, "Input signature parsing is an experimental feature, behavior and APIs may change")
+        signature = _parse_input_signature(compile_spec["input_signature"])
+        info.input_signature = _C.InputSignature(signature) # py_object
+
+    else:
+        raise KeyError(
+            "Module input definitions are requried to compile module. Provide a list of torch_tensorrt.Input keyed to \"inputs\" in the compile spec"
+        )
 
     if "enabled_precisions" in compile_spec:
         info.enabled_precisions = _parse_enabled_precisions(compile_spec["enabled_precisions"])
@@ -230,10 +265,13 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
     if "torch_fallback" in compile_spec:
         info.torch_fallback = _parse_torch_fallback(compile_spec["torch_fallback"])
 
+    log(Level.Debug, str(info))
+
     return info
 
 
 def TensorRTCompileSpec(inputs=[],
+                        input_signature=None,
                         device=Device._current_device(),
                         disable_tf32=False,
                         sparse_weights=False,
@@ -288,6 +326,7 @@ def TensorRTCompileSpec(inputs=[],
 
     compile_spec = {
         "inputs": inputs,
+        "input_signature": input_signature,
         "device": device,
         "disable_tf32":
             disable_tf32,  # Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas
@@ -310,16 +349,11 @@ def TensorRTCompileSpec(inputs=[],
     backend_spec = torch.classes.tensorrt.CompileSpec()
 
     for i in parsed_spec.inputs:
-        clone = torch.classes.tensorrt._Input()
-        clone._set_min(i.min)
-        clone._set_opt(i.opt)
-        clone._set_max(i.max)
-        clone._set_dtype(i.dtype)
-        clone._set_format(i.format)
-        clone._set_input_is_dynamic(i.input_is_dynamic)
-        clone._set_explicit_set_dtype(i._explicit_set_dtype)
+        clone = _internal_input_to_torch_class_input(i)
         backend_spec._append_input(clone)
 
+    backend_spec._set_input_signature(parsed_spec.input_signature)
+
     d = torch.classes.tensorrt._Device()
     d._set_device_type(int(parsed_spec.device.device_type))
     d._set_gpu_id(parsed_spec.device.gpu_id)
diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py
index 83704a4b6c..508cb8fdd0 100644
--- a/py/torch_tensorrt/ts/_compiler.py
+++ b/py/torch_tensorrt/ts/_compiler.py
@@ -11,6 +11,7 @@
 
 def compile(module: torch.jit.ScriptModule,
             inputs=[],
+            input_signature=None,
             device=Device._current_device(),
             disable_tf32=False,
             sparse_weights=False,
@@ -94,6 +95,7 @@ def compile(module: torch.jit.ScriptModule,
 
     spec = {
         "inputs": inputs,
+        "input_signature": input_signature,
         "device": device,
         "disable_tf32": disable_tf32,  # Force FP32 layers to use traditional as FP32 format
         "sparse_weights": sparse_weights,  #Enable sparsity for convolution and fully connected layers.
diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
index fea202fc65..2d0255f130 100644
--- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
+++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
@@ -116,11 +116,11 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({16, 3, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({16}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::unordered_map<torch::jit::Node*, int> fallback_nodes;
@@ -175,11 +175,11 @@ TEST(Partitioning, ResolveTensorListInputsInTrtCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({16, 6, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({16}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::unordered_map<torch::jit::Node*, int> fallback_nodes;
diff --git a/tests/core/partitioning/test_shape_analysis.cpp b/tests/core/partitioning/test_shape_analysis.cpp
index 7bcabc0d51..151a6e75ad 100644
--- a/tests/core/partitioning/test_shape_analysis.cpp
+++ b/tests/core/partitioning/test_shape_analysis.cpp
@@ -59,11 +59,11 @@ TEST(Partitioning, InferSequentialModelSegmentedBlockShapeCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({8, 16, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({8}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::unordered_map<torch::jit::Node*, int> fallback_nodes;
@@ -110,11 +110,11 @@ TEST(Partitioning, InferBranchModelSegmentedBlockShapeCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({16, 32, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({16}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::unordered_map<torch::jit::Node*, int> fallback_nodes;
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
index 3d69afba95..2d545dc8f1 100644
--- a/tests/cpp/BUILD
+++ b/tests/cpp/BUILD
@@ -18,7 +18,8 @@ test_suite(
         ":test_multiple_registered_engines",
         ":test_serialization",
         ":test_module_fallback",
-        ":test_example_tensors"
+        ":test_example_tensors",
+        ":test_collection"
     ],
 )
 
@@ -32,7 +33,8 @@ test_suite(
         ":test_multiple_registered_engines",
         ":test_serialization",
         ":test_module_fallback",
-        ":test_example_tensors"
+        ":test_example_tensors",
+        ":test_collection"
     ],
 )
 
@@ -122,6 +124,20 @@ cc_test(
     })
 )
 
+cc_test(
+    name = "test_collection",
+    srcs = ["test_collection.cpp"],
+    data = [
+        "//tests/modules:jit_models",
+    ],
+    deps = [
+        "//tests/util",
+        "@googletest//:gtest_main",
+    ] + select({
+        ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
+        "//conditions:default": ["@libtorch//:libtorch"],
+    })
+)
 cc_test(
     name = "test_compiled_modules",
     srcs = ["test_compiled_modules.cpp"],
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
new file mode 100644
index 0000000000..c269ebac17
--- /dev/null
+++ b/tests/cpp/test_collection.cpp
@@ -0,0 +1,363 @@
+#include <string>
+#include <thread>
+#include "gtest/gtest.h"
+#include "tests/util/util.h"
+#include "torch/script.h"
+#include "torch_tensorrt/torch_tensorrt.h"
+
+
+TEST(CppAPITests, TestCollectionStandardTensorInput) {
+
+  std::string path = "tests/modules/standard_tensor_input.jit.pt";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+  auto out = mod.forward(inputs_);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  std::vector<torch_tensorrt::Input> input_range;
+  input_range.push_back({in0.sizes(), torch::kF16});
+  input_range.push_back({in0.sizes(), torch::kF16});
+  torch_tensorrt::ts::CompileSpec compile_settings(input_range);
+  compile_settings.require_full_compilation = true;
+  compile_settings.min_block_size = 3;
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(inputs_);
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+}
+
+TEST(CppAPITests, TestCollectionTupleInput) {
+
+  std::string path = "tests/modules/tuple_input.jit.pt";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+
+  std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
+  std::tuple<torch::jit::IValue, torch::jit::IValue> input_tuple(in0, in0);
+
+  complex_inputs.push_back(input_tuple);
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+
+  std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
+
+  torch::jit::IValue complex_input_shape(input_shape_tuple);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 3;
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(complex_inputs);
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+}
+
+
+TEST(CppAPITests, TestCollectionListInput) {
+
+  std::string path = "tests/modules/list_input.jit.pt";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+  std::vector<torch::jit::IValue> complex_inputs;
+  auto input_list = c10::impl::GenericList(c10::TensorType::get());
+  input_list.push_back(inputs_[0]);
+  input_list.push_back(inputs_[0]);
+
+  torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
+
+  complex_inputs.push_back(input_list_ivalue);
+
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+  c10::TypePtr elementType = input_shape_ivalue.type();
+  auto list = c10::impl::GenericList(elementType);
+  list.push_back(input_shape_ivalue);
+  list.push_back(input_shape_ivalue);
+
+
+  torch::jit::IValue complex_input_shape(list);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 3;
+  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(complex_inputs);
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+}
+
+
+TEST(CppAPITests, TestCollectionTupleInputOutput) {
+
+  std::string path = "tests/modules/tuple_input_output.jit.pt";
+
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+
+
+  std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
+  std::tuple<torch::jit::IValue, torch::jit::IValue> input_tuple(in0, in0);
+
+  complex_inputs.push_back(input_tuple);
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+
+  std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
+
+  torch::jit::IValue complex_input_shape(input_shape_tuple);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+  // torch::jit::IValue complex_input_shape(list);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 3;
+
+  // compile_settings.torch_executed_ops.push_back("prim::TupleConstruct");
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(complex_inputs);
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
+}
+
+
+TEST(CppAPITests, TestCollectionListInputOutput) {
+
+  std::string path = "tests/modules/list_input_output.jit.pt";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+  std::vector<torch::jit::IValue> complex_inputs;
+  auto input_list = c10::impl::GenericList(c10::TensorType::get());
+  input_list.push_back(inputs_[0]);
+  input_list.push_back(inputs_[0]);
+
+  torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
+
+  complex_inputs.push_back(input_list_ivalue);
+
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+
+  c10::TypePtr elementType = input_shape_ivalue.type();
+  auto list = c10::impl::GenericList(elementType);
+  list.push_back(input_shape_ivalue);
+  list.push_back(input_shape_ivalue);
+
+
+  torch::jit::IValue complex_input_shape(list);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 3;
+
+  // Need to skip the conversion of __getitem__ and ListConstruct
+  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(complex_inputs);
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5));
+}
+
+
+TEST(CppAPITests, TestCollectionComplexModel) {
+
+  std::string path = "tests/modules/complex_model.jit.pt";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+  std::vector<torch::jit::IValue> complex_inputs;
+  auto input_list = c10::impl::GenericList(c10::TensorType::get());
+  input_list.push_back(inputs_[0]);
+  input_list.push_back(inputs_[0]);
+
+  torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
+
+  complex_inputs.push_back(input_list_ivalue);
+
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+  c10::TypePtr elementType = input_shape_ivalue.type();
+  auto list = c10::impl::GenericList(elementType);
+  list.push_back(input_shape_ivalue);
+  list.push_back(input_shape_ivalue);
+
+
+  torch::jit::IValue complex_input_shape(list);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 3;
+
+  // Need to skip the conversion of __getitem__ and ListConstruct
+  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(complex_inputs);
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
+}
\ No newline at end of file
diff --git a/tests/modules/custom_models.py b/tests/modules/custom_models.py
index 20d501045f..443dcdd11f 100644
--- a/tests/modules/custom_models.py
+++ b/tests/modules/custom_models.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 from transformers import BertModel, BertTokenizer, BertConfig
 import torch.nn.functional as F
+from typing import Tuple, List, Dict
 
 
 # Sample Pool Model (for testing plugin serialization)
@@ -100,6 +101,66 @@ def forward(self, x, y):
         z = torch.cat(mod_list)
         return z
 
+# Collection input/output models
+class StandardTensorInput(nn.Module):
+    def __init__(self):
+        super(StandardTensorInput, self).__init__()
+
+    def forward(self, x, y):
+        r = x + y
+        return r
+
+class TupleInput(nn.Module):
+    def __init__(self):
+        super(TupleInput, self).__init__()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r = z[0] + z[1]
+        return r
+
+class ListInput(nn.Module):
+    def __init__(self):
+        super(ListInput, self).__init__()
+
+    def forward(self, z: List[torch.Tensor]):
+        r = z[0] + z[1]
+        return r
+
+class TupleInputOutput(nn.Module):
+    def __init__(self):
+        super(TupleInputOutput, self).__init__()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r = (r1, r2)
+        return r
+
+class ListInputOutput(nn.Module):
+    def __init__(self):
+        super(ListInputOutput, self).__init__()
+
+    def forward(self, z: List[torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r = [r1, r2]
+        return r
+
+class ListInputTupleOutput(nn.Module):
+    def __init__(self):
+        super(ListInputTupleOutput, self).__init__()
+        self.list_model = ListInputOutput()
+        self.tuple_model = TupleInputOutput()
+
+    def forward(self, z: List[torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r3 = (r1, r2)
+        r4 = [r2, r1]
+        tuple_out = self.tuple_model(r3)
+        list_out = self.list_model(r4)
+        r = (tuple_out[1], list_out[0])
+        return r
 
 def BertModule():
     model_name = "bert-base-uncased"
diff --git a/tests/modules/hub.py b/tests/modules/hub.py
index 48e6b519cb..7d3e03e395 100644
--- a/tests/modules/hub.py
+++ b/tests/modules/hub.py
@@ -104,6 +104,30 @@
         "model": cm.FallbackInplaceOPIf(),
         "path": "script"
     },
+    "standard_tensor_input": {
+        "model": cm.StandardTensorInput(),
+        "path": "script"
+    },
+    "tuple_input": {
+        "model": cm.TupleInput(),
+        "path": "script"
+    },
+    "list_input": {
+        "model": cm.ListInput(),
+        "path": "script"
+    },
+    "tuple_input_output": {
+        "model": cm.TupleInputOutput(),
+        "path": "script"
+    },
+    "list_input_output": {
+        "model": cm.ListInputOutput(),
+        "path": "script"
+    },
+    "list_input_tuple_output": {
+        "model": cm.ListInputTupleOutput(),
+        "path": "script"
+    },
     "bert_base_uncased": {
         "model": cm.BertModule(),
         "path": "trace"
@@ -193,5 +217,5 @@ def main():
         f.write(record)
         f.truncate()
 
-
-main()
+if __name__ == "__main__":
+    main()
diff --git a/tests/modules/requirements.txt b/tests/modules/requirements.txt
index d4b5105850..00acec5861 100644
--- a/tests/modules/requirements.txt
+++ b/tests/modules/requirements.txt
@@ -1,2 +1,3 @@
+torchvision
 timm==v0.4.12
 transformers==4.17.0
diff --git a/tests/py/api/test_collections.py b/tests/py/api/test_collections.py
new file mode 100644
index 0000000000..09cb6c4e70
--- /dev/null
+++ b/tests/py/api/test_collections.py
@@ -0,0 +1,147 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+
+def find_repo_root(max_depth=10):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    for i in range(max_depth):
+        files = os.listdir(dir_path)
+        if "WORKSPACE" in files:
+            return dir_path
+        else:
+            dir_path = os.path.dirname(dir_path)
+
+    raise RuntimeError("Could not find repo root")
+
+MODULE_DIR = find_repo_root() + "/tests/modules"
+
+class TestStandardTensorInput(unittest.TestCase):
+
+
+    def test_compile(self):
+
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/standard_tensor_input.jit.pt").eval().to("cuda")
+
+        compile_spec = {
+            "inputs": [torchtrt.Input(self.input.shape),
+                      torchtrt.Input(self.input.shape)],
+            "device": torchtrt.Device("gpu:0"),
+            "enabled_precisions": {torch.float}
+        }
+
+        trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
+        same = (trt_mod(self.input, self.input) - self.model(self.input, self.input)).abs().max()
+        self.assertTrue(same < 2e-2)
+
+class TestTupleInput(unittest.TestCase):
+
+
+    def test_compile(self):
+
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/tuple_input.jit.pt").eval().to("cuda")
+
+        compile_spec = {
+            "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),),
+            "device": torchtrt.Device("gpu:0"),
+            "enabled_precisions": {torch.float},
+            "require_full_compilation": False,
+            "min_block_size": 3
+        }
+
+        trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
+        same = (trt_mod((self.input, self.input)) - self.model((self.input, self.input))).abs().max()
+        self.assertTrue(same < 2e-2)
+
+class TestListInput(unittest.TestCase):
+
+
+    def test_compile(self):
+
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/list_input.jit.pt").eval().to("cuda")
+
+
+        compile_spec = {
+            "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],),
+            "device": torchtrt.Device("gpu:0"),
+            "enabled_precisions": {torch.float},
+            "require_full_compilation": False,
+            "min_block_size": 3
+        }
+
+        trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
+        same = (trt_mod([self.input, self.input]) - self.model([self.input, self.input])).abs().max()
+        self.assertTrue(same < 2e-2)
+
+class TestTupleInputOutput(unittest.TestCase):
+
+    def test_compile(self):
+
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/tuple_input_output.jit.pt").eval().to("cuda")
+
+
+        compile_spec = {
+            "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),),
+            "device": torchtrt.Device("gpu:0"),
+            "enabled_precisions": {torch.float},
+            "require_full_compilation": False,
+            "min_block_size": 3
+        }
+
+        trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
+        trt_out = trt_mod((self.input, self.input))
+        pyt_out = self.model((self.input, self.input))
+        results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)]
+        self.assertTrue(all(results))
+
+class TestListInputOutput(unittest.TestCase):
+
+    def test_compile(self):
+
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/list_input_output.jit.pt").eval().to("cuda")
+
+
+        compile_spec = {
+            "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],),
+            "device": torchtrt.Device("gpu:0"),
+            "enabled_precisions": {torch.float},
+            "require_full_compilation": False,
+            "min_block_size": 3
+        }
+
+        trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
+        trt_out = trt_mod((self.input, self.input))
+        pyt_out = self.model((self.input, self.input))
+        results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)]
+        self.assertTrue(all(results))
+
+
+class TestListInputTupleOutput(unittest.TestCase):
+
+    def test_compile(self):
+
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/list_input_tuple_output.jit.pt").eval().to("cuda")
+
+
+        compile_spec = {
+            "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],),
+            "device": torchtrt.Device("gpu:0"),
+            "enabled_precisions": {torch.float},
+            "require_full_compilation": False,
+            "min_block_size": 3
+        }
+
+        trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
+        trt_out = trt_mod((self.input, self.input))
+        pyt_out = self.model((self.input, self.input))
+        results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)]
+        self.assertTrue(all(results))
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/py/model_test_case.py b/tests/py/model_test_case.py
index e529f05013..1c772c1faf 100644
--- a/tests/py/model_test_case.py
+++ b/tests/py/model_test_case.py
@@ -1,7 +1,9 @@
 import unittest
 import torch
 import torchvision.models as models
+import os
 
+REPO_ROOT = os.path.abspath(os.getcwd()) + "/../../"
 
 class ModelTestCase(unittest.TestCase):
 
diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt
index 0ea1c76a29..e35531e566 100644
--- a/tests/py/requirements.txt
+++ b/tests/py/requirements.txt
@@ -1,2 +1 @@
-torchvision==0.13.0+cu113
--f https://download.pytorch.org/whl/torch_stable.html
+torchvision

From b26d768605619bd30e9b4c9eb0b88d6566b39a75 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sat, 23 Jul 2022 18:16:15 -0700
Subject: [PATCH 02/16] tests: fix test model paths

Signed-off-by: Naren Dasan <naren@narendasan.com>
---
 tests/py/api/test_collections.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/py/api/test_collections.py b/tests/py/api/test_collections.py
index 09cb6c4e70..603d44aebb 100644
--- a/tests/py/api/test_collections.py
+++ b/tests/py/api/test_collections.py
@@ -2,6 +2,7 @@
 import torch_tensorrt as torchtrt
 import torch
 import torchvision.models as models
+import os
 
 def find_repo_root(max_depth=10):
     dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -22,7 +23,7 @@ class TestStandardTensorInput(unittest.TestCase):
     def test_compile(self):
 
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.model = torch.jit.load(MODULE_DIR + "/standard_tensor_input.jit.pt").eval().to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/standard_tensor_input_scripted.jit.pt").eval().to("cuda")
 
         compile_spec = {
             "inputs": [torchtrt.Input(self.input.shape),
@@ -41,7 +42,7 @@ class TestTupleInput(unittest.TestCase):
     def test_compile(self):
 
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.model = torch.jit.load(MODULE_DIR + "/tuple_input.jit.pt").eval().to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/tuple_input_scripted.jit.pt").eval().to("cuda")
 
         compile_spec = {
             "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),),
@@ -61,7 +62,7 @@ class TestListInput(unittest.TestCase):
     def test_compile(self):
 
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.model = torch.jit.load(MODULE_DIR + "/list_input.jit.pt").eval().to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/list_input_scripted.jit.pt").eval().to("cuda")
 
 
         compile_spec = {
@@ -81,7 +82,7 @@ class TestTupleInputOutput(unittest.TestCase):
     def test_compile(self):
 
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.model = torch.jit.load(MODULE_DIR + "/tuple_input_output.jit.pt").eval().to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/tuple_input_output_scripted.jit.pt").eval().to("cuda")
 
 
         compile_spec = {
@@ -103,7 +104,7 @@ class TestListInputOutput(unittest.TestCase):
     def test_compile(self):
 
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.model = torch.jit.load(MODULE_DIR + "/list_input_output.jit.pt").eval().to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/list_input_output_scripted.jit.pt").eval().to("cuda")
 
 
         compile_spec = {
@@ -126,7 +127,7 @@ class TestListInputTupleOutput(unittest.TestCase):
     def test_compile(self):
 
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.model = torch.jit.load(MODULE_DIR + "/list_input_tuple_output.jit.pt").eval().to("cuda")
+        self.model = torch.jit.load(MODULE_DIR + "/list_input_tuple_output_scripted.jit.pt").eval().to("cuda")
 
 
         compile_spec = {

From b2a518383cc043e33bdcb650d35f97fddfff670b Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sat, 23 Jul 2022 21:35:59 -0700
Subject: [PATCH 03/16] tests: Fix tests

Signed-off-by: Naren Dasan <naren@narendasan.com>
---
 .circleci/config.yml                                 |  4 ++--
 cpp/include/torch_tensorrt/torch_tensorrt.h          |  7 +++----
 tests/cpp/BUILD                                      |  8 ++++----
 .../{test_collection.cpp => test_collections.cpp}    | 12 ++++++------
 tests/cpp/test_example_tensors.cpp                   |  4 +++-
 tests/modules/hub.py                                 |  8 ++++----
 6 files changed, 22 insertions(+), 21 deletions(-)
 rename tests/cpp/{test_collection.cpp => test_collections.cpp} (96%)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 77a1fd036f..3eda95d4f0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -674,7 +674,7 @@ workflows:
           requires:
             - build-x86_64-pyt-release
 
-      - test-py-ts-x86_64:
+      - test-py-fx-x86_64:
           name: test-py-fx-x86_64-pyt-release
           channel: "release"
           torch-build: << pipeline.parameters.torch-release-build >>
@@ -752,7 +752,7 @@ workflows:
           requires:
             - build-x86_64-pyt-release
 
-      - test-py-ts-x86_64:
+      - test-py-fx-x86_64:
           name: test-py-fx-x86_64-pyt-release
           channel: "release"
           torch-build: << pipeline.parameters.torch-release-build >>
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
index 70dea51bc7..11dc5d74c6 100644
--- a/cpp/include/torch_tensorrt/torch_tensorrt.h
+++ b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -569,7 +569,7 @@ struct TORCHTRT_API CompileSpec {
   CompileSpec(std::vector<std::vector<int64_t>> fixed_sizes);
 
   /**
-   * @brief Construct a new Extra Info object
+   * @brief Construct a new Compile Spec object
    * Convienence constructor to set fixed input size from c10::ArrayRef's (the
    * output of tensor.sizes()) describing size of input tensors. Each entry in
    * the vector represents a input and should be provided in call order.
@@ -583,7 +583,7 @@ struct TORCHTRT_API CompileSpec {
   CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes);
 
   /**
-   * @brief Construct a new Extra Info object from input ranges.
+   * @brief Construct a new Compile Spec object from input ranges.
    * Each entry in the vector represents a input and should be provided in call
    * order.
    *
@@ -594,8 +594,7 @@ struct TORCHTRT_API CompileSpec {
   CompileSpec(std::vector<Input> inputs);
 
   /**
-   * @brief Construct a new Extra Info object from IValue.
-   * The IValue store a complex Input
+   * @brief Construct a new Compile Spec  object from IValue which represents the nesting of input tensors for a module.
    *
    * @param input_signature
    */
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
index 2d545dc8f1..8e479e2e0a 100644
--- a/tests/cpp/BUILD
+++ b/tests/cpp/BUILD
@@ -19,7 +19,7 @@ test_suite(
         ":test_serialization",
         ":test_module_fallback",
         ":test_example_tensors",
-        ":test_collection"
+        ":test_collections"
     ],
 )
 
@@ -34,7 +34,7 @@ test_suite(
         ":test_serialization",
         ":test_module_fallback",
         ":test_example_tensors",
-        ":test_collection"
+        ":test_collections"
     ],
 )
 
@@ -125,8 +125,8 @@ cc_test(
 )
 
 cc_test(
-    name = "test_collection",
-    srcs = ["test_collection.cpp"],
+    name = "test_collections",
+    srcs = ["test_collections.cpp"],
     data = [
         "//tests/modules:jit_models",
     ],
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collections.cpp
similarity index 96%
rename from tests/cpp/test_collection.cpp
rename to tests/cpp/test_collections.cpp
index c269ebac17..df2280b947 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collections.cpp
@@ -8,7 +8,7 @@
 
 TEST(CppAPITests, TestCollectionStandardTensorInput) {
 
-  std::string path = "tests/modules/standard_tensor_input.jit.pt";
+  std::string path = "tests/modules/standard_tensor_input_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -53,7 +53,7 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) {
 
 TEST(CppAPITests, TestCollectionTupleInput) {
 
-  std::string path = "tests/modules/tuple_input.jit.pt";
+  std::string path = "tests/modules/tuple_input_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
 
   torch::jit::Module mod;
@@ -103,7 +103,7 @@ TEST(CppAPITests, TestCollectionTupleInput) {
 
 TEST(CppAPITests, TestCollectionListInput) {
 
-  std::string path = "tests/modules/list_input.jit.pt";
+  std::string path = "tests/modules/list_input_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -169,7 +169,7 @@ TEST(CppAPITests, TestCollectionListInput) {
 
 TEST(CppAPITests, TestCollectionTupleInputOutput) {
 
-  std::string path = "tests/modules/tuple_input_output.jit.pt";
+  std::string path = "tests/modules/tuple_input_output_scripted.jit.pt";
 
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
 
@@ -224,7 +224,7 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
 
 TEST(CppAPITests, TestCollectionListInputOutput) {
 
-  std::string path = "tests/modules/list_input_output.jit.pt";
+  std::string path = "tests/modules/list_input_output_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -295,7 +295,7 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
 
 TEST(CppAPITests, TestCollectionComplexModel) {
 
-  std::string path = "tests/modules/complex_model.jit.pt";
+  std::string path = "tests/modules/list_input_tuple_output_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
diff --git a/tests/cpp/test_example_tensors.cpp b/tests/cpp/test_example_tensors.cpp
index 6561cd16a0..3ec8831f9d 100644
--- a/tests/cpp/test_example_tensors.cpp
+++ b/tests/cpp/test_example_tensors.cpp
@@ -9,7 +9,9 @@ TEST_P(CppAPITests, InputsFromTensors) {
     trt_inputs_ivalues.push_back(in.clone());
   }
 
-  auto spec = torch_tensorrt::ts::CompileSpec({trt_inputs_ivalues[0].toTensor()});
+
+  auto inputs = std::vector<torch_tensorrt::Input>{trt_inputs_ivalues[0].toTensor()};
+  auto spec = torch_tensorrt::ts::CompileSpec(inputs);
 
   auto trt_mod = torch_tensorrt::ts::compile(mod, spec);
   torch::jit::IValue trt_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(trt_mod, trt_inputs_ivalues);
diff --git a/tests/modules/hub.py b/tests/modules/hub.py
index 7d3e03e395..3ad92ff79a 100644
--- a/tests/modules/hub.py
+++ b/tests/modules/hub.py
@@ -128,10 +128,10 @@
         "model": cm.ListInputTupleOutput(),
         "path": "script"
     },
-    "bert_base_uncased": {
-        "model": cm.BertModule(),
-        "path": "trace"
-    }
+    #"bert_base_uncased": {
+    #    "model": cm.BertModule(),
+    #    "path": "trace"
+    #}
 }
 
 

From 8385253db173d2898d7dd0b934c798860e1cbd8a Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Tue, 26 Jul 2022 11:45:00 -0700
Subject: [PATCH 04/16] chore: Update generateRandomTensors uses

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/compiler.cpp                                 |  2 +-
 core/conversion/evaluators/aten.cpp               | 15 +++++++++++++++
 core/ir/GraphInputs.cpp                           |  4 ++--
 core/partitioning/partitioning.cpp                |  6 ------
 core/partitioning/shape_analysis.cpp              |  6 +++---
 .../test_resolve_nontensor_inputs.cpp             | 10 +++++-----
 tests/modules/hub.py                              |  8 ++++----
 7 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 214443a9c6..e44ece5c27 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -322,7 +322,7 @@ void MapInputsAndDetermineDTypes(
         est_type_opt = first_use_type_map.find(in)->second;
       }
       // traverse elements in est_type_out and spec
-      for (int i = 0; i < est_type_opt.size(); i++) {
+      for (size_t i = 0; i < est_type_opt.size(); i++) {
         if (est_type_opt[i] && !spec[i].dtype_is_user_defined) {
           // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
           // type
diff --git a/core/conversion/evaluators/aten.cpp b/core/conversion/evaluators/aten.cpp
index 7bb1f6d202..4632744790 100644
--- a/core/conversion/evaluators/aten.cpp
+++ b/core/conversion/evaluators/aten.cpp
@@ -264,6 +264,21 @@ auto aten_registrations TORCHTRT_UNUSED =
              },
              EvalOptions().validSchemas(
                  {"aten::size(Tensor self) -> (int[])", "aten::size.int(Tensor self, int dim) -> (int)"})})
+        .evaluator({c10::Symbol::fromQualString("aten::__getitem__"),
+                    [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
+                      auto list = args.at(n->input(0)).IValue()->to<c10::List<c10::IValue>>();
+                      auto idx = args.at(n->input(1)).unwrapToInt();
+
+                      const int64_t list_size = list.size();
+                      const int64_t normalized_idx = normalizeIndex(idx, list_size);
+                      TORCHTRT_CHECK(
+                          normalized_idx >= 0 || normalized_idx < list_size,
+                          "List index out of range (aten::__getitem__)");
+                      return list.get(normalized_idx);
+                    },
+                    EvalOptions().validSchemas({
+                        "aten::__getitem__.t(t[](a) list, int idx) -> (t(*))",
+                    })})
         .evaluator({c10::Symbol::fromQualString("aten::append"),
                     [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
                       auto list = args.at(n->input(0)).IValue()->to<c10::List<c10::IValue>>();
diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp
index 792189137a..007a7279e7 100644
--- a/core/ir/GraphInputs.cpp
+++ b/core/ir/GraphInputs.cpp
@@ -5,7 +5,7 @@ namespace torch_tensorrt {
 namespace core {
 namespace ir {
 
-void flatten_dfs(std::vector<torch_tensorrt::core::ir::Input>& flattened_inputs, std::vector<std::vector<torch_tensorrt::core::ir::Input>>& collection_inputs, 
+void flatten_dfs(std::vector<torch_tensorrt::core::ir::Input>& flattened_inputs, std::vector<std::vector<torch_tensorrt::core::ir::Input>>& collection_inputs,
                  torch::jit::IValue input_ivalue, int level, int index) {
     if (input_ivalue.isTuple()) {
       auto input_tuple = input_ivalue.toTuple();
@@ -53,7 +53,7 @@ GraphInputs::GraphInputs(std::vector<ir::Input> inputs_) {
     LOG_DEBUG("Construct GraphInput with ir::Input");
     inputs = inputs_;
     collection_inputs.resize(inputs_.size());
-    for (int i = 0; i < inputs_.size(); i++) {
+    for (size_t i = 0; i < inputs_.size(); i++) {
         collection_inputs[i].push_back(inputs_[i]);
     }
 }
diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index f14d5438c6..dc7ef1f7ac 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -400,15 +400,9 @@ PartitionedGraph segment_graph(
     if (n->kind() == torch::jit::prim::Constant) {
       continue;
     }
-<<<<<<< HEAD
-
-    if (check_node_fallback(n, global_fallback_nodes)) {
-      in_prog_trt_blk_nodes.push_back(n);
-=======
     // the outputs of trt subgraph shouldn't be collections
     if (should_run_in_trt(n, forced_fallback_ops) && !(in_prog_trt_blk_nodes.size() == 0 && is_collection(n))) {
       in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n);
->>>>>>> feat: support for grouped inputs
 
       // If there is an active PyTorch block and we have passed the threshold for a valid TRT
       // block then segment and reset the active PyTorch block
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index 22c3ea104f..1221318647 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -39,7 +39,7 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
       std::vector<torch::jit::IValue> list;
       c10::TypePtr elementType = c10::TensorType::get();
       auto generic_list = c10::impl::GenericList(elementType);
-      for (int i = 0; i < input.second.size(); i++) {
+      for (size_t i = 0; i < input.second.size(); i++) {
         auto in = generateSingleInput(input.second[i], types[input.first][i]);
         generic_list.push_back(in.clone());
       }
@@ -47,7 +47,7 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
     } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
       std::vector<torch::jit::IValue> list;
-      for (int i = 0; i < input.second.size(); i++) {
+      for (size_t i = 0; i < input.second.size(); i++) {
         auto in = generateSingleInput(input.second[i], types[input.first][i]);
         list.push_back(in.clone());
       }
@@ -56,7 +56,7 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
     } else {
       auto in = generateSingleInput(input.second[0], types[input.first][0]);
       ivalue_map[input.first] = in.clone();
-      
+
     }
   }
   return ivalue_map;
diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
index 2d0255f130..1f3ee3b051 100644
--- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
+++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
@@ -116,7 +116,7 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({16, 3, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({16}));
 
-  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  torch_tensorrt::core::ir::CollectionInputSpecMap inputs_map;
   std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
     inputs_map.insert({g->inputs()[i], {inputs[i]}});
@@ -367,11 +367,11 @@ TEST(Partitioning, ResolveOnlyNeccessaryNonTensorInputs) {
   inputs.push_back(torch_tensorrt::core::ir::Input({4, 4}));
   inputs.push_back(torch_tensorrt::core::ir::Input({4, 4}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  torch_tensorrt::core::ir::CollectionInputSpecMap inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::unordered_map<torch::jit::Node*, int> fallback_nodes;
diff --git a/tests/modules/hub.py b/tests/modules/hub.py
index 3ad92ff79a..7d3e03e395 100644
--- a/tests/modules/hub.py
+++ b/tests/modules/hub.py
@@ -128,10 +128,10 @@
         "model": cm.ListInputTupleOutput(),
         "path": "script"
     },
-    #"bert_base_uncased": {
-    #    "model": cm.BertModule(),
-    #    "path": "trace"
-    #}
+    "bert_base_uncased": {
+        "model": cm.BertModule(),
+        "path": "trace"
+    }
 }
 
 

From d479c9854a2976b6620a7c7e1e020bf89f333702 Mon Sep 17 00:00:00 2001
From: Bo Wang <bowa@nvidia.com>
Date: Tue, 26 Jul 2022 17:02:02 -0700
Subject: [PATCH 05/16] fix: fix the fallback related issue after merging
 collection

Signed-off-by: Bo Wang <bowa@nvidia.com>
---
 core/partitioning/partitioning.cpp | 70 ++++++++----------------------
 1 file changed, 19 insertions(+), 51 deletions(-)

diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index dc7ef1f7ac..1a7a4777de 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -17,22 +17,17 @@ struct usage_info {
   std::vector<size_t> tensorrt_use_id; // ids of segmented blocks which are of type TensorRT
 };
 
-inline bool isTensorOrTensorList(torch::jit::Value* val) {
-  return val->type()->isSubtypeOf(torch::jit::TensorType::get()) ||
-      val->type()->isSubtypeOf(torch::jit::ListType::ofTensors());
-}
-
-inline bool isTensorList(torch::jit::Value* val) {
-  return val->type()->isSubtypeOf(torch::jit::ListType::ofTensors());
-}
-
 inline bool isTensor(torch::jit::Value* val) {
   return val->type()->isSubtypeOf(torch::jit::TensorType::get());
 }
 
+inline bool isListOrTuple(torch::jit::Value* val) {
+  return val->type()->kind() == torch::jit::TypeKind::TupleType || val->type()->kind() == torch::jit::TypeKind::ListType;
+}
+
 bool containNonTensorOutputs(torch::jit::Node* n) {
   for (auto output : n->outputs()) {
-    if (!isTensorOrTensorList(output)) {
+    if (!isTensor(output)) {
       return true;
     }
   }
@@ -68,6 +63,7 @@ std::vector<torch::jit::Node*> findModifyingNodes(
   return modifying_nodes;
 }
 
+// this function is only used when a TRT segment produces nonTensor values which are used by later TRT segment
 std::vector<torch::jit::Node*> getDependencyNodes(
     const std::vector<torch::jit::Value*>& vals,
     const SegmentedBlock& seg_block) {
@@ -88,7 +84,7 @@ std::vector<torch::jit::Node*> getDependencyNodes(
       stk.insert(stk.end(), modifying_nodes.rbegin(), modifying_nodes.rend());
       stk.push_back(node);
       for (auto input : node->inputs()) {
-        if (!isTensorOrTensorList(input)) {
+        if (!isTensor(input)) {
           q.push(input);
         }
       }
@@ -113,6 +109,8 @@ void find_all_fallback_nodes(
     auto cur_node = q.front();
     q.pop();
     // for every node that produces this fallback node's NonTensor input, they should fallback too
+    // Even collection feature is supported, since TRT List/Tuple output is not supported yet, the nodes
+    // that produce List/Tuple still cannot be in TRT segment
     for (auto input : cur_node->inputs()) {
       if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant &&
           global_fallback_nodes.insert({input->node(), FallbackNodeType::kNON_TENSOR}).second) {
@@ -120,8 +118,10 @@ void find_all_fallback_nodes(
       }
     }
     // for every node that consumes this fallback node's NonTensor output, they should fallback too
+    // Since collection feature is supported, we can have List/Tuple input for TRT segment, so we only
+    // fallback the nodes that take inputs which are not Tensor/List/Tuple
     for (auto output : cur_node->outputs()) {
-      if (!isTensor(output)) {
+      if (!isTensor(output) && !isListOrTuple(output)) {
         for (auto use : output->uses()) {
           auto node = use.user;
           if (node->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) {
@@ -176,7 +176,7 @@ void registerSegmentsOutputs(PartitionedGraph& segmented_blocks, torch::jit::Blo
       if (std::find(seg_block.raw_inputs().begin(), seg_block.raw_inputs().end(), mini_graph_input) ==
               seg_block.raw_inputs().end() &&
           seg_block.contain_raw_value(mini_graph_input)) {
-        if (!isTensorOrTensorList(mini_graph_input) && seg_block.target() == SegmentedBlock::kTensorRT)
+        if (!isTensor(mini_graph_input) && seg_block.target() == SegmentedBlock::kTensorRT)
           continue;
         seg_block.registerOutput(mini_graph_input);
       }
@@ -242,36 +242,6 @@ bool check_node_fallback(torch::jit::Node* n, const std::unordered_map<torch::ji
           "Node fallback to Torch because the NonTensor dependencies with other fallback nodes: "
           << util::node_info(n));
     }
-  }
-  return false;
-}
-
-bool is_collection(torch::jit::Node* n) {
-  for (auto out: n->outputs()) {
-    if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool should_run_in_trt(torch::jit::Node* n, const std::unordered_set<std::string>& torch_ops) {
-  // If the op is not supported by the conversion phase it should run in PyTorch
-  if (!conversion::OpSupported(n)) {
-    LOG_GRAPH("Node not supported by conversion: " << util::node_info(n));
-    return false;
-  }
-
-  // If the user specifies the op to run in Torch it should run in PyTorch
-  if (torch_ops.find(n->kind().toQualString()) != torch_ops.end()) {
-    LOG_GRAPH("Node explicitly set to run in torch: " << util::node_info(n));
-    return false;
-  }
-
-  // If the user specifies the module containing this op to run in torch it should run in PyTorch
-  const auto to_compile_sym = c10::Symbol::attr("to_compile");
-  if (n->hasAttribute(to_compile_sym) && n->i(to_compile_sym) == (int64_t) false) {
-    LOG_GRAPH("Node is within a module set to run in torch: " << util::node_info(n));
     return false;
   }
 
@@ -390,19 +360,18 @@ PartitionedGraph segment_graph(
   find_min_block_size_fallback_nodes(block, global_fallback_nodes, min_block_size);
 
   auto nodes = block->nodes();
-  auto reverse_nodes = nodes.reverse(); // merge from output side to input side
   PartitionedGraph segmented_blocks;
 
   // segment the nodes
   std::vector<torch::jit::Node*> in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes;
-  for (const auto n : reverse_nodes) {
+  for (const auto n : nodes) {
     // Skip constant nodes as they are resources for both kinds of modules
     if (n->kind() == torch::jit::prim::Constant) {
       continue;
     }
     // the outputs of trt subgraph shouldn't be collections
-    if (should_run_in_trt(n, forced_fallback_ops) && !(in_prog_trt_blk_nodes.size() == 0 && is_collection(n))) {
-      in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n);
+    if (check_node_fallback(n, global_fallback_nodes)) {
+      in_prog_trt_blk_nodes.push_back(n);
 
       // If there is an active PyTorch block and we have passed the threshold for a valid TRT
       // block then segment and reset the active PyTorch block
@@ -418,7 +387,7 @@ PartitionedGraph segment_graph(
         LOG_DEBUG(
             "In progress TRT block does not meet minimum block size requirements, therefore folding into in progress PyTorch block");
         in_prog_pyt_blk_nodes.insert(
-            in_prog_pyt_blk_nodes.begin(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end());
+            in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end());
       }
       in_prog_trt_blk_nodes.clear();
       // if there is a prim::If then this if node will be encapsulated in a SegmentedBlock
@@ -437,14 +406,14 @@ PartitionedGraph segment_graph(
           finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes);
         }
         if (checkLoopEvaluatable(n)) {
-          in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n);
+          in_prog_trt_blk_nodes.push_back(n);
         } else {
           auto loop_node = std::vector<torch::jit::Node*>{n};
           finalize_block(segmented_blocks, SegmentedBlock::kTorch, loop_node);
         }
         continue;
       }
-      in_prog_pyt_blk_nodes.insert(in_prog_pyt_blk_nodes.begin(), n);
+      in_prog_pyt_blk_nodes.push_back(n);
     }
   }
 
@@ -459,7 +428,6 @@ PartitionedGraph segment_graph(
         in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end());
     finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes);
   }
-  std::reverse(segmented_blocks.begin(), segmented_blocks.end());
   return segmented_blocks;
 }
 

From b7178ffd055256de210d3d7ab08c23ed15dc90bf Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Wed, 27 Jul 2022 12:04:30 -0700
Subject: [PATCH 06/16] feat: Better input signature logging

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 py/torch_tensorrt/csrc/tensorrt_classes.cpp | 112 ++++++++++++++------
 tests/modules/custom_models.py              |   1 +
 2 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
index 9eb58b3e73..ca11cf4bc1 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
@@ -104,10 +104,54 @@ std::string Input::to_str() {
   return ss.str();
 }
 
+std::string sig_to_str(torch::jit::IValue input_sig) {
+  if (input_sig.isTuple()) {
+    auto input_tuple = input_sig.toTuple();
+    std::vector<std::string> children;
+    for (auto item: input_tuple->elements()) {
+      auto child = sig_to_str(item);
+      children.push_back(child);
+    }
+    std::stringstream ss;
+    ss << "(";
+    for (auto i : children) {
+      ss <<  i << ", ";
+    }
+    ss << ")";
+    return ss.str();
+  } else if(input_sig.isList()) {
+    auto input_list = input_sig.toList().vec();
+    std::vector<std::string> children;
+    for (auto item: input_list) {
+      auto child = sig_to_str(item);
+      children.push_back(child);
+    }
+    std::stringstream ss;
+    ss << "[";
+    for (auto i : children) {
+      ss <<  i << ", ";
+    }
+    ss << "]";
+    return ss.str();
+  } else if(input_sig.isCustomClass()) {
+    auto cur_input = input_sig.toCustomClass<Input>();
+    return cur_input->to_str();
+  } else if(input_sig.isPyObject()) {
+    auto py_object_holder = input_sig.toPyObjectHolder();
+    auto infer_type = py_object_holder->tryToInferType();
+    auto type = infer_type.type();
+    torch::jit::IValue ival = py_object_holder->toIValue(type);
+    torch::jit::IValue converted_item;
+    return sig_to_str(ival);
+  } else {
+    LOG_ERROR("Unknown input spec type");
+    return "";
+  }
+}
+
 std::string InputSignature::to_str() {
   std::stringstream ss;
-  ss << signature_ivalue;
-  return ss.str();
+  return sig_to_str(signature_ivalue);
 }
 
 std::string to_str(DeviceType value) {
@@ -191,40 +235,40 @@ std::string TorchFallback::to_str() {
 }
 
 void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) {
-    if (input_ivalue.isTuple()) {
-      auto input_tuple = input_ivalue.toTuple();
-      std::vector<torch::jit::IValue> converted_elements;
-      for (auto item: input_tuple->elements()) {
-        torch::jit::IValue converted_item;
-        to_internal_input_signature(item, converted_item);
-        converted_elements.push_back(converted_item);
-        auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
-        converted_ivalue = torch::jit::IValue(tuple_ptr);
-      }
-    } else if(input_ivalue.isList()) {
-      auto input_list = input_ivalue.toList().vec();
-      c10::TypePtr type = input_list[0].type();
-      auto converted_elements = c10::impl::GenericList(type);
-      for (auto item: input_list) {
-        torch::jit::IValue converted_item;
-        to_internal_input_signature(item, converted_item);
-        converted_elements.push_back(converted_item);
-      }
-      converted_ivalue = torch::jit::IValue(converted_elements);
-    } else if(input_ivalue.isCustomClass()) {
-      core::ir::Input cur_input = (*(input_ivalue.toCustomClass<Input>())).toInternalInput();
-      converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<core::ir::Input>(cur_input)));
-    } else if(input_ivalue.isPyObject()) {
-      auto py_object_holder = input_ivalue.toPyObjectHolder();
-      auto infer_type = py_object_holder->tryToInferType();
-      auto type = infer_type.type();
-      torch::jit::IValue ival = py_object_holder->toIValue(type);
+  if (input_ivalue.isTuple()) {
+    auto input_tuple = input_ivalue.toTuple();
+    std::vector<torch::jit::IValue> converted_elements;
+    for (auto item: input_tuple->elements()) {
       torch::jit::IValue converted_item;
-      to_internal_input_signature(ival, converted_item);
-      converted_ivalue = torch::jit::IValue(converted_item);
-    } else {
-      LOG_ERROR("Unknown input spec type");
+      to_internal_input_signature(item, converted_item);
+      converted_elements.push_back(converted_item);
+      auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
+      converted_ivalue = torch::jit::IValue(tuple_ptr);
     }
+  } else if(input_ivalue.isList()) {
+    auto input_list = input_ivalue.toList().vec();
+    c10::TypePtr type = input_list[0].type();
+    auto converted_elements = c10::impl::GenericList(type);
+    for (auto item: input_list) {
+      torch::jit::IValue converted_item;
+      to_internal_input_signature(item, converted_item);
+      converted_elements.push_back(converted_item);
+    }
+    converted_ivalue = torch::jit::IValue(converted_elements);
+  } else if(input_ivalue.isCustomClass()) {
+    core::ir::Input cur_input = (*(input_ivalue.toCustomClass<Input>())).toInternalInput();
+    converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<core::ir::Input>(cur_input)));
+  } else if(input_ivalue.isPyObject()) {
+    auto py_object_holder = input_ivalue.toPyObjectHolder();
+    auto infer_type = py_object_holder->tryToInferType();
+    auto type = infer_type.type();
+    torch::jit::IValue ival = py_object_holder->toIValue(type);
+    torch::jit::IValue converted_item;
+    to_internal_input_signature(ival, converted_item);
+    converted_ivalue = torch::jit::IValue(converted_item);
+  } else {
+    LOG_ERROR("Unknown input spec type");
+  }
 }
 
 core::CompileSpec init_compile_spec(CompileSpec external) {
diff --git a/tests/modules/custom_models.py b/tests/modules/custom_models.py
index 443dcdd11f..a92e01e7a4 100644
--- a/tests/modules/custom_models.py
+++ b/tests/modules/custom_models.py
@@ -133,6 +133,7 @@ def __init__(self):
     def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
         r1 = z[0] + z[1]
         r2 = z[0] - z[1]
+        r1 = r1 * 10
         r = (r1, r2)
         return r
 

From 418d1e5646a5e8749c2b9b9849aa1ba94b9835ce Mon Sep 17 00:00:00 2001
From: Bo Wang <bowa@nvidia.com>
Date: Wed, 27 Jul 2022 18:37:38 -0700
Subject: [PATCH 07/16] refactor: still fallback when a trt segment has
 tuple/list input/output

Signed-off-by: Bo Wang <bowa@nvidia.com>
---
 core/partitioning/partitioning.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index 1a7a4777de..85626772f0 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -21,10 +21,6 @@ inline bool isTensor(torch::jit::Value* val) {
   return val->type()->isSubtypeOf(torch::jit::TensorType::get());
 }
 
-inline bool isListOrTuple(torch::jit::Value* val) {
-  return val->type()->kind() == torch::jit::TypeKind::TupleType || val->type()->kind() == torch::jit::TypeKind::ListType;
-}
-
 bool containNonTensorOutputs(torch::jit::Node* n) {
   for (auto output : n->outputs()) {
     if (!isTensor(output)) {
@@ -109,8 +105,6 @@ void find_all_fallback_nodes(
     auto cur_node = q.front();
     q.pop();
     // for every node that produces this fallback node's NonTensor input, they should fallback too
-    // Even collection feature is supported, since TRT List/Tuple output is not supported yet, the nodes
-    // that produce List/Tuple still cannot be in TRT segment
     for (auto input : cur_node->inputs()) {
       if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant &&
           global_fallback_nodes.insert({input->node(), FallbackNodeType::kNON_TENSOR}).second) {
@@ -118,13 +112,12 @@ void find_all_fallback_nodes(
       }
     }
     // for every node that consumes this fallback node's NonTensor output, they should fallback too
-    // Since collection feature is supported, we can have List/Tuple input for TRT segment, so we only
-    // fallback the nodes that take inputs which are not Tensor/List/Tuple
     for (auto output : cur_node->outputs()) {
-      if (!isTensor(output) && !isListOrTuple(output)) {
+      if (!isTensor(output)) {
         for (auto use : output->uses()) {
           auto node = use.user;
-          if (node->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) {
+          if (node->kind() != torch::jit::prim::Constant &&
+              global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) {
             q.push(node);
           }
         }

From c9d4788a8fb046a93dec6e6732d6d6876a83276a Mon Sep 17 00:00:00 2001
From: Bo Wang <bowa@nvidia.com>
Date: Wed, 27 Jul 2022 18:37:38 -0700
Subject: [PATCH 08/16] refactor: still fallback when a trt segment has
 tuple/list input/output

Signed-off-by: Bo Wang <bowa@nvidia.com>
---
 core/partitioning/partitioning.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index 1a7a4777de..85626772f0 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -21,10 +21,6 @@ inline bool isTensor(torch::jit::Value* val) {
   return val->type()->isSubtypeOf(torch::jit::TensorType::get());
 }
 
-inline bool isListOrTuple(torch::jit::Value* val) {
-  return val->type()->kind() == torch::jit::TypeKind::TupleType || val->type()->kind() == torch::jit::TypeKind::ListType;
-}
-
 bool containNonTensorOutputs(torch::jit::Node* n) {
   for (auto output : n->outputs()) {
     if (!isTensor(output)) {
@@ -109,8 +105,6 @@ void find_all_fallback_nodes(
     auto cur_node = q.front();
     q.pop();
     // for every node that produces this fallback node's NonTensor input, they should fallback too
-    // Even collection feature is supported, since TRT List/Tuple output is not supported yet, the nodes
-    // that produce List/Tuple still cannot be in TRT segment
     for (auto input : cur_node->inputs()) {
       if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant &&
           global_fallback_nodes.insert({input->node(), FallbackNodeType::kNON_TENSOR}).second) {
@@ -118,13 +112,12 @@ void find_all_fallback_nodes(
       }
     }
     // for every node that consumes this fallback node's NonTensor output, they should fallback too
-    // Since collection feature is supported, we can have List/Tuple input for TRT segment, so we only
-    // fallback the nodes that take inputs which are not Tensor/List/Tuple
     for (auto output : cur_node->outputs()) {
-      if (!isTensor(output) && !isListOrTuple(output)) {
+      if (!isTensor(output)) {
         for (auto use : output->uses()) {
           auto node = use.user;
-          if (node->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) {
+          if (node->kind() != torch::jit::prim::Constant &&
+              global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) {
             q.push(node);
           }
         }

From 5cff25728e3a2583e2390209a967a930118e3f45 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 28 Jul 2022 15:42:01 -0700
Subject: [PATCH 09/16] chore: Apply liniting

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/compiler.cpp                             | 121 ++++----
 core/conversion/conversion.cpp                |  15 +-
 .../conversionctx/ConversionCtx.cpp           |   8 +-
 core/conversion/converters/converter_util.cpp |  21 +-
 core/conversion/converters/converter_util.h   |   2 +-
 core/conversion/converters/impl/select.cpp    | 267 +++++++++---------
 core/ir/GraphInputs.cpp                       | 114 ++++----
 core/ir/StaticParams.cpp                      |   3 +-
 core/ir/ir.cpp                                |  24 +-
 core/ir/ir.h                                  |  10 +-
 core/partitioning/shape_analysis.cpp          |  33 +--
 core/partitioning/shape_analysis.h            |   1 -
 cpp/bin/torchtrtc/main.cpp                    |   3 +-
 cpp/include/torch_tensorrt/torch_tensorrt.h   |   4 +-
 cpp/src/compile_spec.cpp                      |  52 ++--
 cpp/src/torch_tensorrt.cpp                    |   3 +-
 .../csrc/register_tensorrt_classes.cpp        |   6 +-
 py/torch_tensorrt/csrc/tensorrt_classes.cpp   |  38 +--
 py/torch_tensorrt/csrc/tensorrt_classes.h     |   2 +-
 py/torch_tensorrt/csrc/torch_tensorrt_py.cpp  |   5 +-
 .../core/conversion/converters/test_cast.cpp  |   2 -
 tests/cpp/test_collections.cpp                |  44 +--
 tests/cpp/test_example_tensors.cpp            |   1 -
 tools/linter/utils.py                         |   4 +-
 24 files changed, 389 insertions(+), 394 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index e44ece5c27..caee900879 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -308,70 +308,78 @@ void MapInputsAndDetermineDTypes(
     std::shared_ptr<torch::jit::Graph>& g,
     ir::StaticParams& static_params,
     ir::CollectionTypeMap& first_use_type_map) {
-    cfg.convert_info.collection_input_spec_map = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
+  cfg.convert_info.collection_input_spec_map =
+      std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
 
-    auto collection_inputs = ir::get_collection_inputs(g, static_params);
-    LOG_DEBUG("In MapInputsAndDetermineDTypes, the g->inputs() size is " << g->inputs().size() << ", CollectionInputSpecMap size is" << collection_inputs.size());
+  auto collection_inputs = ir::get_collection_inputs(g, static_params);
+  LOG_DEBUG(
+      "In MapInputsAndDetermineDTypes, the g->inputs() size is "
+      << g->inputs().size() << ", CollectionInputSpecMap size is" << collection_inputs.size());
 
-    for (auto in : collection_inputs) {
-      std::vector<ir::Input>& spec = cfg.convert_info.collection_input_spec_map.find(in)->second;
-      std::vector<c10::optional<at::ScalarType>> est_type_opt;
+  for (auto in : collection_inputs) {
+    std::vector<ir::Input>& spec = cfg.convert_info.collection_input_spec_map.find(in)->second;
+    std::vector<c10::optional<at::ScalarType>> est_type_opt;
 
-      auto est_it = first_use_type_map.find(in);
-      if (est_it != first_use_type_map.end()) {
-        est_type_opt = first_use_type_map.find(in)->second;
-      }
-      // traverse elements in est_type_out and spec
-      for (size_t i = 0; i < est_type_opt.size(); i++) {
-        if (est_type_opt[i] && !spec[i].dtype_is_user_defined) {
-          // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
-          // type
-          LOG_INFO(
-              "Since input type is not explicitly defined, infering using first tensor calculation\n  Inferred input "
-              << in->debugName() << " has type " << est_type_opt[i].value());
-          spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value());
-        } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) {
-          // If we cannot calculate the type and the user did not define the type, then default to FP32
-          LOG_WARNING(
-              "Cannot infer input type from calcuations in graph for input "
-              << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
-          spec[i].dtype = nvinfer1::DataType::kFLOAT;
-        } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) {
-          if (!est_type_opt[i]) {
-            LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
+    auto est_it = first_use_type_map.find(in);
+    if (est_it != first_use_type_map.end()) {
+      est_type_opt = first_use_type_map.find(in)->second;
+    }
+    // traverse elements in est_type_out and spec
+    for (size_t i = 0; i < est_type_opt.size(); i++) {
+      if (est_type_opt[i] && !spec[i].dtype_is_user_defined) {
+        // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
+        // type
+        LOG_INFO(
+            "Since input type is not explicitly defined, infering using first tensor calculation\n  Inferred input "
+            << in->debugName() << " has type " << est_type_opt[i].value());
+        spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value());
+      } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) {
+        // If we cannot calculate the type and the user did not define the type, then default to FP32
+        LOG_WARNING(
+            "Cannot infer input type from calcuations in graph for input "
+            << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
+        spec[i].dtype = nvinfer1::DataType::kFLOAT;
+      } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) {
+        if (!est_type_opt[i]) {
+          LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
+          std::stringstream ss;
+          ss << "For input " << in->debugName() << ", found user specified input dtype as ";
+          ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+          ss << ". The compiler is going to use the user setting "
+             << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+          auto warn_str = ss.str();
+          LOG_WARNING(warn_str);
+          // Overwrite type map with user settings
+          first_use_type_map[in][i] = {
+              util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
+
+        } else {
+          if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype) !=
+              est_type_opt[i].value()) {
             std::stringstream ss;
             ss << "For input " << in->debugName() << ", found user specified input dtype as ";
             ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
-            ss << ". The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+            ss << ", however when inspecting the graph, the input type expected was inferred to be ";
+            ss << est_type_opt[i].value() << std::endl;
+            ss << "The compiler is going to use the user setting "
+               << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+            ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
+            ss << "compatibility with PyTorch's data type convention is required.\n";
+            ss << "If you do indeed see errors at runtime either:\n";
+            ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
+            ss << "- Disable partial compilation by setting require_full_compilation to True";
             auto warn_str = ss.str();
             LOG_WARNING(warn_str);
             // Overwrite type map with user settings
-            first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
-
-          } else {
-            if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype) != est_type_opt[i].value()) {
-              std::stringstream ss;
-              ss << "For input " << in->debugName() << ", found user specified input dtype as ";
-              ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
-              ss << ", however when inspecting the graph, the input type expected was inferred to be ";
-              ss << est_type_opt[i].value() << std::endl;
-              ss << "The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
-              ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
-              ss << "compatibility with PyTorch's data type convention is required.\n";
-              ss << "If you do indeed see errors at runtime either:\n";
-              ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
-              ss << "- Disable partial compilation by setting require_full_compilation to True";
-              auto warn_str = ss.str();
-              LOG_WARNING(warn_str);
-              // Overwrite type map with user settings
-              first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
-            }
+            first_use_type_map[in][i] = {
+                util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
           }
-        } else {
-          // The user defined the type so no changes are necessary
         }
+      } else {
+        // The user defined the type so no changes are necessary
       }
     }
+  }
   // }
 }
 
@@ -425,12 +433,13 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
 
       if (cfg.partition_info.enabled &&
           (!(cfg.lower_info.forced_fallback_modules.size() == 0 &&
-            cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)
-            || outputIsCollection)) {
-
+             cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible) ||
+           outputIsCollection)) {
         std::unordered_map<torch::jit::Node*, int> fallback_nodes;
-        auto collection_input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types);
-        auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), collection_input_ivalues_map, cfg, static_params, fallback_nodes);
+        auto collection_input_ivalues_map =
+            partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types);
+        auto graph_and_mapping = ConstructFallbackGraph(
+            new_mod, g->block(), collection_input_ivalues_map, cfg, static_params, fallback_nodes);
         new_g = graph_and_mapping.first;
         // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly
         for (size_t i = 0; i < new_g->inputs().size(); ++i) {
diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 914f1ddb9d..5f4b20e1b3 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -135,12 +135,10 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
                        << "please report this error to https://www.github.com/NVIDIA/Torch-TensorRT/issues");
 }
 
-void AddInputs(
-    ConversionCtx* ctx,
-    c10::ArrayRef<const torch::jit::Value*> inputs,
-    ConversionInfo& conversion_info) {
+void AddInputs(ConversionCtx* ctx, c10::ArrayRef<const torch::jit::Value*> inputs, ConversionInfo& conversion_info) {
   std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs = conversion_info.inputs;
-  std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>> collection_input_spec = conversion_info.collection_input_spec_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>> collection_input_spec =
+      conversion_info.collection_input_spec_map;
 
   std::vector<const torch::jit::Value*> input_tensors;
   for (auto in : inputs) {
@@ -173,7 +171,7 @@ void AddInputs(
         "Cannot find an input spec associated with input: " << in->debugName());
     ir::Input spec;
     if (input_specs.find(in) != input_specs.end()) {
-        spec = input_specs.find(in)->second;
+      spec = input_specs.find(in)->second;
     } else {
       spec = collection_input_spec.find(in)->second[0]; // assume input is tensor
     }
@@ -559,8 +557,9 @@ std::set<std::string> ConvertableOpsInBlock(const torch::jit::Block* b) {
 }
 
 bool OutputIsCollection(const torch::jit::Block* b) {
-  for (auto out: b->outputs()) {
-    if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) {
+  for (auto out : b->outputs()) {
+    if (out->type()->kind() == torch::jit::TypeKind::TupleType ||
+        out->type()->kind() == torch::jit::TypeKind::ListType) {
       return true;
     }
   }
diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp
index a24a15904c..71159eb2b5 100644
--- a/core/conversion/conversionctx/ConversionCtx.cpp
+++ b/core/conversion/conversionctx/ConversionCtx.cpp
@@ -107,7 +107,7 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
   }
 
   cfg->setAvgTimingIterations(settings.num_avg_timing_iters);
-  if (settings.workspace_size != 0){
+  if (settings.workspace_size != 0) {
     cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, settings.workspace_size);
   }
 
@@ -124,13 +124,13 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
         settings.enabled_precisions.find(nvinfer1::DataType::kFLOAT) == settings.enabled_precisions.end(),
         "DLA supports only fp16 or int8 precision");
     cfg->setDLACore(settings.device.dla_core);
-    if (settings.dla_sram_size != 1048576){
+    if (settings.dla_sram_size != 1048576) {
       cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM, settings.dla_sram_size);
     }
-    if (settings.dla_local_dram_size != 1073741824){
+    if (settings.dla_local_dram_size != 1073741824) {
       cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM, settings.dla_local_dram_size);
     }
-    if (settings.dla_global_dram_size != 536870912){
+    if (settings.dla_global_dram_size != 536870912) {
       cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM, settings.dla_global_dram_size);
     }
   }
diff --git a/core/conversion/converters/converter_util.cpp b/core/conversion/converters/converter_util.cpp
index a6a2bbd555..745261589e 100644
--- a/core/conversion/converters/converter_util.cpp
+++ b/core/conversion/converters/converter_util.cpp
@@ -207,13 +207,13 @@ nvinfer1::ITensor* clamp(
     nvinfer1::ITensor* lower_bound,
     nvinfer1::ITensor* upper_bound,
     std::string const& name) {
-
   auto max_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMAX, x, lower_bound, "max layer for " + name);
   TORCHTRT_CHECK(max_layer, "Unable to create max layer for clamp");
   LOG_DEBUG(ctx->logger, "Create " << max_layer->getName() << " for clamp");
   auto max_itensor = max_layer->getOutput(0);
 
-  auto min_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name);
+  auto min_layer =
+      add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name);
   TORCHTRT_CHECK(min_layer, "Unable to create min layer for clamp");
   LOG_DEBUG(ctx->logger, "Create " << min_layer->getName() << " for clamp");
   auto min_itensor = min_layer->getOutput(0);
@@ -227,13 +227,13 @@ nvinfer1::ITensor* clamp_to_input_dim(
     nvinfer1::ITensor* input_dim,
     int nbdims,
     std::string const& name) {
-
   auto zero = torch::zeros({nbdims}).to(torch::kI32);
   auto zero_itensor = tensor_to_const(ctx, zero);
   auto one = torch::ones({nbdims}).to(torch::kI32);
   auto one_itensor = tensor_to_const(ctx, one);
 
-  auto upper_bound_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, input_dim, one_itensor, "sub layer for " + name);
+  auto upper_bound_layer =
+      add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, input_dim, one_itensor, "sub layer for " + name);
   TORCHTRT_CHECK(upper_bound_layer, "Unable to create sub layer for clamp to inputDim");
   LOG_DEBUG(ctx->logger, "Create " << upper_bound_layer->getName() << " for clamp to inputDim");
   auto upper_bound = upper_bound_layer->getOutput(0);
@@ -243,7 +243,8 @@ nvinfer1::ITensor* clamp_to_input_dim(
   LOG_DEBUG(ctx->logger, "Create " << max_layer->getName() << " for clamp to inputDim");
   auto max_itensor = max_layer->getOutput(0);
 
-  auto min_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name);
+  auto min_layer =
+      add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name);
   TORCHTRT_CHECK(min_layer, "Unable to create min_layer for clamp to inputDim");
   LOG_DEBUG(ctx->logger, "Create " << min_layer->getName() << " for clamp to inputDim");
   auto min_itensor = min_layer->getOutput(0);
@@ -257,7 +258,6 @@ nvinfer1::ITensor* normalize_indices(
     nvinfer1::ITensor* indices,
     int nbdims,
     std::string const& name) {
-
   auto zero = torch::zeros({nbdims}).to(torch::kI32);
   auto neg = -torch::ones({nbdims}).to(torch::kI32);
   auto zero_itensor = tensor_to_const(ctx, zero);
@@ -307,17 +307,20 @@ nvinfer1::ITensor* get_slice_size(
   at::Tensor one_tensor = torch::ones({nbdims}).to(torch::kI32);
   auto one_itensor = tensor_to_const(ctx, one_tensor);
 
-  auto sub_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, end, start, "get_slice_size sub layer for " + name);
+  auto sub_layer =
+      add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, end, start, "get_slice_size sub layer for " + name);
   TORCHTRT_CHECK(sub_layer, "Unable to create sub layer in calculate_output_size");
   LOG_DEBUG(ctx->logger, "Create " << sub_layer->getName() << " for calculate_output_size");
   auto sub_itensor = sub_layer->getOutput(0);
 
-  auto div_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, sub_itensor, stride, "get_slice_size div layer for " + name);
+  auto div_layer = add_elementwise(
+      ctx, nvinfer1::ElementWiseOperation::kDIV, sub_itensor, stride, "get_slice_size div layer for " + name);
   TORCHTRT_CHECK(div_layer, "Unable to create div layer in calculate_output_size");
   LOG_DEBUG(ctx->logger, "Create " << div_layer->getName() << " for calculate_output_size");
   auto div_itensor = div_layer->getOutput(0);
 
-  auto add_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUM, div_itensor, one_itensor, "get_slice_size sum layer for " + name);
+  auto add_layer = add_elementwise(
+      ctx, nvinfer1::ElementWiseOperation::kSUM, div_itensor, one_itensor, "get_slice_size sum layer for " + name);
   TORCHTRT_CHECK(add_layer, "Unable to create add layer in calculate_output_size");
   LOG_DEBUG(ctx->logger, "Create " << add_layer->getName() << " for calculate_output_size");
   auto size_itensor = add_layer->getOutput(0);
diff --git a/core/conversion/converters/converter_util.h b/core/conversion/converters/converter_util.h
index cdf2ee5a8d..b155499858 100644
--- a/core/conversion/converters/converter_util.h
+++ b/core/conversion/converters/converter_util.h
@@ -1,8 +1,8 @@
 #pragma once
 
+#include <limits>
 #include <map>
 #include <string>
-#include <limits>
 
 #include "core/conversion/conversionctx/ConversionCtx.h"
 #include "core/conversion/converters/Weights.h"
diff --git a/core/conversion/converters/impl/select.cpp b/core/conversion/converters/impl/select.cpp
index 3599ab9939..d33f09ae8a 100644
--- a/core/conversion/converters/impl/select.cpp
+++ b/core/conversion/converters/impl/select.cpp
@@ -103,121 +103,118 @@ nvinfer1::ITensor* roll(
 
 auto select_registrations TORCHTRT_UNUSED =
     RegisterNodeConversionPatterns()
-        .pattern(
-            {"aten::select.int(Tensor(a) self, int dim, int index) -> (Tensor(a))",
-             [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
-               auto in = args[0].ITensorOrFreeze(ctx);
-               auto maxDim = static_cast<int64_t>(in->getDimensions().nbDims);
-               auto dim = args[1].unwrapToInt();
-               // Handle negative axis by refering to nbDims of input Tensor
-               dim = dim < 0 ? dim + maxDim : dim;
-               auto ind = (int32_t)args[2].unwrapToInt();
-               // Along the specified dimension, handle negative index by subtracting along length of dimension.
-               ind = ind < 0 ? ind + in->getDimensions().d[dim] : ind;
-               LOG_DEBUG("Gather input dimensions: " << in->getDimensions());
-               LOG_DEBUG("Dimension to select: " << dim);
-               LOG_DEBUG("Index: " << ind);
-
-               // index to access needs to be an at::Tensor
-               at::Tensor indices = torch::tensor({ind}).to(torch::kI32);
-               auto const_out = tensor_to_const(ctx, indices);
-
-               // IGatherLayer takes in input tensor, the indices, and the axis
-               // of input tensor to take indices from
-               auto gather_layer = ctx->net->addGather(*in, *const_out, dim);
-               TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
-               auto out = gather_layer->getOutput(0);
+        .pattern({"aten::select.int(Tensor(a) self, int dim, int index) -> (Tensor(a))",
+                  [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+                    auto in = args[0].ITensorOrFreeze(ctx);
+                    auto maxDim = static_cast<int64_t>(in->getDimensions().nbDims);
+                    auto dim = args[1].unwrapToInt();
+                    // Handle negative axis by refering to nbDims of input Tensor
+                    dim = dim < 0 ? dim + maxDim : dim;
+                    auto ind = (int32_t)args[2].unwrapToInt();
+                    // Along the specified dimension, handle negative index by subtracting along length of dimension.
+                    ind = ind < 0 ? ind + in->getDimensions().d[dim] : ind;
+                    LOG_DEBUG("Gather input dimensions: " << in->getDimensions());
+                    LOG_DEBUG("Dimension to select: " << dim);
+                    LOG_DEBUG("Index: " << ind);
+
+                    // index to access needs to be an at::Tensor
+                    at::Tensor indices = torch::tensor({ind}).to(torch::kI32);
+                    auto const_out = tensor_to_const(ctx, indices);
+
+                    // IGatherLayer takes in input tensor, the indices, and the axis
+                    // of input tensor to take indices from
+                    auto gather_layer = ctx->net->addGather(*in, *const_out, dim);
+                    TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
+                    auto out = gather_layer->getOutput(0);
+
+                    LOG_DEBUG("Gather tensor shape: " << out->getDimensions());
+
+                    if (out->getDimensions().nbDims != 1) {
+                      // IShuffleLayer removes redundant dimensions
+                      auto shuffle_layer = ctx->net->addShuffle(*out);
+                      TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n);
+                      shuffle_layer->setReshapeDimensions(util::squeezeDims(out->getDimensions(), dim));
+                      shuffle_layer->setName(util::node_info(n).c_str());
+                      out = shuffle_layer->getOutput(0);
+                    }
+
+                    out = ctx->AssociateValueAndTensor(n->outputs()[0], out);
+
+                    LOG_DEBUG("Output tensor shape: " << out->getDimensions());
 
-               LOG_DEBUG("Gather tensor shape: " << out->getDimensions());
+                    return true;
+                  }})
+        .pattern({"aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)",
+                  [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+                    auto in = args[0].ITensor();
+                    auto axis = args[1].unwrapToInt();
+                    auto start = (int32_t)args[2].unwrapToInt();
+                    auto length = (int32_t)args[3].unwrapToInt();
 
-               if (out->getDimensions().nbDims != 1) {
-                 // IShuffleLayer removes redundant dimensions
-                 auto shuffle_layer = ctx->net->addShuffle(*out);
-                 TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n);
-                 shuffle_layer->setReshapeDimensions(util::squeezeDims(out->getDimensions(), dim));
-                 shuffle_layer->setName(util::node_info(n).c_str());
-                 out = shuffle_layer->getOutput(0);
-               }
+                    // index to access needs to be an at::Tensor
+                    at::Tensor indices = torch::arange(start, start + length, 1).to(torch::kI32);
+                    auto weights = Weights(ctx, indices);
 
-               out = ctx->AssociateValueAndTensor(n->outputs()[0], out);
+                    // IConstantLayer to convert indices from Weights to ITensor
+                    auto const_layer = ctx->net->addConstant(weights.shape, weights.data);
+                    TORCHTRT_CHECK(const_layer, "Unable to create constant layer from node: " << *n);
+                    auto const_out = const_layer->getOutput(0);
 
-               LOG_DEBUG("Output tensor shape: " << out->getDimensions());
+                    // IGatherLayer takes in input tensor, the indices, and the axis
+                    // of input tensor to take indices from
+                    auto gather_layer = ctx->net->addGather(*in, *const_out, axis);
+                    TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
+                    auto gather_out = gather_layer->getOutput(0);
 
-               return true;
-             }})
-        .pattern(
-            {"aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)",
-             [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
-               auto in = args[0].ITensor();
-               auto axis = args[1].unwrapToInt();
-               auto start = (int32_t)args[2].unwrapToInt();
-               auto length = (int32_t)args[3].unwrapToInt();
-
-               // index to access needs to be an at::Tensor
-               at::Tensor indices = torch::arange(start, start + length, 1).to(torch::kI32);
-               auto weights = Weights(ctx, indices);
-
-               // IConstantLayer to convert indices from Weights to ITensor
-               auto const_layer = ctx->net->addConstant(weights.shape, weights.data);
-               TORCHTRT_CHECK(const_layer, "Unable to create constant layer from node: " << *n);
-               auto const_out = const_layer->getOutput(0);
-
-               // IGatherLayer takes in input tensor, the indices, and the axis
-               // of input tensor to take indices from
-               auto gather_layer = ctx->net->addGather(*in, *const_out, axis);
-               TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
-               auto gather_out = gather_layer->getOutput(0);
-
-               // IShuffleLayer removes redundant dimensions
-               auto shuffle_layer = ctx->net->addShuffle(*gather_out);
-               TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n);
-               shuffle_layer->setReshapeDimensions(util::unpadDims(gather_out->getDimensions()));
-               shuffle_layer->setName(util::node_info(n).c_str());
-               auto shuffle_out = shuffle_layer->getOutput(0);
+                    // IShuffleLayer removes redundant dimensions
+                    auto shuffle_layer = ctx->net->addShuffle(*gather_out);
+                    TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n);
+                    shuffle_layer->setReshapeDimensions(util::unpadDims(gather_out->getDimensions()));
+                    shuffle_layer->setName(util::node_info(n).c_str());
+                    auto shuffle_out = shuffle_layer->getOutput(0);
 
-               auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_out);
+                    auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_out);
 
-               LOG_DEBUG("Output tensor shape: " << out->getDimensions());
+                    LOG_DEBUG("Output tensor shape: " << out->getDimensions());
 
-               return true;
-             }})
-        .pattern(
-            {"aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)",
-             [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
-               auto in = args[0].ITensor();
-               auto axis = args[1].unwrapToInt();
-               torch::Tensor start = args[2].IValue()->toTensor().to(torch::kI32);
-               int32_t startIdx = start.item().to<int32_t>();
-               auto length = (int32_t)args[3].unwrapToInt();
-
-               // index to access needs to be an at::Tensor
-               at::Tensor indices = torch::arange(startIdx, startIdx + length, 1).to(torch::kI32);
-               auto weights = Weights(ctx, indices);
-
-               // IConstantLayer to convert indices from Weights to ITensor
-               auto const_layer = ctx->net->addConstant(weights.shape, weights.data);
-               TORCHTRT_CHECK(const_layer, "Unable to create constant layer from node: " << *n);
-               auto const_out = const_layer->getOutput(0);
-
-               // IGatherLayer takes in input tensor, the indices, and the axis
-               // of input tensor to take indices from
-               auto gather_layer = ctx->net->addGather(*in, *const_out, axis);
-               TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
-               auto gather_out = gather_layer->getOutput(0);
-
-               // IShuffleLayer removes redundant dimensions
-               auto shuffle_layer = ctx->net->addShuffle(*gather_out);
-               TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n);
-               shuffle_layer->setReshapeDimensions(util::unpadDims(gather_out->getDimensions()));
-               shuffle_layer->setName(util::node_info(n).c_str());
-               auto shuffle_out = shuffle_layer->getOutput(0);
-
-               auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_out);
-
-               LOG_DEBUG("Output tensor shape: " << out->getDimensions());
+                    return true;
+                  }})
+        .pattern({"aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)",
+                  [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+                    auto in = args[0].ITensor();
+                    auto axis = args[1].unwrapToInt();
+                    torch::Tensor start = args[2].IValue()->toTensor().to(torch::kI32);
+                    int32_t startIdx = start.item().to<int32_t>();
+                    auto length = (int32_t)args[3].unwrapToInt();
+
+                    // index to access needs to be an at::Tensor
+                    at::Tensor indices = torch::arange(startIdx, startIdx + length, 1).to(torch::kI32);
+                    auto weights = Weights(ctx, indices);
+
+                    // IConstantLayer to convert indices from Weights to ITensor
+                    auto const_layer = ctx->net->addConstant(weights.shape, weights.data);
+                    TORCHTRT_CHECK(const_layer, "Unable to create constant layer from node: " << *n);
+                    auto const_out = const_layer->getOutput(0);
+
+                    // IGatherLayer takes in input tensor, the indices, and the axis
+                    // of input tensor to take indices from
+                    auto gather_layer = ctx->net->addGather(*in, *const_out, axis);
+                    TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
+                    auto gather_out = gather_layer->getOutput(0);
+
+                    // IShuffleLayer removes redundant dimensions
+                    auto shuffle_layer = ctx->net->addShuffle(*gather_out);
+                    TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n);
+                    shuffle_layer->setReshapeDimensions(util::unpadDims(gather_out->getDimensions()));
+                    shuffle_layer->setName(util::node_info(n).c_str());
+                    auto shuffle_out = shuffle_layer->getOutput(0);
+
+                    auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_out);
+
+                    LOG_DEBUG("Output tensor shape: " << out->getDimensions());
 
-               return true;
-             }})
+                    return true;
+                  }})
         .pattern(
             {"aten::embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
@@ -239,30 +236,29 @@ auto select_registrations TORCHTRT_UNUSED =
 
                return true;
              }})
-        .pattern(
-            {"aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> (Tensor)",
-             [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
-               auto in = args[0].ITensor();
-               auto shifts = args[1].unwrapToIntList().vec();
-               auto dims = args[2].unwrapToIntList().vec();
-
-               TORCHTRT_CHECK(dims.size() == shifts.size(), "dims.size() should be equal to shifts.size()");
-               if (ctx->input_is_dynamic) {
-                 TORCHTRT_THROW_ERROR("aten::roll is currently not support in dynamic input shape compilation");
-               } else {
-                 auto in_shape = util::toVec(in->getDimensions());
-                 for (size_t i = 0; i < dims.size(); i++) {
-                   auto dim = dims[i] < 0 ? (in_shape.size() + dims[i]) : dims[i];
-                   TORCHTRT_CHECK(dim < in_shape.size(), "Dimension out of range");
-                   in = roll(ctx, in, shifts[i], dim, in_shape);
-                 }
-                 auto out = ctx->AssociateValueAndTensor(n->outputs()[0], in);
-
-                 LOG_DEBUG("Output tensor shape: " << out->getDimensions());
-
-                 return true;
-               }
-             }})
+        .pattern({"aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> (Tensor)",
+                  [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+                    auto in = args[0].ITensor();
+                    auto shifts = args[1].unwrapToIntList().vec();
+                    auto dims = args[2].unwrapToIntList().vec();
+
+                    TORCHTRT_CHECK(dims.size() == shifts.size(), "dims.size() should be equal to shifts.size()");
+                    if (ctx->input_is_dynamic) {
+                      TORCHTRT_THROW_ERROR("aten::roll is currently not support in dynamic input shape compilation");
+                    } else {
+                      auto in_shape = util::toVec(in->getDimensions());
+                      for (size_t i = 0; i < dims.size(); i++) {
+                        auto dim = dims[i] < 0 ? (in_shape.size() + dims[i]) : dims[i];
+                        TORCHTRT_CHECK(dim < in_shape.size(), "Dimension out of range");
+                        in = roll(ctx, in, shifts[i], dim, in_shape);
+                      }
+                      auto out = ctx->AssociateValueAndTensor(n->outputs()[0], in);
+
+                      LOG_DEBUG("Output tensor shape: " << out->getDimensions());
+
+                      return true;
+                    }
+                  }})
         .pattern(
             {"aten::index.Tensor(Tensor self, Tensor?[] indices) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
@@ -319,7 +315,8 @@ auto select_registrations TORCHTRT_UNUSED =
                int startIdx = 0;
                auto startIdxIVal = args[2].IValue();
                if (!startIdxIVal->isNone()) {
-                 startIdx = startIdxIVal->toInt() > std::numeric_limits<int32_t>::max() ? maxDim : startIdxIVal->toInt();
+                 startIdx =
+                     startIdxIVal->toInt() > std::numeric_limits<int32_t>::max() ? maxDim : startIdxIVal->toInt();
                  startIdx = maxDim == -1 ? startIdx : std::min(startIdx, maxDim);
                }
                // Handle case when given tensor index is negative
@@ -331,7 +328,8 @@ auto select_registrations TORCHTRT_UNUSED =
                int endIdx = maxDim; // -1 for dynamic shape
                auto endIdxIVal = args[3].IValue();
                if (!endIdxIVal->isNone()) {
-                 int truncate_value = endIdxIVal->toInt() > std::numeric_limits<int32_t>::max() ? maxDim : endIdxIVal->toInt();
+                 int truncate_value =
+                     endIdxIVal->toInt() > std::numeric_limits<int32_t>::max() ? maxDim : endIdxIVal->toInt();
                  endIdx = maxDim == -1 ? truncate_value : std::min(truncate_value, maxDim);
                }
                if (maxDim > 0) {
@@ -385,7 +383,8 @@ auto select_registrations TORCHTRT_UNUSED =
                  // update start and end
                  nvinfer1::ITensor* out_start;
                  nvinfer1::ITensor* out_end;
-                 auto start_end = normalize_start_and_end(ctx, ishape_tensor, start_itensor, end_itensor, nbdims, node_name);
+                 auto start_end =
+                     normalize_start_and_end(ctx, ishape_tensor, start_itensor, end_itensor, nbdims, node_name);
                  out_start = start_end[0];
                  out_end = start_end[1];
 
@@ -397,7 +396,7 @@ auto select_registrations TORCHTRT_UNUSED =
                  slice_layer->setInput(2, *size_itensor); // size, must be set if input is dynamic
                }
                auto slice_out = slice_layer->getOutput(0);
-               
+
                auto out = ctx->AssociateValueAndTensor(n->outputs()[0], slice_out);
                LOG_DEBUG("Slice layer output shape: " << out->getDimensions());
 
diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp
index 007a7279e7..a1b1196be9 100644
--- a/core/ir/GraphInputs.cpp
+++ b/core/ir/GraphInputs.cpp
@@ -5,70 +5,74 @@ namespace torch_tensorrt {
 namespace core {
 namespace ir {
 
-void flatten_dfs(std::vector<torch_tensorrt::core::ir::Input>& flattened_inputs, std::vector<std::vector<torch_tensorrt::core::ir::Input>>& collection_inputs,
-                 torch::jit::IValue input_ivalue, int level, int index) {
-    if (input_ivalue.isTuple()) {
-      auto input_tuple = input_ivalue.toTuple();
-      int idx = 0;
-      if (level == 0) {
-        collection_inputs.resize(input_tuple->elements().size());
-      }
-      for (auto item: input_tuple->elements()) {
-        torch::jit::IValue converted_item;
-        int cur_idx = level < 1 ? idx: index;
-        flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx);
-        idx++;
-      }
-    } else if(input_ivalue.isList()) {
-      auto input_list = input_ivalue.toList().vec();
-      if (level == 0) {
-        collection_inputs.resize(input_list.size());
-      }
-      c10::TypePtr type = input_list[0].type();
-      auto converted_elements = c10::impl::GenericList(type);
-      int idx = 0;
-      for (auto item: input_list) {
-        int cur_idx = level < 1 ? idx: index;
-        flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx);
-        idx++;
-      }
-    } else if(input_ivalue.isCustomClass()) {
-      torch_tensorrt::core::ir::Input cur_input = *(input_ivalue.toCustomClass<torch_tensorrt::core::ir::Input>());
-      flattened_inputs.push_back(cur_input);
-      if (level == 0) {  // a single value like A
-        collection_inputs.resize(1);
-        collection_inputs[0].push_back(cur_input);
-      } else if (level == 1) { // like A in [A, A] or [(B, B), A]
-        collection_inputs[index].push_back(cur_input);
-      } else if (level == 2) {  // like A in [(A, A), C]
-        collection_inputs[index].push_back(cur_input);
-      } else {// only support 2 level
-        LOG_ERROR("Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]");
-      }
+void flatten_dfs(
+    std::vector<torch_tensorrt::core::ir::Input>& flattened_inputs,
+    std::vector<std::vector<torch_tensorrt::core::ir::Input>>& collection_inputs,
+    torch::jit::IValue input_ivalue,
+    int level,
+    int index) {
+  if (input_ivalue.isTuple()) {
+    auto input_tuple = input_ivalue.toTuple();
+    int idx = 0;
+    if (level == 0) {
+      collection_inputs.resize(input_tuple->elements().size());
     }
+    for (auto item : input_tuple->elements()) {
+      torch::jit::IValue converted_item;
+      int cur_idx = level < 1 ? idx : index;
+      flatten_dfs(flattened_inputs, collection_inputs, item, level + 1, cur_idx);
+      idx++;
+    }
+  } else if (input_ivalue.isList()) {
+    auto input_list = input_ivalue.toList().vec();
+    if (level == 0) {
+      collection_inputs.resize(input_list.size());
+    }
+    c10::TypePtr type = input_list[0].type();
+    auto converted_elements = c10::impl::GenericList(type);
+    int idx = 0;
+    for (auto item : input_list) {
+      int cur_idx = level < 1 ? idx : index;
+      flatten_dfs(flattened_inputs, collection_inputs, item, level + 1, cur_idx);
+      idx++;
+    }
+  } else if (input_ivalue.isCustomClass()) {
+    torch_tensorrt::core::ir::Input cur_input = *(input_ivalue.toCustomClass<torch_tensorrt::core::ir::Input>());
+    flattened_inputs.push_back(cur_input);
+    if (level == 0) { // a single value like A
+      collection_inputs.resize(1);
+      collection_inputs[0].push_back(cur_input);
+    } else if (level == 1) { // like A in [A, A] or [(B, B), A]
+      collection_inputs[index].push_back(cur_input);
+    } else if (level == 2) { // like A in [(A, A), C]
+      collection_inputs[index].push_back(cur_input);
+    } else { // only support 2 level
+      LOG_ERROR(
+          "Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]");
+    }
+  }
 }
 
-
 GraphInputs::GraphInputs(std::vector<ir::Input> inputs_) {
-    LOG_DEBUG("Construct GraphInput with ir::Input");
-    inputs = inputs_;
-    collection_inputs.resize(inputs_.size());
-    for (size_t i = 0; i < inputs_.size(); i++) {
-        collection_inputs[i].push_back(inputs_[i]);
-    }
+  LOG_DEBUG("Construct GraphInput with ir::Input");
+  inputs = inputs_;
+  collection_inputs.resize(inputs_.size());
+  for (size_t i = 0; i < inputs_.size(); i++) {
+    collection_inputs[i].push_back(inputs_[i]);
+  }
 }
 
 GraphInputs::GraphInputs(torch::jit::IValue& input_signature_) {
-    LOG_DEBUG("Construct GraphInput with IValue");
+  LOG_DEBUG("Construct GraphInput with IValue");
 
-    std::vector<torch_tensorrt::core::ir::Input> flattened_inputs;
-    std::vector<std::vector<torch_tensorrt::core::ir::Input>> collection_inputs_;
+  std::vector<torch_tensorrt::core::ir::Input> flattened_inputs;
+  std::vector<std::vector<torch_tensorrt::core::ir::Input>> collection_inputs_;
 
-    flatten_dfs(flattened_inputs, collection_inputs_, input_signature_, 0, 0);
-    inputs = flattened_inputs;
-    input_signature = input_signature_;
-    collection_inputs = collection_inputs_;
-    LOG_DEBUG("Collection Input Size: " << collection_inputs_.size());
+  flatten_dfs(flattened_inputs, collection_inputs_, input_signature_, 0, 0);
+  inputs = flattened_inputs;
+  input_signature = input_signature_;
+  collection_inputs = collection_inputs_;
+  LOG_DEBUG("Collection Input Size: " << collection_inputs_.size());
 }
 
 } // namespace ir
diff --git a/core/ir/StaticParams.cpp b/core/ir/StaticParams.cpp
index 0073ad2888..8502c80acf 100644
--- a/core/ir/StaticParams.cpp
+++ b/core/ir/StaticParams.cpp
@@ -12,8 +12,7 @@ StaticParams get_static_params(c10::ArrayRef<torch::jit::Value*> inputs, std::ve
   auto param_it = params.begin();
   for (auto in : inputs) {
     // handle TensorType, TupleType and ListType
-    if (in->type() != c10::TensorType::get() && 
-        in->type()->kind() != torch::jit::TypeKind::TupleType &&
+    if (in->type() != c10::TensorType::get() && in->type()->kind() != torch::jit::TypeKind::TupleType &&
         in->type()->kind() != torch::jit::TypeKind::ListType && param_it != params.end()) {
       static_params[in] = *param_it;
       ++param_it;
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index cc82fe09b4..d9b021ed8b 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -35,7 +35,9 @@ InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> va
   return a;
 }
 
-CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs) {
+CollectionInputSpecMap pair_input_vals_with_specs_collection(
+    std::vector<const torch::jit::Value*> vals,
+    std::vector<std::vector<Input>>& specs) {
   TORCHTRT_CHECK(
       vals.size() == specs.size(),
       "Expected dimension specifications for all input tensors"
@@ -64,7 +66,7 @@ std::vector<const torch::jit::Value*> get_tensor_inputs(
     // input.1:Tensor -> used
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in);
-    } 
+    }
   }
   return input_tensors;
 }
@@ -80,7 +82,8 @@ std::vector<const torch::jit::Value*> get_collection_inputs(
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in);
     } else if (in->type()->kind() == torch::jit::TypeKind::TupleType && static_params.find(in) == static_params.end()) {
-    // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) {
+      // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end())
+      // {
       input_tensors.push_back(in); // push original tuple
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(in);
       LOG_DEBUG("get_collection_inputs, tuple size " << unpack_tuple.size());
@@ -190,15 +193,15 @@ TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b) {
     if (i->type() == c10::TensorType::get()) {
       torch::jit::Value* in = i;
       types.insert({in, get_value_first_calc_dtype_opt(b, i)});
-    } else if(i->type()->cast<c10::TupleType>()) {
+    } else if (i->type()->cast<c10::TupleType>()) {
       // make sure very time get the same ptr
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
       LOG_DEBUG("Tuple size " << unpack_tuple.size());
-      for (auto item: unpack_tuple) {
+      for (auto item : unpack_tuple) {
         torch::jit::Value* in = item;
         types.insert({in, get_value_first_calc_dtype_opt(b, i)});
       }
-    } else if(i->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+    } else if (i->type()->isSubtypeOf(c10::ListType::ofTensors())) {
       LOG_INFO("Unsupported type of c10::ListType::ofTensors()");
     }
   }
@@ -212,7 +215,7 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
       torch::jit::Value* in = i;
       types.insert({in, {get_value_first_calc_dtype_opt(b, i)}});
 
-    } else if(i->type()->kind() == torch::jit::TypeKind::TupleType) {
+    } else if (i->type()->kind() == torch::jit::TypeKind::TupleType) {
       // TODO: to evaluate the data type of tuple element
       // make sure very time get the same ptr
       // c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
@@ -220,9 +223,9 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
       // TODO: calculate the tuple element type, currently we use {} as default datatype
       // std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size(), tp);
       std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size());
-      types.insert({i, dytpes}); // insert an empty 
+      types.insert({i, dytpes}); // insert an empty
 
-    } else if(i->type()->kind() == torch::jit::TypeKind::ListType) {
+    } else if (i->type()->kind() == torch::jit::TypeKind::ListType) {
       // TODO: to decide the size of list and type of list element
       LOG_DEBUG("get_block_first_calc_dtypes_opt ListType: use size " << i->uses().size());
       c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
@@ -234,8 +237,7 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
   return types;
 }
 
-static auto core_input_container =
-    torch::class_<Input>("_torch_tensorrt_core_ir", "Input").def(torch::init<>());
+static auto core_input_container = torch::class_<Input>("_torch_tensorrt_core_ir", "Input").def(torch::init<>());
 
 } // namespace ir
 } // namespace core
diff --git a/core/ir/ir.h b/core/ir/ir.h
index 966c747176..a5225daa25 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -12,7 +12,7 @@ namespace core {
 namespace ir {
 
 struct Input : torch::CustomClassHolder {
-  Input() {};
+  Input(){};
   Input(
       std::vector<int64_t> shape,
       nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT,
@@ -42,8 +42,8 @@ struct Input : torch::CustomClassHolder {
 struct GraphInputs {
   GraphInputs(std::vector<ir::Input> inputs);
   GraphInputs(torch::jit::IValue& input_signature);
-  torch::jit::IValue input_signature;  // nested Input, full input spec
-  std::vector<Input> inputs;  // flattend Input
+  torch::jit::IValue input_signature; // nested Input, full input spec
+  std::vector<Input> inputs; // flattend Input
   std::vector<std::vector<Input>> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e)
 };
 
@@ -67,7 +67,9 @@ CollectionInputSpecMap associate_specs_with_collection_inputs(
     ir::GraphInputs graph_inputs,
     StaticParams& static_params);
 InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<Input> specs);
-CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs);
+CollectionInputSpecMap pair_input_vals_with_specs_collection(
+    std::vector<const torch::jit::Value*> vals,
+    std::vector<std::vector<Input>>& specs);
 std::vector<const torch::jit::Value*> get_tensor_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     StaticParams& static_params);
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index 1221318647..8767048030 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -9,31 +9,28 @@ namespace core {
 namespace partitioning {
 
 at::Tensor generateSingleInput(ir::Input& input, c10::optional<at::ScalarType>& type_opt) {
-      auto cur_shape = input.input_shape;
-      std::vector<int64_t> shape;
-      shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
-      // auto type_opt = types[input.first][i];
-      auto type = at::kFloat;
-      if (type_opt) {
-        type = type_opt.value();
-      } else {
-        LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32");
-      }
-      auto in = at::randint(5, shape, {at::kCUDA}).to(type);
-      // ivalue_map[input.first] = in.clone();
-      return in;
+  auto cur_shape = input.input_shape;
+  std::vector<int64_t> shape;
+  shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
+  // auto type_opt = types[input.first][i];
+  auto type = at::kFloat;
+  if (type_opt) {
+    type = type_opt.value();
+  } else {
+    LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32");
+  }
+  auto in = at::randint(5, shape, {at::kCUDA}).to(type);
+  // ivalue_map[input.first] = in.clone();
+  return in;
 }
 
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
     std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& inputs,
     std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types) {
-
   // generate random inputs for running pytorch segments
   std::unordered_map<const torch::jit::Value*, torch::jit::IValue> ivalue_map;
 
-
   for (auto& input : inputs) {
-
     if (input.first->type()->kind() == torch::jit::TypeKind::ListType) {
       // create list
       std::vector<torch::jit::IValue> list;
@@ -56,7 +53,6 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
     } else {
       auto in = generateSingleInput(input.second[0], types[input.first][0]);
       ivalue_map[input.first] = in.clone();
-
     }
   }
   return ivalue_map;
@@ -109,7 +105,8 @@ void getSegmentsOutputByRunning(
       jit_inputs_ivalues.push_back(ivalues_maps[input].toBool());
     } else if (input->type()->kind() == torch::jit::TypeKind::ListType) {
       // create list
-      jit_inputs_ivalues.push_back(ivalues_maps[input].toList());;
+      jit_inputs_ivalues.push_back(ivalues_maps[input].toList());
+      ;
     } else if (input->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
       jit_inputs_ivalues.push_back(ivalues_maps[input].toTuple());
diff --git a/core/partitioning/shape_analysis.h b/core/partitioning/shape_analysis.h
index 2654699a1d..e9c51fc62d 100644
--- a/core/partitioning/shape_analysis.h
+++ b/core/partitioning/shape_analysis.h
@@ -6,7 +6,6 @@ namespace torch_tensorrt {
 namespace core {
 namespace partitioning {
 
-
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
     std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& input_ranges,
     std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& input_types);
diff --git a/cpp/bin/torchtrtc/main.cpp b/cpp/bin/torchtrtc/main.cpp
index 6c207d78da..51ec2c51c6 100644
--- a/cpp/bin/torchtrtc/main.cpp
+++ b/cpp/bin/torchtrtc/main.cpp
@@ -117,8 +117,7 @@ int main(int argc, char** argv) {
       parser, "num_iters", "Number of averaging timing iterations used to select kernels", {"num-avg-timing-iters"});
   args::ValueFlag<uint64_t> workspace_size(
       parser, "workspace_size", "Maximum size of workspace given to TensorRT", {"workspace-size"});
-  args::ValueFlag<uint64_t> dla_sram_size(
-      parser, "dla_sram_size", "DLA managed SRAM size", {"dla-sram-size"});
+  args::ValueFlag<uint64_t> dla_sram_size(parser, "dla_sram_size", "DLA managed SRAM size", {"dla-sram-size"});
   args::ValueFlag<uint64_t> dla_local_dram_size(
       parser, "dla_local_dram_size", "DLA Local DRAM size", {"dla-local-dram-size"});
   args::ValueFlag<uint64_t> dla_global_dram_size(
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
index 11dc5d74c6..6a7035ec2e 100644
--- a/cpp/include/torch_tensorrt/torch_tensorrt.h
+++ b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -364,7 +364,7 @@ class TORCHTRT_API TensorFormat {
  * signifying a static input shape or a set of three input shapes representing
  * the min, optiminal and max input shapes allowed for the engine.
  */
-struct TORCHTRT_API Input : torch::CustomClassHolder{
+struct TORCHTRT_API Input : torch::CustomClassHolder {
   /// Minimum acceptable input size into the engine
   std::vector<int64_t> min_shape;
   /// Optimal input size into the engine (size optimized for given kernels accept any size in min max range)
@@ -520,7 +520,7 @@ struct TORCHTRT_API Input : torch::CustomClassHolder{
  * This struct can either hold a complex inputs of shape or a flattened one,
  */
 struct TORCHTRT_API GraphInputs {
-  torch::jit::IValue input_signature;  // nested Input, full input spec
+  torch::jit::IValue input_signature; // nested Input, full input spec
   std::vector<Input> inputs; // flatten input spec
 };
 
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 1fb4c56a98..432b070e91 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -29,40 +29,38 @@ CompileSpec::CompileSpec(std::vector<std::vector<int64_t>> fixed_sizes) {
 }
 
 CompileSpec::CompileSpec(std::vector<Input> inputs) {
-    graph_inputs.inputs = std::move(inputs);
+  graph_inputs.inputs = std::move(inputs);
 }
 
 CompileSpec::CompileSpec(torch::jit::IValue input_signature) {
-    graph_inputs.input_signature = input_signature;
+  graph_inputs.input_signature = input_signature;
 }
 
-
-
 void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) {
-    if (input_ivalue.isTuple()) {
-      auto input_tuple = input_ivalue.toTuple();
-      std::vector<torch::jit::IValue> converted_elements;
-      for (auto item: input_tuple->elements()) {
-        torch::jit::IValue converted_item;
-        to_internal_input_signature(item, converted_item);
-        converted_elements.push_back(converted_item);
-        auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
-        converted_ivalue = torch::jit::IValue(tuple_ptr);
-      }
-    } else if(input_ivalue.isList()) {
-      auto input_list = input_ivalue.toList().vec();
-      c10::TypePtr type = input_list[0].type();
-      auto converted_elements = c10::impl::GenericList(type);
-      for (auto item: input_list) {
-        torch::jit::IValue converted_item;
-        to_internal_input_signature(item, converted_item);
-        converted_elements.push_back(converted_item);
-      }
-      converted_ivalue = torch::jit::IValue(converted_elements);
-    } else if(input_ivalue.isCustomClass()) {
-      torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass<torchtrt::Input>()));
-      converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::core::ir::Input>(cur_input)));
+  if (input_ivalue.isTuple()) {
+    auto input_tuple = input_ivalue.toTuple();
+    std::vector<torch::jit::IValue> converted_elements;
+    for (auto item : input_tuple->elements()) {
+      torch::jit::IValue converted_item;
+      to_internal_input_signature(item, converted_item);
+      converted_elements.push_back(converted_item);
+      auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
+      converted_ivalue = torch::jit::IValue(tuple_ptr);
     }
+  } else if (input_ivalue.isList()) {
+    auto input_list = input_ivalue.toList().vec();
+    c10::TypePtr type = input_list[0].type();
+    auto converted_elements = c10::impl::GenericList(type);
+    for (auto item : input_list) {
+      torch::jit::IValue converted_item;
+      to_internal_input_signature(item, converted_item);
+      converted_elements.push_back(converted_item);
+    }
+    converted_ivalue = torch::jit::IValue(converted_elements);
+  } else if (input_ivalue.isCustomClass()) {
+    torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass<torchtrt::Input>()));
+    converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::core::ir::Input>(cur_input)));
+  }
 }
 
 torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) {
diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp
index 93813190ab..22855aeb03 100644
--- a/cpp/src/torch_tensorrt.cpp
+++ b/cpp/src/torch_tensorrt.cpp
@@ -53,6 +53,5 @@ void set_device(const int gpu_id) {
   torch_tensorrt::core::set_device(gpu_id);
 }
 
-static auto tensorrt_input_container =
-    torch::class_<Input>("_torch_tensorrt", "Input").def(torch::init<>());
+static auto tensorrt_input_container = torch::class_<Input>("_torch_tensorrt", "Input").def(torch::init<>());
 } // namespace torch_tensorrt
diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
index 0eb6fba2de..274b40d479 100644
--- a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
@@ -28,7 +28,8 @@ void RegisterTRTCompileSpec() {
           .def(torch::init<>())
           .def("__str__", &torch_tensorrt::pyapi::InputSignature::to_str);
 
-  ADD_FIELD_GET_SET_REGISTRATION(TRTInputSignatureTSRegistration, torch_tensorrt::pyapi::InputSignature, signature_ivalue);
+  ADD_FIELD_GET_SET_REGISTRATION(
+      TRTInputSignatureTSRegistration, torch_tensorrt::pyapi::InputSignature, signature_ivalue);
 
   static auto TORCHTRT_UNUSED TRTDeviceTSRegistration =
       torch::class_<torch_tensorrt::pyapi::Device>("tensorrt", "_Device")
@@ -73,7 +74,8 @@ void RegisterTRTCompileSpec() {
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, workspace_size);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_sram_size);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_local_dram_size);
-  ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_global_dram_size);
+  ADD_FIELD_GET_SET_REGISTRATION(
+      TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_global_dram_size);
   ADD_FIELD_GET_SET_REGISTRATION(
       TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, truncate_long_and_double);
 }
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
index ca11cf4bc1..96fef793fd 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
@@ -108,35 +108,35 @@ std::string sig_to_str(torch::jit::IValue input_sig) {
   if (input_sig.isTuple()) {
     auto input_tuple = input_sig.toTuple();
     std::vector<std::string> children;
-    for (auto item: input_tuple->elements()) {
+    for (auto item : input_tuple->elements()) {
       auto child = sig_to_str(item);
       children.push_back(child);
     }
     std::stringstream ss;
     ss << "(";
     for (auto i : children) {
-      ss <<  i << ", ";
+      ss << i << ", ";
     }
     ss << ")";
     return ss.str();
-  } else if(input_sig.isList()) {
+  } else if (input_sig.isList()) {
     auto input_list = input_sig.toList().vec();
     std::vector<std::string> children;
-    for (auto item: input_list) {
+    for (auto item : input_list) {
       auto child = sig_to_str(item);
       children.push_back(child);
     }
     std::stringstream ss;
     ss << "[";
     for (auto i : children) {
-      ss <<  i << ", ";
+      ss << i << ", ";
     }
     ss << "]";
     return ss.str();
-  } else if(input_sig.isCustomClass()) {
+  } else if (input_sig.isCustomClass()) {
     auto cur_input = input_sig.toCustomClass<Input>();
     return cur_input->to_str();
-  } else if(input_sig.isPyObject()) {
+  } else if (input_sig.isPyObject()) {
     auto py_object_holder = input_sig.toPyObjectHolder();
     auto infer_type = py_object_holder->tryToInferType();
     auto type = infer_type.type();
@@ -238,27 +238,27 @@ void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IV
   if (input_ivalue.isTuple()) {
     auto input_tuple = input_ivalue.toTuple();
     std::vector<torch::jit::IValue> converted_elements;
-    for (auto item: input_tuple->elements()) {
+    for (auto item : input_tuple->elements()) {
       torch::jit::IValue converted_item;
       to_internal_input_signature(item, converted_item);
       converted_elements.push_back(converted_item);
       auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
       converted_ivalue = torch::jit::IValue(tuple_ptr);
     }
-  } else if(input_ivalue.isList()) {
+  } else if (input_ivalue.isList()) {
     auto input_list = input_ivalue.toList().vec();
     c10::TypePtr type = input_list[0].type();
     auto converted_elements = c10::impl::GenericList(type);
-    for (auto item: input_list) {
+    for (auto item : input_list) {
       torch::jit::IValue converted_item;
       to_internal_input_signature(item, converted_item);
       converted_elements.push_back(converted_item);
     }
     converted_ivalue = torch::jit::IValue(converted_elements);
-  } else if(input_ivalue.isCustomClass()) {
+  } else if (input_ivalue.isCustomClass()) {
     core::ir::Input cur_input = (*(input_ivalue.toCustomClass<Input>())).toInternalInput();
     converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<core::ir::Input>(cur_input)));
-  } else if(input_ivalue.isPyObject()) {
+  } else if (input_ivalue.isPyObject()) {
     auto py_object_holder = input_ivalue.toPyObjectHolder();
     auto infer_type = py_object_holder->tryToInferType();
     auto type = infer_type.type();
@@ -325,11 +325,17 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() {
   info.convert_info.engine_settings.num_avg_timing_iters = num_avg_timing_iters;
   TORCHTRT_CHECK(workspace_size >= 0, "workspace_size must be 0 or greater");
   info.convert_info.engine_settings.workspace_size = workspace_size;
-  TORCHTRT_CHECK(dla_sram_size >= 4096, "DLA managed SRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 MiB");
+  TORCHTRT_CHECK(
+      dla_sram_size >= 4096,
+      "DLA managed SRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 MiB");
   info.convert_info.engine_settings.dla_sram_size = dla_sram_size;
-  TORCHTRT_CHECK(dla_local_dram_size >= 4096, "DLA Local DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 GiB");
+  TORCHTRT_CHECK(
+      dla_local_dram_size >= 4096,
+      "DLA Local DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 GiB");
   info.convert_info.engine_settings.dla_local_dram_size = dla_local_dram_size;
-  TORCHTRT_CHECK(dla_global_dram_size >= 4096, "DLA Global DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 512 MiB");
+  TORCHTRT_CHECK(
+      dla_global_dram_size >= 4096,
+      "DLA Global DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 512 MiB");
   info.convert_info.engine_settings.dla_global_dram_size = dla_global_dram_size;
   return info;
 }
@@ -348,7 +354,7 @@ std::string CompileSpec::stringify() {
   }
   ss << "    \"Enabled Precision\": [";
   for (auto p : enabled_precisions) {
-    ss << to_str(p) << ", " ;
+    ss << to_str(p) << ", ";
   }
   ss << "]" << std::endl;
   ss << "    \"TF32 Disabled\": " << disable_tf32 << std::endl;
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h
index d3b22740c2..be2fab3b8e 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.h
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.h
@@ -58,7 +58,7 @@ struct Input : torch::CustomClassHolder {
 };
 
 struct InputSignature : torch::CustomClassHolder {
-  torch::jit::IValue signature_ivalue;  // nested Input, full input spec
+  torch::jit::IValue signature_ivalue; // nested Input, full input spec
   ADD_FIELD_GET_SET(signature_ivalue, torch::jit::IValue);
   std::string to_str();
 };
diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
index 6247789a93..6b1ffd4ccf 100644
--- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
+++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
@@ -1,8 +1,8 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-#include "Python.h"
 #include "ATen/core/jit_type.h"
+#include "Python.h"
 #include "core/compiler.h"
 #include "core/conversion/conversion.h"
 #include "tensorrt_classes.h"
@@ -182,7 +182,8 @@ PYBIND11_MODULE(_C, m) {
   py::class_<InputSignature>(m, "InputSignature")
       .def(pybind11::init([](py::object py_obj) {
         InputSignature input_signature;
-        input_signature.signature_ivalue = torch::jit::toIValue(std::move(py_obj), c10::PyObjectType::get(), c10::nullopt);
+        input_signature.signature_ivalue =
+            torch::jit::toIValue(std::move(py_obj), c10::PyObjectType::get(), c10::nullopt);
         return input_signature;
       }))
       .def("__str__", &InputSignature::to_str)
diff --git a/tests/core/conversion/converters/test_cast.cpp b/tests/core/conversion/converters/test_cast.cpp
index 092cdb32a6..d26c7a0277 100644
--- a/tests/core/conversion/converters/test_cast.cpp
+++ b/tests/core/conversion/converters/test_cast.cpp
@@ -135,7 +135,6 @@ TEST(Converters, ATenBoolToINT32TensorConvertsCorrectly) {
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results[0], trt, 2e-6));
 }
 
-
 TEST(Converters, ATenToSingleConvertsCorrectly) {
   const auto graph = R"IR(
     graph(%y.1 : Tensor):
@@ -164,7 +163,6 @@ TEST(Converters, ATenToSingleConvertsCorrectly) {
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results[0], trt, 2e-6));
 }
 
-
 TEST(Converters, ATenTypeAsConvertsCorrectly) {
   const auto graph = R"IR(
       graph(%0 : Tensor,
diff --git a/tests/cpp/test_collections.cpp b/tests/cpp/test_collections.cpp
index df2280b947..829e82abc9 100644
--- a/tests/cpp/test_collections.cpp
+++ b/tests/cpp/test_collections.cpp
@@ -5,9 +5,7 @@
 #include "torch/script.h"
 #include "torch_tensorrt/torch_tensorrt.h"
 
-
 TEST(CppAPITests, TestCollectionStandardTensorInput) {
-
   std::string path = "tests/modules/standard_tensor_input_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
@@ -24,7 +22,6 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) {
   mod.eval();
   mod.to(torch::kCUDA);
 
-
   std::vector<torch::jit::IValue> inputs_;
 
   for (auto in : inputs) {
@@ -52,7 +49,6 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) {
 }
 
 TEST(CppAPITests, TestCollectionTupleInput) {
-
   std::string path = "tests/modules/tuple_input_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
 
@@ -78,14 +74,12 @@ TEST(CppAPITests, TestCollectionTupleInput) {
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
-
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
 
   torch::jit::IValue complex_input_shape(input_shape_tuple);
   std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
-
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
   compile_settings.require_full_compilation = false;
   compile_settings.min_block_size = 3;
@@ -100,9 +94,7 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
 
-
 TEST(CppAPITests, TestCollectionListInput) {
-
   std::string path = "tests/modules/list_input_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
@@ -118,7 +110,6 @@ TEST(CppAPITests, TestCollectionListInput) {
   mod.eval();
   mod.to(torch::kCUDA);
 
-
   std::vector<torch::jit::IValue> inputs_;
 
   for (auto in : inputs) {
@@ -134,7 +125,6 @@ TEST(CppAPITests, TestCollectionListInput) {
 
   complex_inputs.push_back(input_list_ivalue);
 
-
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
@@ -146,7 +136,6 @@ TEST(CppAPITests, TestCollectionListInput) {
   list.push_back(input_shape_ivalue);
   list.push_back(input_shape_ivalue);
 
-
   torch::jit::IValue complex_input_shape(list);
   std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
   torch::jit::IValue complex_input_shape2(input_tuple2);
@@ -166,9 +155,7 @@ TEST(CppAPITests, TestCollectionListInput) {
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
 
-
 TEST(CppAPITests, TestCollectionTupleInputOutput) {
-
   std::string path = "tests/modules/tuple_input_output_scripted.jit.pt";
 
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
@@ -183,7 +170,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   mod.eval();
   mod.to(torch::kCUDA);
 
-
   std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_tuple(in0, in0);
 
@@ -196,7 +182,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
-
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
 
   torch::jit::IValue complex_input_shape(input_shape_tuple);
@@ -217,13 +202,13 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
 
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
+      out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
+      out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
 }
 
-
 TEST(CppAPITests, TestCollectionListInputOutput) {
-
   std::string path = "tests/modules/list_input_output_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
@@ -239,7 +224,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
   mod.eval();
   mod.to(torch::kCUDA);
 
-
   std::vector<torch::jit::IValue> inputs_;
 
   for (auto in : inputs) {
@@ -255,7 +239,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
 
   complex_inputs.push_back(input_list_ivalue);
 
-
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
@@ -263,13 +246,11 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
-
   c10::TypePtr elementType = input_shape_ivalue.type();
   auto list = c10::impl::GenericList(elementType);
   list.push_back(input_shape_ivalue);
   list.push_back(input_shape_ivalue);
 
-
   torch::jit::IValue complex_input_shape(list);
   std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
   torch::jit::IValue complex_input_shape2(input_tuple2);
@@ -288,13 +269,13 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
 
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5));
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
+      out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
+      out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5));
 }
 
-
 TEST(CppAPITests, TestCollectionComplexModel) {
-
   std::string path = "tests/modules/list_input_tuple_output_scripted.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
@@ -310,7 +291,6 @@ TEST(CppAPITests, TestCollectionComplexModel) {
   mod.eval();
   mod.to(torch::kCUDA);
 
-
   std::vector<torch::jit::IValue> inputs_;
 
   for (auto in : inputs) {
@@ -326,7 +306,6 @@ TEST(CppAPITests, TestCollectionComplexModel) {
 
   complex_inputs.push_back(input_list_ivalue);
 
-
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
@@ -339,7 +318,6 @@ TEST(CppAPITests, TestCollectionComplexModel) {
   list.push_back(input_shape_ivalue);
   list.push_back(input_shape_ivalue);
 
-
   torch::jit::IValue complex_input_shape(list);
   std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
   torch::jit::IValue complex_input_shape2(input_tuple2);
@@ -358,6 +336,8 @@ TEST(CppAPITests, TestCollectionComplexModel) {
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
 
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
+      out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
+      out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
 }
\ No newline at end of file
diff --git a/tests/cpp/test_example_tensors.cpp b/tests/cpp/test_example_tensors.cpp
index 3ec8831f9d..256e6f1b59 100644
--- a/tests/cpp/test_example_tensors.cpp
+++ b/tests/cpp/test_example_tensors.cpp
@@ -9,7 +9,6 @@ TEST_P(CppAPITests, InputsFromTensors) {
     trt_inputs_ivalues.push_back(in.clone());
   }
 
-
   auto inputs = std::vector<torch_tensorrt::Input>{trt_inputs_ivalues[0].toTensor()};
   auto spec = torch_tensorrt::ts::CompileSpec(inputs);
 
diff --git a/tools/linter/utils.py b/tools/linter/utils.py
index 1754702f6b..8d4d75cd70 100644
--- a/tools/linter/utils.py
+++ b/tools/linter/utils.py
@@ -6,7 +6,7 @@
 BLACKLISTED_BAZEL_TARGETS = [
     "//experiments", "//tools", "//docker",  "//third_party", "//bazel-bin", "//bazel-genfiles",
     "//bazel-out", "//bazel-TRTorch", "//bazel-Torch-TensorRT", "//bazel-torch-tensorrt", "//bazel-workspace",
-    "//bazel-testlogs", "//py/build",
+    "//bazel-tensorrt", "bazel-TensorRT", "//bazel-testlogs", "//py/build",
     "//py/dist", "//py/trtorch.egg-info", "//py/wheelhouse", "//examples", "//docsrc", "//docs"
 ]
 
@@ -35,4 +35,4 @@ def glob_files(project, file_types):
     files = []
     for t in file_types:
         files += glob.glob(project + "/**/*" + t, recursive=True)
-    return files
\ No newline at end of file
+    return files

From f866dba29afa5848ac67d885eaa1e083e2e00177 Mon Sep 17 00:00:00 2001
From: Bo Wang <bowa@nvidia.com>
Date: Mon, 1 Aug 2022 22:16:17 -0700
Subject: [PATCH 10/16] fix: fix the bug that ListConstruct is in TRT subgraph
 when it's entire graph's output

Signed-off-by: Bo Wang <bowa@nvidia.com>
---
 core/partitioning/partitioning.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index 85626772f0..565f58c677 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -90,6 +90,16 @@ std::vector<torch::jit::Node*> getDependencyNodes(
   return stk;
 }
 
+void find_nontensor_output_nodes(
+    torch::jit::Block* block,
+    std::unordered_map<torch::jit::Node*, int>& global_fallback_nodes) {
+  for (auto i : block->outputs()) {
+    if (!isTensor(i)) {
+      global_fallback_nodes.insert({i->node(), FallbackNodeType::kNON_TENSOR});
+    }
+  }
+}
+
 void find_all_fallback_nodes(
     std::unordered_map<torch::jit::Node*, int>& initial_fallback_nodes,
     std::unordered_map<torch::jit::Node*, int>& global_fallback_nodes) {
@@ -430,6 +440,9 @@ PartitionedGraph Partition(
     const PartitionInfo& partition_info,
     std::unordered_map<torch::jit::Node*, int>& global_fallback_nodes) {
   LOG_DEBUG(partition_info);
+  // if there is nonTensor output for the entire graph, fallback the node that produces this nonTensor output
+  find_nontensor_output_nodes(block, global_fallback_nodes);
+
   // segment lowering global graph into blocks
   LOG_DEBUG("Parititioning source module into PyTorch and TensorRT sub blocks");
   PartitionedGraph segmented_blocks = segment_graph(block, partition_info, global_fallback_nodes);

From 6d0b1d3404ecdac09c45e3455078709b445769b4 Mon Sep 17 00:00:00 2001
From: Bo Wang <bowa@nvidia.com>
Date: Tue, 2 Aug 2022 22:39:15 -0700
Subject: [PATCH 11/16] fix: fix the error that collection input segmented into
 trt subgraph

Signed-off-by: Bo Wang <bowa@nvidia.com>
---
 core/partitioning/partitioning.cpp | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index 565f58c677..28bfd0712c 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -90,14 +90,26 @@ std::vector<torch::jit::Node*> getDependencyNodes(
   return stk;
 }
 
-void find_nontensor_output_nodes(
+// check if the input and output of the graph is Tensor after collection is enabled. If it is, then fallback related
+// nodes
+void fallback_graph_nontensor_in_out(
     torch::jit::Block* block,
     std::unordered_map<torch::jit::Node*, int>& global_fallback_nodes) {
+  // fallback nodes that produce entire graph's nonTensor output
   for (auto i : block->outputs()) {
     if (!isTensor(i)) {
       global_fallback_nodes.insert({i->node(), FallbackNodeType::kNON_TENSOR});
     }
   }
+
+  // fallback nodes that consume entire graph's nonTensor input
+  for (auto i : block->inputs()) {
+    if (!isTensor(i)) {
+      for (auto use : i->uses()) {
+        global_fallback_nodes.insert({use.user, FallbackNodeType::kNON_TENSOR});
+      }
+    }
+  }
 }
 
 void find_all_fallback_nodes(
@@ -202,6 +214,7 @@ void registerSegmentsOutputs(PartitionedGraph& segmented_blocks, torch::jit::Blo
       }
     }
   }
+
   std::for_each(segmented_blocks.begin(), segmented_blocks.end(), [](SegmentedBlock& seg_block) {
     torch::jit::EliminateDeadCode(seg_block.g());
   });
@@ -440,8 +453,9 @@ PartitionedGraph Partition(
     const PartitionInfo& partition_info,
     std::unordered_map<torch::jit::Node*, int>& global_fallback_nodes) {
   LOG_DEBUG(partition_info);
-  // if there is nonTensor output for the entire graph, fallback the node that produces this nonTensor output
-  find_nontensor_output_nodes(block, global_fallback_nodes);
+  // if there is nonTensor input/output for the entire graph, fallback the node that consumes/produces this nonTensor
+  // output
+  fallback_graph_nontensor_in_out(block, global_fallback_nodes);
 
   // segment lowering global graph into blocks
   LOG_DEBUG("Parititioning source module into PyTorch and TensorRT sub blocks");

From 8b891fb18bee24d51cfabe1b7c36c693c7fb4362 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Wed, 3 Aug 2022 09:56:11 -0700
Subject: [PATCH 12/16] feat(//core/conversion/converters/evaluators): New
 evaluators for collections

Implements evaluators for:

- prim::TupleUnpack
- prim::TupleConstruct
- prim::TupleIndex

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/evaluators/aten.cpp           |   8 --
 core/conversion/evaluators/eval_util.cpp      |   9 ++
 core/conversion/evaluators/eval_util.h        |   2 +
 core/conversion/evaluators/prim.cpp           |  52 ++++++++-
 core/ir/GraphInputs.cpp                       |   3 -
 core/ir/ir.cpp                                |  12 +-
 cpp/src/compile_spec.cpp                      |   1 +
 py/torch_tensorrt/ts/_compile_spec.py         |  14 +++
 py/torch_tensorrt/ts/_compiler.py             |  27 +++++
 .../evaluators/test_prim_evaluators.cpp       | 107 ++++++++++++++++++
 tests/cpp/test_collections.cpp                |  39 +++----
 tests/util/evaluate_graph.cpp                 |   2 +-
 12 files changed, 232 insertions(+), 44 deletions(-)

diff --git a/core/conversion/evaluators/aten.cpp b/core/conversion/evaluators/aten.cpp
index 4632744790..49f529c003 100644
--- a/core/conversion/evaluators/aten.cpp
+++ b/core/conversion/evaluators/aten.cpp
@@ -19,14 +19,6 @@ namespace conversion {
 namespace evaluators {
 namespace {
 
-int64_t normalizeIndex(int64_t idx, int64_t list_size) {
-  if (idx < 0) {
-    // Handle negative indexing
-    idx = list_size + idx;
-  }
-  return idx;
-}
-
 DEFINE_GENERIC_TWO_INPUT_EVALUATOR(
     eq,
     "aten::eq",
diff --git a/core/conversion/evaluators/eval_util.cpp b/core/conversion/evaluators/eval_util.cpp
index 79b377cd37..742a4f4938 100644
--- a/core/conversion/evaluators/eval_util.cpp
+++ b/core/conversion/evaluators/eval_util.cpp
@@ -12,6 +12,15 @@ namespace core {
 namespace conversion {
 namespace evaluators {
 
+int64_t normalizeIndex(int64_t idx, int64_t list_size) {
+  if (idx < 0) {
+    // Handle negative indexing
+    idx = list_size + idx;
+  }
+  return idx;
+}
+
+
 // TODO: Switch back to PyTorch canonical implimentation
 c10::optional<torch::jit::IValue> toIValue(const torch::jit::Value* v) {
   if (v->node()->kind() != torch::jit::prim::Constant || v->type()->cast<c10::FunctionType>()) {
diff --git a/core/conversion/evaluators/eval_util.h b/core/conversion/evaluators/eval_util.h
index 5e233b4e2d..a9c21339bb 100644
--- a/core/conversion/evaluators/eval_util.h
+++ b/core/conversion/evaluators/eval_util.h
@@ -13,6 +13,8 @@ at::Tensor createTensorFromList(
     const torch::jit::IValue& dtype,
     const torch::jit::IValue& device);
 
+int64_t normalizeIndex(int64_t idx, int64_t list_size);
+
 at::Tensor scalar_to_tensor(const at::Scalar& s, const at::Device device = at::kCPU);
 
 } // namespace evaluators
diff --git a/core/conversion/evaluators/prim.cpp b/core/conversion/evaluators/prim.cpp
index 7d5373a5f9..338c427ccd 100644
--- a/core/conversion/evaluators/prim.cpp
+++ b/core/conversion/evaluators/prim.cpp
@@ -259,6 +259,56 @@ auto prim_registrations =
                       }
                     },
                     EvalOptions().validSchemas({"prim::shape(Tensor a) -> (int[])"})})
+        .evaluator({torch::jit::prim::TupleConstruct,
+                  [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
+                    auto num_inputs = n->inputs().size();
+                    c10::IValue tuple = c10::ivalue::Tuple::create();
+                    switch (num_inputs) {
+                      case 0:
+                        tuple = c10::ivalue::Tuple::create();
+                        break;
+                      case 1:
+                        tuple = c10::ivalue::Tuple::create(std::move((*args.at(n->input(0)).IValue())));
+                        break;
+                      case 2: {
+                        tuple = c10::ivalue::Tuple::create(
+                            std::move(*(args.at(n->input(0)).IValue())),
+                            std::move(*(args.at(n->input(1)).IValue())));
+                        break;
+                      }
+                      case 3: {
+                        tuple = c10::ivalue::Tuple::create(
+                            std::move(*(args.at(n->input(0)).IValue())),
+                            std::move(*(args.at(n->input(1)).IValue())),
+                            std::move(*(args.at(n->input(2)).IValue())));
+                        break;
+                      }
+                      default: {
+                        std::vector<c10::IValue> elems;
+                        for (size_t i = 0; i < num_inputs; i++) {
+                          elems.push_back(*(args.at(n->input(i)).IValue()));
+                        }
+                        tuple = c10::ivalue::Tuple::create(std::move(elems));
+                        break;
+                      }
+                    }
+                    return c10::optional<torch::jit::IValue>(std::move(tuple));
+                  }})
+        .evaluator({torch::jit::prim::TupleIndex,
+                    [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
+                      // Outputs is an IValue which has list of tensors which can be found in ctx->evaluated_value_map
+                      auto tuple = args.at(n->input(0)).IValue()->toTuple();
+                      int64_t idx = args.at(n->input(1)).IValue()->toInt();
+                      int64_t norm_idx = normalizeIndex(idx, tuple->elements().size());
+                      return c10::optional<torch::jit::IValue>(std::move(tuple->elements()[norm_idx]));
+                    },
+                    EvalOptions().validSchemas({"prim::TupleIndex(Any tup, int i) -> (Any)"})})
+        .evaluator({torch::jit::prim::TupleUnpack,
+                    [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
+                      // Outputs is an IValue which has list of tensors which can be found in ctx->evaluated_value_map
+                      auto output = args.at(n->input()).IValue()->toTuple();
+                      return c10::optional<torch::jit::IValue>(std::move(output));
+                    }})
         .evaluator({c10::Symbol::fromQualString("prim::unchecked_cast"),
                     [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
                       return *(args.at(n->input(0)).IValue());
@@ -277,4 +327,4 @@ auto prim_registrations =
 } // namespace evaluators
 } // namespace conversion
 } // namespace core
-} // namespace torch_tensorrt
+} // namespace torch_tensorrt
\ No newline at end of file
diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp
index a1b1196be9..f3fa889385 100644
--- a/core/ir/GraphInputs.cpp
+++ b/core/ir/GraphInputs.cpp
@@ -54,7 +54,6 @@ void flatten_dfs(
 }
 
 GraphInputs::GraphInputs(std::vector<ir::Input> inputs_) {
-  LOG_DEBUG("Construct GraphInput with ir::Input");
   inputs = inputs_;
   collection_inputs.resize(inputs_.size());
   for (size_t i = 0; i < inputs_.size(); i++) {
@@ -63,8 +62,6 @@ GraphInputs::GraphInputs(std::vector<ir::Input> inputs_) {
 }
 
 GraphInputs::GraphInputs(torch::jit::IValue& input_signature_) {
-  LOG_DEBUG("Construct GraphInput with IValue");
-
   std::vector<torch_tensorrt::core::ir::Input> flattened_inputs;
   std::vector<std::vector<torch_tensorrt::core::ir::Input>> collection_inputs_;
 
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index d9b021ed8b..99bf4f97b1 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -29,7 +29,7 @@ InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> va
 
   std::unordered_map<const torch::jit::Value*, core::ir::Input> a;
   for (size_t i = 0; i < vals.size(); i++) {
-    LOG_DEBUG("Pairing " << i << ": " << vals[i]->debugName() << " : " << specs[i]);
+    LOG_DEBUG("Pairing " << i << ": " << vals[i]->debugName() << ": " << specs[i]);
     a.insert({vals[i], specs[i]});
   }
   return a;
@@ -56,7 +56,7 @@ std::vector<const torch::jit::Value*> get_tensor_inputs(
     StaticParams& static_params) {
   std::vector<const torch::jit::Value*> input_tensors;
   auto inputs = g->inputs();
-  LOG_DEBUG("Raw inputs size of get_tensor_inputs: " << inputs.size());
+  LOG_DEBUG("Found " << inputs.size() << " inputs to graph");
   for (auto in : inputs) {
     LOG_DEBUG("Handle input of debug name: " << in->debugName());
     // Disregarding inputs that are not tensors or are static
@@ -76,7 +76,7 @@ std::vector<const torch::jit::Value*> get_collection_inputs(
     StaticParams& static_params) {
   std::vector<const torch::jit::Value*> input_tensors;
   auto inputs = g->inputs();
-  LOG_DEBUG("Raw inputs size of get_collection_inputs: " << inputs.size());
+  LOG_DEBUG("Found " << inputs.size() << " inputs to graph");
   for (auto in : inputs) {
     LOG_DEBUG("Handle input of debug name: " << in->debugName());
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
@@ -86,9 +86,9 @@ std::vector<const torch::jit::Value*> get_collection_inputs(
       // {
       input_tensors.push_back(in); // push original tuple
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(in);
-      LOG_DEBUG("get_collection_inputs, tuple size " << unpack_tuple.size());
+      LOG_DEBUG("Input tuple size " << unpack_tuple.size());
     } else if (in->type()->kind() == torch::jit::TypeKind::ListType && static_params.find(in) == static_params.end()) {
-      LOG_DEBUG("get_collection_inputs, list use size " << in->uses().size());
+      LOG_DEBUG("Input list use size " << in->uses().size());
       input_tensors.push_back(in); // push original list
     }
   }
@@ -227,7 +227,7 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
 
     } else if (i->type()->kind() == torch::jit::TypeKind::ListType) {
       // TODO: to decide the size of list and type of list element
-      LOG_DEBUG("get_block_first_calc_dtypes_opt ListType: use size " << i->uses().size());
+      LOG_DEBUG("Number of list uses " << i->uses().size());
       c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
       // std::vector<c10::optional<at::ScalarType>> dytpes(i->uses().size());
       std::vector<c10::optional<at::ScalarType>> dytpes(i->uses().size(), tp);
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 432b070e91..04573499e6 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -69,6 +69,7 @@ torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) {
     return internal;
   } else {
     torch::jit::IValue converted_input_signature;
+    LOG_WARNING( "Input signature parsing is an experimental feature, behavior and APIs may change");
     to_internal_input_signature(external.graph_inputs.input_signature, converted_input_signature);
     torchtrt::core::CompileSpec internal(converted_input_signature);
     return internal;
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index 0eb8a1cdce..628955c079 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -305,6 +305,20 @@ def TensorRTCompileSpec(inputs=[],
                     torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
                 ]
 
+        input_signature Union(List, Tuple, torch_tensorrt.Input, torch.Tensor): A formatted collection of input specifications for the module. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using
+            torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. **This API should be considered beta-level stable and may change in the future** ::
+
+                input_signature=([
+                    torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1
+                    torch_tensorrt.Input(
+                        min_shape=(1, 224, 224, 3),
+                        opt_shape=(1, 512, 512, 3),
+                        max_shape=(1, 1024, 1024, 3),
+                        dtype=torch.int32
+                        format=torch.channel_last
+                    ), # Dynamic input shape for input #2
+                ], torch.randn((1, 3, 224, 244))) # Use an example tensor and let torch_tensorrt infer settings for input #3
+
         device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on ::
 
             device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True)
diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py
index 508cb8fdd0..9119c25e86 100644
--- a/py/torch_tensorrt/ts/_compiler.py
+++ b/py/torch_tensorrt/ts/_compiler.py
@@ -58,6 +58,19 @@ def compile(module: torch.jit.ScriptModule,
                     torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
                 ]
 
+        input_signature Union(List, Tuple, torch_tensorrt.Input, torch.Tensor): A formatted collection of input specifications for the module. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using
+            torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. **This API should be considered beta-level stable and may change in the future** ::
+
+                input_signature=([
+                    torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1
+                    torch_tensorrt.Input(
+                        min_shape=(1, 224, 224, 3),
+                        opt_shape=(1, 512, 512, 3),
+                        max_shape=(1, 1024, 1024, 3),
+                        dtype=torch.int32
+                        format=torch.channel_last
+                    ), # Dynamic input shape for input #2
+                ], torch.randn((1, 3, 224, 244))) # Use an example tensor and let torch_tensorrt infer settings for input #3
         device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on ::
 
             device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True)
@@ -163,6 +176,20 @@ def convert_method_to_trt_engine(module: torch.jit.ScriptModule,
                     torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
                 ]
 
+        input_signature Union(List, Tuple, torch_tensorrt.Input, torch.Tensor): A formatted collection of input specifications for the module. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using
+            torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. **This API should be considered beta-level stable and may change in the future** ::
+
+                input_signature=([
+                    torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1
+                    torch_tensorrt.Input(
+                        min_shape=(1, 224, 224, 3),
+                        opt_shape=(1, 512, 512, 3),
+                        max_shape=(1, 1024, 1024, 3),
+                        dtype=torch.int32
+                        format=torch.channel_last
+                    ), # Dynamic input shape for input #2
+                ], torch.randn((1, 3, 224, 244))) # Use an example tensor and let torch_tensorrt infer settings for input #3
+
         device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on ::
 
             device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True)
diff --git a/tests/core/conversion/evaluators/test_prim_evaluators.cpp b/tests/core/conversion/evaluators/test_prim_evaluators.cpp
index 0ff250f0e9..508d4eb1b0 100644
--- a/tests/core/conversion/evaluators/test_prim_evaluators.cpp
+++ b/tests/core/conversion/evaluators/test_prim_evaluators.cpp
@@ -51,5 +51,112 @@ TEST(Evaluators, NumToTensorEvaluatesCorrectly) {
   auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {});
   auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {});
 
+  ASSERT_TRUE(jit_results[0] == trt_results[0]);
+}
+
+TEST(Evaluators, PrimTupleConstruct1EvaluatesCorrectly) {
+  const auto graph = R"IR(
+      graph():
+        %1 : int = prim::Constant[value=3]()
+        %tc : (int) = prim::TupleConstruct(%1)
+        return (%tc))IR";
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {});
+  auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {});
+
+  ASSERT_TRUE(jit_results[0] == trt_results[0]);
+}
+
+TEST(Evaluators, PrimTupleConstruct2EvaluatesCorrectly) {
+  const auto graph = R"IR(
+      graph():
+        %1 : int = prim::Constant[value=3]()
+        %2 : int = prim::Constant[value=4]()
+        %tc : (int, int) = prim::TupleConstruct(%1, %2)
+        return (%tc))IR";
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {});
+  auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {});
+
+  ASSERT_TRUE(jit_results[0] == trt_results[0]);
+}
+
+TEST(Evaluators, PrimTupleConstruct3EvaluatesCorrectly) {
+  const auto graph = R"IR(
+      graph():
+        %1 : int = prim::Constant[value=3]()
+        %2 : int = prim::Constant[value=4]()
+        %3 : int = prim::Constant[value=4]()
+        %tc : (int, int, int) = prim::TupleConstruct(%1, %2, %3)
+        return (%tc))IR";
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {});
+  auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {});
+
+  ASSERT_TRUE(jit_results[0] == trt_results[0]);
+}
+
+TEST(Evaluators, PrimTupleConstruct4EvaluatesCorrectly) {
+  const auto graph = R"IR(
+      graph():
+        %1 : int = prim::Constant[value=3]()
+        %2 : int = prim::Constant[value=4]()
+        %3 : int = prim::Constant[value=3]()
+        %4 : int = prim::Constant[value=4]()
+        %tc : (int, int, int, int) = prim::TupleConstruct(%1, %2, %3, %4)
+        return (%tc))IR";
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {});
+  auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {});
+
+  ASSERT_TRUE(jit_results[0] == trt_results[0]);
+}
+
+TEST(Evaluators, PrimTupleUnpackEvaluatesCorrectly) {
+  const auto graph = R"IR(
+      graph():
+        %1 : int = prim::Constant[value=3]()
+        %2 : int = prim::Constant[value=4]()
+        %tc : (int, int) = prim::TupleConstruct(%1, %2)
+        %tu.1 : int, %tu.2 : int = prim::TupleUnpack(%tc)
+        return (%tu.1, %tu.2))IR";
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {});
+  auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {});
+
+  ASSERT_TRUE(jit_results[0] == trt_results[0]);
+}
+
+TEST(Evaluators, PrimTupleIndexEvaluatesCorrectly) {
+  const auto graph = R"IR(
+      graph():
+        %0 : int = prim::Constant[value=1]()
+        %1 : int = prim::Constant[value=3]()
+        %2 : int = prim::Constant[value=4]()
+        %tc : (int, int) = prim::TupleConstruct(%1, %2)
+        %ti : int = prim::TupleIndex(%tc, %0)
+        return (%ti))IR";
+
+  auto g = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(graph, g.get());
+
+  auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {});
+  auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {});
+
   ASSERT_TRUE(jit_results[0] == trt_results[0]);
 }
\ No newline at end of file
diff --git a/tests/cpp/test_collections.cpp b/tests/cpp/test_collections.cpp
index 829e82abc9..e48874e8bb 100644
--- a/tests/cpp/test_collections.cpp
+++ b/tests/cpp/test_collections.cpp
@@ -29,20 +29,18 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) {
   }
 
   auto out = mod.forward(inputs_);
-  LOG_DEBUG("Finish torchscirpt forward");
 
   std::vector<torch_tensorrt::Input> input_range;
   input_range.push_back({in0.sizes(), torch::kF16});
   input_range.push_back({in0.sizes(), torch::kF16});
   torch_tensorrt::ts::CompileSpec compile_settings(input_range);
   compile_settings.require_full_compilation = true;
-  compile_settings.min_block_size = 3;
+  compile_settings.min_block_size = 1;
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
-  LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(inputs_);
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
@@ -68,7 +66,6 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   complex_inputs.push_back(input_tuple);
 
   auto out = mod.forward(complex_inputs);
-  LOG_DEBUG("Finish torchscirpt forward");
 
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
@@ -81,14 +78,13 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = false;
-  compile_settings.min_block_size = 3;
+  compile_settings.require_full_compilation = true;
+  compile_settings.min_block_size = 1;
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
-  LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
@@ -126,7 +122,6 @@ TEST(CppAPITests, TestCollectionListInput) {
   complex_inputs.push_back(input_list_ivalue);
 
   auto out = mod.forward(complex_inputs);
-  LOG_DEBUG("Finish torchscirpt forward");
 
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
@@ -141,9 +136,9 @@ TEST(CppAPITests, TestCollectionListInput) {
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = false;
-  compile_settings.min_block_size = 3;
-  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+  compile_settings.require_full_compilation = true;
+  compile_settings.min_block_size = 1;
+  //compile_settings.torch_executed_ops.push_back("aten::__getitem__");
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
@@ -176,7 +171,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   complex_inputs.push_back(input_tuple);
 
   auto out = mod.forward(complex_inputs);
-  LOG_DEBUG("Finish torchscirpt forward");
 
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
@@ -190,8 +184,8 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   // torch::jit::IValue complex_input_shape(list);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = false;
-  compile_settings.min_block_size = 3;
+  compile_settings.require_full_compilation = true;
+  compile_settings.min_block_size = 1;
 
   // compile_settings.torch_executed_ops.push_back("prim::TupleConstruct");
 
@@ -199,7 +193,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
-  LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
@@ -240,7 +233,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
   complex_inputs.push_back(input_list_ivalue);
 
   auto out = mod.forward(complex_inputs);
-  LOG_DEBUG("Finish torchscirpt forward");
 
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
@@ -256,17 +248,16 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = false;
-  compile_settings.min_block_size = 3;
+  compile_settings.require_full_compilation = true;
+  compile_settings.min_block_size = 1;
 
   // Need to skip the conversion of __getitem__ and ListConstruct
-  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+  //compile_settings.torch_executed_ops.push_back("aten::__getitem__");
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
-  LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
@@ -307,7 +298,6 @@ TEST(CppAPITests, TestCollectionComplexModel) {
   complex_inputs.push_back(input_list_ivalue);
 
   auto out = mod.forward(complex_inputs);
-  LOG_DEBUG("Finish torchscirpt forward");
 
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
@@ -323,17 +313,16 @@ TEST(CppAPITests, TestCollectionComplexModel) {
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = false;
-  compile_settings.min_block_size = 3;
+  compile_settings.require_full_compilation = true;
+  compile_settings.min_block_size = 1;
 
   // Need to skip the conversion of __getitem__ and ListConstruct
-  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+  //compile_settings.torch_executed_ops.push_back("aten::__getitem__");
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
-  LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
diff --git a/tests/util/evaluate_graph.cpp b/tests/util/evaluate_graph.cpp
index 7e69b454ef..5a9f10f7b0 100644
--- a/tests/util/evaluate_graph.cpp
+++ b/tests/util/evaluate_graph.cpp
@@ -28,7 +28,7 @@ std::vector<torch::jit::IValue> EvaluateGraph(const torch::jit::Block* b, std::v
         "Test graph contains non evaluatable nodes: " << *n);
     auto eval = core::conversion::EvaluateNode(ctx, n);
     if (eval) {
-      if (eval.value().isTuple()) {
+      if (eval.value().isTuple() && n->outputs().size() > 1) {
         auto eval_list = eval.value().toTuple();
         for (size_t i = 0; i < eval_list->elements().size(); i++) {
           auto eval_output = eval_list.get()->elements()[i];

From f5199355f3cc30fd3cf169fcba76cc389c327ac6 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Wed, 3 Aug 2022 22:02:19 -0700
Subject: [PATCH 13/16] feat(collections): Enable grouped inputs via partial
 compilation

HACK: This PR enables grouped input features by leveraging partial compilation
and disabling tuple and list evaluators in the case where grouped inputs
are used. The intention is that this WAR is removed in the next release

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 cpp/src/compile_spec.cpp              | 21 ++++++++++++++++++++-
 py/torch_tensorrt/ts/_compile_spec.py | 24 +++++++++++++++++++++++-
 py/torch_tensorrt/ts/_compiler.py     |  3 +--
 tests/cpp/test_collections.cpp        | 12 ------------
 tests/py/api/test_collections.py      | 15 +++++----------
 5 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 04573499e6..8c7bb8b403 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -63,7 +63,7 @@ void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IV
   }
 }
 
-torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) {
+torchtrt::core::CompileSpec init_compile_spec(CompileSpec& external) {
   if (external.graph_inputs.inputs.size() > 0) {
     torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.graph_inputs.inputs));
     return internal;
@@ -72,6 +72,25 @@ torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) {
     LOG_WARNING( "Input signature parsing is an experimental feature, behavior and APIs may change");
     to_internal_input_signature(external.graph_inputs.input_signature, converted_input_signature);
     torchtrt::core::CompileSpec internal(converted_input_signature);
+
+    TORCHTRT_CHECK(!external.require_full_compilation, \
+      "Grouped inputs currently requires partial compilation to be enabled, \
+      this restriction will be relaxed in a future release");
+
+    LOG_DEBUG("Grouped inputs currently requires additional settings to enable the feature");
+    LOG_DEBUG("Adding the following ops to torch_executed_ops:" \
+       << std::endl << "  - aten::__getitem__" \
+       << std::endl << "  - prim::ListConstruct" \
+       << std::endl << "  - prim::ListUnpack" \
+       << std::endl << "  - prim::TupleIndex" \
+       << std::endl << "  - prim::TupleConstruct" \
+       << std::endl << "  - prim::TupleUnpack");
+    external.torch_executed_ops.push_back("aten::__getitem__");
+    external.torch_executed_ops.push_back("prim::ListConstruct");
+    external.torch_executed_ops.push_back("prim::ListUnpack");
+    external.torch_executed_ops.push_back("prim::TupleIndex");
+    external.torch_executed_ops.push_back("prim::TupleConstruct");
+    external.torch_executed_ops.push_back("prim::TupleUnpack");
     return internal;
   }
 }
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index 628955c079..4cd69f794f 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -8,6 +8,7 @@
 from torch_tensorrt.logging import Level, log
 from typing import Tuple, List, Dict
 import warnings
+from copy import deepcopy
 
 
 def _internal_input_to_torch_class_input(i: _C.Input) -> torch.classes.tensorrt._Input:
@@ -188,7 +189,9 @@ def _parse_input_signature(input_signature: Any):
     else:
         raise KeyError("Input signature contains an unsupported type {}".format(type(input_signature)))
 
-def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
+def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec:
+    # TODO: Remove deep copy once collections does not need partial compilation
+    compile_spec = deepcopy(compile_spec_)
     info = _ts_C.CompileSpec()
 
     if len(compile_spec["inputs"]) > 0:
@@ -204,6 +207,25 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
         signature = _parse_input_signature(compile_spec["input_signature"])
         info.input_signature = _C.InputSignature(signature) # py_object
 
+        if not compile_spec["torch_fallback"]["enabled"]:
+            raise ValueError("Grouped inputs currently requires partial compilation to be enabled, this restriction will be relaxed in a future release")
+
+        log(Level.Debug, "Grouped inputs currently requires additional settings to enable the feature")
+        log(Level.Debug, """Adding the following ops to torch_executed_ops:
+    - aten::__getitem__
+    - prim::ListConstruct
+    - prim::ListUnpack
+    - prim::TupleIndex
+    - prim::TupleConstruct
+    - prim::TupleUnpack
+""")
+        compile_spec["torch_fallback"]["forced_fallback_ops"].append("aten::__getitem__")
+        compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::ListConstruct")
+        compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::ListUnpack")
+        compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::TupleIndex")
+        compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::TupleConstruct")
+        compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::TupleUnpack")
+
     else:
         raise KeyError(
             "Module input definitions are requried to compile module. Provide a list of torch_tensorrt.Input keyed to \"inputs\" in the compile spec"
diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py
index 9119c25e86..cc5f4b24d1 100644
--- a/py/torch_tensorrt/ts/_compiler.py
+++ b/py/torch_tensorrt/ts/_compiler.py
@@ -103,8 +103,7 @@ def compile(module: torch.jit.ScriptModule,
 
     if require_full_compilation and (len(torch_executed_modules) > 0 or len(torch_executed_ops) > 0):
         raise ValueError(
-            "require_full_compilation is enabled however the list of modules and ops to run in torch is not empty. Found: torch_executed_ops: "
-            + torch_executed_ops + ", torch_executed_modules: " + torch_executed_modules)
+            f"require_full_compilation is enabled however the list of modules and ops to run in torch is not empty. Found: torch_executed_ops: {torch_executed_ops}, torch_executed_modules: {torch_executed_modules}")
 
     spec = {
         "inputs": inputs,
diff --git a/tests/cpp/test_collections.cpp b/tests/cpp/test_collections.cpp
index e48874e8bb..31495a47a7 100644
--- a/tests/cpp/test_collections.cpp
+++ b/tests/cpp/test_collections.cpp
@@ -34,7 +34,6 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) {
   input_range.push_back({in0.sizes(), torch::kF16});
   input_range.push_back({in0.sizes(), torch::kF16});
   torch_tensorrt::ts::CompileSpec compile_settings(input_range);
-  compile_settings.require_full_compilation = true;
   compile_settings.min_block_size = 1;
 
   // // FP16 execution
@@ -78,7 +77,6 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = true;
   compile_settings.min_block_size = 1;
 
   // // FP16 execution
@@ -136,7 +134,6 @@ TEST(CppAPITests, TestCollectionListInput) {
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = true;
   compile_settings.min_block_size = 1;
   //compile_settings.torch_executed_ops.push_back("aten::__getitem__");
 
@@ -184,7 +181,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   // torch::jit::IValue complex_input_shape(list);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = true;
   compile_settings.min_block_size = 1;
 
   // compile_settings.torch_executed_ops.push_back("prim::TupleConstruct");
@@ -248,12 +244,8 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = true;
   compile_settings.min_block_size = 1;
 
-  // Need to skip the conversion of __getitem__ and ListConstruct
-  //compile_settings.torch_executed_ops.push_back("aten::__getitem__");
-
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
@@ -313,12 +305,8 @@ TEST(CppAPITests, TestCollectionComplexModel) {
   torch::jit::IValue complex_input_shape2(input_tuple2);
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
-  compile_settings.require_full_compilation = true;
   compile_settings.min_block_size = 1;
 
-  // Need to skip the conversion of __getitem__ and ListConstruct
-  //compile_settings.torch_executed_ops.push_back("aten::__getitem__");
-
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
diff --git a/tests/py/api/test_collections.py b/tests/py/api/test_collections.py
index 603d44aebb..154145e681 100644
--- a/tests/py/api/test_collections.py
+++ b/tests/py/api/test_collections.py
@@ -48,8 +48,7 @@ def test_compile(self):
             "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),),
             "device": torchtrt.Device("gpu:0"),
             "enabled_precisions": {torch.float},
-            "require_full_compilation": False,
-            "min_block_size": 3
+            "min_block_size": 1
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
@@ -69,8 +68,7 @@ def test_compile(self):
             "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],),
             "device": torchtrt.Device("gpu:0"),
             "enabled_precisions": {torch.float},
-            "require_full_compilation": False,
-            "min_block_size": 3
+            "min_block_size": 1
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
@@ -89,8 +87,7 @@ def test_compile(self):
             "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),),
             "device": torchtrt.Device("gpu:0"),
             "enabled_precisions": {torch.float},
-            "require_full_compilation": False,
-            "min_block_size": 3
+            "min_block_size": 1
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
@@ -111,8 +108,7 @@ def test_compile(self):
             "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],),
             "device": torchtrt.Device("gpu:0"),
             "enabled_precisions": {torch.float},
-            "require_full_compilation": False,
-            "min_block_size": 3
+            "min_block_size": 1
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
@@ -134,8 +130,7 @@ def test_compile(self):
             "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],),
             "device": torchtrt.Device("gpu:0"),
             "enabled_precisions": {torch.float},
-            "require_full_compilation": False,
-            "min_block_size": 3
+            "min_block_size": 1
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)

From bce8464b2ce90c1744e45587dd4be17d38fe8219 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 5 Aug 2022 22:03:06 -0700
Subject: [PATCH 14/16] feat(element_wise): Auto cast to higher precision for
 mismatched types

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/converters/converter_util.cpp |  7 ++++
 .../converters/impl/element_wise.cpp          |  2 +
 .../converters/test_element_wise.cpp          | 37 ++++++++++++++++++-
 tests/util/run_graph_engine.cpp               |  5 ++-
 4 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/core/conversion/converters/converter_util.cpp b/core/conversion/converters/converter_util.cpp
index 745261589e..94ac827ef4 100644
--- a/core/conversion/converters/converter_util.cpp
+++ b/core/conversion/converters/converter_util.cpp
@@ -65,6 +65,13 @@ nvinfer1::ILayer* add_elementwise(
     nvinfer1::ITensor* self,
     nvinfer1::ITensor* other,
     const std::string& name) {
+  if (self->getType() == nvinfer1::DataType::kFLOAT && other->getType() == nvinfer1::DataType::kINT32) {
+    LOG_DEBUG("Type mismatch, casting other to " << self->getType());
+    other = castITensor(ctx, other, self->getType());
+  } else if (self->getType() == nvinfer1::DataType::kINT32 && other->getType() == nvinfer1::DataType::kFLOAT) {
+    LOG_DEBUG("Type mismatch, casting self to " << other->getType());
+    self = castITensor(ctx, self, other->getType());
+  }
   // ensure self to have larger number of dimension
   bool swapSelfOther = false;
   if (self->getDimensions().nbDims < other->getDimensions().nbDims) {
diff --git a/core/conversion/converters/impl/element_wise.cpp b/core/conversion/converters/impl/element_wise.cpp
index 2f0c3a9d13..da9d58ef43 100644
--- a/core/conversion/converters/impl/element_wise.cpp
+++ b/core/conversion/converters/impl/element_wise.cpp
@@ -412,6 +412,7 @@ auto element_wise_registrations TORCHTRT_UNUSED =
                     // Should implement self * other
                     auto self = args[0].ITensorOrFreeze(ctx);
                     auto other = args[1].ITensorOrFreeze(ctx);
+
                     auto mul =
                         add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, self, other, util::node_info(n));
                     TORCHTRT_CHECK(mul, "Unable to create mul layer from node: " << *n);
@@ -426,6 +427,7 @@ auto element_wise_registrations TORCHTRT_UNUSED =
                     // TODO: Remove with functionalization
                     auto self = args[0].ITensorOrFreeze(ctx);
                     auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
+
                     auto mul =
                         add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, self, other, util::node_info(n));
                     TORCHTRT_CHECK(mul, "Unable to create mul layer from node: " << *n);
diff --git a/tests/core/conversion/converters/test_element_wise.cpp b/tests/core/conversion/converters/test_element_wise.cpp
index 994fb25811..939c9b7394 100644
--- a/tests/core/conversion/converters/test_element_wise.cpp
+++ b/tests/core/conversion/converters/test_element_wise.cpp
@@ -12,7 +12,9 @@ void pointwise_test_helper(
     std::vector<int64_t> shape1 = {5},
     std::vector<int64_t> shape2 = {5},
     bool negative_input = false,
-    bool int_tensors = false) {
+    bool int_tensors = false,
+    bool float_int_tensors = false,
+    bool int_float_tensors = false) {
   auto g = std::make_shared<torch::jit::Graph>();
   torch::jit::parseIR(graph_ir, g.get());
 
@@ -27,11 +29,24 @@ void pointwise_test_helper(
   if (!singleInput) {
     torch_inputs.push_back(at::randint(1, 5, shape2, {at::kCUDA}));
   }
+
+  TORCHTRT_CHECK(!((int_tensors && (float_int_tensors || int_float_tensors)) || (float_int_tensors && int_float_tensors)),
+    "Invalid test configuration, only one of int_tensors, float_int_tensors, int_float_tensors can be true");
+
   if(int_tensors){
     for(size_t i = 0UL; i < torch_inputs.size(); ++i){
       torch_inputs[i] = torch_inputs[i].to(at::kInt);
     }
+  } else if(float_int_tensors) {
+    TORCHTRT_CHECK(!singleInput, "float_int_tensors tests require two inputs");
+    torch_inputs[0] = torch_inputs[0].to(at::kFloat);
+    torch_inputs[1] = torch_inputs[1].to(at::kInt);
+  } else if (int_float_tensors) {
+    TORCHTRT_CHECK(!singleInput, "int_float_tensors tests require two inputs");
+    torch_inputs[0] = torch_inputs[0].to(at::kInt);
+    torch_inputs[1] = torch_inputs[1].to(at::kFloat);
   }
+
   auto params = torch_tensorrt::core::ir::get_static_params(g->inputs(), {});
   auto jit_results = torch_tensorrt::tests::util::RunGraph(g, params, torch_inputs);
 
@@ -62,6 +77,8 @@ TEST(Converters, ATenAddConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4});
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3});
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3});
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
 TEST(Converters, ATenAddWithAlphaConvertsCorrectly) {
@@ -75,9 +92,11 @@ TEST(Converters, ATenAddWithAlphaConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4});
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3});
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3});
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
-TEST(Converters, ATenAddImplicitWithAlphaConvertsCorrectly) {
+TEST(Converters, ATenAddInplaceWithAlphaConvertsCorrectly) {
   const auto graph = R"IR(
       graph(%0 : Tensor, %1 : Tensor):
         %2 : float = prim::Constant[value=7.6]()
@@ -109,6 +128,8 @@ TEST(Converters, ATenSubConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4});
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3});
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3});
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
 TEST(Converters, ATenMulConvertsCorrectly) {
@@ -121,6 +142,8 @@ TEST(Converters, ATenMulConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4});
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3});
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3});
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
 TEST(Converters, ATenMulWithScalarConvertsCorrectly) {
@@ -151,6 +174,8 @@ TEST(Converters, ATenDivConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4});
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3});
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3});
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
 TEST(Converters, ATenDivWithScalarConvertsCorrectly) {
@@ -173,6 +198,8 @@ TEST(Converters, ATenDivRoundingFloorConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4}, true);
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}, true);
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
 TEST(Converters, ATenDivRoundingTruncConvertsCorrectly) {
@@ -186,6 +213,8 @@ TEST(Converters, ATenDivRoundingTruncConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4}, true);
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}, true);
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
 TEST(Converters, ATenDivRoundingNoneConvertsCorrectly) {
@@ -211,6 +240,8 @@ TEST(Converters, ATenPowTensorConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4});
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3});
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3});
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
 TEST(Converters, ATenPowScalarConvertsCorrectly) {
@@ -251,6 +282,8 @@ TEST(Converters, ATenFloorDivideConvertsCorrectly) {
   pointwise_test_helper(graph, false, false, {4}, {3, 4});
   pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3});
   pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3});
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true);
+  pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true);
 }
 
 TEST(Converters, ATenFloorDivideWithScalarConvertsCorrectly) {
diff --git a/tests/util/run_graph_engine.cpp b/tests/util/run_graph_engine.cpp
index b0bb920768..1d77550d1d 100644
--- a/tests/util/run_graph_engine.cpp
+++ b/tests/util/run_graph_engine.cpp
@@ -30,6 +30,7 @@ std::vector<core::ir::Input> toInputsDynamic(std::vector<at::Tensor> ten, bool d
 
   for (auto i : ten) {
     auto opt = core::util::toVec(i.sizes());
+    auto dtype = core::util::ScalarTypeToTRTDataType(i.scalar_type());
 
     if (dynamic_batch) {
       std::vector<int64_t> min_range(opt);
@@ -38,7 +39,7 @@ std::vector<core::ir::Input> toInputsDynamic(std::vector<at::Tensor> ten, bool d
       min_range[0] = ceil(opt[0] / 2.0);
       max_range[0] = 2 * opt[0];
 
-      a.push_back(core::ir::Input(min_range, opt, max_range));
+      a.push_back(core::ir::Input(min_range, opt, max_range, dtype));
     } else {
       std::vector<int64_t> min_range(opt);
       std::vector<int64_t> max_range(opt);
@@ -46,7 +47,7 @@ std::vector<core::ir::Input> toInputsDynamic(std::vector<at::Tensor> ten, bool d
       min_range[1] = ceil(opt[1] / 2.0);
       max_range[1] = 2 * opt[1];
 
-      a.push_back(core::ir::Input(min_range, opt, max_range));
+      a.push_back(core::ir::Input(min_range, opt, max_range, dtype));
     }
   }
 

From 891440da148b3cee64e0828e3e3a7f6cfe2cb0db Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sat, 6 Aug 2022 13:50:24 -0700
Subject: [PATCH 15/16] refactor: Disable input_signature in torchscript
 backend due to lack of generic interface

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 .../csrc/register_tensorrt_classes.cpp        |  5 +++++
 py/torch_tensorrt/ts/_compile_spec.py         | 21 ++++---------------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
index 274b40d479..9db567ca86 100644
--- a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
@@ -26,6 +26,11 @@ void RegisterTRTCompileSpec() {
   static auto TORCHTRT_UNUSED TRTInputSignatureTSRegistration =
       torch::class_<torch_tensorrt::pyapi::InputSignature>("tensorrt", "_InputSignature")
           .def(torch::init<>())
+          .def("_set_signature_ivalue_torchbind",
+                [](const c10::intrusive_ptr<torch_tensorrt::pyapi::InputSignature>& self,
+                  torch::jit::IValue ival) {
+                    self->signature_ivalue = ival;
+                })
           .def("__str__", &torch_tensorrt::pyapi::InputSignature::to_str);
 
   ADD_FIELD_GET_SET_REGISTRATION(
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index 4cd69f794f..ac3465b1c4 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -327,20 +327,6 @@ def TensorRTCompileSpec(inputs=[],
                     torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
                 ]
 
-        input_signature Union(List, Tuple, torch_tensorrt.Input, torch.Tensor): A formatted collection of input specifications for the module. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using
-            torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. **This API should be considered beta-level stable and may change in the future** ::
-
-                input_signature=([
-                    torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1
-                    torch_tensorrt.Input(
-                        min_shape=(1, 224, 224, 3),
-                        opt_shape=(1, 512, 512, 3),
-                        max_shape=(1, 1024, 1024, 3),
-                        dtype=torch.int32
-                        format=torch.channel_last
-                    ), # Dynamic input shape for input #2
-                ], torch.randn((1, 3, 224, 244))) # Use an example tensor and let torch_tensorrt infer settings for input #3
-
         device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on ::
 
             device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True)
@@ -362,7 +348,7 @@ def TensorRTCompileSpec(inputs=[],
 
     compile_spec = {
         "inputs": inputs,
-        "input_signature": input_signature,
+        #"input_signature": input_signature,
         "device": device,
         "disable_tf32":
             disable_tf32,  # Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas
@@ -384,12 +370,13 @@ def TensorRTCompileSpec(inputs=[],
 
     backend_spec = torch.classes.tensorrt.CompileSpec()
 
+    if input_signature is not None:
+        raise ValueError("Input signature parsing is not currently supported in the TorchScript backend integration")
+
     for i in parsed_spec.inputs:
         clone = _internal_input_to_torch_class_input(i)
         backend_spec._append_input(clone)
 
-    backend_spec._set_input_signature(parsed_spec.input_signature)
-
     d = torch.classes.tensorrt._Device()
     d._set_device_type(int(parsed_spec.device.device_type))
     d._set_gpu_id(parsed_spec.device.gpu_id)

From 223dfd11ae3cfe0e58b229af23d28facf66e5da5 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Mon, 8 Aug 2022 16:52:01 -0700
Subject: [PATCH 16/16] chore: remove commented out code

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/compiler.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index caee900879..7b58dbb2c1 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -394,7 +394,6 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
   auto params = graph_and_parameters.second;
   auto static_params = ir::get_static_params(g->inputs(), params);
   // Infer the type of an input from the weights of the calculation
-  // auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
   auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
   MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);