pytorch
diff --git a/‎core/compiler.cpp
Lines changed: 2 additions & 1 deletion b/‎core/compiler.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/conversion/conversion.cpp
Lines changed: 32 additions & 17 deletions b/‎core/conversion/conversion.cpp
Lines changed: 32 additions & 17 deletions
diff --git a/‎core/conversion/converters/impl/activation.cpp
Lines changed: 21 additions & 0 deletions b/‎core/conversion/converters/impl/activation.cpp
Lines changed: 21 additions & 0 deletions
diff --git a/‎core/ir/Input.cpp
Lines changed: 9 additions & 2 deletions b/‎core/ir/Input.cpp
Lines changed: 9 additions & 2 deletions
diff --git a/‎core/lowering/lowering.cpp
Lines changed: 3 additions & 0 deletions b/‎core/lowering/lowering.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎core/lowering/passes/BUILD
Lines changed: 1 addition & 0 deletions b/‎core/lowering/passes/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/passes/passes.h
Lines changed: 2 additions & 0 deletions b/‎core/lowering/passes/passes.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/lowering/passes/reduce_gelu.cpp
Lines changed: 33 additions & 1 deletion b/‎core/lowering/passes/reduce_gelu.cpp
Lines changed: 33 additions & 1 deletion
diff --git a/‎core/lowering/passes/remove_set_attrs.cpp
Lines changed: 35 additions & 0 deletions b/‎core/lowering/passes/remove_set_attrs.cpp
Lines changed: 35 additions & 0 deletions
@@ -328,7 +328,8 @@ void MapInputsAndDetermineDTypes(
         spec.dtype = nvinfer1::DataType::kFLOAT;
       } else if (spec.dtype_is_user_defined && cfg.partition_info.enabled) {
         if (!est_type_opt) {
-          LOG_INFO("Cannot infer input tensor dtype in graph, unable to verify user input dtype settings");
+          LOG_INFO("Cannot infer input tensor dtype in graph. Using user provided input dtype settings");
+          first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
         } else {
           if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
             std::stringstream ss;
 
@@ -1,15 +1,15 @@
 #include "core/conversion/conversion.h"
+#include <ATen/core/operator_name.h>
 #include <torch/torch.h>
 #include <sstream>
+#include "c10/util/intrusive_ptr.h"
 #include "core/conversion/conversionctx/ConversionCtx.h"
+#include "core/conversion/converters/converter_util.h"
 #include "core/conversion/converters/converters.h"
 #include "core/conversion/evaluators/evaluators.h"
+#include "core/conversion/tensorcontainer/TensorContainer.h"
 #include "core/conversion/var/Var.h"
 #include "core/util/prelude.h"
-
-#include "c10/util/intrusive_ptr.h"
-#include "core/conversion/converters/converter_util.h"
-#include "core/conversion/tensorcontainer/TensorContainer.h"
 #include "core/util/trt_util.h"
 
 namespace torch_tensorrt {
@@ -427,10 +427,18 @@ void ConvertBlockToNetDef(
                                               << " and node outputs size: " << n->outputs().size() << " must match.");
             for (size_t i = 0; i < eval_list->elements().size(); i++) {
               auto eval_output = eval_list.get()->elements()[i];
-              LOG_DEBUG(
-                  ctx->logger,
-                  "Found the evaluated value(s) to be " << eval_output << " for node: " << util::node_info(n));
-              ctx->AssociateValueAndIValue(n->output(i), eval_output);
+              if (eval_output.isCustomClass()) {
+                auto container = eval_output.toCustomClass<TensorContainer>();
+                auto tensor = container->tensor();
+                LOG_DEBUG(
+                    ctx->logger, "Found the evaluated value(s) to be an ITensor of shape: " << tensor->getDimensions());
+                ctx->AssociateValueAndTensor(n->output(i), tensor);
+              } else {
+                LOG_DEBUG(
+                    ctx->logger,
+                    "Found the evaluated value(s) to be " << eval_output << " for node: " << util::node_info(n));
+                ctx->AssociateValueAndIValue(n->output(i), eval_output);
+              }
             }
           } else {
             TORCHTRT_THROW_ERROR("Unsupported return type for evaluated node");
@@ -488,15 +496,23 @@ std::string ConvertBlockToEngine(
 std::unordered_map<c10::OperatorName, std::string> GetUnsupportedOpsInBlock(const torch::jit::Block* b) {
   std::unordered_map<c10::OperatorName, std::string> unsupported_ops;
   for (const auto n : b->nodes()) {
-    if (n->kind() != torch::jit::prim::Loop && n->kind() != torch::jit::prim::If && !OpSupported(n)) {
-      auto schema = n->maybeSchema();
-      TORCHTRT_CHECK(
-          schema,
-          "Unable to get schema for Node " << util::node_info(n) << " (conversion.VerifyCoverterSupportForBlock)");
-      std::stringstream ss;
-      ss << *schema;
-      unsupported_ops[schema->operator_name()] = ss.str();
+    auto schema = n->maybeSchema();
+    // Some ops like torch::jit::prim::Loop, torch::jit::prim::If, torch::jit::prim::DictConstruct don't have a schema
+    // but they are supported. torch::jit::prim::DictConstruct is supported via fallback only
+    if (!OpSupported(n)) {
+      if (schema) {
+        std::stringstream ss;
+        ss << *schema;
+        unsupported_ops[schema->operator_name()] = ss.str();
+      } else {
+        std::stringstream ss;
+        ss << util::node_info(n);
+        // operator.overload is a filler name just to call the constructor.
+        c10::OperatorName op(ss.str(), "operator.overload");
+        unsupported_ops[op] = ss.str();
+      }
     }
+
     for (const auto sub_b : n->blocks()) {
       auto sub_b_unsupported_ops = GetUnsupportedOpsInBlock(sub_b);
       unsupported_ops.insert(sub_b_unsupported_ops.begin(), sub_b_unsupported_ops.end());
@@ -531,7 +547,6 @@ std::set<std::string> ConvertableOpsInBlock(const torch::jit::Block* b) {
 
 bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors) {
   auto unsupported_ops = GetUnsupportedOpsInBlock(b);
-
   if (unsupported_ops.size() != 0) {
     std::stringstream unsupported_msg;
     unsupported_msg
 
@@ -87,6 +87,27 @@ auto acthardtanh TORCHTRT_UNUSED =
 
                bool to_reshape = false;
                auto original_shape = in->getDimensions();
+
+               // Out_tensor of ParametricReLU shape is all 0, when slopes nDims is not equal to in nDims.
+               // Since make sure splopes nDims is equal to in nDims.
+               if (slopes.ndimension() == 1 and original_shape.nbDims != slopes.ndimension()) {
+                 std::vector<int64_t> slopes_new_shape(original_shape.nbDims, 1);
+                 auto first_inputs_allowed_formats = ctx->net->getInput(0)->getAllowedFormats();
+                 for (size_t inputs_index = 1; inputs_index < ctx->num_inputs; inputs_index++) {
+                   auto inputs_allowed_formats = ctx->net->getInput(inputs_index)->getAllowedFormats();
+                   TORCHTRT_CHECK(
+                       first_inputs_allowed_formats == inputs_allowed_formats,
+                       "Unable to create batch prelu layer from node,since the formats(like NHWC or NCHW) of inputs is different: "
+                           << *n);
+                 }
+                 if (1U << static_cast<int>(nvinfer1::TensorFormat::kLINEAR) == first_inputs_allowed_formats) {
+                   slopes_new_shape[1] = slopes.sizes().vec()[0];
+                 } else {
+                   slopes_new_shape[original_shape.nbDims - 1] = slopes.sizes().vec()[0];
+                 }
+                 slopes = slopes.reshape(slopes_new_shape);
+               }
+
                if (slopes.numel() != 1 &&
                    !util::broadcastable(
                        in->getDimensions(),
 
@@ -40,6 +40,13 @@ bool valid_dtype_format_combo(nvinfer1::DataType dtype, nvinfer1::TensorFormat f
         default:
           return false;
       }
+    case nvinfer1::DataType::kBOOL: // Supports Linear (NCHW)
+      switch (format) {
+        case nvinfer1::TensorFormat::kLINEAR:
+          return true;
+        default:
+          return false;
+      }
     default:
       return false;
   }
@@ -48,7 +55,7 @@ bool valid_dtype_format_combo(nvinfer1::DataType dtype, nvinfer1::TensorFormat f
 bool valid_input_dtype(nvinfer1::DataType dtype) {
   switch (dtype) {
     case nvinfer1::DataType::kBOOL:
-      return false;
+      return true;
     case nvinfer1::DataType::kFLOAT:
       return true;
     case nvinfer1::DataType::kHALF:
@@ -153,4 +160,4 @@ std::ostream& operator<<(std::ostream& os, const Input& input) {
 
 } // namespace ir
 } // namespace core
-} // namespace torch_tensorrt
+} // namespace torch_tensorrt
@@ -1,6 +1,7 @@
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
 #include "torch/csrc/jit/passes/create_functional_graphs.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/passes/erase_number_types.h"
 #include "torch/csrc/jit/passes/freeze_module.h"
 #include "torch/csrc/jit/passes/fuse_linear.h"
 #include "torch/csrc/jit/passes/guard_elimination.h"
@@ -64,6 +65,8 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   passes::RemoveNOPs(g);
   passes::AliasOperators(g);
   passes::SiluToSigmoidMultipication(g);
+  passes::RemoveSingleUse0DTensors(g);
+  passes::RemoveUnnecessaryCasts(g);
   LOG_GRAPH(*g);
 }
 
 
@@ -24,6 +24,7 @@ cc_library(
         "view_to_reshape.cpp",
         "remove_dropout.cpp",
         "remove_nops.cpp",
+        "remove_unnecessary_casts.cpp",
         "silu_to_sigmoid_multiplication.cpp",
         "unpack_addmm.cpp",
         "unpack_batch_norm.cpp",
 
@@ -28,6 +28,8 @@ void RemoveContiguous(std::shared_ptr<torch::jit::Graph>& graph);
 void ViewToReshape(std::shared_ptr<torch::jit::Graph>& graph);
 void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph);
 void RemoveNOPs(std::shared_ptr<torch::jit::Graph> graph);
+void RemoveSingleUse0DTensors(std::shared_ptr<torch::jit::Graph>& g);
+void RemoveUnnecessaryCasts(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackAddMM(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph);
 
@@ -8,10 +8,17 @@ namespace passes {
 
 void ReduceGelu(std::shared_ptr<torch::jit::Graph>& graph) {
   std::string gelu_pattern = R"IR(
-        graph(%x):
+        graph(%x : Tensor):
             %out : Tensor = aten::gelu(%x)
             return (%out))IR";
 
+  // This gelu_approximate_pattern schema exists in 21.11, 21.12, 22.01 containers of pytorch. These container versions use
+  // an unmerged PR in pytorch : https://github.com/pytorch/pytorch/pull/61439. We reduce this to regular Gelu.
+  std::string gelu_approximate_pattern = R"IR(
+        graph(%x : Tensor, %approx):
+            %out : Tensor = aten::gelu(%x, %approx)
+            return (%out))IR";
+
   std::string gelu_reduce_pattern = R"IR(
     graph(%x.1 : Tensor):
         %6 : float = prim::Constant[value=0.044714999999999998]()
@@ -30,11 +37,36 @@ void ReduceGelu(std::shared_ptr<torch::jit::Graph>& graph) {
         %15 : Tensor = aten::mul(%7, %14)
         return (%15))IR";
 
+  // This is same as gelu_reduce_pattern except for an additional input %approx.
+  // SubgraphRewriter only works as expected if the number of inputs to gelu_approximate_pattern
+  // and gelu_reduce_multi_input_pattern are same.
+  std::string gelu_reduce_multi_input_pattern = R"IR(
+    graph(%x.1 : Tensor, %approx):
+        %6 : float = prim::Constant[value=0.044714999999999998]()
+        %5 : float = prim::Constant[value=0.79788456080000003]()
+        %4 : float = prim::Constant[value=1.]()
+        %3 : float = prim::Constant[value=0.5]()
+        %2 : int = prim::Constant[value=1]()
+        %7 : Tensor = aten::mul(%x.1, %3)
+        %8 : Tensor = aten::mul(%x.1, %5)
+        %9 : Tensor = aten::mul(%x.1, %6)
+        %10 : Tensor = aten::mul(%9, %x.1)
+        %11 : Tensor = aten::add(%10, %4, %2)
+        %12 : Tensor = aten::mul(%8, %11)
+        %13 : Tensor = aten::tanh(%12)
+        %14 : Tensor = aten::add(%13, %4, %2)
+        %15 : Tensor = aten::mul(%7, %14)
+        return (%15))IR";
+
   // replace aten::gelu with pointwise operations
   torch::jit::SubgraphRewriter map_gelu_to_pointwise_ops;
   map_gelu_to_pointwise_ops.RegisterRewritePattern(gelu_pattern, gelu_reduce_pattern);
   map_gelu_to_pointwise_ops.runOnGraph(graph);
 
+  torch::jit::SubgraphRewriter map_gelu_approximate_to_pointwise_ops;
+  map_gelu_approximate_to_pointwise_ops.RegisterRewritePattern(gelu_approximate_pattern, gelu_reduce_multi_input_pattern);
+  map_gelu_approximate_to_pointwise_ops.runOnGraph(graph);
+
   LOG_GRAPH("Post lowering of [aten::gelu] -> " << *graph);
 }
 
 
@@ -0,0 +1,35 @@
+#include <stack>
+#include <unordered_set>
+
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+#include "core/lowering/passes/passes.h"
+#include "core/util/prelude.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace lowering {
+namespace passes {
+
+void RemoveSetAttrs(const torch::jit::Module& mod, std::string method_name) {
+  auto g = mod.get_method(method_name).graph();
+
+  std::string set_attr_pattern = R"IR(
+        graph(%self, %0):
+            None = prim::SetAttr[name="_has_warned"](%self, %0)
+            return ())IR";
+  std::string no_set_attr_pattern = R"IR(
+        graph(%self, %0):
+            return ())IR";
+
+  // remove contiguous
+  torch::jit::SubgraphRewriter remove_set_attr;
+  remove_set_attr.RegisterRewritePattern(set_attr_pattern, no_set_attr_pattern);
+  remove_set_attr.runOnGraph(g);
+  LOG_GRAPH("Post remove contiguous: " << *g);
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace torch_tensorrt