pytorch
diff --git a/‎core/compiler.cpp
Lines changed: 4 additions & 4 deletions b/‎core/compiler.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎core/conversion/converters/impl/layer_norm.cpp
Lines changed: 24 additions & 5 deletions b/‎core/conversion/converters/impl/layer_norm.cpp
Lines changed: 24 additions & 5 deletions
diff --git a/‎core/conversion/converters/impl/matrix_multiply.cpp
Lines changed: 9 additions & 4 deletions b/‎core/conversion/converters/impl/matrix_multiply.cpp
Lines changed: 9 additions & 4 deletions
diff --git a/‎core/lowering/lowering.cpp
Lines changed: 1 addition & 0 deletions b/‎core/lowering/lowering.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/passes/BUILD
Lines changed: 1 addition & 0 deletions b/‎core/lowering/passes/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/passes/linear_to_addmm.cpp
Lines changed: 41 additions & 16 deletions b/‎core/lowering/passes/linear_to_addmm.cpp
Lines changed: 41 additions & 16 deletions
diff --git a/‎core/lowering/passes/passes.h
Lines changed: 1 addition & 0 deletions b/‎core/lowering/passes/passes.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/passes/unpack_hardswish.cpp
Lines changed: 44 additions & 0 deletions b/‎core/lowering/passes/unpack_hardswish.cpp
Lines changed: 44 additions & 0 deletions
diff --git a/‎core/partitioning/partitioning.cpp
Lines changed: 54 additions & 23 deletions b/‎core/partitioning/partitioning.cpp
Lines changed: 54 additions & 23 deletions
@@ -182,8 +182,8 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
   torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
   std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
   for (const torch::jit::script::Method& method : mod.get_methods()) {
-    // Don't convert hidden methods
-    if (method.name().rfind("_", 0)) {
+    // Compile only forward methods. forward method contains the entire graph.
+    if (method.name().compare("forward") == 0) {
       auto new_g = std::make_shared<torch::jit::Graph>();
       auto graph_and_parameters = lowering::Lower(mod, method.name());
 
@@ -256,8 +256,8 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
   torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
   std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
   for (const torch::jit::script::Method& method : mod.get_methods()) {
-    // Don't convert hidden methods
-    if (method.name().rfind("_", 0)) {
+    // Compile only forward methods. forward method contains the entire graph.
+    if (method.name().compare("forward") == 0) {
       auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
       auto new_g = std::make_shared<torch::jit::Graph>();
       AddEngineToGraph(new_mod, new_g, engine);
 
@@ -117,12 +117,31 @@ auto layer_norm_registrations TRTORCH_UNUSED = RegisterNodeConversionPatterns().
       }
 
       auto power = Weights(ctx, at::ones(expand_size));
-      auto scale_nd = ctx->net->addScaleNd(
-          *div_out, nvinfer1::ScaleMode::kELEMENTWISE, beta_weights.data, gamma_weights.data, power.data, 1);
-      scale_nd->setName((util::node_info(n) + "_scale_nd").c_str());
-      auto scale_nd_out = scale_nd->getOutput(0);
 
-      ctx->AssociateValueAndTensor(n->outputs()[0], scale_nd_out);
+      auto gamma_tensor = ctx->net->addConstant(gamma_weights.shape, gamma_weights.data)->getOutput(0);
+      auto scale_l = add_elementwise(
+          ctx, nvinfer1::ElementWiseOperation::kPROD, div_out, gamma_tensor, (util::node_info(n) + "_scale").c_str());
+
+      auto beta_tensor = ctx->net->addConstant(beta_weights.shape, beta_weights.data)->getOutput(0);
+      auto shift_l = add_elementwise(
+          ctx,
+          nvinfer1::ElementWiseOperation::kSUM,
+          scale_l->getOutput(0),
+          beta_tensor,
+          (util::node_info(n) + "_shift").c_str());
+
+      auto power_tensor = ctx->net->addConstant(power.shape, power.data)->getOutput(0);
+      auto power_l = add_elementwise(
+          ctx,
+          nvinfer1::ElementWiseOperation::kPOW,
+          shift_l->getOutput(0),
+          power_tensor,
+          (util::node_info(n) + "_power").c_str());
+
+      power_l->setName((util::node_info(n) + "_scale_nd").c_str());
+      auto power_l_out = power_l->getOutput(0);
+
+      ctx->AssociateValueAndTensor(n->outputs()[0], power_l_out);
       return true;
     }});
 
 
@@ -1,3 +1,4 @@
+#include "core/conversion/converters/converter_util.h"
 #include "core/conversion/converters/converters.h"
 #include "core/util/prelude.h"
 
@@ -13,10 +14,14 @@ auto mm_registrations TRTORCH_UNUSED =
         .pattern({"aten::matmul(Tensor self, Tensor other) -> (Tensor)",
                   [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                     auto self = args[0].ITensorOrFreeze(ctx);
-                    LOG_DEBUG("self tensor shape: " << self->getDimensions());
-
                     auto other = args[1].ITensorOrFreeze(ctx);
-                    LOG_DEBUG("other tensor shape: " << other->getDimensions());
+                    // Ensure self and other tensors have same nbDims by expanding the dimensions (from 0 axis) if
+                    // necessary.
+                    if (self->getDimensions().nbDims < other->getDimensions().nbDims) {
+                      self = addPadding(ctx, n, self, other->getDimensions().nbDims, false, false);
+                    } else {
+                      other = addPadding(ctx, n, other, self->getDimensions().nbDims, false, false);
+                    }
 
                     auto mm_layer = ctx->net->addMatrixMultiply(
                         *self, nvinfer1::MatrixOperation::kNONE, *other, nvinfer1::MatrixOperation::kNONE);
@@ -73,4 +78,4 @@ auto mm_registrations TRTORCH_UNUSED =
 } // namespace converters
 } // namespace conversion
 } // namespace core
-} // namespace trtorch
+} // namespace trtorch
@@ -25,6 +25,7 @@ void LowerBlock(torch::jit::Block* b) {
 }
 
 void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {
+  passes::UnpackHardSwish(g);
   torch::jit::EliminateRedundantGuards(g);
   torch::jit::RemoveListMutation(g);
   torch::jit::RemoveTensorMutation(g);
 
@@ -24,6 +24,7 @@ cc_library(
         "unpack_addmm.cpp",
         "unpack_batch_norm.cpp",
         "unpack_log_softmax.cpp",
+        "unpack_hardswish.cpp"
     ],
     hdrs = [
         "passes.h",
 
@@ -1,23 +1,55 @@
-#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+#include <torch/csrc/jit/runtime/operator.h>
+#include "torch/csrc/jit/ir/alias_analysis.h"
+#include "torch/csrc/jit/jit_log.h"
+#include "torch/csrc/jit/passes/constant_propagation.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/passes/guard_elimination.h"
+#include "torch/csrc/jit/passes/peephole.h"
+#include "torch/csrc/jit/runtime/graph_executor.h"
 
 #include "core/util/prelude.h"
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
 
 namespace trtorch {
 namespace core {
 namespace lowering {
 namespace passes {
 
+void replaceLinearWithBiasNonePattern(std::shared_ptr<torch::jit::Graph> graph) {
+  // Define the decomposition function for aten::linear for the case where bias (mat2) is None.
+  static torch::jit::CompilationUnit decompose_funcs(R"SCRIPT(
+     def linear(self: Tensor, mat1: Tensor, mat2: Tensor):
+         return torch.matmul(self, mat1.t())
+     )SCRIPT");
+
+  // Iterate through nodes and search for aten::linear nodes where bias is not a Tensor (includes bias=None case)
+  auto block = graph->block();
+  for (auto it = block->nodes().begin(); it != block->nodes().end(); it++) {
+    auto n = *it;
+    if (n->kind().toQualString() == std::string("aten::linear")) {
+      auto input_values = n->inputs();
+      // input_values[2] is the bias. If none, replace it with the decomposed linear graph.
+      if (input_values[2]->type()->isSubtypeOf(c10::TensorType::get())) {
+        continue;
+      } else {
+        torch::jit::WithInsertPoint guard(*it);
+        std::shared_ptr<torch::jit::Graph> d_graph = decompose_funcs.get_function("linear").graph();
+        torch::jit::Value* new_output = insertGraph(*it->owningGraph(), *d_graph, it->inputs()).at(0);
+        new_output->setType(it->output()->type());
+        it->output()->replaceAllUsesWith(new_output);
+        it.destroyCurrent();
+      }
+    }
+  }
+}
+
 void LinearToAddMM(std::shared_ptr<torch::jit::Graph>& graph) {
   // TensorRT implicitly adds a flatten layer infront of FC layers if necessary
   std::string flatten_linear_pattern = R"IR(
         graph(%input, %weight, %bias):
             %res = aten::linear(%input, %weight, %bias)
             return (%res))IR";
-  std::string flatten_linear_bias_none_pattern = R"IR(
-        graph(%input, %weight):
-            %bias: Tensor? = prim::Constant()
-            %res = aten::linear(%input, %weight, %bias)
-            return (%res))IR";
 
   std::string fused_linear = R"IR(
         graph(%input, %weight_t, %bias):
@@ -27,20 +59,13 @@ void LinearToAddMM(std::shared_ptr<torch::jit::Graph>& graph) {
             %b_f: Tensor = trt::const(%bias)
             %out: Tensor = aten::add(%b_f, %mm, %1)
             return (%out))IR";
-  std::string fused_linear_bias_none = R"IR(
-        graph(%input, %weight_t):
-            %weight = aten::t(%weight_t)
-            %mm: Tensor = aten::matmul(%input, %weight)
-            return (%mm))IR";
+
+  // First find and replace aten::linear nodes with non-tensor bias values.
+  replaceLinearWithBiasNonePattern(graph);
 
   torch::jit::SubgraphRewriter flatten_linear_to_linear;
   flatten_linear_to_linear.RegisterRewritePattern(flatten_linear_pattern, fused_linear);
   flatten_linear_to_linear.runOnGraph(graph);
-
-  torch::jit::SubgraphRewriter flatten_linear_bias_none_to_linear;
-  flatten_linear_bias_none_to_linear.RegisterRewritePattern(flatten_linear_bias_none_pattern, fused_linear_bias_none);
-  flatten_linear_bias_none_to_linear.runOnGraph(graph);
-  LOG_GRAPH("Post linear to addmm: " << *graph);
 }
 
 } // namespace passes
 
@@ -21,6 +21,7 @@ void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph);
 void AliasOperators(std::shared_ptr<torch::jit::Graph>& graph);
 void SiluToSigmoidMultipication(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackHardSwish(std::shared_ptr<torch::jit::Graph>& graph);
 
 } // namespace passes
 } // namespace lowering
 
@@ -0,0 +1,44 @@
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+#include "core/util/prelude.h"
+
+namespace trtorch {
+namespace core {
+namespace lowering {
+namespace passes {
+
+void UnpackHardSwish(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string hardswish_pattern = R"IR(
+        graph(%input):
+            %result = aten::hardswish(%input)
+            return (%result))IR";
+
+  std::string hardswish_pattern_inplace = R"IR(
+        graph(%input):
+            %result = aten::hardswish_(%input)
+            return (%result))IR";
+
+  std::string new_pattern = R"IR(
+        graph(%input):
+            %1 : Scalar = prim::Constant[value=3.]()
+            %2 : Scalar = prim::Constant[value=1.]()
+            %3 = aten::add(%input, %1, %2)
+            %4 : Scalar = prim::Constant[value=0.]()
+            %5 : Scalar = prim::Constant[value=6.]()
+            %6 = aten::hardtanh(%3, %4, %5)
+            %7 = aten::div(%6, %5)
+            %8 = aten::mul(%input, %7)
+            return (%8))IR";
+
+  torch::jit::SubgraphRewriter rewriter;
+  rewriter.RegisterRewritePattern(hardswish_pattern, new_pattern);
+  rewriter.RegisterRewritePattern(hardswish_pattern_inplace, new_pattern);
+  rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("Post unpack hardswish: " << *graph);
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace trtorch
@@ -10,9 +10,9 @@ namespace core {
 namespace partitioning {
 
 struct usage_info {
-  int produce_id = -1;
-  std::vector<int> torch_use_id;
-  std::vector<int> tensorrt_use_id;
+  size_t produce_id; // id of segmented block which contains a raw value of a given torch::jit::Value
+  std::vector<size_t> torch_use_id; // ids of segmented blocks which are of type Pytorch
+  std::vector<size_t> tensorrt_use_id; // ids of segmented blocks which are of type TensorRT
 };
 
 inline bool isTensorOrTensorList(torch::jit::Value* val) {
@@ -70,44 +70,54 @@ std::vector<torch::jit::Node*> getDependencyNodes(std::vector<torch::jit::Value*
   return stk;
 }
 
-std::vector<SegmentedBlock> injectNodesForNonTensorInputs(SegmentedBlock& seg_block) {
+std::vector<SegmentedBlock> segmentBlocksWithNonTensorInputs(SegmentedBlock& seg_block) {
   // reconstruct segmented_block if this block requires nonTensor input
   std::vector<torch::jit::Value*> nontensor_inputs;
+  // Gather all non-tensor inputs for this seg_block
   for (auto input : seg_block.raw_inputs()) {
     if (!isTensorOrTensorList(input)) {
       nontensor_inputs.push_back(input);
     }
   }
-  std::vector<torch::jit::Node*> dependency_nodes = getDependencyNodes(nontensor_inputs);
 
+  std::vector<torch::jit::Node*> dependency_nodes = getDependencyNodes(nontensor_inputs);
   std::vector<SegmentedBlock> new_seg_blocks;
-  // if current block is kTorch or current block is TensorRT and all dependent nodes are also supported, construct only
-  // one new block
+  // if current block is kTorch or current block is TensorRT and all dependent nodes are also supported, merge the
+  // dependency nodes at the beginning of the current segmented_block and return this merged segmented_block
   if (seg_block.target() == SegmentedBlock::kTorch || isAllNodesSupported(dependency_nodes)) {
     dependency_nodes.insert(dependency_nodes.end(), seg_block.raw_nodes().begin(), seg_block.raw_nodes().end());
     new_seg_blocks.emplace_back(seg_block.target(), dependency_nodes);
   } else {
     // if current block is kTensorRT but the dependency nodes contain unsupported node, then we have to segment again
     std::unordered_set<torch::jit::Value*> nontensor_inputs_set(nontensor_inputs.begin(), nontensor_inputs.end());
-    std::vector<torch::jit::Node*> tensorrt_nodes, pytorch_nodes;
+    std::vector<torch::jit::Node*> tensorrt_nodes, pytorch_nodes(dependency_nodes.begin(), dependency_nodes.end());
+
     bool prev_non_tensor_outputs = false;
     for (auto n : seg_block.raw_nodes()) {
-      // it's a kTorch block if it uses the nonTensor input and the nonTensor input is produced in kTorch block
+      // Check if the node has non-tensor inputs or if it consumes non-tensor outputs of previous node.
+      // In these cases, these nodes are placed into a new Pytorch SegmentedBlock. Else, they form a new TensorRT
+      // SegmentedBlock.
       if (containTargetInputs(n, nontensor_inputs_set) || prev_non_tensor_outputs) {
+        // If tensorrt_nodes is not empty, the previous nodes were all tensorrt_nodes. Construct a
+        // TensorRT segmented_block and clear the tensorrt_nodes list to be later used for new TRT segments.
         if (!tensorrt_nodes.empty()) {
           new_seg_blocks.emplace_back(SegmentedBlock::kTensorRT, tensorrt_nodes);
           tensorrt_nodes.clear();
         }
         pytorch_nodes.push_back(n);
         prev_non_tensor_outputs = containNonTensorOutputs(n);
       } else {
+        // If pytorch_nodes is not empty, the previous nodes were all tensorrt_nodes. Construct a
+        // Pytorch segmented_block and clear the pytorch_nodes list to be later used for new Pytorch segments.
         if (!pytorch_nodes.empty()) {
           new_seg_blocks.emplace_back(SegmentedBlock::kTorch, pytorch_nodes);
           pytorch_nodes.clear();
         }
         tensorrt_nodes.push_back(n);
       }
     }
+
+    // Form the last segmented_block with the left over nodes in tensorrt_nodes or pytorch_nodes correspondingly.
     if (!tensorrt_nodes.empty()) {
       new_seg_blocks.emplace_back(SegmentedBlock::kTensorRT, tensorrt_nodes);
     } else {
@@ -118,7 +128,20 @@ std::vector<SegmentedBlock> injectNodesForNonTensorInputs(SegmentedBlock& seg_bl
 }
 
 void resolveNonTensorInputs(PartitionedGraph& segmented_blocks, std::shared_ptr<torch::jit::Graph> g) {
-  // for NonTensor inputs in TensorRT segments, count the usages on Torch segments and TensorRT segments
+  // create a list so we can insert SegmentedBlock without losing the iterators
+  std::list<SegmentedBlock> segmented_blocks_list(segmented_blocks.begin(), segmented_blocks.end());
+  std::unordered_map<size_t, std::list<SegmentedBlock>::iterator> idx_to_iter;
+  auto iter = segmented_blocks_list.begin();
+  for (size_t i = 0; i < segmented_blocks.size(); ++i, ++iter) {
+    idx_to_iter[i] = iter;
+  }
+
+  // usage_counts is a map which stores non-tensor inputs as keys and the values are indices of segmented blocks which
+  // have these non-tensor inputs. Iterate through the graph (segmented blocks) from bottom to top. When we find a
+  // non-tensor input in a segmented block of index "i", store it in the usage_counts map. Now for each non-tensor
+  // inputs recorded in the usage_counts map, we check if any previous segmented block (segmented block index i goes
+  // from n-1 to 0) generated/contains this non-tensor input. If so, we set this idx as the produce_id as it produces
+  // the non-tensor input.
   std::unordered_map<torch::jit::Value*, usage_info> usage_counts;
   for (int i = segmented_blocks.size() - 1; i >= 0; --i) {
     for (auto input : segmented_blocks[i].raw_inputs()) {
@@ -127,36 +150,44 @@ void resolveNonTensorInputs(PartitionedGraph& segmented_blocks, std::shared_ptr<
                                                                : usage_counts[input].tensorrt_use_id.push_back(i);
       }
     }
+
     for (auto& use : usage_counts) {
+      // Set the produce_id to the segmented block index that contains/produces this non-tensor torch::jit::Value
       if (segmented_blocks[i].contain_raw_value(use.first)) {
         use.second.produce_id = i;
       }
     }
   }
+
   std::unordered_set<int> updated_segments;
   for (auto& use : usage_counts) {
     auto use_info = use.second;
     // if the segment that produce this nonTensor value is kTensorRT but consumed in kTorch, inject nodes in the first
-    // kTorch segments
+    // kTorch segment.
     if (segmented_blocks[use_info.produce_id].target() == SegmentedBlock::kTensorRT && !use_info.torch_use_id.empty()) {
-      int first_torch_id = use_info.torch_use_id.front();
+      auto first_torch_id = use_info.torch_use_id.front();
       if (!updated_segments.count(first_torch_id)) {
-        auto new_torch_block = injectNodesForNonTensorInputs(segmented_blocks[first_torch_id]).front();
-        segmented_blocks[first_torch_id] = new_torch_block;
+        // Segmented Blocks with non-tensor inputs will have to be re-segmented as
+        // TRTorch doesn't support non-tensor inputs for a module.
+        auto new_torch_block = segmentBlocksWithNonTensorInputs(segmented_blocks[first_torch_id]).front();
+        *idx_to_iter[first_torch_id] = new_torch_block;
         updated_segments.insert(first_torch_id);
       }
-    } else {
-      // KTensorRT segments always need to inject nodes for the nonTensor inputs
-      for (int i : use_info.tensorrt_use_id) {
-        if (!updated_segments.count(i)) {
-          auto to_inject_blocks = injectNodesForNonTensorInputs(segmented_blocks[i]);
-          segmented_blocks.erase(segmented_blocks.begin() + i);
-          segmented_blocks.insert(segmented_blocks.begin() + i, to_inject_blocks.begin(), to_inject_blocks.end());
-          updated_segments.insert(i);
-        }
+    }
+    // kTensorRT segments always need to inject nodes for the nonTensor inputs
+    for (auto i : use_info.tensorrt_use_id) {
+      if (!updated_segments.count(i)) {
+        // Segmented Blocks with non-tensor inputs will have to be re-segmented as
+        // TRTorch doesn't support non-tensor inputs for a module.
+        auto to_inject_blocks = segmentBlocksWithNonTensorInputs(segmented_blocks[i]);
+        auto next_iter = segmented_blocks_list.erase(idx_to_iter[i]);
+        segmented_blocks_list.insert(next_iter, to_inject_blocks.begin(), to_inject_blocks.end());
+        updated_segments.insert(i);
       }
     }
   }
+  segmented_blocks.clear();
+  segmented_blocks.insert(segmented_blocks.begin(), segmented_blocks_list.begin(), segmented_blocks_list.end());
   return;
 }
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ void LowerBlock(torch::jit::Block* b) {`
`25`	`25`	`}`
`26`	`26`
`27`	`27`	`void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {`
	`28`	`+ passes::UnpackHardSwish(g);`
`28`	`29`	`torch::jit::EliminateRedundantGuards(g);`
`29`	`30`	`torch::jit::RemoveListMutation(g);`
`30`	`31`	`torch::jit::RemoveTensorMutation(g);`