pytorch
diff --git a/‎core/conversion/conversionctx/BUILD
Lines changed: 1 addition & 0 deletions b/‎core/conversion/conversionctx/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/conversion/conversionctx/ConversionCtx.h
Lines changed: 2 additions & 9 deletions b/‎core/conversion/conversionctx/ConversionCtx.h
Lines changed: 2 additions & 9 deletions
diff --git a/‎core/conversion/converters/impl/element_wise.cpp
Lines changed: 13 additions & 22 deletions b/‎core/conversion/converters/impl/element_wise.cpp
Lines changed: 13 additions & 22 deletions
diff --git a/‎core/ir/ir.h
Lines changed: 8 additions & 0 deletions b/‎core/ir/ir.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎core/lowering/BUILD
Lines changed: 1 addition & 0 deletions b/‎core/lowering/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/CMakeLists.txt
Lines changed: 4 additions & 1 deletion b/‎core/lowering/CMakeLists.txt
Lines changed: 4 additions & 1 deletion
diff --git a/‎core/lowering/lowering.cpp
Lines changed: 7 additions & 2 deletions b/‎core/lowering/lowering.cpp
Lines changed: 7 additions & 2 deletions
diff --git a/‎core/lowering/lowering.h
Lines changed: 6 additions & 0 deletions b/‎core/lowering/lowering.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎core/lowering/passes/BUILD
Lines changed: 2 additions & 0 deletions b/‎core/lowering/passes/BUILD
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/lowering/passes/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎core/lowering/passes/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/lowering/passes/device_casting.cpp
Lines changed: 121 additions & 0 deletions b/‎core/lowering/passes/device_casting.cpp
Lines changed: 121 additions & 0 deletions
diff --git a/‎core/lowering/passes/passes.h
Lines changed: 5 additions & 0 deletions b/‎core/lowering/passes/passes.h
Lines changed: 5 additions & 0 deletions
@@ -21,6 +21,7 @@ cc_library(
     deps = [
         "@tensorrt//:nvinfer",
         "//core/util:prelude",
+        "//core/ir",
     ] + select({
         ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
         "//conditions:default": ["@libtorch//:libtorch"],
 
@@ -9,28 +9,21 @@
 #include "torch/csrc/jit/ir/ir.h"
 
 #include <cuda_runtime.h>
+#include "core/ir/ir.h"
 #include "core/util/prelude.h"
 
 namespace torch_tensorrt {
 namespace core {
 namespace conversion {
 
-struct Device {
-  nvinfer1::DeviceType device_type;
-  int64_t gpu_id;
-  int64_t dla_core;
-  bool allow_gpu_fallback;
-  Device() : device_type(nvinfer1::DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
-};
-
 struct BuilderSettings {
   std::set<nvinfer1::DataType> enabled_precisions = {};
   bool sparse_weights = false;
   bool disable_tf32 = false;
   bool refit = false;
   bool debug = false;
   bool truncate_long_and_double = false;
-  Device device;
+  ir::Device device;
   nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
   nvinfer1::IInt8Calibrator* calibrator = nullptr;
   uint64_t num_avg_timing_iters = 1;
 
@@ -166,11 +166,11 @@ auto element_wise_registrations TORCHTRT_UNUSED =
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                // Should implement self - alpha * other
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto scalar = args[2].unwrapToScalar().to<float>();
                auto other = args[1].ITensorOrFreeze(ctx);
+               auto scalar = args[2].unwrapToScalar();
 
-               if (1 != scalar) {
-                 auto alphaTensor = tensor_to_const(ctx, torch::tensor({scalar}));
+               if (1 != scalar.to<float>()) {
+                 auto alphaTensor = scalar_to_tensor(ctx, scalar);
                  auto scaleLayer = add_elementwise(
                      ctx,
                      nvinfer1::ElementWiseOperation::kPROD,
@@ -214,11 +214,11 @@ auto element_wise_registrations TORCHTRT_UNUSED =
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                // Should implement self - alpha * other
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto scalar = args[2].unwrapToScalar().to<float>();
                auto other = args[1].ITensorOrFreeze(ctx);
+               auto scalar = args[2].unwrapToScalar();
 
-               if (1 != scalar) {
-                 auto alphaTensor = tensor_to_const(ctx, torch::tensor({scalar}));
+               if (1 != scalar.to<float>()) {
+                 auto alphaTensor = scalar_to_tensor(ctx, scalar);
                  auto scaleLayer = add_elementwise(
                      ctx,
                      nvinfer1::ElementWiseOperation::kPROD,
@@ -351,8 +351,7 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::div.Scalar(Tensor self, Scalar other) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto otherScalar = args[1].unwrapToScalar().to<float>();
-               auto other = tensor_to_const(ctx, torch::tensor({otherScalar}));
+               auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                auto div = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, self, other, util::node_info(n));
                TORCHTRT_CHECK(div, "Unable to create div layer from node: " << *n);
 
@@ -381,8 +380,7 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto otherScalar = args[1].unwrapToScalar().to<float>();
-               auto other = tensor_to_const(ctx, torch::tensor({otherScalar}));
+               auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                auto div = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, self, other, util::node_info(n));
                TORCHTRT_CHECK(div, "Unable to create div layer from node: " << *n);
 
@@ -481,18 +479,12 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::ne.Scalar(Tensor self, Scalar other) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto scalar = args[1].unwrapToScalar();
-               nvinfer1::ITensor* scalar_tensor;
-               if (self->getType() == nvinfer1::DataType::kFLOAT || self->getType() == nvinfer1::DataType::kHALF) {
-                 scalar_tensor = tensor_to_const(ctx, torch::tensor({scalar.to<float>()}));
-               } else {
-                 scalar_tensor = tensor_to_const(ctx, torch::tensor({scalar.to<int>()}));
-               }
+               auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                auto equal = add_elementwise(
                    ctx,
                    nvinfer1::ElementWiseOperation::kEQUAL,
                    self,
-                   scalar_tensor,
+                   other,
                    util::node_info(n) + std::string("is_equal"));
                TORCHTRT_CHECK(equal, "Unable to create elementwise equal layer from node: " << *n);
                // XOR with ones negates and produces not_equal result
@@ -534,8 +526,7 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto exponentScalar = args[1].unwrapToScalar().to<float>();
-               auto exponent = tensor_to_const(ctx, torch::tensor({exponentScalar}));
+               auto exponent = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                auto pow =
                    add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPOW, self, exponent, util::node_info(n));
                TORCHTRT_CHECK(pow, "Unable to create Power layer from node: " << *n);
@@ -681,9 +672,9 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::eq.Scalar(Tensor self, Scalar other) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto otherScalar = args[1].unwrapToScalar().to<float>();
-               auto other = tensor_to_const(ctx, torch::tensor({otherScalar}));
+               auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                if (self->getType() == nvinfer1::DataType::kBOOL) {
+                 auto otherScalar = args[1].unwrapToScalar().to<float>();
                  if (otherScalar == 0 || otherScalar == 1) {
                    LOG_DEBUG("Since input tensor is type bool, casting input tensor and scalar to int32");
                    other = castITensor(ctx, other, nvinfer1::DataType::kINT32);
 
@@ -17,6 +17,14 @@ enum class ShapeMode {
   kMAX,
 };
 
+struct Device {
+  nvinfer1::DeviceType device_type;
+  int64_t gpu_id;
+  int64_t dla_core;
+  bool allow_gpu_fallback;
+  Device() : device_type(nvinfer1::DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
+};
+
 struct Input : torch::CustomClassHolder {
   Input(){};
   Input(
 
@@ -24,6 +24,7 @@ cc_library(
     deps = [
         "//core/lowering/passes",
         "//core/util:prelude",
+        "//core/ir",
     ] + select({
         ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
         "//conditions:default": ["@libtorch//:libtorch"],
 
@@ -15,6 +15,8 @@ set(HEADER_FILES
 target_sources(${lib_name}
     PRIVATE
         ${CXX_SRCS}
+    PUBLIC
+        $<TARGET_OBJECTS:core_ir>
         $<TARGET_OBJECTS:core_util>
 )
 
@@ -25,8 +27,9 @@ target_include_directories(${lib_name}
 
 target_link_libraries(${lib_name}
     PUBLIC
+        TensorRT::nvinfer
         torch
-    PRIVATE
+        core_ir
         core_util
 )
 
 
@@ -26,7 +26,7 @@ void LowerBlock(torch::jit::Block* b) {
   DropUnusedNodes(b);
 }
 
-void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
+void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::IValue>& params, LowerInfo lower_info) {
   torch::jit::EliminateRedundantGuards(g);
   torch::jit::RemoveListMutation(g);
   torch::jit::RemoveTensorMutation(g);
@@ -70,6 +70,11 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   passes::SiluToSigmoidMultipication(g);
   passes::RemoveSingleUse0DTensors(g);
   passes::RemoveUnnecessaryCasts(g);
+  passes::UnpackAndCastMaskedFill(g, lower_info.getGPUDeviceString());
+  passes::UnpackAndCastNumToTensor(g, lower_info.getGPUDeviceString());
+  passes::UnpackAndCastFull(g, lower_info.getGPUDeviceString());
+  passes::ReplaceScalarImplicit(g);
+  passes::RewriteInputsWithParams(g, params);
   LOG_GRAPH(*g);
 }
 
@@ -103,7 +108,7 @@ std::pair<std::shared_ptr<torch::jit::Graph>, std::vector<torch::jit::IValue>> L
   // In quantization aware trained (QAT) models, weights are passed through quantize and
   // dequantize nodes which should not be folded. So unfreeze_module is set to True for QAT models.
   LOG_GRAPH("Torch-TensorRT.TorchScript Graph Lowering");
-  lowering::LowerGraph(graph_and_ivalues.first, lower_info);
+  lowering::LowerGraph(graph_and_ivalues.first, graph_and_ivalues.second, lower_info);
 
   // Is this necessary?
   // lowering::LowerBlock(g->block());
 
@@ -1,5 +1,6 @@
 #pragma once
 #include <memory>
+#include "core/ir/ir.h"
 #include "torch/csrc/jit/ir/ir.h"
 
 namespace torch_tensorrt {
@@ -15,8 +16,13 @@ struct LowerInfo {
   // Since these QDQ nodes will be identical as they share same input, one of them is eliminated due to CSE lowering
   // pass. Disable this in order to not disturb TensorRT's QAT optimizations.
   bool disable_cse = false;
+  ir::Device target_device;
   std::vector<std::string> forced_fallback_modules;
   friend std::ostream& operator<<(std::ostream& os, const LowerInfo& l);
+
+  std::string getGPUDeviceString() {
+    return "cuda:" + std::to_string(target_device.gpu_id);
+  };
 };
 
 void LowerBlock(torch::jit::Block* b);
 
@@ -14,6 +14,7 @@ cc_library(
     name = "passes",
     srcs = [
         "convNd_to_convolution.cpp",
+        "device_casting.cpp",
         "exception_elimination.cpp",
         "fuse_addmm_branches.cpp",
         "linear_to_addmm.cpp",
@@ -27,6 +28,7 @@ cc_library(
         "remove_dropout.cpp",
         "remove_nops.cpp",
         "remove_unnecessary_casts.cpp",
+        "rewrite_inputs_with_params.cpp",
         "silu_to_sigmoid_multiplication.cpp",
         "unpack_addmm.cpp",
         "unpack_batch_norm.cpp",
 
@@ -1,5 +1,6 @@
 target_sources(${lib_name}
     PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/convNd_to_convolution.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/device_casting.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/exception_elimination.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/fuse_addmm_branches.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/linear_to_addmm.cpp"
@@ -24,6 +25,7 @@ target_sources(${lib_name}
             "${CMAKE_CURRENT_SOURCE_DIR}/unpack_std.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/unpack_var.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/view_to_reshape.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/rewrite_inputs_with_params.cpp"
 )
 
 set(HEADER_FILES
 
@@ -0,0 +1,121 @@
+#include "torch/csrc/jit/ir/constants.h"
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+#include "core/util/prelude.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace lowering {
+namespace passes {
+
+void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) {
+  std::string masked_fill_pattern = R"IR(
+    graph(%self, %mask, %value):
+      %out: Tensor = aten::masked_fill_(%self, %mask, %value)
+      return (%out))IR";
+
+  // Calls to masked_fill_ often utilize CPU tensors, and as such
+  // should be moved to gpu to avoid device mismatch errors
+
+  // Separate string into portions to insert device name
+  std::string clean_pattern_part_1 = R"IR(
+    graph(%self, %mask, %value):
+      %device: Device = prim::Constant[value=")IR";
+
+  std::string clean_pattern_part_2 = R"IR("]()
+      %dtype: NoneType = prim::Constant()
+      %false: bool = prim::Constant[value=0]()
+      %mask_cuda: Tensor = aten::to(%mask, %device, %dtype, %false, %false)
+      %self_cuda: Tensor = aten::to(%self, %device, %dtype, %false, %false)
+      %out: Tensor = aten::masked_fill(%self_cuda, %mask_cuda, %value)
+      return (%out))IR";
+
+  auto unpacked_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2;
+
+  torch::jit::SubgraphRewriter masked_fill_rewriter;
+  masked_fill_rewriter.RegisterRewritePattern(masked_fill_pattern, unpacked_pattern);
+  masked_fill_rewriter.runOnGraph(graph);
+  LOG_GRAPH("After unpack and cast masked_fill_: " << *graph);
+}
+
+void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) {
+  std::string num_to_tensor_cast_pattern = R"IR(
+    graph(%1: Scalar):
+      %2: Tensor = prim::NumToTensor(%1)
+      return (%2))IR";
+
+  // 0D Tensors are initialized on cpu, and need to be moved to gpu
+  // to avoid device mismatch issues
+
+  // Separate string into portions to insert device name
+  std::string clean_pattern_part_1 = R"IR(
+    graph(%1: Scalar):
+      %2: Tensor = prim::NumToTensor(%1)
+      %device: Device = prim::Constant[value=")IR";
+
+  std::string clean_pattern_part_2 = R"IR("]()
+      %dtype: NoneType = prim::Constant()
+      %false: bool = prim::Constant[value=0]()
+      %3: Tensor = aten::to(%2, %device, %dtype, %false, %false)
+      return (%3))IR";
+
+  auto num_to_tensor_clean_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2;
+
+  torch::jit::SubgraphRewriter num_to_tensor_cast_rewriter;
+  num_to_tensor_cast_rewriter.RegisterRewritePattern(num_to_tensor_cast_pattern, num_to_tensor_clean_pattern);
+  num_to_tensor_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast NumToTensor: " << *graph);
+}
+
+void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name) {
+  std::string full_cast_pattern = R"IR(
+    graph(%1, %2, %3, %4, %5, %6):
+      %out: Tensor = aten::full(%1, %2, %3, %4, %5, %6)
+      return (%out))IR";
+
+  // Tensors created via aten::full are initialized on cpu, and need to be casted to gpu
+  // to avoid device mismatch issues
+
+  // Separate string into portions to insert device name
+  std::string clean_pattern_part_1 = R"IR(
+    graph(%1, %2, %3, %4, %5, %6):
+      %device: Device = prim::Constant[value=")IR";
+
+  std::string clean_pattern_part_2 = R"IR("]()
+      %out: Tensor = aten::full(%1, %2, %3, %4, %device, %6)
+      return (%out))IR";
+
+  auto full_clean_pattern = clean_pattern_part_1 + target_device_name + clean_pattern_part_2;
+
+  torch::jit::SubgraphRewriter full_cast_rewriter;
+  full_cast_rewriter.RegisterRewritePattern(full_cast_pattern, full_clean_pattern);
+  full_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast full: " << *graph);
+}
+
+void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string scalar_implicit_cast_pattern = R"IR(
+    graph(%1: Tensor):
+      %2: Scalar = aten::ScalarImplicit(%1)
+      return (%2))IR";
+
+  // ScalarImplicit can only unpack 0D tensors, whereas Tensors operated on by
+  // TensorRT are padded to 1 dimension. aten::item() resolves this conflict
+  std::string scalar_implicit_clean_pattern = R"IR(
+    graph(%1: Tensor):
+      %2: Scalar = aten::item(%1)
+      return (%2))IR";
+
+  torch::jit::SubgraphRewriter scalar_implicit_cast_rewriter;
+  scalar_implicit_cast_rewriter.RegisterRewritePattern(scalar_implicit_cast_pattern, scalar_implicit_clean_pattern);
+  scalar_implicit_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast full: " << *graph);
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace torch_tensorrt
@@ -39,7 +39,12 @@ void UnpackVar(std::shared_ptr<torch::jit::Graph>& graph);
 void AliasOperators(std::shared_ptr<torch::jit::Graph>& graph);
 void SiluToSigmoidMultipication(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackHardSwish(std::shared_ptr<torch::jit::Graph>& graph);
+void RewriteInputsWithParams(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::IValue>& params);
 void UnpackHardSigmoid(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
+void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
+void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
+void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph);
 
 } // namespace passes
 } // namespace lowering