From 7393fa878c1904fb92f01b0da7255e5ef9053dce Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Sat, 23 Jul 2022 16:34:56 -0700 Subject: [PATCH 01/16] feat: support for grouped inputs Signed-off-by: Naren Dasan --- core/compiler.cpp | 120 +++--- core/compiler.h | 6 +- core/conversion/conversion.cpp | 26 +- core/conversion/conversion.h | 3 + core/conversion/evaluators/aten.cpp | 15 - core/ir/BUILD | 3 +- core/ir/GraphInputs.cpp | 76 ++++ core/ir/StaticParams.cpp | 5 +- core/ir/ir.cpp | 93 ++++- core/ir/ir.h | 32 +- core/lowering/lowering.cpp | 1 - core/partitioning/partitioning.cpp | 48 ++- core/partitioning/shape_analysis.cpp | 62 ++- core/partitioning/shape_analysis.h | 5 +- cpp/include/torch_tensorrt/torch_tensorrt.h | 32 +- cpp/src/compile_spec.cpp | 55 ++- cpp/src/torch_tensorrt.cpp | 3 + .../csrc/register_tensorrt_classes.cpp | 8 + py/torch_tensorrt/csrc/tensorrt_classes.cpp | 84 +++- py/torch_tensorrt/csrc/tensorrt_classes.h | 11 + py/torch_tensorrt/csrc/torch_tensorrt_py.cpp | 11 + py/torch_tensorrt/ts/_compile_spec.py | 64 ++- py/torch_tensorrt/ts/_compiler.py | 2 + .../test_resolve_nontensor_inputs.cpp | 16 +- .../core/partitioning/test_shape_analysis.cpp | 16 +- tests/cpp/BUILD | 20 +- tests/cpp/test_collection.cpp | 363 ++++++++++++++++++ tests/modules/custom_models.py | 61 +++ tests/modules/hub.py | 28 +- tests/modules/requirements.txt | 1 + tests/py/api/test_collections.py | 147 +++++++ tests/py/model_test_case.py | 2 + tests/py/requirements.txt | 3 +- 33 files changed, 1257 insertions(+), 165 deletions(-) create mode 100644 core/ir/GraphInputs.cpp create mode 100644 tests/cpp/test_collection.cpp create mode 100644 tests/py/api/test_collections.py diff --git a/core/compiler.cpp b/core/compiler.cpp index fc1cc66aee..214443a9c6 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -256,6 +256,7 @@ GraphAndMapping ConstructFallbackGraph( // update the input ranges for each segments convert_cfg.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params); + // TODO mapping Inputs Ivalue to flatten one here auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, static_params); auto temp_g = std::make_shared(); auto device_spec = convert_cfg.engine_settings.device; @@ -306,57 +307,72 @@ void MapInputsAndDetermineDTypes( CompileSpec& cfg, std::shared_ptr& g, ir::StaticParams& static_params, - ir::TypeMap& first_use_type_map) { - // Associate input specs with inputs - cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params)); - - for (auto& in : g->inputs()) { - if (static_params.find(in) == static_params.end()) { - ir::Input& spec = cfg.convert_info.inputs.find(in)->second; - auto est_type_opt = first_use_type_map.find(in)->second; - if (est_type_opt && !spec.dtype_is_user_defined) { - // If we can calculate the type from the graph and the type was not defined by the user then use the calculated - // type - LOG_INFO( - "Since input type is not explicitly defined, infering using first tensor calculation\n Found input " - << in->debugName() << " has type " << est_type_opt.value() - << ". If this is incorrect explicitly set dtype for input and file a bug"); - spec.dtype = util::ScalarTypeToTRTDataType(est_type_opt.value()); - } else if (!est_type_opt && !spec.dtype_is_user_defined) { - // If we cannot calculate the type and the user did not define the type, then default to FP32 - LOG_WARNING( - "Cannot infer input type from calcuations in graph for input " - << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity"); - spec.dtype = nvinfer1::DataType::kFLOAT; - } else if (spec.dtype_is_user_defined && cfg.partition_info.enabled) { - if (!est_type_opt) { - LOG_INFO("Cannot infer input tensor dtype in graph. Using user provided input dtype settings"); - first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)}; - } else { - if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) { + ir::CollectionTypeMap& first_use_type_map) { + cfg.convert_info.collection_input_spec_map = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params)); + + auto collection_inputs = ir::get_collection_inputs(g, static_params); + LOG_DEBUG("In MapInputsAndDetermineDTypes, the g->inputs() size is " << g->inputs().size() << ", CollectionInputSpecMap size is" << collection_inputs.size()); + + for (auto in : collection_inputs) { + std::vector& spec = cfg.convert_info.collection_input_spec_map.find(in)->second; + std::vector> est_type_opt; + + auto est_it = first_use_type_map.find(in); + if (est_it != first_use_type_map.end()) { + est_type_opt = first_use_type_map.find(in)->second; + } + // traverse elements in est_type_out and spec + for (int i = 0; i < est_type_opt.size(); i++) { + if (est_type_opt[i] && !spec[i].dtype_is_user_defined) { + // If we can calculate the type from the graph and the type was not defined by the user then use the calculated + // type + LOG_INFO( + "Since input type is not explicitly defined, infering using first tensor calculation\n Inferred input " + << in->debugName() << " has type " << est_type_opt[i].value()); + spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value()); + } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) { + // If we cannot calculate the type and the user did not define the type, then default to FP32 + LOG_WARNING( + "Cannot infer input type from calcuations in graph for input " + << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity"); + spec[i].dtype = nvinfer1::DataType::kFLOAT; + } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) { + if (!est_type_opt[i]) { + LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting"); std::stringstream ss; ss << "For input " << in->debugName() << ", found user specified input dtype as "; - ss << cfg.convert_info.inputs.find(in)->second.dtype; - ss << ", however when inspecting the graph, the input type expected was inferred to be "; - ss << est_type_opt.value() << std::endl; - ss << "The compiler is going to use the user setting " << cfg.convert_info.inputs.find(in)->second.dtype; - ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n"; - ss << "compatibility with PyTorch's data type convention is required.\n"; - ss << "If you do indeed see errors at runtime either:\n"; - ss << "- Remove the dtype spec for " << in->debugName() << std::endl; - ss << "- Disable partial compilation by setting require_full_compilation to True"; + ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; + ss << ". The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; auto warn_str = ss.str(); LOG_WARNING(warn_str); + // Overwrite type map with user settings + first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)}; + + } else { + if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype) != est_type_opt[i].value()) { + std::stringstream ss; + ss << "For input " << in->debugName() << ", found user specified input dtype as "; + ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; + ss << ", however when inspecting the graph, the input type expected was inferred to be "; + ss << est_type_opt[i].value() << std::endl; + ss << "The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; + ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n"; + ss << "compatibility with PyTorch's data type convention is required.\n"; + ss << "If you do indeed see errors at runtime either:\n"; + ss << "- Remove the dtype spec for " << in->debugName() << std::endl; + ss << "- Disable partial compilation by setting require_full_compilation to True"; + auto warn_str = ss.str(); + LOG_WARNING(warn_str); + // Overwrite type map with user settings + first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)}; + } } - // Overwrite type map with user settings - // We use this map for partitiioning since we need c10::ScalarTypes not nvinfer::DataTypes - first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)}; + } else { + // The user defined the type so no changes are necessary } - } else { - // The user defined the type so no changes are necessary } } - } + // } } std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) { @@ -370,7 +386,8 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std:: auto params = graph_and_parameters.second; auto static_params = ir::get_static_params(g->inputs(), params); // Infer the type of an input from the weights of the calculation - auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block()); + // auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block()); + auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block()); MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types); @@ -395,10 +412,11 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) auto params = graph_and_parameters.second; auto static_params = ir::get_static_params(g->inputs(), params); // Infer the type of an input from the weights of the calculation - auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block()); + auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block()); MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types); auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true); + auto outputIsCollection = conversion::OutputIsCollection(g->block()); if (cfg.partition_info.enabled && (cfg.lower_info.forced_fallback_modules.size() == 0 && cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) { @@ -406,12 +424,13 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) } if (cfg.partition_info.enabled && - !(cfg.lower_info.forced_fallback_modules.size() == 0 && - cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) { - auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types); + (!(cfg.lower_info.forced_fallback_modules.size() == 0 && + cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible) + || outputIsCollection)) { + std::unordered_map fallback_nodes; - auto graph_and_mapping = - ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params, fallback_nodes); + auto collection_input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types); + auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), collection_input_ivalues_map, cfg, static_params, fallback_nodes); new_g = graph_and_mapping.first; // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly for (size_t i = 0; i < new_g->inputs().size(); ++i) { @@ -429,6 +448,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) TORCHTRT_CHECK( conversion::VerifyConverterSupportForBlock(g->block()), "Not all operations in graph are supported by the compiler"); + // TODO find the right auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params); AddEngineToGraph(new_mod, new_g, engine, cuda_device); } diff --git a/core/compiler.h b/core/compiler.h index c1bb85aa3b..c8dc85020b 100644 --- a/core/compiler.h +++ b/core/compiler.h @@ -8,13 +8,15 @@ #include "core/partitioning/partitioning.h" #include "core/runtime/runtime.h" #include "torch/csrc/jit/api/module.h" +#include "torch/csrc/jit/ir/ir.h" namespace torch_tensorrt { namespace core { struct CompileSpec { - CompileSpec(std::vector inputs) : inputs(inputs) {} - std::vector inputs; + CompileSpec(std::vector inputs) : graph_inputs(inputs) {} + CompileSpec(torch::jit::IValue& input_signature) : graph_inputs(input_signature) {} + ir::GraphInputs graph_inputs; conversion::ConversionInfo convert_info; lowering::LowerInfo lower_info; partitioning::PartitionInfo partition_info; diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp index 3211e7dd98..914f1ddb9d 100644 --- a/core/conversion/conversion.cpp +++ b/core/conversion/conversion.cpp @@ -138,7 +138,10 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) { void AddInputs( ConversionCtx* ctx, c10::ArrayRef inputs, - std::unordered_map& input_specs) { + ConversionInfo& conversion_info) { + std::unordered_map& input_specs = conversion_info.inputs; + std::unordered_map> collection_input_spec = conversion_info.collection_input_spec_map; + std::vector input_tensors; for (auto in : inputs) { // Disregarding inputs that are not tensors @@ -166,9 +169,15 @@ void AddInputs( for (auto input : input_tensors) { const torch::jit::Value* in = input; TORCHTRT_CHECK( - input_specs.find(in) != input_specs.end(), + input_specs.find(in) != input_specs.end() || collection_input_spec.find(in) != collection_input_spec.end(), "Cannot find an input spec associated with input: " << in->debugName()); - ir::Input& spec = input_specs.find(in)->second; + ir::Input spec; + if (input_specs.find(in) != input_specs.end()) { + spec = input_specs.find(in)->second; + } else { + spec = collection_input_spec.find(in)->second[0]; // assume input is tensor + } + // ir::Input& spec = input_specs.find(in)->second; std::string name = std::string("input_") + std::to_string(ctx->num_inputs); LOG_INFO( @@ -408,7 +417,7 @@ void ConvertBlockToNetDef( auto inputs = b->inputs(); AddParamsToCtxValueMap(ctx, static_params); - AddInputs(ctx, inputs, build_info.inputs); + AddInputs(ctx, inputs, build_info); auto nodes = b->nodes(); @@ -549,6 +558,15 @@ std::set ConvertableOpsInBlock(const torch::jit::Block* b) { return convertable_ops; } +bool OutputIsCollection(const torch::jit::Block* b) { + for (auto out: b->outputs()) { + if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) { + return true; + } + } + return false; +} + bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors) { auto unsupported_ops = GetUnsupportedOpsInBlock(b); if (unsupported_ops.size() != 0) { diff --git a/core/conversion/conversion.h b/core/conversion/conversion.h index 58c06b42a3..a578c4288e 100644 --- a/core/conversion/conversion.h +++ b/core/conversion/conversion.h @@ -13,6 +13,7 @@ namespace conversion { struct ConversionInfo { ir::InputSpecMap inputs; + ir::CollectionInputSpecMap collection_input_spec_map; BuilderSettings engine_settings; }; @@ -25,6 +26,8 @@ std::string ConvertBlockToEngine( bool OpSupported(const torch::jit::Node* n); +bool OutputIsCollection(const torch::jit::Block* b); + bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors = false); c10::optional EvaluateNode( diff --git a/core/conversion/evaluators/aten.cpp b/core/conversion/evaluators/aten.cpp index 4632744790..7bb1f6d202 100644 --- a/core/conversion/evaluators/aten.cpp +++ b/core/conversion/evaluators/aten.cpp @@ -264,21 +264,6 @@ auto aten_registrations TORCHTRT_UNUSED = }, EvalOptions().validSchemas( {"aten::size(Tensor self) -> (int[])", "aten::size.int(Tensor self, int dim) -> (int)"})}) - .evaluator({c10::Symbol::fromQualString("aten::__getitem__"), - [](const torch::jit::Node* n, kwargs& args) -> c10::optional { - auto list = args.at(n->input(0)).IValue()->to>(); - auto idx = args.at(n->input(1)).unwrapToInt(); - - const int64_t list_size = list.size(); - const int64_t normalized_idx = normalizeIndex(idx, list_size); - TORCHTRT_CHECK( - normalized_idx >= 0 || normalized_idx < list_size, - "List index out of range (aten::__getitem__)"); - return list.get(normalized_idx); - }, - EvalOptions().validSchemas({ - "aten::__getitem__.t(t[](a) list, int idx) -> (t(*))", - })}) .evaluator({c10::Symbol::fromQualString("aten::append"), [](const torch::jit::Node* n, kwargs& args) -> c10::optional { auto list = args.at(n->input(0)).IValue()->to>(); diff --git a/core/ir/BUILD b/core/ir/BUILD index a613aaf489..2e9ef7e6a8 100644 --- a/core/ir/BUILD +++ b/core/ir/BUILD @@ -15,7 +15,8 @@ cc_library( srcs = [ "ir.cpp", "Input.cpp", - "StaticParams.cpp" + "StaticParams.cpp", + "GraphInputs.cpp" ], deps = [ "@tensorrt//:nvinfer", diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp new file mode 100644 index 0000000000..792189137a --- /dev/null +++ b/core/ir/GraphInputs.cpp @@ -0,0 +1,76 @@ +#include "core/ir/ir.h" +#include "core/util/prelude.h" + +namespace torch_tensorrt { +namespace core { +namespace ir { + +void flatten_dfs(std::vector& flattened_inputs, std::vector>& collection_inputs, + torch::jit::IValue input_ivalue, int level, int index) { + if (input_ivalue.isTuple()) { + auto input_tuple = input_ivalue.toTuple(); + int idx = 0; + if (level == 0) { + collection_inputs.resize(input_tuple->elements().size()); + } + for (auto item: input_tuple->elements()) { + torch::jit::IValue converted_item; + int cur_idx = level < 1 ? idx: index; + flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx); + idx++; + } + } else if(input_ivalue.isList()) { + auto input_list = input_ivalue.toList().vec(); + if (level == 0) { + collection_inputs.resize(input_list.size()); + } + c10::TypePtr type = input_list[0].type(); + auto converted_elements = c10::impl::GenericList(type); + int idx = 0; + for (auto item: input_list) { + int cur_idx = level < 1 ? idx: index; + flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx); + idx++; + } + } else if(input_ivalue.isCustomClass()) { + torch_tensorrt::core::ir::Input cur_input = *(input_ivalue.toCustomClass()); + flattened_inputs.push_back(cur_input); + if (level == 0) { // a single value like A + collection_inputs.resize(1); + collection_inputs[0].push_back(cur_input); + } else if (level == 1) { // like A in [A, A] or [(B, B), A] + collection_inputs[index].push_back(cur_input); + } else if (level == 2) { // like A in [(A, A), C] + collection_inputs[index].push_back(cur_input); + } else {// only support 2 level + LOG_ERROR("Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]"); + } + } +} + + +GraphInputs::GraphInputs(std::vector inputs_) { + LOG_DEBUG("Construct GraphInput with ir::Input"); + inputs = inputs_; + collection_inputs.resize(inputs_.size()); + for (int i = 0; i < inputs_.size(); i++) { + collection_inputs[i].push_back(inputs_[i]); + } +} + +GraphInputs::GraphInputs(torch::jit::IValue& input_signature_) { + LOG_DEBUG("Construct GraphInput with IValue"); + + std::vector flattened_inputs; + std::vector> collection_inputs_; + + flatten_dfs(flattened_inputs, collection_inputs_, input_signature_, 0, 0); + inputs = flattened_inputs; + input_signature = input_signature_; + collection_inputs = collection_inputs_; + LOG_DEBUG("Collection Input Size: " << collection_inputs_.size()); +} + +} // namespace ir +} // namespace core +} // namespace torch_tensorrt \ No newline at end of file diff --git a/core/ir/StaticParams.cpp b/core/ir/StaticParams.cpp index ac16c72d9f..0073ad2888 100644 --- a/core/ir/StaticParams.cpp +++ b/core/ir/StaticParams.cpp @@ -11,7 +11,10 @@ StaticParams get_static_params(c10::ArrayRef inputs, std::ve StaticParams static_params; auto param_it = params.begin(); for (auto in : inputs) { - if (in->type() != c10::TensorType::get() && param_it != params.end()) { + // handle TensorType, TupleType and ListType + if (in->type() != c10::TensorType::get() && + in->type()->kind() != torch::jit::TypeKind::TupleType && + in->type()->kind() != torch::jit::TypeKind::ListType && param_it != params.end()) { static_params[in] = *param_it; ++param_it; } diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp index fcca3df33c..cc82fe09b4 100644 --- a/core/ir/ir.cpp +++ b/core/ir/ir.cpp @@ -13,6 +13,14 @@ InputSpecMap associate_specs_with_inputs( return pair_input_vals_with_specs(tensor_inputs, specs); } +CollectionInputSpecMap associate_specs_with_collection_inputs( + std::shared_ptr& g, + ir::GraphInputs graph_inputs, + StaticParams& static_params) { + auto tensor_inputs = get_collection_inputs(g, static_params); + return pair_input_vals_with_specs_collection(tensor_inputs, graph_inputs.collection_inputs); +} + InputSpecMap pair_input_vals_with_specs(std::vector vals, std::vector specs) { TORCHTRT_CHECK( vals.size() == specs.size(), @@ -27,12 +35,28 @@ InputSpecMap pair_input_vals_with_specs(std::vector va return a; } +CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector vals, std::vector>& specs) { + TORCHTRT_CHECK( + vals.size() == specs.size(), + "Expected dimension specifications for all input tensors" + << ", but found " << vals.size() << " input tensors and " << specs.size() << " dimension specs"); + + CollectionInputSpecMap a; + for (size_t i = 0; i < vals.size(); i++) { + LOG_DEBUG("Paring " << i << ": " << vals[i]->debugName() << " : " << specs[i]); + a.insert({vals[i], specs[i]}); + } + return a; +} + std::vector get_tensor_inputs( std::shared_ptr& g, StaticParams& static_params) { std::vector input_tensors; auto inputs = g->inputs(); + LOG_DEBUG("Raw inputs size of get_tensor_inputs: " << inputs.size()); for (auto in : inputs) { + LOG_DEBUG("Handle input of debug name: " << in->debugName()); // Disregarding inputs that are not tensors or are static // // Ex. @@ -40,6 +64,29 @@ std::vector get_tensor_inputs( // input.1:Tensor -> used if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) { input_tensors.push_back(in); + } + } + return input_tensors; +} + +std::vector get_collection_inputs( + std::shared_ptr& g, + StaticParams& static_params) { + std::vector input_tensors; + auto inputs = g->inputs(); + LOG_DEBUG("Raw inputs size of get_collection_inputs: " << inputs.size()); + for (auto in : inputs) { + LOG_DEBUG("Handle input of debug name: " << in->debugName()); + if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) { + input_tensors.push_back(in); + } else if (in->type()->kind() == torch::jit::TypeKind::TupleType && static_params.find(in) == static_params.end()) { + // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) { + input_tensors.push_back(in); // push original tuple + at::ArrayRef unpack_tuple = torch::jit::createTupleUnpack(in); + LOG_DEBUG("get_collection_inputs, tuple size " << unpack_tuple.size()); + } else if (in->type()->kind() == torch::jit::TypeKind::ListType && static_params.find(in) == static_params.end()) { + LOG_DEBUG("get_collection_inputs, list use size " << in->uses().size()); + input_tensors.push_back(in); // push original list } } return input_tensors; @@ -52,9 +99,6 @@ c10::optional get_value_first_calc_dtype_opt(torch::jit::Block* auto b_ins = b->inputs(); std::unordered_set b_in_set(b_ins.begin(), b_ins.end()); - TORCHTRT_ASSERT( - in->type() == c10::TensorType::get(), "Input is not a tensor, cannot check for dtype based on calculation"); - auto consumers = in->uses(); auto search_list = std::vector(consumers.begin(), consumers.end()); @@ -142,16 +186,57 @@ c10::optional get_value_first_calc_dtype_opt(torch::jit::Block* TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b) { TypeMap types; - for (auto i : b->inputs()) { if (i->type() == c10::TensorType::get()) { torch::jit::Value* in = i; types.insert({in, get_value_first_calc_dtype_opt(b, i)}); + } else if(i->type()->cast()) { + // make sure very time get the same ptr + at::ArrayRef unpack_tuple = torch::jit::createTupleUnpack(i); + LOG_DEBUG("Tuple size " << unpack_tuple.size()); + for (auto item: unpack_tuple) { + torch::jit::Value* in = item; + types.insert({in, get_value_first_calc_dtype_opt(b, i)}); + } + } else if(i->type()->isSubtypeOf(c10::ListType::ofTensors())) { + LOG_INFO("Unsupported type of c10::ListType::ofTensors()"); } } return types; } +CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* b) { + CollectionTypeMap types; + for (auto i : b->inputs()) { + if (i->type() == c10::TensorType::get()) { + torch::jit::Value* in = i; + types.insert({in, {get_value_first_calc_dtype_opt(b, i)}}); + + } else if(i->type()->kind() == torch::jit::TypeKind::TupleType) { + // TODO: to evaluate the data type of tuple element + // make sure very time get the same ptr + // c10::optional tp = get_value_first_calc_dtype_opt(b, i); + at::ArrayRef unpack_tuple = torch::jit::createTupleUnpack(i); + // TODO: calculate the tuple element type, currently we use {} as default datatype + // std::vector> dytpes(unpack_tuple.size(), tp); + std::vector> dytpes(unpack_tuple.size()); + types.insert({i, dytpes}); // insert an empty + + } else if(i->type()->kind() == torch::jit::TypeKind::ListType) { + // TODO: to decide the size of list and type of list element + LOG_DEBUG("get_block_first_calc_dtypes_opt ListType: use size " << i->uses().size()); + c10::optional tp = get_value_first_calc_dtype_opt(b, i); + // std::vector> dytpes(i->uses().size()); + std::vector> dytpes(i->uses().size(), tp); + types.insert({i, dytpes}); // insert an empty + } + } + return types; +} + +static auto core_input_container = + torch::class_("_torch_tensorrt_core_ir", "Input").def(torch::init<>()); + } // namespace ir } // namespace core } // namespace torch_tensorrt diff --git a/core/ir/ir.h b/core/ir/ir.h index 2d9acccc69..966c747176 100644 --- a/core/ir/ir.h +++ b/core/ir/ir.h @@ -11,9 +11,8 @@ namespace torch_tensorrt { namespace core { namespace ir { -struct Input { - // Input(std::vector shape); - // Input(std::vector min_shape, std::vector opt_shape, std::vector max_shape); +struct Input : torch::CustomClassHolder { + Input() {}; Input( std::vector shape, nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT, @@ -36,27 +35,52 @@ struct Input { nvinfer1::Dims opt; nvinfer1::DataType dtype; nvinfer1::TensorFormat format; + int id; }; +// Add to spec +struct GraphInputs { + GraphInputs(std::vector inputs); + GraphInputs(torch::jit::IValue& input_signature); + torch::jit::IValue input_signature; // nested Input, full input spec + std::vector inputs; // flattend Input + std::vector> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e) +}; + +typedef std::pair GraphIO; // Graph input output mapping + using StaticParams = std::map; StaticParams get_static_params(c10::ArrayRef inputs, std::vector params); using InputSpecMap = std::unordered_map; +using CollectionInputSpecMap = std::unordered_map>; +std::vector get_tensor_inputs( + std::shared_ptr& g, + StaticParams& static_params); InputSpecMap associate_specs_with_inputs( std::shared_ptr& g, std::vector specs, StaticParams& static_params); +CollectionInputSpecMap associate_specs_with_collection_inputs( + std::shared_ptr& g, + ir::GraphInputs graph_inputs, + StaticParams& static_params); InputSpecMap pair_input_vals_with_specs(std::vector vals, std::vector specs); +CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector vals, std::vector>& specs); std::vector get_tensor_inputs( std::shared_ptr& g, StaticParams& static_params); +std::vector get_collection_inputs( + std::shared_ptr& g, + StaticParams& static_params); using TypeMap = std::unordered_map>; +using CollectionTypeMap = std::unordered_map>>; c10::optional get_value_first_calc_dtype_opt(torch::jit::Block* b, torch::jit::Value* in); ir::TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b); - +ir::CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* b); } // namespace ir } // namespace core } // namespace torch_tensorrt diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp index d3296c347c..8bbae296c3 100644 --- a/core/lowering/lowering.cpp +++ b/core/lowering/lowering.cpp @@ -33,7 +33,6 @@ void LowerGraph(std::shared_ptr& g, LowerInfo lower_info) { torch::jit::InlineFunctionalGraphs(g); torch::jit::PeepholeOptimize(g, false); torch::jit::FuseLinear(g); - torch::jit::LowerAllTuples(g); if (!lower_info.disable_cse) { torch::jit::EliminateCommonSubexpression(g); } diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 8fcd29f7a8..f14d5438c6 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -242,6 +242,36 @@ bool check_node_fallback(torch::jit::Node* n, const std::unordered_mapoutputs()) { + if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) { + return true; + } + } + return false; +} + +bool should_run_in_trt(torch::jit::Node* n, const std::unordered_set& torch_ops) { + // If the op is not supported by the conversion phase it should run in PyTorch + if (!conversion::OpSupported(n)) { + LOG_GRAPH("Node not supported by conversion: " << util::node_info(n)); + return false; + } + + // If the user specifies the op to run in Torch it should run in PyTorch + if (torch_ops.find(n->kind().toQualString()) != torch_ops.end()) { + LOG_GRAPH("Node explicitly set to run in torch: " << util::node_info(n)); + return false; + } + + // If the user specifies the module containing this op to run in torch it should run in PyTorch + const auto to_compile_sym = c10::Symbol::attr("to_compile"); + if (n->hasAttribute(to_compile_sym) && n->i(to_compile_sym) == (int64_t) false) { + LOG_GRAPH("Node is within a module set to run in torch: " << util::node_info(n)); return false; } @@ -360,19 +390,25 @@ PartitionedGraph segment_graph( find_min_block_size_fallback_nodes(block, global_fallback_nodes, min_block_size); auto nodes = block->nodes(); - + auto reverse_nodes = nodes.reverse(); // merge from output side to input side PartitionedGraph segmented_blocks; // segment the nodes std::vector in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes; - for (const auto n : nodes) { + for (const auto n : reverse_nodes) { // Skip constant nodes as they are resources for both kinds of modules if (n->kind() == torch::jit::prim::Constant) { continue; } +<<<<<<< HEAD if (check_node_fallback(n, global_fallback_nodes)) { in_prog_trt_blk_nodes.push_back(n); +======= + // the outputs of trt subgraph shouldn't be collections + if (should_run_in_trt(n, forced_fallback_ops) && !(in_prog_trt_blk_nodes.size() == 0 && is_collection(n))) { + in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n); +>>>>>>> feat: support for grouped inputs // If there is an active PyTorch block and we have passed the threshold for a valid TRT // block then segment and reset the active PyTorch block @@ -388,7 +424,7 @@ PartitionedGraph segment_graph( LOG_DEBUG( "In progress TRT block does not meet minimum block size requirements, therefore folding into in progress PyTorch block"); in_prog_pyt_blk_nodes.insert( - in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); + in_prog_pyt_blk_nodes.begin(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); } in_prog_trt_blk_nodes.clear(); // if there is a prim::If then this if node will be encapsulated in a SegmentedBlock @@ -407,14 +443,14 @@ PartitionedGraph segment_graph( finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } if (checkLoopEvaluatable(n)) { - in_prog_trt_blk_nodes.push_back(n); + in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n); } else { auto loop_node = std::vector{n}; finalize_block(segmented_blocks, SegmentedBlock::kTorch, loop_node); } continue; } - in_prog_pyt_blk_nodes.push_back(n); + in_prog_pyt_blk_nodes.insert(in_prog_pyt_blk_nodes.begin(), n); } } @@ -429,7 +465,7 @@ PartitionedGraph segment_graph( in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } - + std::reverse(segmented_blocks.begin(), segmented_blocks.end()); return segmented_blocks; } diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index d24b1f980a..22c3ea104f 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -8,27 +8,56 @@ namespace torch_tensorrt { namespace core { namespace partitioning { +at::Tensor generateSingleInput(ir::Input& input, c10::optional& type_opt) { + auto cur_shape = input.input_shape; + std::vector shape; + shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims); + // auto type_opt = types[input.first][i]; + auto type = at::kFloat; + if (type_opt) { + type = type_opt.value(); + } else { + LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32"); + } + auto in = at::randint(5, shape, {at::kCUDA}).to(type); + // ivalue_map[input.first] = in.clone(); + return in; +} + std::unordered_map generateRandomInputs( - std::unordered_map& inputs, - std::unordered_map>& types) { + std::unordered_map>& inputs, + std::unordered_map>>& types) { + // generate random inputs for running pytorch segments std::unordered_map ivalue_map; - uint64_t in_i = 0; + for (auto& input : inputs) { - auto cur_shape = input.second.input_shape; - std::vector shape; - shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims); - auto type_opt = types[input.first]; - auto type = at::kFloat; - if (type_opt) { - type = type_opt.value(); + + if (input.first->type()->kind() == torch::jit::TypeKind::ListType) { + // create list + std::vector list; + c10::TypePtr elementType = c10::TensorType::get(); + auto generic_list = c10::impl::GenericList(elementType); + for (int i = 0; i < input.second.size(); i++) { + auto in = generateSingleInput(input.second[i], types[input.first][i]); + generic_list.push_back(in.clone()); + } + ivalue_map[input.first] = c10::IValue(generic_list); + } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) { + // create tuple + std::vector list; + for (int i = 0; i < input.second.size(); i++) { + auto in = generateSingleInput(input.second[i], types[input.first][i]); + list.push_back(in.clone()); + } + auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr + ivalue_map[input.first] = c10::IValue(tuple); } else { - LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32"); + auto in = generateSingleInput(input.second[0], types[input.first][0]); + ivalue_map[input.first] = in.clone(); + } - auto in = at::randint(5, shape, {at::kCUDA}).to(type); - ivalue_map[input.first] = in.clone(); - in_i++; } return ivalue_map; } @@ -79,8 +108,10 @@ void getSegmentsOutputByRunning( } else if (input->type()->isSubtypeOf(torch::jit::BoolType::get())) { jit_inputs_ivalues.push_back(ivalues_maps[input].toBool()); } else if (input->type()->kind() == torch::jit::TypeKind::ListType) { - jit_inputs_ivalues.push_back(ivalues_maps[input].toList()); + // create list + jit_inputs_ivalues.push_back(ivalues_maps[input].toList());; } else if (input->type()->kind() == torch::jit::TypeKind::TupleType) { + // create tuple jit_inputs_ivalues.push_back(ivalues_maps[input].toTuple()); } else if (input->type()->kind() == torch::jit::TypeKind::NumberType) { jit_inputs_ivalues.push_back(ivalues_maps[input].toScalar()); @@ -145,6 +176,7 @@ void getSegmentsOutputByRunning( } input_types.push_back(cur_ivalue.toTensor().scalar_type()); } + // TODO: tuple and list inputs in subgraph } seg_block.register_inshapes(input_shapes); diff --git a/core/partitioning/shape_analysis.h b/core/partitioning/shape_analysis.h index 0626490222..2654699a1d 100644 --- a/core/partitioning/shape_analysis.h +++ b/core/partitioning/shape_analysis.h @@ -6,9 +6,10 @@ namespace torch_tensorrt { namespace core { namespace partitioning { + std::unordered_map generateRandomInputs( - std::unordered_map& input_ranges, - std::unordered_map>& input_types); + std::unordered_map>& input_ranges, + std::unordered_map>>& input_types); void runShapeAnalysis( std::vector& segmented_blocks, diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h index 66706db791..70dea51bc7 100644 --- a/cpp/include/torch_tensorrt/torch_tensorrt.h +++ b/cpp/include/torch_tensorrt/torch_tensorrt.h @@ -14,6 +14,7 @@ #include #include #include +#include "torch/custom_class.h" // Just include the .h? #ifndef DOXYGEN_SHOULD_SKIP_THIS @@ -363,7 +364,7 @@ class TORCHTRT_API TensorFormat { * signifying a static input shape or a set of three input shapes representing * the min, optiminal and max input shapes allowed for the engine. */ -struct TORCHTRT_API Input { +struct TORCHTRT_API Input : torch::CustomClassHolder{ /// Minimum acceptable input size into the engine std::vector min_shape; /// Optimal input size into the engine (size optimized for given kernels accept any size in min max range) @@ -378,6 +379,7 @@ struct TORCHTRT_API Input { /// Expected tensor format for the input TensorFormat format; + Input() {} /** * @brief Construct a new Input spec object for static input size from * vector, optional arguments allow the user to configure expected input shape @@ -512,6 +514,16 @@ struct TORCHTRT_API Input { bool input_is_dynamic; }; +/** + * @brief A struct to hold complex inputs + * + * This struct can either hold a complex inputs of shape or a flattened one, + */ +struct TORCHTRT_API GraphInputs { + torch::jit::IValue input_signature; // nested Input, full input spec + std::vector inputs; // flatten input spec +}; + /** * @brief Get the build information for the library including the dependency * versions @@ -579,18 +591,22 @@ struct TORCHTRT_API CompileSpec { * * @param inputs */ - CompileSpec(std::vector inputs) : inputs(std::move(inputs)) {} - - // Defaults should reflect TensorRT defaults for BuilderConfig + CompileSpec(std::vector inputs); /** - * @brief Specifications for inputs to the engine, can either be a single size or a range defined by min, opt and max - * sizes Users can also specify expected input type as well as tensor memory format + * @brief Construct a new Extra Info object from IValue. + * The IValue store a complex Input * - * Order in vector should match call order for the function + * @param input_signature */ - std::vector inputs; + CompileSpec(torch::jit::IValue input_signature); + // Defaults should reflect TensorRT defaults for BuilderConfig + /** + * @brief Specifications for inputs to the engine, can store a IValue which has stored complex Input + * or a flatened Input + */ + GraphInputs graph_inputs; /** * @brief The set of precisions TensorRT is allowed to use for kernels during compilation * diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp index 2881887aea..1fb4c56a98 100644 --- a/cpp/src/compile_spec.cpp +++ b/cpp/src/compile_spec.cpp @@ -18,18 +18,67 @@ torchtrt::core::runtime::CudaDevice to_internal_cuda_device(Device device); namespace torchscript { CompileSpec::CompileSpec(std::vector> fixed_sizes) { for (auto in : fixed_sizes) { - inputs.push_back(Input(in)); + graph_inputs.inputs.push_back(Input(in)); } } CompileSpec::CompileSpec(std::vector> fixed_sizes) { for (auto in : fixed_sizes) { - inputs.push_back(Input(in)); + graph_inputs.inputs.push_back(Input(in)); + } +} + +CompileSpec::CompileSpec(std::vector inputs) { + graph_inputs.inputs = std::move(inputs); +} + +CompileSpec::CompileSpec(torch::jit::IValue input_signature) { + graph_inputs.input_signature = input_signature; +} + + + +void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) { + if (input_ivalue.isTuple()) { + auto input_tuple = input_ivalue.toTuple(); + std::vector converted_elements; + for (auto item: input_tuple->elements()) { + torch::jit::IValue converted_item; + to_internal_input_signature(item, converted_item); + converted_elements.push_back(converted_item); + auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements); + converted_ivalue = torch::jit::IValue(tuple_ptr); + } + } else if(input_ivalue.isList()) { + auto input_list = input_ivalue.toList().vec(); + c10::TypePtr type = input_list[0].type(); + auto converted_elements = c10::impl::GenericList(type); + for (auto item: input_list) { + torch::jit::IValue converted_item; + to_internal_input_signature(item, converted_item); + converted_elements.push_back(converted_item); + } + converted_ivalue = torch::jit::IValue(converted_elements); + } else if(input_ivalue.isCustomClass()) { + torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass())); + converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(cur_input))); + } +} + +torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) { + if (external.graph_inputs.inputs.size() > 0) { + torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.graph_inputs.inputs)); + return internal; + } else { + torch::jit::IValue converted_input_signature; + to_internal_input_signature(external.graph_inputs.input_signature, converted_input_signature); + torchtrt::core::CompileSpec internal(converted_input_signature); + return internal; } } torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) { - torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.inputs)); + torchtrt::core::CompileSpec internal = init_compile_spec(external); for (auto p : external.enabled_precisions) { internal.convert_info.engine_settings.enabled_precisions.insert(toTRTDataType(p)); diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp index 42b44833de..93813190ab 100644 --- a/cpp/src/torch_tensorrt.cpp +++ b/cpp/src/torch_tensorrt.cpp @@ -52,4 +52,7 @@ void set_device(const int gpu_id) { // Want to export a much simpler (non CUDA header dependent) API torch_tensorrt::core::set_device(gpu_id); } + +static auto tensorrt_input_container = + torch::class_("_torch_tensorrt", "Input").def(torch::init<>()); } // namespace torch_tensorrt diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp index 9165b21185..0eb6fba2de 100644 --- a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp @@ -23,6 +23,13 @@ void RegisterTRTCompileSpec() { ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistration, torch_tensorrt::pyapi::Input, input_is_dynamic); ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistration, torch_tensorrt::pyapi::Input, explicit_set_dtype); + static auto TORCHTRT_UNUSED TRTInputSignatureTSRegistration = + torch::class_("tensorrt", "_InputSignature") + .def(torch::init<>()) + .def("__str__", &torch_tensorrt::pyapi::InputSignature::to_str); + + ADD_FIELD_GET_SET_REGISTRATION(TRTInputSignatureTSRegistration, torch_tensorrt::pyapi::InputSignature, signature_ivalue); + static auto TORCHTRT_UNUSED TRTDeviceTSRegistration = torch::class_("tensorrt", "_Device") .def(torch::init<>()) @@ -49,6 +56,7 @@ void RegisterTRTCompileSpec() { torch::class_("tensorrt", "CompileSpec") .def(torch::init<>()) .def("_append_input", &torch_tensorrt::pyapi::CompileSpec::appendInput) + .def("_set_input_signature", &torch_tensorrt::pyapi::CompileSpec::setInputSignature) .def("_set_precisions", &torch_tensorrt::pyapi::CompileSpec::setPrecisions) .def("_set_device", &torch_tensorrt::pyapi::CompileSpec::setDeviceIntrusive) .def("_set_torch_fallback", &torch_tensorrt::pyapi::CompileSpec::setTorchFallbackIntrusive) diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp index 5aeac3b6d6..9eb58b3e73 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp @@ -104,6 +104,12 @@ std::string Input::to_str() { return ss.str(); } +std::string InputSignature::to_str() { + std::stringstream ss; + ss << signature_ivalue; + return ss.str(); +} + std::string to_str(DeviceType value) { switch (value) { case DeviceType::kDLA: @@ -184,13 +190,63 @@ std::string TorchFallback::to_str() { return ss.str(); } -core::CompileSpec CompileSpec::toInternalCompileSpec() { - std::vector internal_inputs; - for (auto i : inputs) { - internal_inputs.push_back(i.toInternalInput()); +void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) { + if (input_ivalue.isTuple()) { + auto input_tuple = input_ivalue.toTuple(); + std::vector converted_elements; + for (auto item: input_tuple->elements()) { + torch::jit::IValue converted_item; + to_internal_input_signature(item, converted_item); + converted_elements.push_back(converted_item); + auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements); + converted_ivalue = torch::jit::IValue(tuple_ptr); + } + } else if(input_ivalue.isList()) { + auto input_list = input_ivalue.toList().vec(); + c10::TypePtr type = input_list[0].type(); + auto converted_elements = c10::impl::GenericList(type); + for (auto item: input_list) { + torch::jit::IValue converted_item; + to_internal_input_signature(item, converted_item); + converted_elements.push_back(converted_item); + } + converted_ivalue = torch::jit::IValue(converted_elements); + } else if(input_ivalue.isCustomClass()) { + core::ir::Input cur_input = (*(input_ivalue.toCustomClass())).toInternalInput(); + converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(cur_input))); + } else if(input_ivalue.isPyObject()) { + auto py_object_holder = input_ivalue.toPyObjectHolder(); + auto infer_type = py_object_holder->tryToInferType(); + auto type = infer_type.type(); + torch::jit::IValue ival = py_object_holder->toIValue(type); + torch::jit::IValue converted_item; + to_internal_input_signature(ival, converted_item); + converted_ivalue = torch::jit::IValue(converted_item); + } else { + LOG_ERROR("Unknown input spec type"); + } +} + +core::CompileSpec init_compile_spec(CompileSpec external) { + if (external.inputs.size() > 0) { + LOG_DEBUG("init_compile_spec with input vector"); + std::vector internal_inputs; + for (auto i : external.inputs) { + internal_inputs.push_back(i.toInternalInput()); + } + core::CompileSpec internal(internal_inputs); + return internal; + } else { + LOG_DEBUG("init_compile_spec with input signature"); + torch::jit::IValue converted_input_signature; + to_internal_input_signature(external.input_signature.signature_ivalue, converted_input_signature); + core::CompileSpec internal(converted_input_signature); + return internal; } +} - auto info = core::CompileSpec(internal_inputs); +core::CompileSpec CompileSpec::toInternalCompileSpec() { + core::CompileSpec info = init_compile_spec(*this); for (auto p : enabled_precisions) { info.convert_info.engine_settings.enabled_precisions.insert(toTRTDataType(p)); @@ -237,16 +293,20 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() { std::string CompileSpec::stringify() { std::stringstream ss; ss << "TensorRT Compile Spec: {" << std::endl; - ss << " \"Inputs\": [" << std::endl; - for (auto i : inputs) { - ss << i.to_str(); + if (inputs.size() > 0) { + ss << " \"Inputs\": [" << std::endl; + for (auto i : inputs) { + ss << i.to_str(); + } + ss << " ]" << std::endl; + } else { + ss << " \"Input Signature\": " << input_signature.to_str() << std::endl; } - ss << " ]" << std::endl; - ss << " \"Enabled Precision\": [" << std::endl; + ss << " \"Enabled Precision\": ["; for (auto p : enabled_precisions) { - ss << to_str(p); + ss << to_str(p) << ", " ; } - ss << " ]" << std::endl; + ss << "]" << std::endl; ss << " \"TF32 Disabled\": " << disable_tf32 << std::endl; ss << " \"Sparsity\": " << sparse_weights << std::endl; ss << " \"Refit\": " << refit << std::endl; diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h index b615022bd0..d3b22740c2 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.h +++ b/py/torch_tensorrt/csrc/tensorrt_classes.h @@ -57,6 +57,12 @@ struct Input : torch::CustomClassHolder { std::string to_str(); }; +struct InputSignature : torch::CustomClassHolder { + torch::jit::IValue signature_ivalue; // nested Input, full input spec + ADD_FIELD_GET_SET(signature_ivalue, torch::jit::IValue); + std::string to_str(); +}; + enum DeviceType : int8_t { kGPU, kDLA, @@ -119,6 +125,10 @@ struct CompileSpec : torch::CustomClassHolder { inputs.push_back(*ir); } + void setInputSignature(const c10::intrusive_ptr& is) { + input_signature = *is; + } + void setPrecisions(const std::vector& precisions_raw) { for (auto p : precisions_raw) { TORCHTRT_CHECK(p >= 0 && p <= static_cast(DataType::kBool), "Invalid enum value for field"); @@ -158,6 +168,7 @@ struct CompileSpec : torch::CustomClassHolder { ADD_FIELD_GET_SET(ptq_calibrator, nvinfer1::IInt8Calibrator*); std::vector inputs; + InputSignature input_signature; nvinfer1::IInt8Calibrator* ptq_calibrator = nullptr; std::set enabled_precisions = {}; bool sparse_weights = false; diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp index 74a8b72711..6247789a93 100644 --- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp +++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp @@ -2,6 +2,7 @@ #include "pybind11/stl.h" #include "Python.h" +#include "ATen/core/jit_type.h" #include "core/compiler.h" #include "core/conversion/conversion.h" #include "tensorrt_classes.h" @@ -178,6 +179,15 @@ PYBIND11_MODULE(_C, m) { .def_readwrite("dtype", &Input::dtype) .def_readwrite("format", &Input::format); + py::class_(m, "InputSignature") + .def(pybind11::init([](py::object py_obj) { + InputSignature input_signature; + input_signature.signature_ivalue = torch::jit::toIValue(std::move(py_obj), c10::PyObjectType::get(), c10::nullopt); + return input_signature; + })) + .def("__str__", &InputSignature::to_str) + .def_readwrite("_signature_ivalue", &InputSignature::signature_ivalue); + py::enum_(m, "dtype", "Enum to specifiy operating precision for engine execution") .value("float", DataType::kFloat, "32 bit floating point number") .value("float32", DataType::kFloat, "32 bit floating point number") @@ -292,6 +302,7 @@ PYBIND11_MODULE(_C, m) { .def("__str__", &torch_tensorrt::pyapi::CompileSpec::stringify) .def("_get_calibrator_handle", &CompileSpec::getPTQCalibratorHandle, "[Internal] gets a handle from a calibrator") .def_readwrite("inputs", &CompileSpec::inputs) + .def_readwrite("input_signature", &CompileSpec::input_signature) .def_readwrite("enabled_precisions", &CompileSpec::enabled_precisions) .def_readwrite("ptq_calibrator", &CompileSpec::ptq_calibrator) .def_readwrite("refit", &CompileSpec::refit) diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py index 4c7b8b5b5d..0eb8a1cdce 100644 --- a/py/torch_tensorrt/ts/_compile_spec.py +++ b/py/torch_tensorrt/ts/_compile_spec.py @@ -5,10 +5,22 @@ from torch_tensorrt import _enums from torch_tensorrt._Input import Input from torch_tensorrt._Device import Device - +from torch_tensorrt.logging import Level, log +from typing import Tuple, List, Dict import warnings +def _internal_input_to_torch_class_input(i: _C.Input) -> torch.classes.tensorrt._Input: + clone = torch.classes.tensorrt._Input() + clone._set_min(i.min) + clone._set_opt(i.opt) + clone._set_max(i.max) + clone._set_dtype(i.dtype) + clone._set_format(i.format) + clone._set_input_is_dynamic(i.input_is_dynamic) + clone._set_explicit_set_dtype(i._explicit_set_dtype) + return clone + def _supported_input_size_type(input_size: Any) -> bool: if isinstance(input_size, torch.Size): return True @@ -156,15 +168,30 @@ def _parse_torch_fallback(fallback_info: Dict[str, Any]) -> _ts_C.TorchFallback: return info +def _parse_input_signature(input_signature: Any): + if isinstance(input_signature, tuple): + input_list = [] + for item in input_signature: + input = _parse_input_signature(item) + input_list.append(input) + return tuple(input_list) + elif isinstance(input_signature, list): + input_list = [] + for item in input_signature: + input = _parse_input_signature(item) + input_list.append(input) + return input_list + elif isinstance(input_signature, Input) or isinstance(input_signature, torch.Tensor): + i = Input._from_tensor(input_signature) if isinstance(input_signature, torch.Tensor) else input_signature + clone = _internal_input_to_torch_class_input(i._to_internal()) + return clone + else: + raise KeyError("Input signature contains an unsupported type {}".format(type(input_signature))) def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec: info = _ts_C.CompileSpec() - if "inputs" not in compile_spec: - raise KeyError( - "Module input definitions are requried to compile module. Provide a list of torch_tensorrt.Input keyed to \"inputs\" in the compile spec" - ) - if "inputs" in compile_spec: + if len(compile_spec["inputs"]) > 0: if not all([isinstance(i, torch.Tensor) or isinstance(i, Input) for i in compile_spec["inputs"]]): raise KeyError("Input specs should be either torch_tensorrt.Input or torch.Tensor, found types: {}".format( [type(i) for i in compile_spec["inputs"]])) @@ -172,7 +199,15 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec: inputs = [Input._from_tensor(i) if isinstance(i, torch.Tensor) else i for i in compile_spec["inputs"]] info.inputs = [i._to_internal() for i in inputs] - assert (len(info.inputs) > 0), "Require at least one input definition to compile model" + elif compile_spec["input_signature"] is not None: + log(Level.Warning, "Input signature parsing is an experimental feature, behavior and APIs may change") + signature = _parse_input_signature(compile_spec["input_signature"]) + info.input_signature = _C.InputSignature(signature) # py_object + + else: + raise KeyError( + "Module input definitions are requried to compile module. Provide a list of torch_tensorrt.Input keyed to \"inputs\" in the compile spec" + ) if "enabled_precisions" in compile_spec: info.enabled_precisions = _parse_enabled_precisions(compile_spec["enabled_precisions"]) @@ -230,10 +265,13 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec: if "torch_fallback" in compile_spec: info.torch_fallback = _parse_torch_fallback(compile_spec["torch_fallback"]) + log(Level.Debug, str(info)) + return info def TensorRTCompileSpec(inputs=[], + input_signature=None, device=Device._current_device(), disable_tf32=False, sparse_weights=False, @@ -288,6 +326,7 @@ def TensorRTCompileSpec(inputs=[], compile_spec = { "inputs": inputs, + "input_signature": input_signature, "device": device, "disable_tf32": disable_tf32, # Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas @@ -310,16 +349,11 @@ def TensorRTCompileSpec(inputs=[], backend_spec = torch.classes.tensorrt.CompileSpec() for i in parsed_spec.inputs: - clone = torch.classes.tensorrt._Input() - clone._set_min(i.min) - clone._set_opt(i.opt) - clone._set_max(i.max) - clone._set_dtype(i.dtype) - clone._set_format(i.format) - clone._set_input_is_dynamic(i.input_is_dynamic) - clone._set_explicit_set_dtype(i._explicit_set_dtype) + clone = _internal_input_to_torch_class_input(i) backend_spec._append_input(clone) + backend_spec._set_input_signature(parsed_spec.input_signature) + d = torch.classes.tensorrt._Device() d._set_device_type(int(parsed_spec.device.device_type)) d._set_gpu_id(parsed_spec.device.gpu_id) diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py index 83704a4b6c..508cb8fdd0 100644 --- a/py/torch_tensorrt/ts/_compiler.py +++ b/py/torch_tensorrt/ts/_compiler.py @@ -11,6 +11,7 @@ def compile(module: torch.jit.ScriptModule, inputs=[], + input_signature=None, device=Device._current_device(), disable_tf32=False, sparse_weights=False, @@ -94,6 +95,7 @@ def compile(module: torch.jit.ScriptModule, spec = { "inputs": inputs, + "input_signature": input_signature, "device": device, "disable_tf32": disable_tf32, # Force FP32 layers to use traditional as FP32 format "sparse_weights": sparse_weights, #Enable sparsity for convolution and fully connected layers. diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp index fea202fc65..2d0255f130 100644 --- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp +++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp @@ -116,11 +116,11 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) { inputs.push_back(torch_tensorrt::core::ir::Input({16, 3, 3, 3})); inputs.push_back(torch_tensorrt::core::ir::Input({16})); - std::unordered_map inputs_map; - std::unordered_map> input_types; + std::unordered_map> inputs_map; + std::unordered_map>> input_types; for (size_t i = 0; i < g->inputs().size(); ++i) { - inputs_map.insert({g->inputs()[i], inputs[i]}); - input_types.insert({g->inputs()[i], {at::kFloat}}); + inputs_map.insert({g->inputs()[i], {inputs[i]}}); + input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); std::unordered_map fallback_nodes; @@ -175,11 +175,11 @@ TEST(Partitioning, ResolveTensorListInputsInTrtCorrectly) { inputs.push_back(torch_tensorrt::core::ir::Input({16, 6, 3, 3})); inputs.push_back(torch_tensorrt::core::ir::Input({16})); - std::unordered_map inputs_map; - std::unordered_map> input_types; + std::unordered_map> inputs_map; + std::unordered_map>> input_types; for (size_t i = 0; i < g->inputs().size(); ++i) { - inputs_map.insert({g->inputs()[i], inputs[i]}); - input_types.insert({g->inputs()[i], {at::kFloat}}); + inputs_map.insert({g->inputs()[i], {inputs[i]}}); + input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); std::unordered_map fallback_nodes; diff --git a/tests/core/partitioning/test_shape_analysis.cpp b/tests/core/partitioning/test_shape_analysis.cpp index 7bcabc0d51..151a6e75ad 100644 --- a/tests/core/partitioning/test_shape_analysis.cpp +++ b/tests/core/partitioning/test_shape_analysis.cpp @@ -59,11 +59,11 @@ TEST(Partitioning, InferSequentialModelSegmentedBlockShapeCorrectly) { inputs.push_back(torch_tensorrt::core::ir::Input({8, 16, 3, 3})); inputs.push_back(torch_tensorrt::core::ir::Input({8})); - std::unordered_map inputs_map; - std::unordered_map> input_types; + std::unordered_map> inputs_map; + std::unordered_map>> input_types; for (size_t i = 0; i < g->inputs().size(); ++i) { - inputs_map.insert({g->inputs()[i], inputs[i]}); - input_types.insert({g->inputs()[i], {at::kFloat}}); + inputs_map.insert({g->inputs()[i], {inputs[i]}}); + input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); std::unordered_map fallback_nodes; @@ -110,11 +110,11 @@ TEST(Partitioning, InferBranchModelSegmentedBlockShapeCorrectly) { inputs.push_back(torch_tensorrt::core::ir::Input({16, 32, 3, 3})); inputs.push_back(torch_tensorrt::core::ir::Input({16})); - std::unordered_map inputs_map; - std::unordered_map> input_types; + std::unordered_map> inputs_map; + std::unordered_map>> input_types; for (size_t i = 0; i < g->inputs().size(); ++i) { - inputs_map.insert({g->inputs()[i], inputs[i]}); - input_types.insert({g->inputs()[i], {at::kFloat}}); + inputs_map.insert({g->inputs()[i], {inputs[i]}}); + input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); std::unordered_map fallback_nodes; diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD index 3d69afba95..2d545dc8f1 100644 --- a/tests/cpp/BUILD +++ b/tests/cpp/BUILD @@ -18,7 +18,8 @@ test_suite( ":test_multiple_registered_engines", ":test_serialization", ":test_module_fallback", - ":test_example_tensors" + ":test_example_tensors", + ":test_collection" ], ) @@ -32,7 +33,8 @@ test_suite( ":test_multiple_registered_engines", ":test_serialization", ":test_module_fallback", - ":test_example_tensors" + ":test_example_tensors", + ":test_collection" ], ) @@ -122,6 +124,20 @@ cc_test( }) ) +cc_test( + name = "test_collection", + srcs = ["test_collection.cpp"], + data = [ + "//tests/modules:jit_models", + ], + deps = [ + "//tests/util", + "@googletest//:gtest_main", + ] + select({ + ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"], + "//conditions:default": ["@libtorch//:libtorch"], + }) +) cc_test( name = "test_compiled_modules", srcs = ["test_compiled_modules.cpp"], diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp new file mode 100644 index 0000000000..c269ebac17 --- /dev/null +++ b/tests/cpp/test_collection.cpp @@ -0,0 +1,363 @@ +#include +#include +#include "gtest/gtest.h" +#include "tests/util/util.h" +#include "torch/script.h" +#include "torch_tensorrt/torch_tensorrt.h" + + +TEST(CppAPITests, TestCollectionStandardTensorInput) { + + std::string path = "tests/modules/standard_tensor_input.jit.pt"; + torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); + std::vector inputs; + inputs.push_back(in0); + inputs.push_back(in0); + + torch::jit::Module mod; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + mod = torch::jit::load(path); + } catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + } + mod.eval(); + mod.to(torch::kCUDA); + + + std::vector inputs_; + + for (auto in : inputs) { + inputs_.push_back(torch::jit::IValue(in.clone())); + } + + auto out = mod.forward(inputs_); + LOG_DEBUG("Finish torchscirpt forward"); + + std::vector input_range; + input_range.push_back({in0.sizes(), torch::kF16}); + input_range.push_back({in0.sizes(), torch::kF16}); + torch_tensorrt::ts::CompileSpec compile_settings(input_range); + compile_settings.require_full_compilation = true; + compile_settings.min_block_size = 3; + + // // FP16 execution + compile_settings.enabled_precisions = {torch::kHalf}; + // // Compile module + auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); + LOG_DEBUG("Finish compile"); + auto trt_out = trt_mod.forward(inputs_); + + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5)); +} + +TEST(CppAPITests, TestCollectionTupleInput) { + + std::string path = "tests/modules/tuple_input.jit.pt"; + torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); + + torch::jit::Module mod; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + mod = torch::jit::load(path); + } catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + } + mod.eval(); + mod.to(torch::kCUDA); + + std::vector complex_inputs, complex_inputs_list; + std::tuple input_tuple(in0, in0); + + complex_inputs.push_back(input_tuple); + + auto out = mod.forward(complex_inputs); + LOG_DEBUG("Finish torchscirpt forward"); + + auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); + + auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); + + + std::tuple input_shape_tuple(input_shape_ivalue, input_shape_ivalue); + + torch::jit::IValue complex_input_shape(input_shape_tuple); + std::tuple input_tuple2(complex_input_shape); + torch::jit::IValue complex_input_shape2(input_tuple2); + + + auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); + compile_settings.require_full_compilation = false; + compile_settings.min_block_size = 3; + + // // FP16 execution + compile_settings.enabled_precisions = {torch::kHalf}; + // // Compile module + auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); + LOG_DEBUG("Finish compile"); + auto trt_out = trt_mod.forward(complex_inputs); + + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5)); +} + + +TEST(CppAPITests, TestCollectionListInput) { + + std::string path = "tests/modules/list_input.jit.pt"; + torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); + std::vector inputs; + inputs.push_back(in0); + + torch::jit::Module mod; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + mod = torch::jit::load(path); + } catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + } + mod.eval(); + mod.to(torch::kCUDA); + + + std::vector inputs_; + + for (auto in : inputs) { + inputs_.push_back(torch::jit::IValue(in.clone())); + } + + std::vector complex_inputs; + auto input_list = c10::impl::GenericList(c10::TensorType::get()); + input_list.push_back(inputs_[0]); + input_list.push_back(inputs_[0]); + + torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list); + + complex_inputs.push_back(input_list_ivalue); + + + auto out = mod.forward(complex_inputs); + LOG_DEBUG("Finish torchscirpt forward"); + + auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); + auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); + + c10::TypePtr elementType = input_shape_ivalue.type(); + auto list = c10::impl::GenericList(elementType); + list.push_back(input_shape_ivalue); + list.push_back(input_shape_ivalue); + + + torch::jit::IValue complex_input_shape(list); + std::tuple input_tuple2(complex_input_shape); + torch::jit::IValue complex_input_shape2(input_tuple2); + + auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); + compile_settings.require_full_compilation = false; + compile_settings.min_block_size = 3; + compile_settings.torch_executed_ops.push_back("aten::__getitem__"); + + // // FP16 execution + compile_settings.enabled_precisions = {torch::kHalf}; + // // Compile module + auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); + LOG_DEBUG("Finish compile"); + auto trt_out = trt_mod.forward(complex_inputs); + + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5)); +} + + +TEST(CppAPITests, TestCollectionTupleInputOutput) { + + std::string path = "tests/modules/tuple_input_output.jit.pt"; + + torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); + + torch::jit::Module mod; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + mod = torch::jit::load(path); + } catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + } + mod.eval(); + mod.to(torch::kCUDA); + + + std::vector complex_inputs, complex_inputs_list; + std::tuple input_tuple(in0, in0); + + complex_inputs.push_back(input_tuple); + + auto out = mod.forward(complex_inputs); + LOG_DEBUG("Finish torchscirpt forward"); + + auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); + + auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); + + + std::tuple input_shape_tuple(input_shape_ivalue, input_shape_ivalue); + + torch::jit::IValue complex_input_shape(input_shape_tuple); + std::tuple input_tuple2(complex_input_shape); + torch::jit::IValue complex_input_shape2(input_tuple2); + // torch::jit::IValue complex_input_shape(list); + + auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); + compile_settings.require_full_compilation = false; + compile_settings.min_block_size = 3; + + // compile_settings.torch_executed_ops.push_back("prim::TupleConstruct"); + + // // FP16 execution + compile_settings.enabled_precisions = {torch::kHalf}; + // // Compile module + auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); + LOG_DEBUG("Finish compile"); + auto trt_out = trt_mod.forward(complex_inputs); + + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5)); +} + + +TEST(CppAPITests, TestCollectionListInputOutput) { + + std::string path = "tests/modules/list_input_output.jit.pt"; + torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); + std::vector inputs; + inputs.push_back(in0); + + torch::jit::Module mod; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + mod = torch::jit::load(path); + } catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + } + mod.eval(); + mod.to(torch::kCUDA); + + + std::vector inputs_; + + for (auto in : inputs) { + inputs_.push_back(torch::jit::IValue(in.clone())); + } + + std::vector complex_inputs; + auto input_list = c10::impl::GenericList(c10::TensorType::get()); + input_list.push_back(inputs_[0]); + input_list.push_back(inputs_[0]); + + torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list); + + complex_inputs.push_back(input_list_ivalue); + + + auto out = mod.forward(complex_inputs); + LOG_DEBUG("Finish torchscirpt forward"); + + auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); + + auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); + + + c10::TypePtr elementType = input_shape_ivalue.type(); + auto list = c10::impl::GenericList(elementType); + list.push_back(input_shape_ivalue); + list.push_back(input_shape_ivalue); + + + torch::jit::IValue complex_input_shape(list); + std::tuple input_tuple2(complex_input_shape); + torch::jit::IValue complex_input_shape2(input_tuple2); + + auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); + compile_settings.require_full_compilation = false; + compile_settings.min_block_size = 3; + + // Need to skip the conversion of __getitem__ and ListConstruct + compile_settings.torch_executed_ops.push_back("aten::__getitem__"); + + // // FP16 execution + compile_settings.enabled_precisions = {torch::kHalf}; + // // Compile module + auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); + LOG_DEBUG("Finish compile"); + auto trt_out = trt_mod.forward(complex_inputs); + + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5)); +} + + +TEST(CppAPITests, TestCollectionComplexModel) { + + std::string path = "tests/modules/complex_model.jit.pt"; + torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); + std::vector inputs; + inputs.push_back(in0); + + torch::jit::Module mod; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + mod = torch::jit::load(path); + } catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + } + mod.eval(); + mod.to(torch::kCUDA); + + + std::vector inputs_; + + for (auto in : inputs) { + inputs_.push_back(torch::jit::IValue(in.clone())); + } + + std::vector complex_inputs; + auto input_list = c10::impl::GenericList(c10::TensorType::get()); + input_list.push_back(inputs_[0]); + input_list.push_back(inputs_[0]); + + torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list); + + complex_inputs.push_back(input_list_ivalue); + + + auto out = mod.forward(complex_inputs); + LOG_DEBUG("Finish torchscirpt forward"); + + auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); + + auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); + + c10::TypePtr elementType = input_shape_ivalue.type(); + auto list = c10::impl::GenericList(elementType); + list.push_back(input_shape_ivalue); + list.push_back(input_shape_ivalue); + + + torch::jit::IValue complex_input_shape(list); + std::tuple input_tuple2(complex_input_shape); + torch::jit::IValue complex_input_shape2(input_tuple2); + + auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); + compile_settings.require_full_compilation = false; + compile_settings.min_block_size = 3; + + // Need to skip the conversion of __getitem__ and ListConstruct + compile_settings.torch_executed_ops.push_back("aten::__getitem__"); + + // // FP16 execution + compile_settings.enabled_precisions = {torch::kHalf}; + // // Compile module + auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); + LOG_DEBUG("Finish compile"); + auto trt_out = trt_mod.forward(complex_inputs); + + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5)); +} \ No newline at end of file diff --git a/tests/modules/custom_models.py b/tests/modules/custom_models.py index 20d501045f..443dcdd11f 100644 --- a/tests/modules/custom_models.py +++ b/tests/modules/custom_models.py @@ -2,6 +2,7 @@ import torch.nn as nn from transformers import BertModel, BertTokenizer, BertConfig import torch.nn.functional as F +from typing import Tuple, List, Dict # Sample Pool Model (for testing plugin serialization) @@ -100,6 +101,66 @@ def forward(self, x, y): z = torch.cat(mod_list) return z +# Collection input/output models +class StandardTensorInput(nn.Module): + def __init__(self): + super(StandardTensorInput, self).__init__() + + def forward(self, x, y): + r = x + y + return r + +class TupleInput(nn.Module): + def __init__(self): + super(TupleInput, self).__init__() + + def forward(self, z: Tuple[torch.Tensor, torch.Tensor]): + r = z[0] + z[1] + return r + +class ListInput(nn.Module): + def __init__(self): + super(ListInput, self).__init__() + + def forward(self, z: List[torch.Tensor]): + r = z[0] + z[1] + return r + +class TupleInputOutput(nn.Module): + def __init__(self): + super(TupleInputOutput, self).__init__() + + def forward(self, z: Tuple[torch.Tensor, torch.Tensor]): + r1 = z[0] + z[1] + r2 = z[0] - z[1] + r = (r1, r2) + return r + +class ListInputOutput(nn.Module): + def __init__(self): + super(ListInputOutput, self).__init__() + + def forward(self, z: List[torch.Tensor]): + r1 = z[0] + z[1] + r2 = z[0] - z[1] + r = [r1, r2] + return r + +class ListInputTupleOutput(nn.Module): + def __init__(self): + super(ListInputTupleOutput, self).__init__() + self.list_model = ListInputOutput() + self.tuple_model = TupleInputOutput() + + def forward(self, z: List[torch.Tensor]): + r1 = z[0] + z[1] + r2 = z[0] - z[1] + r3 = (r1, r2) + r4 = [r2, r1] + tuple_out = self.tuple_model(r3) + list_out = self.list_model(r4) + r = (tuple_out[1], list_out[0]) + return r def BertModule(): model_name = "bert-base-uncased" diff --git a/tests/modules/hub.py b/tests/modules/hub.py index 48e6b519cb..7d3e03e395 100644 --- a/tests/modules/hub.py +++ b/tests/modules/hub.py @@ -104,6 +104,30 @@ "model": cm.FallbackInplaceOPIf(), "path": "script" }, + "standard_tensor_input": { + "model": cm.StandardTensorInput(), + "path": "script" + }, + "tuple_input": { + "model": cm.TupleInput(), + "path": "script" + }, + "list_input": { + "model": cm.ListInput(), + "path": "script" + }, + "tuple_input_output": { + "model": cm.TupleInputOutput(), + "path": "script" + }, + "list_input_output": { + "model": cm.ListInputOutput(), + "path": "script" + }, + "list_input_tuple_output": { + "model": cm.ListInputTupleOutput(), + "path": "script" + }, "bert_base_uncased": { "model": cm.BertModule(), "path": "trace" @@ -193,5 +217,5 @@ def main(): f.write(record) f.truncate() - -main() +if __name__ == "__main__": + main() diff --git a/tests/modules/requirements.txt b/tests/modules/requirements.txt index d4b5105850..00acec5861 100644 --- a/tests/modules/requirements.txt +++ b/tests/modules/requirements.txt @@ -1,2 +1,3 @@ +torchvision timm==v0.4.12 transformers==4.17.0 diff --git a/tests/py/api/test_collections.py b/tests/py/api/test_collections.py new file mode 100644 index 0000000000..09cb6c4e70 --- /dev/null +++ b/tests/py/api/test_collections.py @@ -0,0 +1,147 @@ +import unittest +import torch_tensorrt as torchtrt +import torch +import torchvision.models as models + +def find_repo_root(max_depth=10): + dir_path = os.path.dirname(os.path.realpath(__file__)) + for i in range(max_depth): + files = os.listdir(dir_path) + if "WORKSPACE" in files: + return dir_path + else: + dir_path = os.path.dirname(dir_path) + + raise RuntimeError("Could not find repo root") + +MODULE_DIR = find_repo_root() + "/tests/modules" + +class TestStandardTensorInput(unittest.TestCase): + + + def test_compile(self): + + self.input = torch.randn((1, 3, 224, 224)).to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/standard_tensor_input.jit.pt").eval().to("cuda") + + compile_spec = { + "inputs": [torchtrt.Input(self.input.shape), + torchtrt.Input(self.input.shape)], + "device": torchtrt.Device("gpu:0"), + "enabled_precisions": {torch.float} + } + + trt_mod = torchtrt.ts.compile(self.model, **compile_spec) + same = (trt_mod(self.input, self.input) - self.model(self.input, self.input)).abs().max() + self.assertTrue(same < 2e-2) + +class TestTupleInput(unittest.TestCase): + + + def test_compile(self): + + self.input = torch.randn((1, 3, 224, 224)).to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/tuple_input.jit.pt").eval().to("cuda") + + compile_spec = { + "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),), + "device": torchtrt.Device("gpu:0"), + "enabled_precisions": {torch.float}, + "require_full_compilation": False, + "min_block_size": 3 + } + + trt_mod = torchtrt.ts.compile(self.model, **compile_spec) + same = (trt_mod((self.input, self.input)) - self.model((self.input, self.input))).abs().max() + self.assertTrue(same < 2e-2) + +class TestListInput(unittest.TestCase): + + + def test_compile(self): + + self.input = torch.randn((1, 3, 224, 224)).to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/list_input.jit.pt").eval().to("cuda") + + + compile_spec = { + "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],), + "device": torchtrt.Device("gpu:0"), + "enabled_precisions": {torch.float}, + "require_full_compilation": False, + "min_block_size": 3 + } + + trt_mod = torchtrt.ts.compile(self.model, **compile_spec) + same = (trt_mod([self.input, self.input]) - self.model([self.input, self.input])).abs().max() + self.assertTrue(same < 2e-2) + +class TestTupleInputOutput(unittest.TestCase): + + def test_compile(self): + + self.input = torch.randn((1, 3, 224, 224)).to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/tuple_input_output.jit.pt").eval().to("cuda") + + + compile_spec = { + "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),), + "device": torchtrt.Device("gpu:0"), + "enabled_precisions": {torch.float}, + "require_full_compilation": False, + "min_block_size": 3 + } + + trt_mod = torchtrt.ts.compile(self.model, **compile_spec) + trt_out = trt_mod((self.input, self.input)) + pyt_out = self.model((self.input, self.input)) + results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)] + self.assertTrue(all(results)) + +class TestListInputOutput(unittest.TestCase): + + def test_compile(self): + + self.input = torch.randn((1, 3, 224, 224)).to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/list_input_output.jit.pt").eval().to("cuda") + + + compile_spec = { + "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],), + "device": torchtrt.Device("gpu:0"), + "enabled_precisions": {torch.float}, + "require_full_compilation": False, + "min_block_size": 3 + } + + trt_mod = torchtrt.ts.compile(self.model, **compile_spec) + trt_out = trt_mod((self.input, self.input)) + pyt_out = self.model((self.input, self.input)) + results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)] + self.assertTrue(all(results)) + + +class TestListInputTupleOutput(unittest.TestCase): + + def test_compile(self): + + self.input = torch.randn((1, 3, 224, 224)).to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/list_input_tuple_output.jit.pt").eval().to("cuda") + + + compile_spec = { + "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],), + "device": torchtrt.Device("gpu:0"), + "enabled_precisions": {torch.float}, + "require_full_compilation": False, + "min_block_size": 3 + } + + trt_mod = torchtrt.ts.compile(self.model, **compile_spec) + trt_out = trt_mod((self.input, self.input)) + pyt_out = self.model((self.input, self.input)) + results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)] + self.assertTrue(all(results)) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/py/model_test_case.py b/tests/py/model_test_case.py index e529f05013..1c772c1faf 100644 --- a/tests/py/model_test_case.py +++ b/tests/py/model_test_case.py @@ -1,7 +1,9 @@ import unittest import torch import torchvision.models as models +import os +REPO_ROOT = os.path.abspath(os.getcwd()) + "/../../" class ModelTestCase(unittest.TestCase): diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt index 0ea1c76a29..e35531e566 100644 --- a/tests/py/requirements.txt +++ b/tests/py/requirements.txt @@ -1,2 +1 @@ -torchvision==0.13.0+cu113 --f https://download.pytorch.org/whl/torch_stable.html +torchvision From b26d768605619bd30e9b4c9eb0b88d6566b39a75 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Sat, 23 Jul 2022 18:16:15 -0700 Subject: [PATCH 02/16] tests: fix test model paths Signed-off-by: Naren Dasan --- tests/py/api/test_collections.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/py/api/test_collections.py b/tests/py/api/test_collections.py index 09cb6c4e70..603d44aebb 100644 --- a/tests/py/api/test_collections.py +++ b/tests/py/api/test_collections.py @@ -2,6 +2,7 @@ import torch_tensorrt as torchtrt import torch import torchvision.models as models +import os def find_repo_root(max_depth=10): dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -22,7 +23,7 @@ class TestStandardTensorInput(unittest.TestCase): def test_compile(self): self.input = torch.randn((1, 3, 224, 224)).to("cuda") - self.model = torch.jit.load(MODULE_DIR + "/standard_tensor_input.jit.pt").eval().to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/standard_tensor_input_scripted.jit.pt").eval().to("cuda") compile_spec = { "inputs": [torchtrt.Input(self.input.shape), @@ -41,7 +42,7 @@ class TestTupleInput(unittest.TestCase): def test_compile(self): self.input = torch.randn((1, 3, 224, 224)).to("cuda") - self.model = torch.jit.load(MODULE_DIR + "/tuple_input.jit.pt").eval().to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/tuple_input_scripted.jit.pt").eval().to("cuda") compile_spec = { "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),), @@ -61,7 +62,7 @@ class TestListInput(unittest.TestCase): def test_compile(self): self.input = torch.randn((1, 3, 224, 224)).to("cuda") - self.model = torch.jit.load(MODULE_DIR + "/list_input.jit.pt").eval().to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/list_input_scripted.jit.pt").eval().to("cuda") compile_spec = { @@ -81,7 +82,7 @@ class TestTupleInputOutput(unittest.TestCase): def test_compile(self): self.input = torch.randn((1, 3, 224, 224)).to("cuda") - self.model = torch.jit.load(MODULE_DIR + "/tuple_input_output.jit.pt").eval().to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/tuple_input_output_scripted.jit.pt").eval().to("cuda") compile_spec = { @@ -103,7 +104,7 @@ class TestListInputOutput(unittest.TestCase): def test_compile(self): self.input = torch.randn((1, 3, 224, 224)).to("cuda") - self.model = torch.jit.load(MODULE_DIR + "/list_input_output.jit.pt").eval().to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/list_input_output_scripted.jit.pt").eval().to("cuda") compile_spec = { @@ -126,7 +127,7 @@ class TestListInputTupleOutput(unittest.TestCase): def test_compile(self): self.input = torch.randn((1, 3, 224, 224)).to("cuda") - self.model = torch.jit.load(MODULE_DIR + "/list_input_tuple_output.jit.pt").eval().to("cuda") + self.model = torch.jit.load(MODULE_DIR + "/list_input_tuple_output_scripted.jit.pt").eval().to("cuda") compile_spec = { From b2a518383cc043e33bdcb650d35f97fddfff670b Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Sat, 23 Jul 2022 21:35:59 -0700 Subject: [PATCH 03/16] tests: Fix tests Signed-off-by: Naren Dasan --- .circleci/config.yml | 4 ++-- cpp/include/torch_tensorrt/torch_tensorrt.h | 7 +++---- tests/cpp/BUILD | 8 ++++---- .../{test_collection.cpp => test_collections.cpp} | 12 ++++++------ tests/cpp/test_example_tensors.cpp | 4 +++- tests/modules/hub.py | 8 ++++---- 6 files changed, 22 insertions(+), 21 deletions(-) rename tests/cpp/{test_collection.cpp => test_collections.cpp} (96%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 77a1fd036f..3eda95d4f0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -674,7 +674,7 @@ workflows: requires: - build-x86_64-pyt-release - - test-py-ts-x86_64: + - test-py-fx-x86_64: name: test-py-fx-x86_64-pyt-release channel: "release" torch-build: << pipeline.parameters.torch-release-build >> @@ -752,7 +752,7 @@ workflows: requires: - build-x86_64-pyt-release - - test-py-ts-x86_64: + - test-py-fx-x86_64: name: test-py-fx-x86_64-pyt-release channel: "release" torch-build: << pipeline.parameters.torch-release-build >> diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h index 70dea51bc7..11dc5d74c6 100644 --- a/cpp/include/torch_tensorrt/torch_tensorrt.h +++ b/cpp/include/torch_tensorrt/torch_tensorrt.h @@ -569,7 +569,7 @@ struct TORCHTRT_API CompileSpec { CompileSpec(std::vector> fixed_sizes); /** - * @brief Construct a new Extra Info object + * @brief Construct a new Compile Spec object * Convienence constructor to set fixed input size from c10::ArrayRef's (the * output of tensor.sizes()) describing size of input tensors. Each entry in * the vector represents a input and should be provided in call order. @@ -583,7 +583,7 @@ struct TORCHTRT_API CompileSpec { CompileSpec(std::vector> fixed_sizes); /** - * @brief Construct a new Extra Info object from input ranges. + * @brief Construct a new Compile Spec object from input ranges. * Each entry in the vector represents a input and should be provided in call * order. * @@ -594,8 +594,7 @@ struct TORCHTRT_API CompileSpec { CompileSpec(std::vector inputs); /** - * @brief Construct a new Extra Info object from IValue. - * The IValue store a complex Input + * @brief Construct a new Compile Spec object from IValue which represents the nesting of input tensors for a module. * * @param input_signature */ diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD index 2d545dc8f1..8e479e2e0a 100644 --- a/tests/cpp/BUILD +++ b/tests/cpp/BUILD @@ -19,7 +19,7 @@ test_suite( ":test_serialization", ":test_module_fallback", ":test_example_tensors", - ":test_collection" + ":test_collections" ], ) @@ -34,7 +34,7 @@ test_suite( ":test_serialization", ":test_module_fallback", ":test_example_tensors", - ":test_collection" + ":test_collections" ], ) @@ -125,8 +125,8 @@ cc_test( ) cc_test( - name = "test_collection", - srcs = ["test_collection.cpp"], + name = "test_collections", + srcs = ["test_collections.cpp"], data = [ "//tests/modules:jit_models", ], diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collections.cpp similarity index 96% rename from tests/cpp/test_collection.cpp rename to tests/cpp/test_collections.cpp index c269ebac17..df2280b947 100644 --- a/tests/cpp/test_collection.cpp +++ b/tests/cpp/test_collections.cpp @@ -8,7 +8,7 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) { - std::string path = "tests/modules/standard_tensor_input.jit.pt"; + std::string path = "tests/modules/standard_tensor_input_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); std::vector inputs; inputs.push_back(in0); @@ -53,7 +53,7 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) { TEST(CppAPITests, TestCollectionTupleInput) { - std::string path = "tests/modules/tuple_input.jit.pt"; + std::string path = "tests/modules/tuple_input_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); torch::jit::Module mod; @@ -103,7 +103,7 @@ TEST(CppAPITests, TestCollectionTupleInput) { TEST(CppAPITests, TestCollectionListInput) { - std::string path = "tests/modules/list_input.jit.pt"; + std::string path = "tests/modules/list_input_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); std::vector inputs; inputs.push_back(in0); @@ -169,7 +169,7 @@ TEST(CppAPITests, TestCollectionListInput) { TEST(CppAPITests, TestCollectionTupleInputOutput) { - std::string path = "tests/modules/tuple_input_output.jit.pt"; + std::string path = "tests/modules/tuple_input_output_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); @@ -224,7 +224,7 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) { TEST(CppAPITests, TestCollectionListInputOutput) { - std::string path = "tests/modules/list_input_output.jit.pt"; + std::string path = "tests/modules/list_input_output_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); std::vector inputs; inputs.push_back(in0); @@ -295,7 +295,7 @@ TEST(CppAPITests, TestCollectionListInputOutput) { TEST(CppAPITests, TestCollectionComplexModel) { - std::string path = "tests/modules/complex_model.jit.pt"; + std::string path = "tests/modules/list_input_tuple_output_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); std::vector inputs; inputs.push_back(in0); diff --git a/tests/cpp/test_example_tensors.cpp b/tests/cpp/test_example_tensors.cpp index 6561cd16a0..3ec8831f9d 100644 --- a/tests/cpp/test_example_tensors.cpp +++ b/tests/cpp/test_example_tensors.cpp @@ -9,7 +9,9 @@ TEST_P(CppAPITests, InputsFromTensors) { trt_inputs_ivalues.push_back(in.clone()); } - auto spec = torch_tensorrt::ts::CompileSpec({trt_inputs_ivalues[0].toTensor()}); + + auto inputs = std::vector{trt_inputs_ivalues[0].toTensor()}; + auto spec = torch_tensorrt::ts::CompileSpec(inputs); auto trt_mod = torch_tensorrt::ts::compile(mod, spec); torch::jit::IValue trt_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(trt_mod, trt_inputs_ivalues); diff --git a/tests/modules/hub.py b/tests/modules/hub.py index 7d3e03e395..3ad92ff79a 100644 --- a/tests/modules/hub.py +++ b/tests/modules/hub.py @@ -128,10 +128,10 @@ "model": cm.ListInputTupleOutput(), "path": "script" }, - "bert_base_uncased": { - "model": cm.BertModule(), - "path": "trace" - } + #"bert_base_uncased": { + # "model": cm.BertModule(), + # "path": "trace" + #} } From 8385253db173d2898d7dd0b934c798860e1cbd8a Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Tue, 26 Jul 2022 11:45:00 -0700 Subject: [PATCH 04/16] chore: Update generateRandomTensors uses Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- core/compiler.cpp | 2 +- core/conversion/evaluators/aten.cpp | 15 +++++++++++++++ core/ir/GraphInputs.cpp | 4 ++-- core/partitioning/partitioning.cpp | 6 ------ core/partitioning/shape_analysis.cpp | 6 +++--- .../test_resolve_nontensor_inputs.cpp | 10 +++++----- tests/modules/hub.py | 8 ++++---- 7 files changed, 30 insertions(+), 21 deletions(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index 214443a9c6..e44ece5c27 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -322,7 +322,7 @@ void MapInputsAndDetermineDTypes( est_type_opt = first_use_type_map.find(in)->second; } // traverse elements in est_type_out and spec - for (int i = 0; i < est_type_opt.size(); i++) { + for (size_t i = 0; i < est_type_opt.size(); i++) { if (est_type_opt[i] && !spec[i].dtype_is_user_defined) { // If we can calculate the type from the graph and the type was not defined by the user then use the calculated // type diff --git a/core/conversion/evaluators/aten.cpp b/core/conversion/evaluators/aten.cpp index 7bb1f6d202..4632744790 100644 --- a/core/conversion/evaluators/aten.cpp +++ b/core/conversion/evaluators/aten.cpp @@ -264,6 +264,21 @@ auto aten_registrations TORCHTRT_UNUSED = }, EvalOptions().validSchemas( {"aten::size(Tensor self) -> (int[])", "aten::size.int(Tensor self, int dim) -> (int)"})}) + .evaluator({c10::Symbol::fromQualString("aten::__getitem__"), + [](const torch::jit::Node* n, kwargs& args) -> c10::optional { + auto list = args.at(n->input(0)).IValue()->to>(); + auto idx = args.at(n->input(1)).unwrapToInt(); + + const int64_t list_size = list.size(); + const int64_t normalized_idx = normalizeIndex(idx, list_size); + TORCHTRT_CHECK( + normalized_idx >= 0 || normalized_idx < list_size, + "List index out of range (aten::__getitem__)"); + return list.get(normalized_idx); + }, + EvalOptions().validSchemas({ + "aten::__getitem__.t(t[](a) list, int idx) -> (t(*))", + })}) .evaluator({c10::Symbol::fromQualString("aten::append"), [](const torch::jit::Node* n, kwargs& args) -> c10::optional { auto list = args.at(n->input(0)).IValue()->to>(); diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp index 792189137a..007a7279e7 100644 --- a/core/ir/GraphInputs.cpp +++ b/core/ir/GraphInputs.cpp @@ -5,7 +5,7 @@ namespace torch_tensorrt { namespace core { namespace ir { -void flatten_dfs(std::vector& flattened_inputs, std::vector>& collection_inputs, +void flatten_dfs(std::vector& flattened_inputs, std::vector>& collection_inputs, torch::jit::IValue input_ivalue, int level, int index) { if (input_ivalue.isTuple()) { auto input_tuple = input_ivalue.toTuple(); @@ -53,7 +53,7 @@ GraphInputs::GraphInputs(std::vector inputs_) { LOG_DEBUG("Construct GraphInput with ir::Input"); inputs = inputs_; collection_inputs.resize(inputs_.size()); - for (int i = 0; i < inputs_.size(); i++) { + for (size_t i = 0; i < inputs_.size(); i++) { collection_inputs[i].push_back(inputs_[i]); } } diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index f14d5438c6..dc7ef1f7ac 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -400,15 +400,9 @@ PartitionedGraph segment_graph( if (n->kind() == torch::jit::prim::Constant) { continue; } -<<<<<<< HEAD - - if (check_node_fallback(n, global_fallback_nodes)) { - in_prog_trt_blk_nodes.push_back(n); -======= // the outputs of trt subgraph shouldn't be collections if (should_run_in_trt(n, forced_fallback_ops) && !(in_prog_trt_blk_nodes.size() == 0 && is_collection(n))) { in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n); ->>>>>>> feat: support for grouped inputs // If there is an active PyTorch block and we have passed the threshold for a valid TRT // block then segment and reset the active PyTorch block diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index 22c3ea104f..1221318647 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -39,7 +39,7 @@ std::unordered_map generateRandomI std::vector list; c10::TypePtr elementType = c10::TensorType::get(); auto generic_list = c10::impl::GenericList(elementType); - for (int i = 0; i < input.second.size(); i++) { + for (size_t i = 0; i < input.second.size(); i++) { auto in = generateSingleInput(input.second[i], types[input.first][i]); generic_list.push_back(in.clone()); } @@ -47,7 +47,7 @@ std::unordered_map generateRandomI } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) { // create tuple std::vector list; - for (int i = 0; i < input.second.size(); i++) { + for (size_t i = 0; i < input.second.size(); i++) { auto in = generateSingleInput(input.second[i], types[input.first][i]); list.push_back(in.clone()); } @@ -56,7 +56,7 @@ std::unordered_map generateRandomI } else { auto in = generateSingleInput(input.second[0], types[input.first][0]); ivalue_map[input.first] = in.clone(); - + } } return ivalue_map; diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp index 2d0255f130..1f3ee3b051 100644 --- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp +++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp @@ -116,7 +116,7 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) { inputs.push_back(torch_tensorrt::core::ir::Input({16, 3, 3, 3})); inputs.push_back(torch_tensorrt::core::ir::Input({16})); - std::unordered_map> inputs_map; + torch_tensorrt::core::ir::CollectionInputSpecMap inputs_map; std::unordered_map>> input_types; for (size_t i = 0; i < g->inputs().size(); ++i) { inputs_map.insert({g->inputs()[i], {inputs[i]}}); @@ -367,11 +367,11 @@ TEST(Partitioning, ResolveOnlyNeccessaryNonTensorInputs) { inputs.push_back(torch_tensorrt::core::ir::Input({4, 4})); inputs.push_back(torch_tensorrt::core::ir::Input({4, 4})); - std::unordered_map inputs_map; - std::unordered_map> input_types; + torch_tensorrt::core::ir::CollectionInputSpecMap inputs_map; + std::unordered_map>> input_types; for (size_t i = 0; i < g->inputs().size(); ++i) { - inputs_map.insert({g->inputs()[i], inputs[i]}); - input_types.insert({g->inputs()[i], {at::kFloat}}); + inputs_map.insert({g->inputs()[i], {inputs[i]}}); + input_types.insert({g->inputs()[i], {{at::kFloat}}}); } auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types); std::unordered_map fallback_nodes; diff --git a/tests/modules/hub.py b/tests/modules/hub.py index 3ad92ff79a..7d3e03e395 100644 --- a/tests/modules/hub.py +++ b/tests/modules/hub.py @@ -128,10 +128,10 @@ "model": cm.ListInputTupleOutput(), "path": "script" }, - #"bert_base_uncased": { - # "model": cm.BertModule(), - # "path": "trace" - #} + "bert_base_uncased": { + "model": cm.BertModule(), + "path": "trace" + } } From d479c9854a2976b6620a7c7e1e020bf89f333702 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Tue, 26 Jul 2022 17:02:02 -0700 Subject: [PATCH 05/16] fix: fix the fallback related issue after merging collection Signed-off-by: Bo Wang --- core/partitioning/partitioning.cpp | 70 ++++++++---------------------- 1 file changed, 19 insertions(+), 51 deletions(-) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index dc7ef1f7ac..1a7a4777de 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -17,22 +17,17 @@ struct usage_info { std::vector tensorrt_use_id; // ids of segmented blocks which are of type TensorRT }; -inline bool isTensorOrTensorList(torch::jit::Value* val) { - return val->type()->isSubtypeOf(torch::jit::TensorType::get()) || - val->type()->isSubtypeOf(torch::jit::ListType::ofTensors()); -} - -inline bool isTensorList(torch::jit::Value* val) { - return val->type()->isSubtypeOf(torch::jit::ListType::ofTensors()); -} - inline bool isTensor(torch::jit::Value* val) { return val->type()->isSubtypeOf(torch::jit::TensorType::get()); } +inline bool isListOrTuple(torch::jit::Value* val) { + return val->type()->kind() == torch::jit::TypeKind::TupleType || val->type()->kind() == torch::jit::TypeKind::ListType; +} + bool containNonTensorOutputs(torch::jit::Node* n) { for (auto output : n->outputs()) { - if (!isTensorOrTensorList(output)) { + if (!isTensor(output)) { return true; } } @@ -68,6 +63,7 @@ std::vector findModifyingNodes( return modifying_nodes; } +// this function is only used when a TRT segment produces nonTensor values which are used by later TRT segment std::vector getDependencyNodes( const std::vector& vals, const SegmentedBlock& seg_block) { @@ -88,7 +84,7 @@ std::vector getDependencyNodes( stk.insert(stk.end(), modifying_nodes.rbegin(), modifying_nodes.rend()); stk.push_back(node); for (auto input : node->inputs()) { - if (!isTensorOrTensorList(input)) { + if (!isTensor(input)) { q.push(input); } } @@ -113,6 +109,8 @@ void find_all_fallback_nodes( auto cur_node = q.front(); q.pop(); // for every node that produces this fallback node's NonTensor input, they should fallback too + // Even collection feature is supported, since TRT List/Tuple output is not supported yet, the nodes + // that produce List/Tuple still cannot be in TRT segment for (auto input : cur_node->inputs()) { if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({input->node(), FallbackNodeType::kNON_TENSOR}).second) { @@ -120,8 +118,10 @@ void find_all_fallback_nodes( } } // for every node that consumes this fallback node's NonTensor output, they should fallback too + // Since collection feature is supported, we can have List/Tuple input for TRT segment, so we only + // fallback the nodes that take inputs which are not Tensor/List/Tuple for (auto output : cur_node->outputs()) { - if (!isTensor(output)) { + if (!isTensor(output) && !isListOrTuple(output)) { for (auto use : output->uses()) { auto node = use.user; if (node->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) { @@ -176,7 +176,7 @@ void registerSegmentsOutputs(PartitionedGraph& segmented_blocks, torch::jit::Blo if (std::find(seg_block.raw_inputs().begin(), seg_block.raw_inputs().end(), mini_graph_input) == seg_block.raw_inputs().end() && seg_block.contain_raw_value(mini_graph_input)) { - if (!isTensorOrTensorList(mini_graph_input) && seg_block.target() == SegmentedBlock::kTensorRT) + if (!isTensor(mini_graph_input) && seg_block.target() == SegmentedBlock::kTensorRT) continue; seg_block.registerOutput(mini_graph_input); } @@ -242,36 +242,6 @@ bool check_node_fallback(torch::jit::Node* n, const std::unordered_mapoutputs()) { - if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) { - return true; - } - } - return false; -} - -bool should_run_in_trt(torch::jit::Node* n, const std::unordered_set& torch_ops) { - // If the op is not supported by the conversion phase it should run in PyTorch - if (!conversion::OpSupported(n)) { - LOG_GRAPH("Node not supported by conversion: " << util::node_info(n)); - return false; - } - - // If the user specifies the op to run in Torch it should run in PyTorch - if (torch_ops.find(n->kind().toQualString()) != torch_ops.end()) { - LOG_GRAPH("Node explicitly set to run in torch: " << util::node_info(n)); - return false; - } - - // If the user specifies the module containing this op to run in torch it should run in PyTorch - const auto to_compile_sym = c10::Symbol::attr("to_compile"); - if (n->hasAttribute(to_compile_sym) && n->i(to_compile_sym) == (int64_t) false) { - LOG_GRAPH("Node is within a module set to run in torch: " << util::node_info(n)); return false; } @@ -390,19 +360,18 @@ PartitionedGraph segment_graph( find_min_block_size_fallback_nodes(block, global_fallback_nodes, min_block_size); auto nodes = block->nodes(); - auto reverse_nodes = nodes.reverse(); // merge from output side to input side PartitionedGraph segmented_blocks; // segment the nodes std::vector in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes; - for (const auto n : reverse_nodes) { + for (const auto n : nodes) { // Skip constant nodes as they are resources for both kinds of modules if (n->kind() == torch::jit::prim::Constant) { continue; } // the outputs of trt subgraph shouldn't be collections - if (should_run_in_trt(n, forced_fallback_ops) && !(in_prog_trt_blk_nodes.size() == 0 && is_collection(n))) { - in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n); + if (check_node_fallback(n, global_fallback_nodes)) { + in_prog_trt_blk_nodes.push_back(n); // If there is an active PyTorch block and we have passed the threshold for a valid TRT // block then segment and reset the active PyTorch block @@ -418,7 +387,7 @@ PartitionedGraph segment_graph( LOG_DEBUG( "In progress TRT block does not meet minimum block size requirements, therefore folding into in progress PyTorch block"); in_prog_pyt_blk_nodes.insert( - in_prog_pyt_blk_nodes.begin(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); + in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); } in_prog_trt_blk_nodes.clear(); // if there is a prim::If then this if node will be encapsulated in a SegmentedBlock @@ -437,14 +406,14 @@ PartitionedGraph segment_graph( finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } if (checkLoopEvaluatable(n)) { - in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n); + in_prog_trt_blk_nodes.push_back(n); } else { auto loop_node = std::vector{n}; finalize_block(segmented_blocks, SegmentedBlock::kTorch, loop_node); } continue; } - in_prog_pyt_blk_nodes.insert(in_prog_pyt_blk_nodes.begin(), n); + in_prog_pyt_blk_nodes.push_back(n); } } @@ -459,7 +428,6 @@ PartitionedGraph segment_graph( in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end()); finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes); } - std::reverse(segmented_blocks.begin(), segmented_blocks.end()); return segmented_blocks; } From b7178ffd055256de210d3d7ab08c23ed15dc90bf Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Wed, 27 Jul 2022 12:04:30 -0700 Subject: [PATCH 06/16] feat: Better input signature logging Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- py/torch_tensorrt/csrc/tensorrt_classes.cpp | 112 ++++++++++++++------ tests/modules/custom_models.py | 1 + 2 files changed, 79 insertions(+), 34 deletions(-) diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp index 9eb58b3e73..ca11cf4bc1 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp @@ -104,10 +104,54 @@ std::string Input::to_str() { return ss.str(); } +std::string sig_to_str(torch::jit::IValue input_sig) { + if (input_sig.isTuple()) { + auto input_tuple = input_sig.toTuple(); + std::vector children; + for (auto item: input_tuple->elements()) { + auto child = sig_to_str(item); + children.push_back(child); + } + std::stringstream ss; + ss << "("; + for (auto i : children) { + ss << i << ", "; + } + ss << ")"; + return ss.str(); + } else if(input_sig.isList()) { + auto input_list = input_sig.toList().vec(); + std::vector children; + for (auto item: input_list) { + auto child = sig_to_str(item); + children.push_back(child); + } + std::stringstream ss; + ss << "["; + for (auto i : children) { + ss << i << ", "; + } + ss << "]"; + return ss.str(); + } else if(input_sig.isCustomClass()) { + auto cur_input = input_sig.toCustomClass(); + return cur_input->to_str(); + } else if(input_sig.isPyObject()) { + auto py_object_holder = input_sig.toPyObjectHolder(); + auto infer_type = py_object_holder->tryToInferType(); + auto type = infer_type.type(); + torch::jit::IValue ival = py_object_holder->toIValue(type); + torch::jit::IValue converted_item; + return sig_to_str(ival); + } else { + LOG_ERROR("Unknown input spec type"); + return ""; + } +} + std::string InputSignature::to_str() { std::stringstream ss; - ss << signature_ivalue; - return ss.str(); + return sig_to_str(signature_ivalue); } std::string to_str(DeviceType value) { @@ -191,40 +235,40 @@ std::string TorchFallback::to_str() { } void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) { - if (input_ivalue.isTuple()) { - auto input_tuple = input_ivalue.toTuple(); - std::vector converted_elements; - for (auto item: input_tuple->elements()) { - torch::jit::IValue converted_item; - to_internal_input_signature(item, converted_item); - converted_elements.push_back(converted_item); - auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements); - converted_ivalue = torch::jit::IValue(tuple_ptr); - } - } else if(input_ivalue.isList()) { - auto input_list = input_ivalue.toList().vec(); - c10::TypePtr type = input_list[0].type(); - auto converted_elements = c10::impl::GenericList(type); - for (auto item: input_list) { - torch::jit::IValue converted_item; - to_internal_input_signature(item, converted_item); - converted_elements.push_back(converted_item); - } - converted_ivalue = torch::jit::IValue(converted_elements); - } else if(input_ivalue.isCustomClass()) { - core::ir::Input cur_input = (*(input_ivalue.toCustomClass())).toInternalInput(); - converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(cur_input))); - } else if(input_ivalue.isPyObject()) { - auto py_object_holder = input_ivalue.toPyObjectHolder(); - auto infer_type = py_object_holder->tryToInferType(); - auto type = infer_type.type(); - torch::jit::IValue ival = py_object_holder->toIValue(type); + if (input_ivalue.isTuple()) { + auto input_tuple = input_ivalue.toTuple(); + std::vector converted_elements; + for (auto item: input_tuple->elements()) { torch::jit::IValue converted_item; - to_internal_input_signature(ival, converted_item); - converted_ivalue = torch::jit::IValue(converted_item); - } else { - LOG_ERROR("Unknown input spec type"); + to_internal_input_signature(item, converted_item); + converted_elements.push_back(converted_item); + auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements); + converted_ivalue = torch::jit::IValue(tuple_ptr); } + } else if(input_ivalue.isList()) { + auto input_list = input_ivalue.toList().vec(); + c10::TypePtr type = input_list[0].type(); + auto converted_elements = c10::impl::GenericList(type); + for (auto item: input_list) { + torch::jit::IValue converted_item; + to_internal_input_signature(item, converted_item); + converted_elements.push_back(converted_item); + } + converted_ivalue = torch::jit::IValue(converted_elements); + } else if(input_ivalue.isCustomClass()) { + core::ir::Input cur_input = (*(input_ivalue.toCustomClass())).toInternalInput(); + converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(cur_input))); + } else if(input_ivalue.isPyObject()) { + auto py_object_holder = input_ivalue.toPyObjectHolder(); + auto infer_type = py_object_holder->tryToInferType(); + auto type = infer_type.type(); + torch::jit::IValue ival = py_object_holder->toIValue(type); + torch::jit::IValue converted_item; + to_internal_input_signature(ival, converted_item); + converted_ivalue = torch::jit::IValue(converted_item); + } else { + LOG_ERROR("Unknown input spec type"); + } } core::CompileSpec init_compile_spec(CompileSpec external) { diff --git a/tests/modules/custom_models.py b/tests/modules/custom_models.py index 443dcdd11f..a92e01e7a4 100644 --- a/tests/modules/custom_models.py +++ b/tests/modules/custom_models.py @@ -133,6 +133,7 @@ def __init__(self): def forward(self, z: Tuple[torch.Tensor, torch.Tensor]): r1 = z[0] + z[1] r2 = z[0] - z[1] + r1 = r1 * 10 r = (r1, r2) return r From 418d1e5646a5e8749c2b9b9849aa1ba94b9835ce Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Wed, 27 Jul 2022 18:37:38 -0700 Subject: [PATCH 07/16] refactor: still fallback when a trt segment has tuple/list input/output Signed-off-by: Bo Wang --- core/partitioning/partitioning.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 1a7a4777de..85626772f0 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -21,10 +21,6 @@ inline bool isTensor(torch::jit::Value* val) { return val->type()->isSubtypeOf(torch::jit::TensorType::get()); } -inline bool isListOrTuple(torch::jit::Value* val) { - return val->type()->kind() == torch::jit::TypeKind::TupleType || val->type()->kind() == torch::jit::TypeKind::ListType; -} - bool containNonTensorOutputs(torch::jit::Node* n) { for (auto output : n->outputs()) { if (!isTensor(output)) { @@ -109,8 +105,6 @@ void find_all_fallback_nodes( auto cur_node = q.front(); q.pop(); // for every node that produces this fallback node's NonTensor input, they should fallback too - // Even collection feature is supported, since TRT List/Tuple output is not supported yet, the nodes - // that produce List/Tuple still cannot be in TRT segment for (auto input : cur_node->inputs()) { if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({input->node(), FallbackNodeType::kNON_TENSOR}).second) { @@ -118,13 +112,12 @@ void find_all_fallback_nodes( } } // for every node that consumes this fallback node's NonTensor output, they should fallback too - // Since collection feature is supported, we can have List/Tuple input for TRT segment, so we only - // fallback the nodes that take inputs which are not Tensor/List/Tuple for (auto output : cur_node->outputs()) { - if (!isTensor(output) && !isListOrTuple(output)) { + if (!isTensor(output)) { for (auto use : output->uses()) { auto node = use.user; - if (node->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) { + if (node->kind() != torch::jit::prim::Constant && + global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) { q.push(node); } } From c9d4788a8fb046a93dec6e6732d6d6876a83276a Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Wed, 27 Jul 2022 18:37:38 -0700 Subject: [PATCH 08/16] refactor: still fallback when a trt segment has tuple/list input/output Signed-off-by: Bo Wang --- core/partitioning/partitioning.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 1a7a4777de..85626772f0 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -21,10 +21,6 @@ inline bool isTensor(torch::jit::Value* val) { return val->type()->isSubtypeOf(torch::jit::TensorType::get()); } -inline bool isListOrTuple(torch::jit::Value* val) { - return val->type()->kind() == torch::jit::TypeKind::TupleType || val->type()->kind() == torch::jit::TypeKind::ListType; -} - bool containNonTensorOutputs(torch::jit::Node* n) { for (auto output : n->outputs()) { if (!isTensor(output)) { @@ -109,8 +105,6 @@ void find_all_fallback_nodes( auto cur_node = q.front(); q.pop(); // for every node that produces this fallback node's NonTensor input, they should fallback too - // Even collection feature is supported, since TRT List/Tuple output is not supported yet, the nodes - // that produce List/Tuple still cannot be in TRT segment for (auto input : cur_node->inputs()) { if (!isTensor(input) && input->node()->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({input->node(), FallbackNodeType::kNON_TENSOR}).second) { @@ -118,13 +112,12 @@ void find_all_fallback_nodes( } } // for every node that consumes this fallback node's NonTensor output, they should fallback too - // Since collection feature is supported, we can have List/Tuple input for TRT segment, so we only - // fallback the nodes that take inputs which are not Tensor/List/Tuple for (auto output : cur_node->outputs()) { - if (!isTensor(output) && !isListOrTuple(output)) { + if (!isTensor(output)) { for (auto use : output->uses()) { auto node = use.user; - if (node->kind() != torch::jit::prim::Constant && global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) { + if (node->kind() != torch::jit::prim::Constant && + global_fallback_nodes.insert({node, FallbackNodeType::kNON_TENSOR}).second) { q.push(node); } } From 5cff25728e3a2583e2390209a967a930118e3f45 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Thu, 28 Jul 2022 15:42:01 -0700 Subject: [PATCH 09/16] chore: Apply liniting Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- core/compiler.cpp | 121 ++++---- core/conversion/conversion.cpp | 15 +- .../conversionctx/ConversionCtx.cpp | 8 +- core/conversion/converters/converter_util.cpp | 21 +- core/conversion/converters/converter_util.h | 2 +- core/conversion/converters/impl/select.cpp | 267 +++++++++--------- core/ir/GraphInputs.cpp | 114 ++++---- core/ir/StaticParams.cpp | 3 +- core/ir/ir.cpp | 24 +- core/ir/ir.h | 10 +- core/partitioning/shape_analysis.cpp | 33 +-- core/partitioning/shape_analysis.h | 1 - cpp/bin/torchtrtc/main.cpp | 3 +- cpp/include/torch_tensorrt/torch_tensorrt.h | 4 +- cpp/src/compile_spec.cpp | 52 ++-- cpp/src/torch_tensorrt.cpp | 3 +- .../csrc/register_tensorrt_classes.cpp | 6 +- py/torch_tensorrt/csrc/tensorrt_classes.cpp | 38 +-- py/torch_tensorrt/csrc/tensorrt_classes.h | 2 +- py/torch_tensorrt/csrc/torch_tensorrt_py.cpp | 5 +- .../core/conversion/converters/test_cast.cpp | 2 - tests/cpp/test_collections.cpp | 44 +-- tests/cpp/test_example_tensors.cpp | 1 - tools/linter/utils.py | 4 +- 24 files changed, 389 insertions(+), 394 deletions(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index e44ece5c27..caee900879 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -308,70 +308,78 @@ void MapInputsAndDetermineDTypes( std::shared_ptr& g, ir::StaticParams& static_params, ir::CollectionTypeMap& first_use_type_map) { - cfg.convert_info.collection_input_spec_map = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params)); + cfg.convert_info.collection_input_spec_map = + std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params)); - auto collection_inputs = ir::get_collection_inputs(g, static_params); - LOG_DEBUG("In MapInputsAndDetermineDTypes, the g->inputs() size is " << g->inputs().size() << ", CollectionInputSpecMap size is" << collection_inputs.size()); + auto collection_inputs = ir::get_collection_inputs(g, static_params); + LOG_DEBUG( + "In MapInputsAndDetermineDTypes, the g->inputs() size is " + << g->inputs().size() << ", CollectionInputSpecMap size is" << collection_inputs.size()); - for (auto in : collection_inputs) { - std::vector& spec = cfg.convert_info.collection_input_spec_map.find(in)->second; - std::vector> est_type_opt; + for (auto in : collection_inputs) { + std::vector& spec = cfg.convert_info.collection_input_spec_map.find(in)->second; + std::vector> est_type_opt; - auto est_it = first_use_type_map.find(in); - if (est_it != first_use_type_map.end()) { - est_type_opt = first_use_type_map.find(in)->second; - } - // traverse elements in est_type_out and spec - for (size_t i = 0; i < est_type_opt.size(); i++) { - if (est_type_opt[i] && !spec[i].dtype_is_user_defined) { - // If we can calculate the type from the graph and the type was not defined by the user then use the calculated - // type - LOG_INFO( - "Since input type is not explicitly defined, infering using first tensor calculation\n Inferred input " - << in->debugName() << " has type " << est_type_opt[i].value()); - spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value()); - } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) { - // If we cannot calculate the type and the user did not define the type, then default to FP32 - LOG_WARNING( - "Cannot infer input type from calcuations in graph for input " - << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity"); - spec[i].dtype = nvinfer1::DataType::kFLOAT; - } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) { - if (!est_type_opt[i]) { - LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting"); + auto est_it = first_use_type_map.find(in); + if (est_it != first_use_type_map.end()) { + est_type_opt = first_use_type_map.find(in)->second; + } + // traverse elements in est_type_out and spec + for (size_t i = 0; i < est_type_opt.size(); i++) { + if (est_type_opt[i] && !spec[i].dtype_is_user_defined) { + // If we can calculate the type from the graph and the type was not defined by the user then use the calculated + // type + LOG_INFO( + "Since input type is not explicitly defined, infering using first tensor calculation\n Inferred input " + << in->debugName() << " has type " << est_type_opt[i].value()); + spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value()); + } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) { + // If we cannot calculate the type and the user did not define the type, then default to FP32 + LOG_WARNING( + "Cannot infer input type from calcuations in graph for input " + << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity"); + spec[i].dtype = nvinfer1::DataType::kFLOAT; + } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) { + if (!est_type_opt[i]) { + LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting"); + std::stringstream ss; + ss << "For input " << in->debugName() << ", found user specified input dtype as "; + ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; + ss << ". The compiler is going to use the user setting " + << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; + auto warn_str = ss.str(); + LOG_WARNING(warn_str); + // Overwrite type map with user settings + first_use_type_map[in][i] = { + util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)}; + + } else { + if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype) != + est_type_opt[i].value()) { std::stringstream ss; ss << "For input " << in->debugName() << ", found user specified input dtype as "; ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; - ss << ". The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; + ss << ", however when inspecting the graph, the input type expected was inferred to be "; + ss << est_type_opt[i].value() << std::endl; + ss << "The compiler is going to use the user setting " + << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; + ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n"; + ss << "compatibility with PyTorch's data type convention is required.\n"; + ss << "If you do indeed see errors at runtime either:\n"; + ss << "- Remove the dtype spec for " << in->debugName() << std::endl; + ss << "- Disable partial compilation by setting require_full_compilation to True"; auto warn_str = ss.str(); LOG_WARNING(warn_str); // Overwrite type map with user settings - first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)}; - - } else { - if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype) != est_type_opt[i].value()) { - std::stringstream ss; - ss << "For input " << in->debugName() << ", found user specified input dtype as "; - ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; - ss << ", however when inspecting the graph, the input type expected was inferred to be "; - ss << est_type_opt[i].value() << std::endl; - ss << "The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype; - ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n"; - ss << "compatibility with PyTorch's data type convention is required.\n"; - ss << "If you do indeed see errors at runtime either:\n"; - ss << "- Remove the dtype spec for " << in->debugName() << std::endl; - ss << "- Disable partial compilation by setting require_full_compilation to True"; - auto warn_str = ss.str(); - LOG_WARNING(warn_str); - // Overwrite type map with user settings - first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)}; - } + first_use_type_map[in][i] = { + util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)}; } - } else { - // The user defined the type so no changes are necessary } + } else { + // The user defined the type so no changes are necessary } } + } // } } @@ -425,12 +433,13 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) if (cfg.partition_info.enabled && (!(cfg.lower_info.forced_fallback_modules.size() == 0 && - cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible) - || outputIsCollection)) { - + cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible) || + outputIsCollection)) { std::unordered_map fallback_nodes; - auto collection_input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types); - auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), collection_input_ivalues_map, cfg, static_params, fallback_nodes); + auto collection_input_ivalues_map = + partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types); + auto graph_and_mapping = ConstructFallbackGraph( + new_mod, g->block(), collection_input_ivalues_map, cfg, static_params, fallback_nodes); new_g = graph_and_mapping.first; // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly for (size_t i = 0; i < new_g->inputs().size(); ++i) { diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp index 914f1ddb9d..5f4b20e1b3 100644 --- a/core/conversion/conversion.cpp +++ b/core/conversion/conversion.cpp @@ -135,12 +135,10 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) { << "please report this error to https://www.github.com/NVIDIA/Torch-TensorRT/issues"); } -void AddInputs( - ConversionCtx* ctx, - c10::ArrayRef inputs, - ConversionInfo& conversion_info) { +void AddInputs(ConversionCtx* ctx, c10::ArrayRef inputs, ConversionInfo& conversion_info) { std::unordered_map& input_specs = conversion_info.inputs; - std::unordered_map> collection_input_spec = conversion_info.collection_input_spec_map; + std::unordered_map> collection_input_spec = + conversion_info.collection_input_spec_map; std::vector input_tensors; for (auto in : inputs) { @@ -173,7 +171,7 @@ void AddInputs( "Cannot find an input spec associated with input: " << in->debugName()); ir::Input spec; if (input_specs.find(in) != input_specs.end()) { - spec = input_specs.find(in)->second; + spec = input_specs.find(in)->second; } else { spec = collection_input_spec.find(in)->second[0]; // assume input is tensor } @@ -559,8 +557,9 @@ std::set ConvertableOpsInBlock(const torch::jit::Block* b) { } bool OutputIsCollection(const torch::jit::Block* b) { - for (auto out: b->outputs()) { - if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) { + for (auto out : b->outputs()) { + if (out->type()->kind() == torch::jit::TypeKind::TupleType || + out->type()->kind() == torch::jit::TypeKind::ListType) { return true; } } diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp index a24a15904c..71159eb2b5 100644 --- a/core/conversion/conversionctx/ConversionCtx.cpp +++ b/core/conversion/conversionctx/ConversionCtx.cpp @@ -107,7 +107,7 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings) } cfg->setAvgTimingIterations(settings.num_avg_timing_iters); - if (settings.workspace_size != 0){ + if (settings.workspace_size != 0) { cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, settings.workspace_size); } @@ -124,13 +124,13 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings) settings.enabled_precisions.find(nvinfer1::DataType::kFLOAT) == settings.enabled_precisions.end(), "DLA supports only fp16 or int8 precision"); cfg->setDLACore(settings.device.dla_core); - if (settings.dla_sram_size != 1048576){ + if (settings.dla_sram_size != 1048576) { cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM, settings.dla_sram_size); } - if (settings.dla_local_dram_size != 1073741824){ + if (settings.dla_local_dram_size != 1073741824) { cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM, settings.dla_local_dram_size); } - if (settings.dla_global_dram_size != 536870912){ + if (settings.dla_global_dram_size != 536870912) { cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM, settings.dla_global_dram_size); } } diff --git a/core/conversion/converters/converter_util.cpp b/core/conversion/converters/converter_util.cpp index a6a2bbd555..745261589e 100644 --- a/core/conversion/converters/converter_util.cpp +++ b/core/conversion/converters/converter_util.cpp @@ -207,13 +207,13 @@ nvinfer1::ITensor* clamp( nvinfer1::ITensor* lower_bound, nvinfer1::ITensor* upper_bound, std::string const& name) { - auto max_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMAX, x, lower_bound, "max layer for " + name); TORCHTRT_CHECK(max_layer, "Unable to create max layer for clamp"); LOG_DEBUG(ctx->logger, "Create " << max_layer->getName() << " for clamp"); auto max_itensor = max_layer->getOutput(0); - auto min_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name); + auto min_layer = + add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name); TORCHTRT_CHECK(min_layer, "Unable to create min layer for clamp"); LOG_DEBUG(ctx->logger, "Create " << min_layer->getName() << " for clamp"); auto min_itensor = min_layer->getOutput(0); @@ -227,13 +227,13 @@ nvinfer1::ITensor* clamp_to_input_dim( nvinfer1::ITensor* input_dim, int nbdims, std::string const& name) { - auto zero = torch::zeros({nbdims}).to(torch::kI32); auto zero_itensor = tensor_to_const(ctx, zero); auto one = torch::ones({nbdims}).to(torch::kI32); auto one_itensor = tensor_to_const(ctx, one); - auto upper_bound_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, input_dim, one_itensor, "sub layer for " + name); + auto upper_bound_layer = + add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, input_dim, one_itensor, "sub layer for " + name); TORCHTRT_CHECK(upper_bound_layer, "Unable to create sub layer for clamp to inputDim"); LOG_DEBUG(ctx->logger, "Create " << upper_bound_layer->getName() << " for clamp to inputDim"); auto upper_bound = upper_bound_layer->getOutput(0); @@ -243,7 +243,8 @@ nvinfer1::ITensor* clamp_to_input_dim( LOG_DEBUG(ctx->logger, "Create " << max_layer->getName() << " for clamp to inputDim"); auto max_itensor = max_layer->getOutput(0); - auto min_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name); + auto min_layer = + add_elementwise(ctx, nvinfer1::ElementWiseOperation::kMIN, max_itensor, upper_bound, "min layer for " + name); TORCHTRT_CHECK(min_layer, "Unable to create min_layer for clamp to inputDim"); LOG_DEBUG(ctx->logger, "Create " << min_layer->getName() << " for clamp to inputDim"); auto min_itensor = min_layer->getOutput(0); @@ -257,7 +258,6 @@ nvinfer1::ITensor* normalize_indices( nvinfer1::ITensor* indices, int nbdims, std::string const& name) { - auto zero = torch::zeros({nbdims}).to(torch::kI32); auto neg = -torch::ones({nbdims}).to(torch::kI32); auto zero_itensor = tensor_to_const(ctx, zero); @@ -307,17 +307,20 @@ nvinfer1::ITensor* get_slice_size( at::Tensor one_tensor = torch::ones({nbdims}).to(torch::kI32); auto one_itensor = tensor_to_const(ctx, one_tensor); - auto sub_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, end, start, "get_slice_size sub layer for " + name); + auto sub_layer = + add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, end, start, "get_slice_size sub layer for " + name); TORCHTRT_CHECK(sub_layer, "Unable to create sub layer in calculate_output_size"); LOG_DEBUG(ctx->logger, "Create " << sub_layer->getName() << " for calculate_output_size"); auto sub_itensor = sub_layer->getOutput(0); - auto div_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, sub_itensor, stride, "get_slice_size div layer for " + name); + auto div_layer = add_elementwise( + ctx, nvinfer1::ElementWiseOperation::kDIV, sub_itensor, stride, "get_slice_size div layer for " + name); TORCHTRT_CHECK(div_layer, "Unable to create div layer in calculate_output_size"); LOG_DEBUG(ctx->logger, "Create " << div_layer->getName() << " for calculate_output_size"); auto div_itensor = div_layer->getOutput(0); - auto add_layer = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUM, div_itensor, one_itensor, "get_slice_size sum layer for " + name); + auto add_layer = add_elementwise( + ctx, nvinfer1::ElementWiseOperation::kSUM, div_itensor, one_itensor, "get_slice_size sum layer for " + name); TORCHTRT_CHECK(add_layer, "Unable to create add layer in calculate_output_size"); LOG_DEBUG(ctx->logger, "Create " << add_layer->getName() << " for calculate_output_size"); auto size_itensor = add_layer->getOutput(0); diff --git a/core/conversion/converters/converter_util.h b/core/conversion/converters/converter_util.h index cdf2ee5a8d..b155499858 100644 --- a/core/conversion/converters/converter_util.h +++ b/core/conversion/converters/converter_util.h @@ -1,8 +1,8 @@ #pragma once +#include #include #include -#include #include "core/conversion/conversionctx/ConversionCtx.h" #include "core/conversion/converters/Weights.h" diff --git a/core/conversion/converters/impl/select.cpp b/core/conversion/converters/impl/select.cpp index 3599ab9939..d33f09ae8a 100644 --- a/core/conversion/converters/impl/select.cpp +++ b/core/conversion/converters/impl/select.cpp @@ -103,121 +103,118 @@ nvinfer1::ITensor* roll( auto select_registrations TORCHTRT_UNUSED = RegisterNodeConversionPatterns() - .pattern( - {"aten::select.int(Tensor(a) self, int dim, int index) -> (Tensor(a))", - [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { - auto in = args[0].ITensorOrFreeze(ctx); - auto maxDim = static_cast(in->getDimensions().nbDims); - auto dim = args[1].unwrapToInt(); - // Handle negative axis by refering to nbDims of input Tensor - dim = dim < 0 ? dim + maxDim : dim; - auto ind = (int32_t)args[2].unwrapToInt(); - // Along the specified dimension, handle negative index by subtracting along length of dimension. - ind = ind < 0 ? ind + in->getDimensions().d[dim] : ind; - LOG_DEBUG("Gather input dimensions: " << in->getDimensions()); - LOG_DEBUG("Dimension to select: " << dim); - LOG_DEBUG("Index: " << ind); - - // index to access needs to be an at::Tensor - at::Tensor indices = torch::tensor({ind}).to(torch::kI32); - auto const_out = tensor_to_const(ctx, indices); - - // IGatherLayer takes in input tensor, the indices, and the axis - // of input tensor to take indices from - auto gather_layer = ctx->net->addGather(*in, *const_out, dim); - TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n); - auto out = gather_layer->getOutput(0); + .pattern({"aten::select.int(Tensor(a) self, int dim, int index) -> (Tensor(a))", + [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { + auto in = args[0].ITensorOrFreeze(ctx); + auto maxDim = static_cast(in->getDimensions().nbDims); + auto dim = args[1].unwrapToInt(); + // Handle negative axis by refering to nbDims of input Tensor + dim = dim < 0 ? dim + maxDim : dim; + auto ind = (int32_t)args[2].unwrapToInt(); + // Along the specified dimension, handle negative index by subtracting along length of dimension. + ind = ind < 0 ? ind + in->getDimensions().d[dim] : ind; + LOG_DEBUG("Gather input dimensions: " << in->getDimensions()); + LOG_DEBUG("Dimension to select: " << dim); + LOG_DEBUG("Index: " << ind); + + // index to access needs to be an at::Tensor + at::Tensor indices = torch::tensor({ind}).to(torch::kI32); + auto const_out = tensor_to_const(ctx, indices); + + // IGatherLayer takes in input tensor, the indices, and the axis + // of input tensor to take indices from + auto gather_layer = ctx->net->addGather(*in, *const_out, dim); + TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n); + auto out = gather_layer->getOutput(0); + + LOG_DEBUG("Gather tensor shape: " << out->getDimensions()); + + if (out->getDimensions().nbDims != 1) { + // IShuffleLayer removes redundant dimensions + auto shuffle_layer = ctx->net->addShuffle(*out); + TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n); + shuffle_layer->setReshapeDimensions(util::squeezeDims(out->getDimensions(), dim)); + shuffle_layer->setName(util::node_info(n).c_str()); + out = shuffle_layer->getOutput(0); + } + + out = ctx->AssociateValueAndTensor(n->outputs()[0], out); + + LOG_DEBUG("Output tensor shape: " << out->getDimensions()); - LOG_DEBUG("Gather tensor shape: " << out->getDimensions()); + return true; + }}) + .pattern({"aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)", + [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { + auto in = args[0].ITensor(); + auto axis = args[1].unwrapToInt(); + auto start = (int32_t)args[2].unwrapToInt(); + auto length = (int32_t)args[3].unwrapToInt(); - if (out->getDimensions().nbDims != 1) { - // IShuffleLayer removes redundant dimensions - auto shuffle_layer = ctx->net->addShuffle(*out); - TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n); - shuffle_layer->setReshapeDimensions(util::squeezeDims(out->getDimensions(), dim)); - shuffle_layer->setName(util::node_info(n).c_str()); - out = shuffle_layer->getOutput(0); - } + // index to access needs to be an at::Tensor + at::Tensor indices = torch::arange(start, start + length, 1).to(torch::kI32); + auto weights = Weights(ctx, indices); - out = ctx->AssociateValueAndTensor(n->outputs()[0], out); + // IConstantLayer to convert indices from Weights to ITensor + auto const_layer = ctx->net->addConstant(weights.shape, weights.data); + TORCHTRT_CHECK(const_layer, "Unable to create constant layer from node: " << *n); + auto const_out = const_layer->getOutput(0); - LOG_DEBUG("Output tensor shape: " << out->getDimensions()); + // IGatherLayer takes in input tensor, the indices, and the axis + // of input tensor to take indices from + auto gather_layer = ctx->net->addGather(*in, *const_out, axis); + TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n); + auto gather_out = gather_layer->getOutput(0); - return true; - }}) - .pattern( - {"aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)", - [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { - auto in = args[0].ITensor(); - auto axis = args[1].unwrapToInt(); - auto start = (int32_t)args[2].unwrapToInt(); - auto length = (int32_t)args[3].unwrapToInt(); - - // index to access needs to be an at::Tensor - at::Tensor indices = torch::arange(start, start + length, 1).to(torch::kI32); - auto weights = Weights(ctx, indices); - - // IConstantLayer to convert indices from Weights to ITensor - auto const_layer = ctx->net->addConstant(weights.shape, weights.data); - TORCHTRT_CHECK(const_layer, "Unable to create constant layer from node: " << *n); - auto const_out = const_layer->getOutput(0); - - // IGatherLayer takes in input tensor, the indices, and the axis - // of input tensor to take indices from - auto gather_layer = ctx->net->addGather(*in, *const_out, axis); - TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n); - auto gather_out = gather_layer->getOutput(0); - - // IShuffleLayer removes redundant dimensions - auto shuffle_layer = ctx->net->addShuffle(*gather_out); - TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n); - shuffle_layer->setReshapeDimensions(util::unpadDims(gather_out->getDimensions())); - shuffle_layer->setName(util::node_info(n).c_str()); - auto shuffle_out = shuffle_layer->getOutput(0); + // IShuffleLayer removes redundant dimensions + auto shuffle_layer = ctx->net->addShuffle(*gather_out); + TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n); + shuffle_layer->setReshapeDimensions(util::unpadDims(gather_out->getDimensions())); + shuffle_layer->setName(util::node_info(n).c_str()); + auto shuffle_out = shuffle_layer->getOutput(0); - auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_out); + auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_out); - LOG_DEBUG("Output tensor shape: " << out->getDimensions()); + LOG_DEBUG("Output tensor shape: " << out->getDimensions()); - return true; - }}) - .pattern( - {"aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)", - [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { - auto in = args[0].ITensor(); - auto axis = args[1].unwrapToInt(); - torch::Tensor start = args[2].IValue()->toTensor().to(torch::kI32); - int32_t startIdx = start.item().to(); - auto length = (int32_t)args[3].unwrapToInt(); - - // index to access needs to be an at::Tensor - at::Tensor indices = torch::arange(startIdx, startIdx + length, 1).to(torch::kI32); - auto weights = Weights(ctx, indices); - - // IConstantLayer to convert indices from Weights to ITensor - auto const_layer = ctx->net->addConstant(weights.shape, weights.data); - TORCHTRT_CHECK(const_layer, "Unable to create constant layer from node: " << *n); - auto const_out = const_layer->getOutput(0); - - // IGatherLayer takes in input tensor, the indices, and the axis - // of input tensor to take indices from - auto gather_layer = ctx->net->addGather(*in, *const_out, axis); - TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n); - auto gather_out = gather_layer->getOutput(0); - - // IShuffleLayer removes redundant dimensions - auto shuffle_layer = ctx->net->addShuffle(*gather_out); - TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n); - shuffle_layer->setReshapeDimensions(util::unpadDims(gather_out->getDimensions())); - shuffle_layer->setName(util::node_info(n).c_str()); - auto shuffle_out = shuffle_layer->getOutput(0); - - auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_out); - - LOG_DEBUG("Output tensor shape: " << out->getDimensions()); + return true; + }}) + .pattern({"aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)", + [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { + auto in = args[0].ITensor(); + auto axis = args[1].unwrapToInt(); + torch::Tensor start = args[2].IValue()->toTensor().to(torch::kI32); + int32_t startIdx = start.item().to(); + auto length = (int32_t)args[3].unwrapToInt(); + + // index to access needs to be an at::Tensor + at::Tensor indices = torch::arange(startIdx, startIdx + length, 1).to(torch::kI32); + auto weights = Weights(ctx, indices); + + // IConstantLayer to convert indices from Weights to ITensor + auto const_layer = ctx->net->addConstant(weights.shape, weights.data); + TORCHTRT_CHECK(const_layer, "Unable to create constant layer from node: " << *n); + auto const_out = const_layer->getOutput(0); + + // IGatherLayer takes in input tensor, the indices, and the axis + // of input tensor to take indices from + auto gather_layer = ctx->net->addGather(*in, *const_out, axis); + TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n); + auto gather_out = gather_layer->getOutput(0); + + // IShuffleLayer removes redundant dimensions + auto shuffle_layer = ctx->net->addShuffle(*gather_out); + TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n); + shuffle_layer->setReshapeDimensions(util::unpadDims(gather_out->getDimensions())); + shuffle_layer->setName(util::node_info(n).c_str()); + auto shuffle_out = shuffle_layer->getOutput(0); + + auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_out); + + LOG_DEBUG("Output tensor shape: " << out->getDimensions()); - return true; - }}) + return true; + }}) .pattern( {"aten::embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> (Tensor)", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { @@ -239,30 +236,29 @@ auto select_registrations TORCHTRT_UNUSED = return true; }}) - .pattern( - {"aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> (Tensor)", - [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { - auto in = args[0].ITensor(); - auto shifts = args[1].unwrapToIntList().vec(); - auto dims = args[2].unwrapToIntList().vec(); - - TORCHTRT_CHECK(dims.size() == shifts.size(), "dims.size() should be equal to shifts.size()"); - if (ctx->input_is_dynamic) { - TORCHTRT_THROW_ERROR("aten::roll is currently not support in dynamic input shape compilation"); - } else { - auto in_shape = util::toVec(in->getDimensions()); - for (size_t i = 0; i < dims.size(); i++) { - auto dim = dims[i] < 0 ? (in_shape.size() + dims[i]) : dims[i]; - TORCHTRT_CHECK(dim < in_shape.size(), "Dimension out of range"); - in = roll(ctx, in, shifts[i], dim, in_shape); - } - auto out = ctx->AssociateValueAndTensor(n->outputs()[0], in); - - LOG_DEBUG("Output tensor shape: " << out->getDimensions()); - - return true; - } - }}) + .pattern({"aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> (Tensor)", + [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { + auto in = args[0].ITensor(); + auto shifts = args[1].unwrapToIntList().vec(); + auto dims = args[2].unwrapToIntList().vec(); + + TORCHTRT_CHECK(dims.size() == shifts.size(), "dims.size() should be equal to shifts.size()"); + if (ctx->input_is_dynamic) { + TORCHTRT_THROW_ERROR("aten::roll is currently not support in dynamic input shape compilation"); + } else { + auto in_shape = util::toVec(in->getDimensions()); + for (size_t i = 0; i < dims.size(); i++) { + auto dim = dims[i] < 0 ? (in_shape.size() + dims[i]) : dims[i]; + TORCHTRT_CHECK(dim < in_shape.size(), "Dimension out of range"); + in = roll(ctx, in, shifts[i], dim, in_shape); + } + auto out = ctx->AssociateValueAndTensor(n->outputs()[0], in); + + LOG_DEBUG("Output tensor shape: " << out->getDimensions()); + + return true; + } + }}) .pattern( {"aten::index.Tensor(Tensor self, Tensor?[] indices) -> (Tensor)", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { @@ -319,7 +315,8 @@ auto select_registrations TORCHTRT_UNUSED = int startIdx = 0; auto startIdxIVal = args[2].IValue(); if (!startIdxIVal->isNone()) { - startIdx = startIdxIVal->toInt() > std::numeric_limits::max() ? maxDim : startIdxIVal->toInt(); + startIdx = + startIdxIVal->toInt() > std::numeric_limits::max() ? maxDim : startIdxIVal->toInt(); startIdx = maxDim == -1 ? startIdx : std::min(startIdx, maxDim); } // Handle case when given tensor index is negative @@ -331,7 +328,8 @@ auto select_registrations TORCHTRT_UNUSED = int endIdx = maxDim; // -1 for dynamic shape auto endIdxIVal = args[3].IValue(); if (!endIdxIVal->isNone()) { - int truncate_value = endIdxIVal->toInt() > std::numeric_limits::max() ? maxDim : endIdxIVal->toInt(); + int truncate_value = + endIdxIVal->toInt() > std::numeric_limits::max() ? maxDim : endIdxIVal->toInt(); endIdx = maxDim == -1 ? truncate_value : std::min(truncate_value, maxDim); } if (maxDim > 0) { @@ -385,7 +383,8 @@ auto select_registrations TORCHTRT_UNUSED = // update start and end nvinfer1::ITensor* out_start; nvinfer1::ITensor* out_end; - auto start_end = normalize_start_and_end(ctx, ishape_tensor, start_itensor, end_itensor, nbdims, node_name); + auto start_end = + normalize_start_and_end(ctx, ishape_tensor, start_itensor, end_itensor, nbdims, node_name); out_start = start_end[0]; out_end = start_end[1]; @@ -397,7 +396,7 @@ auto select_registrations TORCHTRT_UNUSED = slice_layer->setInput(2, *size_itensor); // size, must be set if input is dynamic } auto slice_out = slice_layer->getOutput(0); - + auto out = ctx->AssociateValueAndTensor(n->outputs()[0], slice_out); LOG_DEBUG("Slice layer output shape: " << out->getDimensions()); diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp index 007a7279e7..a1b1196be9 100644 --- a/core/ir/GraphInputs.cpp +++ b/core/ir/GraphInputs.cpp @@ -5,70 +5,74 @@ namespace torch_tensorrt { namespace core { namespace ir { -void flatten_dfs(std::vector& flattened_inputs, std::vector>& collection_inputs, - torch::jit::IValue input_ivalue, int level, int index) { - if (input_ivalue.isTuple()) { - auto input_tuple = input_ivalue.toTuple(); - int idx = 0; - if (level == 0) { - collection_inputs.resize(input_tuple->elements().size()); - } - for (auto item: input_tuple->elements()) { - torch::jit::IValue converted_item; - int cur_idx = level < 1 ? idx: index; - flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx); - idx++; - } - } else if(input_ivalue.isList()) { - auto input_list = input_ivalue.toList().vec(); - if (level == 0) { - collection_inputs.resize(input_list.size()); - } - c10::TypePtr type = input_list[0].type(); - auto converted_elements = c10::impl::GenericList(type); - int idx = 0; - for (auto item: input_list) { - int cur_idx = level < 1 ? idx: index; - flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx); - idx++; - } - } else if(input_ivalue.isCustomClass()) { - torch_tensorrt::core::ir::Input cur_input = *(input_ivalue.toCustomClass()); - flattened_inputs.push_back(cur_input); - if (level == 0) { // a single value like A - collection_inputs.resize(1); - collection_inputs[0].push_back(cur_input); - } else if (level == 1) { // like A in [A, A] or [(B, B), A] - collection_inputs[index].push_back(cur_input); - } else if (level == 2) { // like A in [(A, A), C] - collection_inputs[index].push_back(cur_input); - } else {// only support 2 level - LOG_ERROR("Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]"); - } +void flatten_dfs( + std::vector& flattened_inputs, + std::vector>& collection_inputs, + torch::jit::IValue input_ivalue, + int level, + int index) { + if (input_ivalue.isTuple()) { + auto input_tuple = input_ivalue.toTuple(); + int idx = 0; + if (level == 0) { + collection_inputs.resize(input_tuple->elements().size()); } + for (auto item : input_tuple->elements()) { + torch::jit::IValue converted_item; + int cur_idx = level < 1 ? idx : index; + flatten_dfs(flattened_inputs, collection_inputs, item, level + 1, cur_idx); + idx++; + } + } else if (input_ivalue.isList()) { + auto input_list = input_ivalue.toList().vec(); + if (level == 0) { + collection_inputs.resize(input_list.size()); + } + c10::TypePtr type = input_list[0].type(); + auto converted_elements = c10::impl::GenericList(type); + int idx = 0; + for (auto item : input_list) { + int cur_idx = level < 1 ? idx : index; + flatten_dfs(flattened_inputs, collection_inputs, item, level + 1, cur_idx); + idx++; + } + } else if (input_ivalue.isCustomClass()) { + torch_tensorrt::core::ir::Input cur_input = *(input_ivalue.toCustomClass()); + flattened_inputs.push_back(cur_input); + if (level == 0) { // a single value like A + collection_inputs.resize(1); + collection_inputs[0].push_back(cur_input); + } else if (level == 1) { // like A in [A, A] or [(B, B), A] + collection_inputs[index].push_back(cur_input); + } else if (level == 2) { // like A in [(A, A), C] + collection_inputs[index].push_back(cur_input); + } else { // only support 2 level + LOG_ERROR( + "Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]"); + } + } } - GraphInputs::GraphInputs(std::vector inputs_) { - LOG_DEBUG("Construct GraphInput with ir::Input"); - inputs = inputs_; - collection_inputs.resize(inputs_.size()); - for (size_t i = 0; i < inputs_.size(); i++) { - collection_inputs[i].push_back(inputs_[i]); - } + LOG_DEBUG("Construct GraphInput with ir::Input"); + inputs = inputs_; + collection_inputs.resize(inputs_.size()); + for (size_t i = 0; i < inputs_.size(); i++) { + collection_inputs[i].push_back(inputs_[i]); + } } GraphInputs::GraphInputs(torch::jit::IValue& input_signature_) { - LOG_DEBUG("Construct GraphInput with IValue"); + LOG_DEBUG("Construct GraphInput with IValue"); - std::vector flattened_inputs; - std::vector> collection_inputs_; + std::vector flattened_inputs; + std::vector> collection_inputs_; - flatten_dfs(flattened_inputs, collection_inputs_, input_signature_, 0, 0); - inputs = flattened_inputs; - input_signature = input_signature_; - collection_inputs = collection_inputs_; - LOG_DEBUG("Collection Input Size: " << collection_inputs_.size()); + flatten_dfs(flattened_inputs, collection_inputs_, input_signature_, 0, 0); + inputs = flattened_inputs; + input_signature = input_signature_; + collection_inputs = collection_inputs_; + LOG_DEBUG("Collection Input Size: " << collection_inputs_.size()); } } // namespace ir diff --git a/core/ir/StaticParams.cpp b/core/ir/StaticParams.cpp index 0073ad2888..8502c80acf 100644 --- a/core/ir/StaticParams.cpp +++ b/core/ir/StaticParams.cpp @@ -12,8 +12,7 @@ StaticParams get_static_params(c10::ArrayRef inputs, std::ve auto param_it = params.begin(); for (auto in : inputs) { // handle TensorType, TupleType and ListType - if (in->type() != c10::TensorType::get() && - in->type()->kind() != torch::jit::TypeKind::TupleType && + if (in->type() != c10::TensorType::get() && in->type()->kind() != torch::jit::TypeKind::TupleType && in->type()->kind() != torch::jit::TypeKind::ListType && param_it != params.end()) { static_params[in] = *param_it; ++param_it; diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp index cc82fe09b4..d9b021ed8b 100644 --- a/core/ir/ir.cpp +++ b/core/ir/ir.cpp @@ -35,7 +35,9 @@ InputSpecMap pair_input_vals_with_specs(std::vector va return a; } -CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector vals, std::vector>& specs) { +CollectionInputSpecMap pair_input_vals_with_specs_collection( + std::vector vals, + std::vector>& specs) { TORCHTRT_CHECK( vals.size() == specs.size(), "Expected dimension specifications for all input tensors" @@ -64,7 +66,7 @@ std::vector get_tensor_inputs( // input.1:Tensor -> used if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) { input_tensors.push_back(in); - } + } } return input_tensors; } @@ -80,7 +82,8 @@ std::vector get_collection_inputs( if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) { input_tensors.push_back(in); } else if (in->type()->kind() == torch::jit::TypeKind::TupleType && static_params.find(in) == static_params.end()) { - // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) { + // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) + // { input_tensors.push_back(in); // push original tuple at::ArrayRef unpack_tuple = torch::jit::createTupleUnpack(in); LOG_DEBUG("get_collection_inputs, tuple size " << unpack_tuple.size()); @@ -190,15 +193,15 @@ TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b) { if (i->type() == c10::TensorType::get()) { torch::jit::Value* in = i; types.insert({in, get_value_first_calc_dtype_opt(b, i)}); - } else if(i->type()->cast()) { + } else if (i->type()->cast()) { // make sure very time get the same ptr at::ArrayRef unpack_tuple = torch::jit::createTupleUnpack(i); LOG_DEBUG("Tuple size " << unpack_tuple.size()); - for (auto item: unpack_tuple) { + for (auto item : unpack_tuple) { torch::jit::Value* in = item; types.insert({in, get_value_first_calc_dtype_opt(b, i)}); } - } else if(i->type()->isSubtypeOf(c10::ListType::ofTensors())) { + } else if (i->type()->isSubtypeOf(c10::ListType::ofTensors())) { LOG_INFO("Unsupported type of c10::ListType::ofTensors()"); } } @@ -212,7 +215,7 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* torch::jit::Value* in = i; types.insert({in, {get_value_first_calc_dtype_opt(b, i)}}); - } else if(i->type()->kind() == torch::jit::TypeKind::TupleType) { + } else if (i->type()->kind() == torch::jit::TypeKind::TupleType) { // TODO: to evaluate the data type of tuple element // make sure very time get the same ptr // c10::optional tp = get_value_first_calc_dtype_opt(b, i); @@ -220,9 +223,9 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* // TODO: calculate the tuple element type, currently we use {} as default datatype // std::vector> dytpes(unpack_tuple.size(), tp); std::vector> dytpes(unpack_tuple.size()); - types.insert({i, dytpes}); // insert an empty + types.insert({i, dytpes}); // insert an empty - } else if(i->type()->kind() == torch::jit::TypeKind::ListType) { + } else if (i->type()->kind() == torch::jit::TypeKind::ListType) { // TODO: to decide the size of list and type of list element LOG_DEBUG("get_block_first_calc_dtypes_opt ListType: use size " << i->uses().size()); c10::optional tp = get_value_first_calc_dtype_opt(b, i); @@ -234,8 +237,7 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* return types; } -static auto core_input_container = - torch::class_("_torch_tensorrt_core_ir", "Input").def(torch::init<>()); +static auto core_input_container = torch::class_("_torch_tensorrt_core_ir", "Input").def(torch::init<>()); } // namespace ir } // namespace core diff --git a/core/ir/ir.h b/core/ir/ir.h index 966c747176..a5225daa25 100644 --- a/core/ir/ir.h +++ b/core/ir/ir.h @@ -12,7 +12,7 @@ namespace core { namespace ir { struct Input : torch::CustomClassHolder { - Input() {}; + Input(){}; Input( std::vector shape, nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT, @@ -42,8 +42,8 @@ struct Input : torch::CustomClassHolder { struct GraphInputs { GraphInputs(std::vector inputs); GraphInputs(torch::jit::IValue& input_signature); - torch::jit::IValue input_signature; // nested Input, full input spec - std::vector inputs; // flattend Input + torch::jit::IValue input_signature; // nested Input, full input spec + std::vector inputs; // flattend Input std::vector> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e) }; @@ -67,7 +67,9 @@ CollectionInputSpecMap associate_specs_with_collection_inputs( ir::GraphInputs graph_inputs, StaticParams& static_params); InputSpecMap pair_input_vals_with_specs(std::vector vals, std::vector specs); -CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector vals, std::vector>& specs); +CollectionInputSpecMap pair_input_vals_with_specs_collection( + std::vector vals, + std::vector>& specs); std::vector get_tensor_inputs( std::shared_ptr& g, StaticParams& static_params); diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index 1221318647..8767048030 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -9,31 +9,28 @@ namespace core { namespace partitioning { at::Tensor generateSingleInput(ir::Input& input, c10::optional& type_opt) { - auto cur_shape = input.input_shape; - std::vector shape; - shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims); - // auto type_opt = types[input.first][i]; - auto type = at::kFloat; - if (type_opt) { - type = type_opt.value(); - } else { - LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32"); - } - auto in = at::randint(5, shape, {at::kCUDA}).to(type); - // ivalue_map[input.first] = in.clone(); - return in; + auto cur_shape = input.input_shape; + std::vector shape; + shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims); + // auto type_opt = types[input.first][i]; + auto type = at::kFloat; + if (type_opt) { + type = type_opt.value(); + } else { + LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32"); + } + auto in = at::randint(5, shape, {at::kCUDA}).to(type); + // ivalue_map[input.first] = in.clone(); + return in; } std::unordered_map generateRandomInputs( std::unordered_map>& inputs, std::unordered_map>>& types) { - // generate random inputs for running pytorch segments std::unordered_map ivalue_map; - for (auto& input : inputs) { - if (input.first->type()->kind() == torch::jit::TypeKind::ListType) { // create list std::vector list; @@ -56,7 +53,6 @@ std::unordered_map generateRandomI } else { auto in = generateSingleInput(input.second[0], types[input.first][0]); ivalue_map[input.first] = in.clone(); - } } return ivalue_map; @@ -109,7 +105,8 @@ void getSegmentsOutputByRunning( jit_inputs_ivalues.push_back(ivalues_maps[input].toBool()); } else if (input->type()->kind() == torch::jit::TypeKind::ListType) { // create list - jit_inputs_ivalues.push_back(ivalues_maps[input].toList());; + jit_inputs_ivalues.push_back(ivalues_maps[input].toList()); + ; } else if (input->type()->kind() == torch::jit::TypeKind::TupleType) { // create tuple jit_inputs_ivalues.push_back(ivalues_maps[input].toTuple()); diff --git a/core/partitioning/shape_analysis.h b/core/partitioning/shape_analysis.h index 2654699a1d..e9c51fc62d 100644 --- a/core/partitioning/shape_analysis.h +++ b/core/partitioning/shape_analysis.h @@ -6,7 +6,6 @@ namespace torch_tensorrt { namespace core { namespace partitioning { - std::unordered_map generateRandomInputs( std::unordered_map>& input_ranges, std::unordered_map>>& input_types); diff --git a/cpp/bin/torchtrtc/main.cpp b/cpp/bin/torchtrtc/main.cpp index 6c207d78da..51ec2c51c6 100644 --- a/cpp/bin/torchtrtc/main.cpp +++ b/cpp/bin/torchtrtc/main.cpp @@ -117,8 +117,7 @@ int main(int argc, char** argv) { parser, "num_iters", "Number of averaging timing iterations used to select kernels", {"num-avg-timing-iters"}); args::ValueFlag workspace_size( parser, "workspace_size", "Maximum size of workspace given to TensorRT", {"workspace-size"}); - args::ValueFlag dla_sram_size( - parser, "dla_sram_size", "DLA managed SRAM size", {"dla-sram-size"}); + args::ValueFlag dla_sram_size(parser, "dla_sram_size", "DLA managed SRAM size", {"dla-sram-size"}); args::ValueFlag dla_local_dram_size( parser, "dla_local_dram_size", "DLA Local DRAM size", {"dla-local-dram-size"}); args::ValueFlag dla_global_dram_size( diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h index 11dc5d74c6..6a7035ec2e 100644 --- a/cpp/include/torch_tensorrt/torch_tensorrt.h +++ b/cpp/include/torch_tensorrt/torch_tensorrt.h @@ -364,7 +364,7 @@ class TORCHTRT_API TensorFormat { * signifying a static input shape or a set of three input shapes representing * the min, optiminal and max input shapes allowed for the engine. */ -struct TORCHTRT_API Input : torch::CustomClassHolder{ +struct TORCHTRT_API Input : torch::CustomClassHolder { /// Minimum acceptable input size into the engine std::vector min_shape; /// Optimal input size into the engine (size optimized for given kernels accept any size in min max range) @@ -520,7 +520,7 @@ struct TORCHTRT_API Input : torch::CustomClassHolder{ * This struct can either hold a complex inputs of shape or a flattened one, */ struct TORCHTRT_API GraphInputs { - torch::jit::IValue input_signature; // nested Input, full input spec + torch::jit::IValue input_signature; // nested Input, full input spec std::vector inputs; // flatten input spec }; diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp index 1fb4c56a98..432b070e91 100644 --- a/cpp/src/compile_spec.cpp +++ b/cpp/src/compile_spec.cpp @@ -29,40 +29,38 @@ CompileSpec::CompileSpec(std::vector> fixed_sizes) { } CompileSpec::CompileSpec(std::vector inputs) { - graph_inputs.inputs = std::move(inputs); + graph_inputs.inputs = std::move(inputs); } CompileSpec::CompileSpec(torch::jit::IValue input_signature) { - graph_inputs.input_signature = input_signature; + graph_inputs.input_signature = input_signature; } - - void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) { - if (input_ivalue.isTuple()) { - auto input_tuple = input_ivalue.toTuple(); - std::vector converted_elements; - for (auto item: input_tuple->elements()) { - torch::jit::IValue converted_item; - to_internal_input_signature(item, converted_item); - converted_elements.push_back(converted_item); - auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements); - converted_ivalue = torch::jit::IValue(tuple_ptr); - } - } else if(input_ivalue.isList()) { - auto input_list = input_ivalue.toList().vec(); - c10::TypePtr type = input_list[0].type(); - auto converted_elements = c10::impl::GenericList(type); - for (auto item: input_list) { - torch::jit::IValue converted_item; - to_internal_input_signature(item, converted_item); - converted_elements.push_back(converted_item); - } - converted_ivalue = torch::jit::IValue(converted_elements); - } else if(input_ivalue.isCustomClass()) { - torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass())); - converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(cur_input))); + if (input_ivalue.isTuple()) { + auto input_tuple = input_ivalue.toTuple(); + std::vector converted_elements; + for (auto item : input_tuple->elements()) { + torch::jit::IValue converted_item; + to_internal_input_signature(item, converted_item); + converted_elements.push_back(converted_item); + auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements); + converted_ivalue = torch::jit::IValue(tuple_ptr); } + } else if (input_ivalue.isList()) { + auto input_list = input_ivalue.toList().vec(); + c10::TypePtr type = input_list[0].type(); + auto converted_elements = c10::impl::GenericList(type); + for (auto item : input_list) { + torch::jit::IValue converted_item; + to_internal_input_signature(item, converted_item); + converted_elements.push_back(converted_item); + } + converted_ivalue = torch::jit::IValue(converted_elements); + } else if (input_ivalue.isCustomClass()) { + torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass())); + converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(cur_input))); + } } torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) { diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp index 93813190ab..22855aeb03 100644 --- a/cpp/src/torch_tensorrt.cpp +++ b/cpp/src/torch_tensorrt.cpp @@ -53,6 +53,5 @@ void set_device(const int gpu_id) { torch_tensorrt::core::set_device(gpu_id); } -static auto tensorrt_input_container = - torch::class_("_torch_tensorrt", "Input").def(torch::init<>()); +static auto tensorrt_input_container = torch::class_("_torch_tensorrt", "Input").def(torch::init<>()); } // namespace torch_tensorrt diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp index 0eb6fba2de..274b40d479 100644 --- a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp @@ -28,7 +28,8 @@ void RegisterTRTCompileSpec() { .def(torch::init<>()) .def("__str__", &torch_tensorrt::pyapi::InputSignature::to_str); - ADD_FIELD_GET_SET_REGISTRATION(TRTInputSignatureTSRegistration, torch_tensorrt::pyapi::InputSignature, signature_ivalue); + ADD_FIELD_GET_SET_REGISTRATION( + TRTInputSignatureTSRegistration, torch_tensorrt::pyapi::InputSignature, signature_ivalue); static auto TORCHTRT_UNUSED TRTDeviceTSRegistration = torch::class_("tensorrt", "_Device") @@ -73,7 +74,8 @@ void RegisterTRTCompileSpec() { ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, workspace_size); ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_sram_size); ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_local_dram_size); - ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_global_dram_size); + ADD_FIELD_GET_SET_REGISTRATION( + TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_global_dram_size); ADD_FIELD_GET_SET_REGISTRATION( TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, truncate_long_and_double); } diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp index ca11cf4bc1..96fef793fd 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp @@ -108,35 +108,35 @@ std::string sig_to_str(torch::jit::IValue input_sig) { if (input_sig.isTuple()) { auto input_tuple = input_sig.toTuple(); std::vector children; - for (auto item: input_tuple->elements()) { + for (auto item : input_tuple->elements()) { auto child = sig_to_str(item); children.push_back(child); } std::stringstream ss; ss << "("; for (auto i : children) { - ss << i << ", "; + ss << i << ", "; } ss << ")"; return ss.str(); - } else if(input_sig.isList()) { + } else if (input_sig.isList()) { auto input_list = input_sig.toList().vec(); std::vector children; - for (auto item: input_list) { + for (auto item : input_list) { auto child = sig_to_str(item); children.push_back(child); } std::stringstream ss; ss << "["; for (auto i : children) { - ss << i << ", "; + ss << i << ", "; } ss << "]"; return ss.str(); - } else if(input_sig.isCustomClass()) { + } else if (input_sig.isCustomClass()) { auto cur_input = input_sig.toCustomClass(); return cur_input->to_str(); - } else if(input_sig.isPyObject()) { + } else if (input_sig.isPyObject()) { auto py_object_holder = input_sig.toPyObjectHolder(); auto infer_type = py_object_holder->tryToInferType(); auto type = infer_type.type(); @@ -238,27 +238,27 @@ void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IV if (input_ivalue.isTuple()) { auto input_tuple = input_ivalue.toTuple(); std::vector converted_elements; - for (auto item: input_tuple->elements()) { + for (auto item : input_tuple->elements()) { torch::jit::IValue converted_item; to_internal_input_signature(item, converted_item); converted_elements.push_back(converted_item); auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements); converted_ivalue = torch::jit::IValue(tuple_ptr); } - } else if(input_ivalue.isList()) { + } else if (input_ivalue.isList()) { auto input_list = input_ivalue.toList().vec(); c10::TypePtr type = input_list[0].type(); auto converted_elements = c10::impl::GenericList(type); - for (auto item: input_list) { + for (auto item : input_list) { torch::jit::IValue converted_item; to_internal_input_signature(item, converted_item); converted_elements.push_back(converted_item); } converted_ivalue = torch::jit::IValue(converted_elements); - } else if(input_ivalue.isCustomClass()) { + } else if (input_ivalue.isCustomClass()) { core::ir::Input cur_input = (*(input_ivalue.toCustomClass())).toInternalInput(); converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(cur_input))); - } else if(input_ivalue.isPyObject()) { + } else if (input_ivalue.isPyObject()) { auto py_object_holder = input_ivalue.toPyObjectHolder(); auto infer_type = py_object_holder->tryToInferType(); auto type = infer_type.type(); @@ -325,11 +325,17 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() { info.convert_info.engine_settings.num_avg_timing_iters = num_avg_timing_iters; TORCHTRT_CHECK(workspace_size >= 0, "workspace_size must be 0 or greater"); info.convert_info.engine_settings.workspace_size = workspace_size; - TORCHTRT_CHECK(dla_sram_size >= 4096, "DLA managed SRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 MiB"); + TORCHTRT_CHECK( + dla_sram_size >= 4096, + "DLA managed SRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 MiB"); info.convert_info.engine_settings.dla_sram_size = dla_sram_size; - TORCHTRT_CHECK(dla_local_dram_size >= 4096, "DLA Local DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 GiB"); + TORCHTRT_CHECK( + dla_local_dram_size >= 4096, + "DLA Local DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 GiB"); info.convert_info.engine_settings.dla_local_dram_size = dla_local_dram_size; - TORCHTRT_CHECK(dla_global_dram_size >= 4096, "DLA Global DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 512 MiB"); + TORCHTRT_CHECK( + dla_global_dram_size >= 4096, + "DLA Global DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 512 MiB"); info.convert_info.engine_settings.dla_global_dram_size = dla_global_dram_size; return info; } @@ -348,7 +354,7 @@ std::string CompileSpec::stringify() { } ss << " \"Enabled Precision\": ["; for (auto p : enabled_precisions) { - ss << to_str(p) << ", " ; + ss << to_str(p) << ", "; } ss << "]" << std::endl; ss << " \"TF32 Disabled\": " << disable_tf32 << std::endl; diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h index d3b22740c2..be2fab3b8e 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.h +++ b/py/torch_tensorrt/csrc/tensorrt_classes.h @@ -58,7 +58,7 @@ struct Input : torch::CustomClassHolder { }; struct InputSignature : torch::CustomClassHolder { - torch::jit::IValue signature_ivalue; // nested Input, full input spec + torch::jit::IValue signature_ivalue; // nested Input, full input spec ADD_FIELD_GET_SET(signature_ivalue, torch::jit::IValue); std::string to_str(); }; diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp index 6247789a93..6b1ffd4ccf 100644 --- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp +++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp @@ -1,8 +1,8 @@ #include "pybind11/pybind11.h" #include "pybind11/stl.h" -#include "Python.h" #include "ATen/core/jit_type.h" +#include "Python.h" #include "core/compiler.h" #include "core/conversion/conversion.h" #include "tensorrt_classes.h" @@ -182,7 +182,8 @@ PYBIND11_MODULE(_C, m) { py::class_(m, "InputSignature") .def(pybind11::init([](py::object py_obj) { InputSignature input_signature; - input_signature.signature_ivalue = torch::jit::toIValue(std::move(py_obj), c10::PyObjectType::get(), c10::nullopt); + input_signature.signature_ivalue = + torch::jit::toIValue(std::move(py_obj), c10::PyObjectType::get(), c10::nullopt); return input_signature; })) .def("__str__", &InputSignature::to_str) diff --git a/tests/core/conversion/converters/test_cast.cpp b/tests/core/conversion/converters/test_cast.cpp index 092cdb32a6..d26c7a0277 100644 --- a/tests/core/conversion/converters/test_cast.cpp +++ b/tests/core/conversion/converters/test_cast.cpp @@ -135,7 +135,6 @@ TEST(Converters, ATenBoolToINT32TensorConvertsCorrectly) { ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results[0], trt, 2e-6)); } - TEST(Converters, ATenToSingleConvertsCorrectly) { const auto graph = R"IR( graph(%y.1 : Tensor): @@ -164,7 +163,6 @@ TEST(Converters, ATenToSingleConvertsCorrectly) { ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results[0], trt, 2e-6)); } - TEST(Converters, ATenTypeAsConvertsCorrectly) { const auto graph = R"IR( graph(%0 : Tensor, diff --git a/tests/cpp/test_collections.cpp b/tests/cpp/test_collections.cpp index df2280b947..829e82abc9 100644 --- a/tests/cpp/test_collections.cpp +++ b/tests/cpp/test_collections.cpp @@ -5,9 +5,7 @@ #include "torch/script.h" #include "torch_tensorrt/torch_tensorrt.h" - TEST(CppAPITests, TestCollectionStandardTensorInput) { - std::string path = "tests/modules/standard_tensor_input_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); std::vector inputs; @@ -24,7 +22,6 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) { mod.eval(); mod.to(torch::kCUDA); - std::vector inputs_; for (auto in : inputs) { @@ -52,7 +49,6 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) { } TEST(CppAPITests, TestCollectionTupleInput) { - std::string path = "tests/modules/tuple_input_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); @@ -78,14 +74,12 @@ TEST(CppAPITests, TestCollectionTupleInput) { auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); - std::tuple input_shape_tuple(input_shape_ivalue, input_shape_ivalue); torch::jit::IValue complex_input_shape(input_shape_tuple); std::tuple input_tuple2(complex_input_shape); torch::jit::IValue complex_input_shape2(input_tuple2); - auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); compile_settings.require_full_compilation = false; compile_settings.min_block_size = 3; @@ -100,9 +94,7 @@ TEST(CppAPITests, TestCollectionTupleInput) { ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5)); } - TEST(CppAPITests, TestCollectionListInput) { - std::string path = "tests/modules/list_input_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); std::vector inputs; @@ -118,7 +110,6 @@ TEST(CppAPITests, TestCollectionListInput) { mod.eval(); mod.to(torch::kCUDA); - std::vector inputs_; for (auto in : inputs) { @@ -134,7 +125,6 @@ TEST(CppAPITests, TestCollectionListInput) { complex_inputs.push_back(input_list_ivalue); - auto out = mod.forward(complex_inputs); LOG_DEBUG("Finish torchscirpt forward"); @@ -146,7 +136,6 @@ TEST(CppAPITests, TestCollectionListInput) { list.push_back(input_shape_ivalue); list.push_back(input_shape_ivalue); - torch::jit::IValue complex_input_shape(list); std::tuple input_tuple2(complex_input_shape); torch::jit::IValue complex_input_shape2(input_tuple2); @@ -166,9 +155,7 @@ TEST(CppAPITests, TestCollectionListInput) { ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5)); } - TEST(CppAPITests, TestCollectionTupleInputOutput) { - std::string path = "tests/modules/tuple_input_output_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); @@ -183,7 +170,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) { mod.eval(); mod.to(torch::kCUDA); - std::vector complex_inputs, complex_inputs_list; std::tuple input_tuple(in0, in0); @@ -196,7 +182,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) { auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); - std::tuple input_shape_tuple(input_shape_ivalue, input_shape_ivalue); torch::jit::IValue complex_input_shape(input_shape_tuple); @@ -217,13 +202,13 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) { LOG_DEBUG("Finish compile"); auto trt_out = trt_mod.forward(complex_inputs); - ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5)); - ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( + out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( + out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5)); } - TEST(CppAPITests, TestCollectionListInputOutput) { - std::string path = "tests/modules/list_input_output_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); std::vector inputs; @@ -239,7 +224,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) { mod.eval(); mod.to(torch::kCUDA); - std::vector inputs_; for (auto in : inputs) { @@ -255,7 +239,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) { complex_inputs.push_back(input_list_ivalue); - auto out = mod.forward(complex_inputs); LOG_DEBUG("Finish torchscirpt forward"); @@ -263,13 +246,11 @@ TEST(CppAPITests, TestCollectionListInputOutput) { auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); - c10::TypePtr elementType = input_shape_ivalue.type(); auto list = c10::impl::GenericList(elementType); list.push_back(input_shape_ivalue); list.push_back(input_shape_ivalue); - torch::jit::IValue complex_input_shape(list); std::tuple input_tuple2(complex_input_shape); torch::jit::IValue complex_input_shape2(input_tuple2); @@ -288,13 +269,13 @@ TEST(CppAPITests, TestCollectionListInputOutput) { LOG_DEBUG("Finish compile"); auto trt_out = trt_mod.forward(complex_inputs); - ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5)); - ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( + out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( + out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5)); } - TEST(CppAPITests, TestCollectionComplexModel) { - std::string path = "tests/modules/list_input_tuple_output_scripted.jit.pt"; torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf); std::vector inputs; @@ -310,7 +291,6 @@ TEST(CppAPITests, TestCollectionComplexModel) { mod.eval(); mod.to(torch::kCUDA); - std::vector inputs_; for (auto in : inputs) { @@ -326,7 +306,6 @@ TEST(CppAPITests, TestCollectionComplexModel) { complex_inputs.push_back(input_list_ivalue); - auto out = mod.forward(complex_inputs); LOG_DEBUG("Finish torchscirpt forward"); @@ -339,7 +318,6 @@ TEST(CppAPITests, TestCollectionComplexModel) { list.push_back(input_shape_ivalue); list.push_back(input_shape_ivalue); - torch::jit::IValue complex_input_shape(list); std::tuple input_tuple2(complex_input_shape); torch::jit::IValue complex_input_shape2(input_tuple2); @@ -358,6 +336,8 @@ TEST(CppAPITests, TestCollectionComplexModel) { LOG_DEBUG("Finish compile"); auto trt_out = trt_mod.forward(complex_inputs); - ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5)); - ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( + out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5)); + ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( + out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5)); } \ No newline at end of file diff --git a/tests/cpp/test_example_tensors.cpp b/tests/cpp/test_example_tensors.cpp index 3ec8831f9d..256e6f1b59 100644 --- a/tests/cpp/test_example_tensors.cpp +++ b/tests/cpp/test_example_tensors.cpp @@ -9,7 +9,6 @@ TEST_P(CppAPITests, InputsFromTensors) { trt_inputs_ivalues.push_back(in.clone()); } - auto inputs = std::vector{trt_inputs_ivalues[0].toTensor()}; auto spec = torch_tensorrt::ts::CompileSpec(inputs); diff --git a/tools/linter/utils.py b/tools/linter/utils.py index 1754702f6b..8d4d75cd70 100644 --- a/tools/linter/utils.py +++ b/tools/linter/utils.py @@ -6,7 +6,7 @@ BLACKLISTED_BAZEL_TARGETS = [ "//experiments", "//tools", "//docker", "//third_party", "//bazel-bin", "//bazel-genfiles", "//bazel-out", "//bazel-TRTorch", "//bazel-Torch-TensorRT", "//bazel-torch-tensorrt", "//bazel-workspace", - "//bazel-testlogs", "//py/build", + "//bazel-tensorrt", "bazel-TensorRT", "//bazel-testlogs", "//py/build", "//py/dist", "//py/trtorch.egg-info", "//py/wheelhouse", "//examples", "//docsrc", "//docs" ] @@ -35,4 +35,4 @@ def glob_files(project, file_types): files = [] for t in file_types: files += glob.glob(project + "/**/*" + t, recursive=True) - return files \ No newline at end of file + return files From f866dba29afa5848ac67d885eaa1e083e2e00177 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Mon, 1 Aug 2022 22:16:17 -0700 Subject: [PATCH 10/16] fix: fix the bug that ListConstruct is in TRT subgraph when it's entire graph's output Signed-off-by: Bo Wang --- core/partitioning/partitioning.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 85626772f0..565f58c677 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -90,6 +90,16 @@ std::vector getDependencyNodes( return stk; } +void find_nontensor_output_nodes( + torch::jit::Block* block, + std::unordered_map& global_fallback_nodes) { + for (auto i : block->outputs()) { + if (!isTensor(i)) { + global_fallback_nodes.insert({i->node(), FallbackNodeType::kNON_TENSOR}); + } + } +} + void find_all_fallback_nodes( std::unordered_map& initial_fallback_nodes, std::unordered_map& global_fallback_nodes) { @@ -430,6 +440,9 @@ PartitionedGraph Partition( const PartitionInfo& partition_info, std::unordered_map& global_fallback_nodes) { LOG_DEBUG(partition_info); + // if there is nonTensor output for the entire graph, fallback the node that produces this nonTensor output + find_nontensor_output_nodes(block, global_fallback_nodes); + // segment lowering global graph into blocks LOG_DEBUG("Parititioning source module into PyTorch and TensorRT sub blocks"); PartitionedGraph segmented_blocks = segment_graph(block, partition_info, global_fallback_nodes); From 6d0b1d3404ecdac09c45e3455078709b445769b4 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Tue, 2 Aug 2022 22:39:15 -0700 Subject: [PATCH 11/16] fix: fix the error that collection input segmented into trt subgraph Signed-off-by: Bo Wang --- core/partitioning/partitioning.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 565f58c677..28bfd0712c 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -90,14 +90,26 @@ std::vector getDependencyNodes( return stk; } -void find_nontensor_output_nodes( +// check if the input and output of the graph is Tensor after collection is enabled. If it is, then fallback related +// nodes +void fallback_graph_nontensor_in_out( torch::jit::Block* block, std::unordered_map& global_fallback_nodes) { + // fallback nodes that produce entire graph's nonTensor output for (auto i : block->outputs()) { if (!isTensor(i)) { global_fallback_nodes.insert({i->node(), FallbackNodeType::kNON_TENSOR}); } } + + // fallback nodes that consume entire graph's nonTensor input + for (auto i : block->inputs()) { + if (!isTensor(i)) { + for (auto use : i->uses()) { + global_fallback_nodes.insert({use.user, FallbackNodeType::kNON_TENSOR}); + } + } + } } void find_all_fallback_nodes( @@ -202,6 +214,7 @@ void registerSegmentsOutputs(PartitionedGraph& segmented_blocks, torch::jit::Blo } } } + std::for_each(segmented_blocks.begin(), segmented_blocks.end(), [](SegmentedBlock& seg_block) { torch::jit::EliminateDeadCode(seg_block.g()); }); @@ -440,8 +453,9 @@ PartitionedGraph Partition( const PartitionInfo& partition_info, std::unordered_map& global_fallback_nodes) { LOG_DEBUG(partition_info); - // if there is nonTensor output for the entire graph, fallback the node that produces this nonTensor output - find_nontensor_output_nodes(block, global_fallback_nodes); + // if there is nonTensor input/output for the entire graph, fallback the node that consumes/produces this nonTensor + // output + fallback_graph_nontensor_in_out(block, global_fallback_nodes); // segment lowering global graph into blocks LOG_DEBUG("Parititioning source module into PyTorch and TensorRT sub blocks"); From 8b891fb18bee24d51cfabe1b7c36c693c7fb4362 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Wed, 3 Aug 2022 09:56:11 -0700 Subject: [PATCH 12/16] feat(//core/conversion/converters/evaluators): New evaluators for collections Implements evaluators for: - prim::TupleUnpack - prim::TupleConstruct - prim::TupleIndex Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- core/conversion/evaluators/aten.cpp | 8 -- core/conversion/evaluators/eval_util.cpp | 9 ++ core/conversion/evaluators/eval_util.h | 2 + core/conversion/evaluators/prim.cpp | 52 ++++++++- core/ir/GraphInputs.cpp | 3 - core/ir/ir.cpp | 12 +- cpp/src/compile_spec.cpp | 1 + py/torch_tensorrt/ts/_compile_spec.py | 14 +++ py/torch_tensorrt/ts/_compiler.py | 27 +++++ .../evaluators/test_prim_evaluators.cpp | 107 ++++++++++++++++++ tests/cpp/test_collections.cpp | 39 +++---- tests/util/evaluate_graph.cpp | 2 +- 12 files changed, 232 insertions(+), 44 deletions(-) diff --git a/core/conversion/evaluators/aten.cpp b/core/conversion/evaluators/aten.cpp index 4632744790..49f529c003 100644 --- a/core/conversion/evaluators/aten.cpp +++ b/core/conversion/evaluators/aten.cpp @@ -19,14 +19,6 @@ namespace conversion { namespace evaluators { namespace { -int64_t normalizeIndex(int64_t idx, int64_t list_size) { - if (idx < 0) { - // Handle negative indexing - idx = list_size + idx; - } - return idx; -} - DEFINE_GENERIC_TWO_INPUT_EVALUATOR( eq, "aten::eq", diff --git a/core/conversion/evaluators/eval_util.cpp b/core/conversion/evaluators/eval_util.cpp index 79b377cd37..742a4f4938 100644 --- a/core/conversion/evaluators/eval_util.cpp +++ b/core/conversion/evaluators/eval_util.cpp @@ -12,6 +12,15 @@ namespace core { namespace conversion { namespace evaluators { +int64_t normalizeIndex(int64_t idx, int64_t list_size) { + if (idx < 0) { + // Handle negative indexing + idx = list_size + idx; + } + return idx; +} + + // TODO: Switch back to PyTorch canonical implimentation c10::optional toIValue(const torch::jit::Value* v) { if (v->node()->kind() != torch::jit::prim::Constant || v->type()->cast()) { diff --git a/core/conversion/evaluators/eval_util.h b/core/conversion/evaluators/eval_util.h index 5e233b4e2d..a9c21339bb 100644 --- a/core/conversion/evaluators/eval_util.h +++ b/core/conversion/evaluators/eval_util.h @@ -13,6 +13,8 @@ at::Tensor createTensorFromList( const torch::jit::IValue& dtype, const torch::jit::IValue& device); +int64_t normalizeIndex(int64_t idx, int64_t list_size); + at::Tensor scalar_to_tensor(const at::Scalar& s, const at::Device device = at::kCPU); } // namespace evaluators diff --git a/core/conversion/evaluators/prim.cpp b/core/conversion/evaluators/prim.cpp index 7d5373a5f9..338c427ccd 100644 --- a/core/conversion/evaluators/prim.cpp +++ b/core/conversion/evaluators/prim.cpp @@ -259,6 +259,56 @@ auto prim_registrations = } }, EvalOptions().validSchemas({"prim::shape(Tensor a) -> (int[])"})}) + .evaluator({torch::jit::prim::TupleConstruct, + [](const torch::jit::Node* n, kwargs& args) -> c10::optional { + auto num_inputs = n->inputs().size(); + c10::IValue tuple = c10::ivalue::Tuple::create(); + switch (num_inputs) { + case 0: + tuple = c10::ivalue::Tuple::create(); + break; + case 1: + tuple = c10::ivalue::Tuple::create(std::move((*args.at(n->input(0)).IValue()))); + break; + case 2: { + tuple = c10::ivalue::Tuple::create( + std::move(*(args.at(n->input(0)).IValue())), + std::move(*(args.at(n->input(1)).IValue()))); + break; + } + case 3: { + tuple = c10::ivalue::Tuple::create( + std::move(*(args.at(n->input(0)).IValue())), + std::move(*(args.at(n->input(1)).IValue())), + std::move(*(args.at(n->input(2)).IValue()))); + break; + } + default: { + std::vector elems; + for (size_t i = 0; i < num_inputs; i++) { + elems.push_back(*(args.at(n->input(i)).IValue())); + } + tuple = c10::ivalue::Tuple::create(std::move(elems)); + break; + } + } + return c10::optional(std::move(tuple)); + }}) + .evaluator({torch::jit::prim::TupleIndex, + [](const torch::jit::Node* n, kwargs& args) -> c10::optional { + // Outputs is an IValue which has list of tensors which can be found in ctx->evaluated_value_map + auto tuple = args.at(n->input(0)).IValue()->toTuple(); + int64_t idx = args.at(n->input(1)).IValue()->toInt(); + int64_t norm_idx = normalizeIndex(idx, tuple->elements().size()); + return c10::optional(std::move(tuple->elements()[norm_idx])); + }, + EvalOptions().validSchemas({"prim::TupleIndex(Any tup, int i) -> (Any)"})}) + .evaluator({torch::jit::prim::TupleUnpack, + [](const torch::jit::Node* n, kwargs& args) -> c10::optional { + // Outputs is an IValue which has list of tensors which can be found in ctx->evaluated_value_map + auto output = args.at(n->input()).IValue()->toTuple(); + return c10::optional(std::move(output)); + }}) .evaluator({c10::Symbol::fromQualString("prim::unchecked_cast"), [](const torch::jit::Node* n, kwargs& args) -> c10::optional { return *(args.at(n->input(0)).IValue()); @@ -277,4 +327,4 @@ auto prim_registrations = } // namespace evaluators } // namespace conversion } // namespace core -} // namespace torch_tensorrt +} // namespace torch_tensorrt \ No newline at end of file diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp index a1b1196be9..f3fa889385 100644 --- a/core/ir/GraphInputs.cpp +++ b/core/ir/GraphInputs.cpp @@ -54,7 +54,6 @@ void flatten_dfs( } GraphInputs::GraphInputs(std::vector inputs_) { - LOG_DEBUG("Construct GraphInput with ir::Input"); inputs = inputs_; collection_inputs.resize(inputs_.size()); for (size_t i = 0; i < inputs_.size(); i++) { @@ -63,8 +62,6 @@ GraphInputs::GraphInputs(std::vector inputs_) { } GraphInputs::GraphInputs(torch::jit::IValue& input_signature_) { - LOG_DEBUG("Construct GraphInput with IValue"); - std::vector flattened_inputs; std::vector> collection_inputs_; diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp index d9b021ed8b..99bf4f97b1 100644 --- a/core/ir/ir.cpp +++ b/core/ir/ir.cpp @@ -29,7 +29,7 @@ InputSpecMap pair_input_vals_with_specs(std::vector va std::unordered_map a; for (size_t i = 0; i < vals.size(); i++) { - LOG_DEBUG("Pairing " << i << ": " << vals[i]->debugName() << " : " << specs[i]); + LOG_DEBUG("Pairing " << i << ": " << vals[i]->debugName() << ": " << specs[i]); a.insert({vals[i], specs[i]}); } return a; @@ -56,7 +56,7 @@ std::vector get_tensor_inputs( StaticParams& static_params) { std::vector input_tensors; auto inputs = g->inputs(); - LOG_DEBUG("Raw inputs size of get_tensor_inputs: " << inputs.size()); + LOG_DEBUG("Found " << inputs.size() << " inputs to graph"); for (auto in : inputs) { LOG_DEBUG("Handle input of debug name: " << in->debugName()); // Disregarding inputs that are not tensors or are static @@ -76,7 +76,7 @@ std::vector get_collection_inputs( StaticParams& static_params) { std::vector input_tensors; auto inputs = g->inputs(); - LOG_DEBUG("Raw inputs size of get_collection_inputs: " << inputs.size()); + LOG_DEBUG("Found " << inputs.size() << " inputs to graph"); for (auto in : inputs) { LOG_DEBUG("Handle input of debug name: " << in->debugName()); if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) { @@ -86,9 +86,9 @@ std::vector get_collection_inputs( // { input_tensors.push_back(in); // push original tuple at::ArrayRef unpack_tuple = torch::jit::createTupleUnpack(in); - LOG_DEBUG("get_collection_inputs, tuple size " << unpack_tuple.size()); + LOG_DEBUG("Input tuple size " << unpack_tuple.size()); } else if (in->type()->kind() == torch::jit::TypeKind::ListType && static_params.find(in) == static_params.end()) { - LOG_DEBUG("get_collection_inputs, list use size " << in->uses().size()); + LOG_DEBUG("Input list use size " << in->uses().size()); input_tensors.push_back(in); // push original list } } @@ -227,7 +227,7 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* } else if (i->type()->kind() == torch::jit::TypeKind::ListType) { // TODO: to decide the size of list and type of list element - LOG_DEBUG("get_block_first_calc_dtypes_opt ListType: use size " << i->uses().size()); + LOG_DEBUG("Number of list uses " << i->uses().size()); c10::optional tp = get_value_first_calc_dtype_opt(b, i); // std::vector> dytpes(i->uses().size()); std::vector> dytpes(i->uses().size(), tp); diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp index 432b070e91..04573499e6 100644 --- a/cpp/src/compile_spec.cpp +++ b/cpp/src/compile_spec.cpp @@ -69,6 +69,7 @@ torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) { return internal; } else { torch::jit::IValue converted_input_signature; + LOG_WARNING( "Input signature parsing is an experimental feature, behavior and APIs may change"); to_internal_input_signature(external.graph_inputs.input_signature, converted_input_signature); torchtrt::core::CompileSpec internal(converted_input_signature); return internal; diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py index 0eb8a1cdce..628955c079 100644 --- a/py/torch_tensorrt/ts/_compile_spec.py +++ b/py/torch_tensorrt/ts/_compile_spec.py @@ -305,6 +305,20 @@ def TensorRTCompileSpec(inputs=[], torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings ] + input_signature Union(List, Tuple, torch_tensorrt.Input, torch.Tensor): A formatted collection of input specifications for the module. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using + torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. **This API should be considered beta-level stable and may change in the future** :: + + input_signature=([ + torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 + torch_tensorrt.Input( + min_shape=(1, 224, 224, 3), + opt_shape=(1, 512, 512, 3), + max_shape=(1, 1024, 1024, 3), + dtype=torch.int32 + format=torch.channel_last + ), # Dynamic input shape for input #2 + ], torch.randn((1, 3, 224, 244))) # Use an example tensor and let torch_tensorrt infer settings for input #3 + device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on :: device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True) diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py index 508cb8fdd0..9119c25e86 100644 --- a/py/torch_tensorrt/ts/_compiler.py +++ b/py/torch_tensorrt/ts/_compiler.py @@ -58,6 +58,19 @@ def compile(module: torch.jit.ScriptModule, torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings ] + input_signature Union(List, Tuple, torch_tensorrt.Input, torch.Tensor): A formatted collection of input specifications for the module. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using + torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. **This API should be considered beta-level stable and may change in the future** :: + + input_signature=([ + torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 + torch_tensorrt.Input( + min_shape=(1, 224, 224, 3), + opt_shape=(1, 512, 512, 3), + max_shape=(1, 1024, 1024, 3), + dtype=torch.int32 + format=torch.channel_last + ), # Dynamic input shape for input #2 + ], torch.randn((1, 3, 224, 244))) # Use an example tensor and let torch_tensorrt infer settings for input #3 device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on :: device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True) @@ -163,6 +176,20 @@ def convert_method_to_trt_engine(module: torch.jit.ScriptModule, torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings ] + input_signature Union(List, Tuple, torch_tensorrt.Input, torch.Tensor): A formatted collection of input specifications for the module. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using + torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. **This API should be considered beta-level stable and may change in the future** :: + + input_signature=([ + torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 + torch_tensorrt.Input( + min_shape=(1, 224, 224, 3), + opt_shape=(1, 512, 512, 3), + max_shape=(1, 1024, 1024, 3), + dtype=torch.int32 + format=torch.channel_last + ), # Dynamic input shape for input #2 + ], torch.randn((1, 3, 224, 244))) # Use an example tensor and let torch_tensorrt infer settings for input #3 + device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on :: device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True) diff --git a/tests/core/conversion/evaluators/test_prim_evaluators.cpp b/tests/core/conversion/evaluators/test_prim_evaluators.cpp index 0ff250f0e9..508d4eb1b0 100644 --- a/tests/core/conversion/evaluators/test_prim_evaluators.cpp +++ b/tests/core/conversion/evaluators/test_prim_evaluators.cpp @@ -51,5 +51,112 @@ TEST(Evaluators, NumToTensorEvaluatesCorrectly) { auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {}); auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {}); + ASSERT_TRUE(jit_results[0] == trt_results[0]); +} + +TEST(Evaluators, PrimTupleConstruct1EvaluatesCorrectly) { + const auto graph = R"IR( + graph(): + %1 : int = prim::Constant[value=3]() + %tc : (int) = prim::TupleConstruct(%1) + return (%tc))IR"; + + auto g = std::make_shared(); + torch::jit::parseIR(graph, g.get()); + + auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {}); + auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {}); + + ASSERT_TRUE(jit_results[0] == trt_results[0]); +} + +TEST(Evaluators, PrimTupleConstruct2EvaluatesCorrectly) { + const auto graph = R"IR( + graph(): + %1 : int = prim::Constant[value=3]() + %2 : int = prim::Constant[value=4]() + %tc : (int, int) = prim::TupleConstruct(%1, %2) + return (%tc))IR"; + + auto g = std::make_shared(); + torch::jit::parseIR(graph, g.get()); + + auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {}); + auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {}); + + ASSERT_TRUE(jit_results[0] == trt_results[0]); +} + +TEST(Evaluators, PrimTupleConstruct3EvaluatesCorrectly) { + const auto graph = R"IR( + graph(): + %1 : int = prim::Constant[value=3]() + %2 : int = prim::Constant[value=4]() + %3 : int = prim::Constant[value=4]() + %tc : (int, int, int) = prim::TupleConstruct(%1, %2, %3) + return (%tc))IR"; + + auto g = std::make_shared(); + torch::jit::parseIR(graph, g.get()); + + auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {}); + auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {}); + + ASSERT_TRUE(jit_results[0] == trt_results[0]); +} + +TEST(Evaluators, PrimTupleConstruct4EvaluatesCorrectly) { + const auto graph = R"IR( + graph(): + %1 : int = prim::Constant[value=3]() + %2 : int = prim::Constant[value=4]() + %3 : int = prim::Constant[value=3]() + %4 : int = prim::Constant[value=4]() + %tc : (int, int, int, int) = prim::TupleConstruct(%1, %2, %3, %4) + return (%tc))IR"; + + auto g = std::make_shared(); + torch::jit::parseIR(graph, g.get()); + + auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {}); + auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {}); + + ASSERT_TRUE(jit_results[0] == trt_results[0]); +} + +TEST(Evaluators, PrimTupleUnpackEvaluatesCorrectly) { + const auto graph = R"IR( + graph(): + %1 : int = prim::Constant[value=3]() + %2 : int = prim::Constant[value=4]() + %tc : (int, int) = prim::TupleConstruct(%1, %2) + %tu.1 : int, %tu.2 : int = prim::TupleUnpack(%tc) + return (%tu.1, %tu.2))IR"; + + auto g = std::make_shared(); + torch::jit::parseIR(graph, g.get()); + + auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {}); + auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {}); + + ASSERT_TRUE(jit_results[0] == trt_results[0]); +} + +TEST(Evaluators, PrimTupleIndexEvaluatesCorrectly) { + const auto graph = R"IR( + graph(): + %0 : int = prim::Constant[value=1]() + %1 : int = prim::Constant[value=3]() + %2 : int = prim::Constant[value=4]() + %tc : (int, int) = prim::TupleConstruct(%1, %2) + %ti : int = prim::TupleIndex(%tc, %0) + return (%ti))IR"; + + auto g = std::make_shared(); + torch::jit::parseIR(graph, g.get()); + + auto jit_results = torch_tensorrt::tests::util::EvaluateGraphJIT(g, {}); + auto trt_results = torch_tensorrt::tests::util::EvaluateGraph(g->block(), {}); + ASSERT_TRUE(jit_results[0] == trt_results[0]); } \ No newline at end of file diff --git a/tests/cpp/test_collections.cpp b/tests/cpp/test_collections.cpp index 829e82abc9..e48874e8bb 100644 --- a/tests/cpp/test_collections.cpp +++ b/tests/cpp/test_collections.cpp @@ -29,20 +29,18 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) { } auto out = mod.forward(inputs_); - LOG_DEBUG("Finish torchscirpt forward"); std::vector input_range; input_range.push_back({in0.sizes(), torch::kF16}); input_range.push_back({in0.sizes(), torch::kF16}); torch_tensorrt::ts::CompileSpec compile_settings(input_range); compile_settings.require_full_compilation = true; - compile_settings.min_block_size = 3; + compile_settings.min_block_size = 1; // // FP16 execution compile_settings.enabled_precisions = {torch::kHalf}; // // Compile module auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); - LOG_DEBUG("Finish compile"); auto trt_out = trt_mod.forward(inputs_); ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5)); @@ -68,7 +66,6 @@ TEST(CppAPITests, TestCollectionTupleInput) { complex_inputs.push_back(input_tuple); auto out = mod.forward(complex_inputs); - LOG_DEBUG("Finish torchscirpt forward"); auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); @@ -81,14 +78,13 @@ TEST(CppAPITests, TestCollectionTupleInput) { torch::jit::IValue complex_input_shape2(input_tuple2); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = false; - compile_settings.min_block_size = 3; + compile_settings.require_full_compilation = true; + compile_settings.min_block_size = 1; // // FP16 execution compile_settings.enabled_precisions = {torch::kHalf}; // // Compile module auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); - LOG_DEBUG("Finish compile"); auto trt_out = trt_mod.forward(complex_inputs); ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5)); @@ -126,7 +122,6 @@ TEST(CppAPITests, TestCollectionListInput) { complex_inputs.push_back(input_list_ivalue); auto out = mod.forward(complex_inputs); - LOG_DEBUG("Finish torchscirpt forward"); auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive(input_shape))); @@ -141,9 +136,9 @@ TEST(CppAPITests, TestCollectionListInput) { torch::jit::IValue complex_input_shape2(input_tuple2); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = false; - compile_settings.min_block_size = 3; - compile_settings.torch_executed_ops.push_back("aten::__getitem__"); + compile_settings.require_full_compilation = true; + compile_settings.min_block_size = 1; + //compile_settings.torch_executed_ops.push_back("aten::__getitem__"); // // FP16 execution compile_settings.enabled_precisions = {torch::kHalf}; @@ -176,7 +171,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) { complex_inputs.push_back(input_tuple); auto out = mod.forward(complex_inputs); - LOG_DEBUG("Finish torchscirpt forward"); auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); @@ -190,8 +184,8 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) { // torch::jit::IValue complex_input_shape(list); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = false; - compile_settings.min_block_size = 3; + compile_settings.require_full_compilation = true; + compile_settings.min_block_size = 1; // compile_settings.torch_executed_ops.push_back("prim::TupleConstruct"); @@ -199,7 +193,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) { compile_settings.enabled_precisions = {torch::kHalf}; // // Compile module auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); - LOG_DEBUG("Finish compile"); auto trt_out = trt_mod.forward(complex_inputs); ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( @@ -240,7 +233,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) { complex_inputs.push_back(input_list_ivalue); auto out = mod.forward(complex_inputs); - LOG_DEBUG("Finish torchscirpt forward"); auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); @@ -256,17 +248,16 @@ TEST(CppAPITests, TestCollectionListInputOutput) { torch::jit::IValue complex_input_shape2(input_tuple2); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = false; - compile_settings.min_block_size = 3; + compile_settings.require_full_compilation = true; + compile_settings.min_block_size = 1; // Need to skip the conversion of __getitem__ and ListConstruct - compile_settings.torch_executed_ops.push_back("aten::__getitem__"); + //compile_settings.torch_executed_ops.push_back("aten::__getitem__"); // // FP16 execution compile_settings.enabled_precisions = {torch::kHalf}; // // Compile module auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); - LOG_DEBUG("Finish compile"); auto trt_out = trt_mod.forward(complex_inputs); ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( @@ -307,7 +298,6 @@ TEST(CppAPITests, TestCollectionComplexModel) { complex_inputs.push_back(input_list_ivalue); auto out = mod.forward(complex_inputs); - LOG_DEBUG("Finish torchscirpt forward"); auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf); @@ -323,17 +313,16 @@ TEST(CppAPITests, TestCollectionComplexModel) { torch::jit::IValue complex_input_shape2(input_tuple2); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = false; - compile_settings.min_block_size = 3; + compile_settings.require_full_compilation = true; + compile_settings.min_block_size = 1; // Need to skip the conversion of __getitem__ and ListConstruct - compile_settings.torch_executed_ops.push_back("aten::__getitem__"); + //compile_settings.torch_executed_ops.push_back("aten::__getitem__"); // // FP16 execution compile_settings.enabled_precisions = {torch::kHalf}; // // Compile module auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings); - LOG_DEBUG("Finish compile"); auto trt_out = trt_mod.forward(complex_inputs); ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual( diff --git a/tests/util/evaluate_graph.cpp b/tests/util/evaluate_graph.cpp index 7e69b454ef..5a9f10f7b0 100644 --- a/tests/util/evaluate_graph.cpp +++ b/tests/util/evaluate_graph.cpp @@ -28,7 +28,7 @@ std::vector EvaluateGraph(const torch::jit::Block* b, std::v "Test graph contains non evaluatable nodes: " << *n); auto eval = core::conversion::EvaluateNode(ctx, n); if (eval) { - if (eval.value().isTuple()) { + if (eval.value().isTuple() && n->outputs().size() > 1) { auto eval_list = eval.value().toTuple(); for (size_t i = 0; i < eval_list->elements().size(); i++) { auto eval_output = eval_list.get()->elements()[i]; From f5199355f3cc30fd3cf169fcba76cc389c327ac6 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Wed, 3 Aug 2022 22:02:19 -0700 Subject: [PATCH 13/16] feat(collections): Enable grouped inputs via partial compilation HACK: This PR enables grouped input features by leveraging partial compilation and disabling tuple and list evaluators in the case where grouped inputs are used. The intention is that this WAR is removed in the next release Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- cpp/src/compile_spec.cpp | 21 ++++++++++++++++++++- py/torch_tensorrt/ts/_compile_spec.py | 24 +++++++++++++++++++++++- py/torch_tensorrt/ts/_compiler.py | 3 +-- tests/cpp/test_collections.cpp | 12 ------------ tests/py/api/test_collections.py | 15 +++++---------- 5 files changed, 49 insertions(+), 26 deletions(-) diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp index 04573499e6..8c7bb8b403 100644 --- a/cpp/src/compile_spec.cpp +++ b/cpp/src/compile_spec.cpp @@ -63,7 +63,7 @@ void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IV } } -torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) { +torchtrt::core::CompileSpec init_compile_spec(CompileSpec& external) { if (external.graph_inputs.inputs.size() > 0) { torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.graph_inputs.inputs)); return internal; @@ -72,6 +72,25 @@ torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) { LOG_WARNING( "Input signature parsing is an experimental feature, behavior and APIs may change"); to_internal_input_signature(external.graph_inputs.input_signature, converted_input_signature); torchtrt::core::CompileSpec internal(converted_input_signature); + + TORCHTRT_CHECK(!external.require_full_compilation, \ + "Grouped inputs currently requires partial compilation to be enabled, \ + this restriction will be relaxed in a future release"); + + LOG_DEBUG("Grouped inputs currently requires additional settings to enable the feature"); + LOG_DEBUG("Adding the following ops to torch_executed_ops:" \ + << std::endl << " - aten::__getitem__" \ + << std::endl << " - prim::ListConstruct" \ + << std::endl << " - prim::ListUnpack" \ + << std::endl << " - prim::TupleIndex" \ + << std::endl << " - prim::TupleConstruct" \ + << std::endl << " - prim::TupleUnpack"); + external.torch_executed_ops.push_back("aten::__getitem__"); + external.torch_executed_ops.push_back("prim::ListConstruct"); + external.torch_executed_ops.push_back("prim::ListUnpack"); + external.torch_executed_ops.push_back("prim::TupleIndex"); + external.torch_executed_ops.push_back("prim::TupleConstruct"); + external.torch_executed_ops.push_back("prim::TupleUnpack"); return internal; } } diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py index 628955c079..4cd69f794f 100644 --- a/py/torch_tensorrt/ts/_compile_spec.py +++ b/py/torch_tensorrt/ts/_compile_spec.py @@ -8,6 +8,7 @@ from torch_tensorrt.logging import Level, log from typing import Tuple, List, Dict import warnings +from copy import deepcopy def _internal_input_to_torch_class_input(i: _C.Input) -> torch.classes.tensorrt._Input: @@ -188,7 +189,9 @@ def _parse_input_signature(input_signature: Any): else: raise KeyError("Input signature contains an unsupported type {}".format(type(input_signature))) -def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec: +def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec: + # TODO: Remove deep copy once collections does not need partial compilation + compile_spec = deepcopy(compile_spec_) info = _ts_C.CompileSpec() if len(compile_spec["inputs"]) > 0: @@ -204,6 +207,25 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec: signature = _parse_input_signature(compile_spec["input_signature"]) info.input_signature = _C.InputSignature(signature) # py_object + if not compile_spec["torch_fallback"]["enabled"]: + raise ValueError("Grouped inputs currently requires partial compilation to be enabled, this restriction will be relaxed in a future release") + + log(Level.Debug, "Grouped inputs currently requires additional settings to enable the feature") + log(Level.Debug, """Adding the following ops to torch_executed_ops: + - aten::__getitem__ + - prim::ListConstruct + - prim::ListUnpack + - prim::TupleIndex + - prim::TupleConstruct + - prim::TupleUnpack +""") + compile_spec["torch_fallback"]["forced_fallback_ops"].append("aten::__getitem__") + compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::ListConstruct") + compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::ListUnpack") + compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::TupleIndex") + compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::TupleConstruct") + compile_spec["torch_fallback"]["forced_fallback_ops"].append("prim::TupleUnpack") + else: raise KeyError( "Module input definitions are requried to compile module. Provide a list of torch_tensorrt.Input keyed to \"inputs\" in the compile spec" diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py index 9119c25e86..cc5f4b24d1 100644 --- a/py/torch_tensorrt/ts/_compiler.py +++ b/py/torch_tensorrt/ts/_compiler.py @@ -103,8 +103,7 @@ def compile(module: torch.jit.ScriptModule, if require_full_compilation and (len(torch_executed_modules) > 0 or len(torch_executed_ops) > 0): raise ValueError( - "require_full_compilation is enabled however the list of modules and ops to run in torch is not empty. Found: torch_executed_ops: " - + torch_executed_ops + ", torch_executed_modules: " + torch_executed_modules) + f"require_full_compilation is enabled however the list of modules and ops to run in torch is not empty. Found: torch_executed_ops: {torch_executed_ops}, torch_executed_modules: {torch_executed_modules}") spec = { "inputs": inputs, diff --git a/tests/cpp/test_collections.cpp b/tests/cpp/test_collections.cpp index e48874e8bb..31495a47a7 100644 --- a/tests/cpp/test_collections.cpp +++ b/tests/cpp/test_collections.cpp @@ -34,7 +34,6 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) { input_range.push_back({in0.sizes(), torch::kF16}); input_range.push_back({in0.sizes(), torch::kF16}); torch_tensorrt::ts::CompileSpec compile_settings(input_range); - compile_settings.require_full_compilation = true; compile_settings.min_block_size = 1; // // FP16 execution @@ -78,7 +77,6 @@ TEST(CppAPITests, TestCollectionTupleInput) { torch::jit::IValue complex_input_shape2(input_tuple2); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = true; compile_settings.min_block_size = 1; // // FP16 execution @@ -136,7 +134,6 @@ TEST(CppAPITests, TestCollectionListInput) { torch::jit::IValue complex_input_shape2(input_tuple2); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = true; compile_settings.min_block_size = 1; //compile_settings.torch_executed_ops.push_back("aten::__getitem__"); @@ -184,7 +181,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) { // torch::jit::IValue complex_input_shape(list); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = true; compile_settings.min_block_size = 1; // compile_settings.torch_executed_ops.push_back("prim::TupleConstruct"); @@ -248,12 +244,8 @@ TEST(CppAPITests, TestCollectionListInputOutput) { torch::jit::IValue complex_input_shape2(input_tuple2); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = true; compile_settings.min_block_size = 1; - // Need to skip the conversion of __getitem__ and ListConstruct - //compile_settings.torch_executed_ops.push_back("aten::__getitem__"); - // // FP16 execution compile_settings.enabled_precisions = {torch::kHalf}; // // Compile module @@ -313,12 +305,8 @@ TEST(CppAPITests, TestCollectionComplexModel) { torch::jit::IValue complex_input_shape2(input_tuple2); auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2); - compile_settings.require_full_compilation = true; compile_settings.min_block_size = 1; - // Need to skip the conversion of __getitem__ and ListConstruct - //compile_settings.torch_executed_ops.push_back("aten::__getitem__"); - // // FP16 execution compile_settings.enabled_precisions = {torch::kHalf}; // // Compile module diff --git a/tests/py/api/test_collections.py b/tests/py/api/test_collections.py index 603d44aebb..154145e681 100644 --- a/tests/py/api/test_collections.py +++ b/tests/py/api/test_collections.py @@ -48,8 +48,7 @@ def test_compile(self): "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),), "device": torchtrt.Device("gpu:0"), "enabled_precisions": {torch.float}, - "require_full_compilation": False, - "min_block_size": 3 + "min_block_size": 1 } trt_mod = torchtrt.ts.compile(self.model, **compile_spec) @@ -69,8 +68,7 @@ def test_compile(self): "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],), "device": torchtrt.Device("gpu:0"), "enabled_precisions": {torch.float}, - "require_full_compilation": False, - "min_block_size": 3 + "min_block_size": 1 } trt_mod = torchtrt.ts.compile(self.model, **compile_spec) @@ -89,8 +87,7 @@ def test_compile(self): "input_signature": ((torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)),), "device": torchtrt.Device("gpu:0"), "enabled_precisions": {torch.float}, - "require_full_compilation": False, - "min_block_size": 3 + "min_block_size": 1 } trt_mod = torchtrt.ts.compile(self.model, **compile_spec) @@ -111,8 +108,7 @@ def test_compile(self): "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],), "device": torchtrt.Device("gpu:0"), "enabled_precisions": {torch.float}, - "require_full_compilation": False, - "min_block_size": 3 + "min_block_size": 1 } trt_mod = torchtrt.ts.compile(self.model, **compile_spec) @@ -134,8 +130,7 @@ def test_compile(self): "input_signature": ([torchtrt.Input(self.input.shape), torchtrt.Input(self.input.shape)],), "device": torchtrt.Device("gpu:0"), "enabled_precisions": {torch.float}, - "require_full_compilation": False, - "min_block_size": 3 + "min_block_size": 1 } trt_mod = torchtrt.ts.compile(self.model, **compile_spec) From bce8464b2ce90c1744e45587dd4be17d38fe8219 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Fri, 5 Aug 2022 22:03:06 -0700 Subject: [PATCH 14/16] feat(element_wise): Auto cast to higher precision for mismatched types Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- core/conversion/converters/converter_util.cpp | 7 ++++ .../converters/impl/element_wise.cpp | 2 + .../converters/test_element_wise.cpp | 37 ++++++++++++++++++- tests/util/run_graph_engine.cpp | 5 ++- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/core/conversion/converters/converter_util.cpp b/core/conversion/converters/converter_util.cpp index 745261589e..94ac827ef4 100644 --- a/core/conversion/converters/converter_util.cpp +++ b/core/conversion/converters/converter_util.cpp @@ -65,6 +65,13 @@ nvinfer1::ILayer* add_elementwise( nvinfer1::ITensor* self, nvinfer1::ITensor* other, const std::string& name) { + if (self->getType() == nvinfer1::DataType::kFLOAT && other->getType() == nvinfer1::DataType::kINT32) { + LOG_DEBUG("Type mismatch, casting other to " << self->getType()); + other = castITensor(ctx, other, self->getType()); + } else if (self->getType() == nvinfer1::DataType::kINT32 && other->getType() == nvinfer1::DataType::kFLOAT) { + LOG_DEBUG("Type mismatch, casting self to " << other->getType()); + self = castITensor(ctx, self, other->getType()); + } // ensure self to have larger number of dimension bool swapSelfOther = false; if (self->getDimensions().nbDims < other->getDimensions().nbDims) { diff --git a/core/conversion/converters/impl/element_wise.cpp b/core/conversion/converters/impl/element_wise.cpp index 2f0c3a9d13..da9d58ef43 100644 --- a/core/conversion/converters/impl/element_wise.cpp +++ b/core/conversion/converters/impl/element_wise.cpp @@ -412,6 +412,7 @@ auto element_wise_registrations TORCHTRT_UNUSED = // Should implement self * other auto self = args[0].ITensorOrFreeze(ctx); auto other = args[1].ITensorOrFreeze(ctx); + auto mul = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, self, other, util::node_info(n)); TORCHTRT_CHECK(mul, "Unable to create mul layer from node: " << *n); @@ -426,6 +427,7 @@ auto element_wise_registrations TORCHTRT_UNUSED = // TODO: Remove with functionalization auto self = args[0].ITensorOrFreeze(ctx); auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar()); + auto mul = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, self, other, util::node_info(n)); TORCHTRT_CHECK(mul, "Unable to create mul layer from node: " << *n); diff --git a/tests/core/conversion/converters/test_element_wise.cpp b/tests/core/conversion/converters/test_element_wise.cpp index 994fb25811..939c9b7394 100644 --- a/tests/core/conversion/converters/test_element_wise.cpp +++ b/tests/core/conversion/converters/test_element_wise.cpp @@ -12,7 +12,9 @@ void pointwise_test_helper( std::vector shape1 = {5}, std::vector shape2 = {5}, bool negative_input = false, - bool int_tensors = false) { + bool int_tensors = false, + bool float_int_tensors = false, + bool int_float_tensors = false) { auto g = std::make_shared(); torch::jit::parseIR(graph_ir, g.get()); @@ -27,11 +29,24 @@ void pointwise_test_helper( if (!singleInput) { torch_inputs.push_back(at::randint(1, 5, shape2, {at::kCUDA})); } + + TORCHTRT_CHECK(!((int_tensors && (float_int_tensors || int_float_tensors)) || (float_int_tensors && int_float_tensors)), + "Invalid test configuration, only one of int_tensors, float_int_tensors, int_float_tensors can be true"); + if(int_tensors){ for(size_t i = 0UL; i < torch_inputs.size(); ++i){ torch_inputs[i] = torch_inputs[i].to(at::kInt); } + } else if(float_int_tensors) { + TORCHTRT_CHECK(!singleInput, "float_int_tensors tests require two inputs"); + torch_inputs[0] = torch_inputs[0].to(at::kFloat); + torch_inputs[1] = torch_inputs[1].to(at::kInt); + } else if (int_float_tensors) { + TORCHTRT_CHECK(!singleInput, "int_float_tensors tests require two inputs"); + torch_inputs[0] = torch_inputs[0].to(at::kInt); + torch_inputs[1] = torch_inputs[1].to(at::kFloat); } + auto params = torch_tensorrt::core::ir::get_static_params(g->inputs(), {}); auto jit_results = torch_tensorrt::tests::util::RunGraph(g, params, torch_inputs); @@ -62,6 +77,8 @@ TEST(Converters, ATenAddConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } TEST(Converters, ATenAddWithAlphaConvertsCorrectly) { @@ -75,9 +92,11 @@ TEST(Converters, ATenAddWithAlphaConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } -TEST(Converters, ATenAddImplicitWithAlphaConvertsCorrectly) { +TEST(Converters, ATenAddInplaceWithAlphaConvertsCorrectly) { const auto graph = R"IR( graph(%0 : Tensor, %1 : Tensor): %2 : float = prim::Constant[value=7.6]() @@ -109,6 +128,8 @@ TEST(Converters, ATenSubConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } TEST(Converters, ATenMulConvertsCorrectly) { @@ -121,6 +142,8 @@ TEST(Converters, ATenMulConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } TEST(Converters, ATenMulWithScalarConvertsCorrectly) { @@ -151,6 +174,8 @@ TEST(Converters, ATenDivConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } TEST(Converters, ATenDivWithScalarConvertsCorrectly) { @@ -173,6 +198,8 @@ TEST(Converters, ATenDivRoundingFloorConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}, true); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}, true); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } TEST(Converters, ATenDivRoundingTruncConvertsCorrectly) { @@ -186,6 +213,8 @@ TEST(Converters, ATenDivRoundingTruncConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}, true); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}, true); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } TEST(Converters, ATenDivRoundingNoneConvertsCorrectly) { @@ -211,6 +240,8 @@ TEST(Converters, ATenPowTensorConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } TEST(Converters, ATenPowScalarConvertsCorrectly) { @@ -251,6 +282,8 @@ TEST(Converters, ATenFloorDivideConvertsCorrectly) { pointwise_test_helper(graph, false, false, {4}, {3, 4}); pointwise_test_helper(graph, false, true, {3, 4, 3}, {4, 3}); pointwise_test_helper(graph, false, true, {4, 3}, {3, 4, 3}); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, true); + pointwise_test_helper(graph, false, true, {5}, {5}, false, false, false, true); } TEST(Converters, ATenFloorDivideWithScalarConvertsCorrectly) { diff --git a/tests/util/run_graph_engine.cpp b/tests/util/run_graph_engine.cpp index b0bb920768..1d77550d1d 100644 --- a/tests/util/run_graph_engine.cpp +++ b/tests/util/run_graph_engine.cpp @@ -30,6 +30,7 @@ std::vector toInputsDynamic(std::vector ten, bool d for (auto i : ten) { auto opt = core::util::toVec(i.sizes()); + auto dtype = core::util::ScalarTypeToTRTDataType(i.scalar_type()); if (dynamic_batch) { std::vector min_range(opt); @@ -38,7 +39,7 @@ std::vector toInputsDynamic(std::vector ten, bool d min_range[0] = ceil(opt[0] / 2.0); max_range[0] = 2 * opt[0]; - a.push_back(core::ir::Input(min_range, opt, max_range)); + a.push_back(core::ir::Input(min_range, opt, max_range, dtype)); } else { std::vector min_range(opt); std::vector max_range(opt); @@ -46,7 +47,7 @@ std::vector toInputsDynamic(std::vector ten, bool d min_range[1] = ceil(opt[1] / 2.0); max_range[1] = 2 * opt[1]; - a.push_back(core::ir::Input(min_range, opt, max_range)); + a.push_back(core::ir::Input(min_range, opt, max_range, dtype)); } } From 891440da148b3cee64e0828e3e3a7f6cfe2cb0db Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Sat, 6 Aug 2022 13:50:24 -0700 Subject: [PATCH 15/16] refactor: Disable input_signature in torchscript backend due to lack of generic interface Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- .../csrc/register_tensorrt_classes.cpp | 5 +++++ py/torch_tensorrt/ts/_compile_spec.py | 21 ++++--------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp index 274b40d479..9db567ca86 100644 --- a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp @@ -26,6 +26,11 @@ void RegisterTRTCompileSpec() { static auto TORCHTRT_UNUSED TRTInputSignatureTSRegistration = torch::class_("tensorrt", "_InputSignature") .def(torch::init<>()) + .def("_set_signature_ivalue_torchbind", + [](const c10::intrusive_ptr& self, + torch::jit::IValue ival) { + self->signature_ivalue = ival; + }) .def("__str__", &torch_tensorrt::pyapi::InputSignature::to_str); ADD_FIELD_GET_SET_REGISTRATION( diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py index 4cd69f794f..ac3465b1c4 100644 --- a/py/torch_tensorrt/ts/_compile_spec.py +++ b/py/torch_tensorrt/ts/_compile_spec.py @@ -327,20 +327,6 @@ def TensorRTCompileSpec(inputs=[], torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings ] - input_signature Union(List, Tuple, torch_tensorrt.Input, torch.Tensor): A formatted collection of input specifications for the module. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using - torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. **This API should be considered beta-level stable and may change in the future** :: - - input_signature=([ - torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 - torch_tensorrt.Input( - min_shape=(1, 224, 224, 3), - opt_shape=(1, 512, 512, 3), - max_shape=(1, 1024, 1024, 3), - dtype=torch.int32 - format=torch.channel_last - ), # Dynamic input shape for input #2 - ], torch.randn((1, 3, 224, 244))) # Use an example tensor and let torch_tensorrt infer settings for input #3 - device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on :: device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True) @@ -362,7 +348,7 @@ def TensorRTCompileSpec(inputs=[], compile_spec = { "inputs": inputs, - "input_signature": input_signature, + #"input_signature": input_signature, "device": device, "disable_tf32": disable_tf32, # Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas @@ -384,12 +370,13 @@ def TensorRTCompileSpec(inputs=[], backend_spec = torch.classes.tensorrt.CompileSpec() + if input_signature is not None: + raise ValueError("Input signature parsing is not currently supported in the TorchScript backend integration") + for i in parsed_spec.inputs: clone = _internal_input_to_torch_class_input(i) backend_spec._append_input(clone) - backend_spec._set_input_signature(parsed_spec.input_signature) - d = torch.classes.tensorrt._Device() d._set_device_type(int(parsed_spec.device.device_type)) d._set_gpu_id(parsed_spec.device.gpu_id) From 223dfd11ae3cfe0e58b229af23d28facf66e5da5 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Mon, 8 Aug 2022 16:52:01 -0700 Subject: [PATCH 16/16] chore: remove commented out code Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- core/compiler.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index caee900879..7b58dbb2c1 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -394,7 +394,6 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std:: auto params = graph_and_parameters.second; auto static_params = ir::get_static_params(g->inputs(), params); // Infer the type of an input from the weights of the calculation - // auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block()); auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block()); MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);