From 377229485490f9ef99859aa657c61efff96703ab Mon Sep 17 00:00:00 2001 From: Cheng Hang Date: Mon, 13 Jun 2022 17:48:59 +0800 Subject: [PATCH 1/2] Support shape analysis for dynamic fallback Signed-off-by: Cheng Hang --- core/compiler.cpp | 10 +-- core/partitioning/SegmentedBlock.h | 35 +++++++++++ core/partitioning/partitioning.cpp | 4 +- core/partitioning/partitioning.h | 2 +- core/partitioning/shape_analysis.cpp | 91 +++++++++++++++++++++++++--- core/partitioning/shape_analysis.h | 4 +- 6 files changed, 128 insertions(+), 18 deletions(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index fc1cc66aee..8548ef636c 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -222,7 +222,7 @@ void AddIfBlockToGraph( GraphAndMapping ConstructFallbackGraph( torch::jit::script::Module& new_mod, torch::jit::Block* block, - std::unordered_map example_tensor_map, + std::vector> example_tensor_maps, CompileSpec cfg, ir::StaticParams static_params, std::unordered_map& fallback_nodes) { @@ -231,7 +231,7 @@ GraphAndMapping ConstructFallbackGraph( auto new_g = std::make_shared(); - auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partition_info, fallback_nodes); + auto segmented_blocks = partitioning::Partition(block, example_tensor_maps, partition_info, fallback_nodes); // the mapping from lowering graph => fallback global graph std::unordered_map old_to_new_g; @@ -272,7 +272,7 @@ GraphAndMapping ConstructFallbackGraph( std::vector graph_and_mappings; for (auto cur_block : if_node->blocks()) { graph_and_mappings.push_back( - ConstructFallbackGraph(new_mod, cur_block, example_tensor_map, cfg, static_params, fallback_nodes)); + ConstructFallbackGraph(new_mod, cur_block, example_tensor_maps, cfg, static_params, fallback_nodes)); } AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g); @@ -408,10 +408,10 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) if (cfg.partition_info.enabled && !(cfg.lower_info.forced_fallback_modules.size() == 0 && cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) { - auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types); + auto input_ivalues_maps = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types); std::unordered_map fallback_nodes; auto graph_and_mapping = - ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params, fallback_nodes); + ConstructFallbackGraph(new_mod, g->block(), input_ivalues_maps, cfg, static_params, fallback_nodes); new_g = graph_and_mapping.first; // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly for (size_t i = 0; i < new_g->inputs().size(); ++i) { diff --git a/core/partitioning/SegmentedBlock.h b/core/partitioning/SegmentedBlock.h index f7d8a0b612..b692c5802a 100644 --- a/core/partitioning/SegmentedBlock.h +++ b/core/partitioning/SegmentedBlock.h @@ -6,6 +6,7 @@ #include "NvInfer.h" #include "core/ir/ir.h" #include "core/partitioning/PartitionInfo.h" +#include "core/util/trt_util.h" #include "torch/csrc/jit/ir/ir.h" namespace torch_tensorrt { @@ -76,6 +77,40 @@ struct SegmentedBlock { void register_inshapes(std::vector& in_shapes) { in_shapes_ = in_shapes; } + + void register_opt_shapes(std::vector& opt_shapes) { + assert(in_shapes_.size() == opt_shapes.size()); + for (size_t i = 0; i < opt_shapes.size(); i++) { + in_shapes_[i].opt = opt_shapes[i].opt; + } + } + + void register_max_shapes(std::vector& max_shapes) { + assert(in_shapes_.size() == max_shapes.size()); + for (size_t i = 0; i < max_shapes.size(); i++) { + in_shapes_[i].max = max_shapes[i].max; + } + } + + void construct_dynamic_shape() { + for (size_t i = 0; i < in_shapes_.size(); i++) { + std::vector dyn_shape; + for (int j = 0; j < in_shapes_[i].input_shape.nbDims; j++) { + std::set dim; + dim.insert(in_shapes_[i].min.d[j]); + dim.insert(in_shapes_[i].opt.d[j]); + dim.insert(in_shapes_[i].max.d[j]); + if (dim.size() != 1) { + dyn_shape.push_back(-1); + in_shapes_[i].input_is_dynamic = true; + } else { + dyn_shape.push_back(in_shapes_[i].opt.d[j]); + } + } + in_shapes_[i].input_shape = util::toDims(dyn_shape); + } + } + const std::vector& in_shapes() const { return in_shapes_; } diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 8fcd29f7a8..c6dc47c3e6 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -435,7 +435,7 @@ PartitionedGraph segment_graph( PartitionedGraph Partition( torch::jit::Block* block, - std::unordered_map& example_tensor_map, + std::vector>& example_tensor_maps, const PartitionInfo& partition_info, std::unordered_map& global_fallback_nodes) { LOG_DEBUG(partition_info); @@ -453,7 +453,7 @@ PartitionedGraph Partition( registerSegmentsOutputs(segmented_blocks, block); // run shape analysis on each segmented block - runShapeAnalysis(segmented_blocks, example_tensor_map, partition_info); + runShapeAnalysis(segmented_blocks, example_tensor_maps, partition_info); for (uint64_t i = 0; i < segmented_blocks.size(); i++) { segmented_blocks[i].update_id(i); diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index df64f582a4..89f13465ed 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -37,7 +37,7 @@ PartitionedGraph segment_graph( PartitionedGraph Partition( torch::jit::Block* block, - std::unordered_map& example_tensor_map, + std::vector>& example_tensor_map, const PartitionInfo& partition_info, std::unordered_map& fallback_nodes); diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index d24b1f980a..e96e4042c7 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -8,9 +8,43 @@ namespace torch_tensorrt { namespace core { namespace partitioning { -std::unordered_map generateRandomInputs( +std::vector> generateRandomInputs( std::unordered_map& inputs, std::unordered_map>& types) { + std::vector> ivalue_maps; + + bool is_dynamic = false; + for (auto& input : inputs) { + if (input.second.input_is_dynamic) + is_dynamic = true; + } + if (is_dynamic) { + LOG_WARNING("Dynamic fallback encountered"); + std::unordered_map ivalue_map_min, ivalue_map_opt, ivalue_map_max; + for (auto& input : inputs) { + auto cur_min = input.second.min; + auto cur_opt = input.second.opt; + auto cur_max = input.second.max; + std::vector min_shape, opt_shape, max_shape; + min_shape.insert(min_shape.begin(), std::begin(cur_min.d), std::begin(cur_min.d) + cur_min.nbDims); + opt_shape.insert(opt_shape.begin(), std::begin(cur_opt.d), std::begin(cur_opt.d) + cur_opt.nbDims); + max_shape.insert(max_shape.begin(), std::begin(cur_max.d), std::begin(cur_max.d) + cur_max.nbDims); + auto type_opt = types[input.first]; + auto type = at::kFloat; + if (type_opt) { + type = type_opt.value(); + } else { + LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32"); + } + auto in_min = at::randint(5, min_shape, {at::kCUDA}).to(type); + auto in_opt = at::randint(5, opt_shape, {at::kCUDA}).to(type); + auto in_max = at::randint(5, max_shape, {at::kCUDA}).to(type); + ivalue_map_min[input.first] = in_min.clone(); + ivalue_map_opt[input.first] = in_opt.clone(); + ivalue_map_max[input.first] = in_max.clone(); + } + return {ivalue_map_min, ivalue_map_opt, ivalue_map_max}; + } // generate random inputs for running pytorch segments std::unordered_map ivalue_map; @@ -30,12 +64,13 @@ std::unordered_map generateRandomI ivalue_map[input.first] = in.clone(); in_i++; } - return ivalue_map; + return {ivalue_map}; } void getSegmentsOutputByRunning( SegmentedBlock& seg_block, std::unordered_map& ivalues_maps, + int register_iteration, const PartitionInfo& partition_info) { // create a module to run the graph auto g = seg_block.g(); @@ -63,6 +98,12 @@ void getSegmentsOutputByRunning( std::vector jit_inputs_ivalues; + for (auto& input : seg_block.raw_inputs()) { + LOG_DEBUG( + "Register input ivalues_maps for torch::jit::Value* " << input->debugName() << ", produced from " + << util::node_info(input->node())); + } + // set inputs ivalues, now supports Tensor/Int to pass argumentes between different segments for (auto& input : seg_block.raw_inputs()) { TORCHTRT_CHECK( @@ -111,6 +152,9 @@ void getSegmentsOutputByRunning( size_t idx = 0; for (auto& output : seg_block.raw_outputs()) { ivalues_maps[output] = jit_results[idx++]; + LOG_DEBUG( + "Register output ivalues_maps for torch::jit::Value* " << output->debugName() << ", produced from " + << util::node_info(output->node())); } // set input shape for each segmented block so we wil use it in conversion process @@ -146,19 +190,50 @@ void getSegmentsOutputByRunning( input_types.push_back(cur_ivalue.toTensor().scalar_type()); } } - - seg_block.register_inshapes(input_shapes); + LOG_DEBUG("Begin register shape"); + if (register_iteration == 0) + seg_block.register_inshapes(input_shapes); + else if (register_iteration == 1) + seg_block.register_opt_shapes(input_shapes); + else if (register_iteration == 2) + seg_block.register_max_shapes(input_shapes); seg_block.register_intypes(input_types); + LOG_DEBUG("Done"); } void runShapeAnalysis( std::vector& segmented_blocks, - std::unordered_map& example_tensor_map, + std::vector>& example_tensor_maps, const PartitionInfo& partition_info) { // register every segment's input shape, and it's running output IValues - for (auto& seg_block : segmented_blocks) { - torch::jit::ConstantPooling(seg_block.g()); - getSegmentsOutputByRunning(seg_block, example_tensor_map, partition_info); + if (example_tensor_maps.size() == 1) { + int i = 0; + for (auto& seg_block : segmented_blocks) { + torch::jit::ConstantPooling(seg_block.g()); + LOG_DEBUG("Running the graph @" << i); + getSegmentsOutputByRunning(seg_block, example_tensor_maps[0], 0, partition_info); + i++; + } + } else if (example_tensor_maps.size() == 3) { + int i = 0; + for (auto& seg_block : segmented_blocks) { + torch::jit::ConstantPooling(seg_block.g()); + LOG_DEBUG("Running min graph @" << i); + getSegmentsOutputByRunning(seg_block, example_tensor_maps[0], 0, partition_info); + i++; + } + for (auto& seg_block : segmented_blocks) { + torch::jit::ConstantPooling(seg_block.g()); + LOG_DEBUG("Running opt graph @" << i); + getSegmentsOutputByRunning(seg_block, example_tensor_maps[1], 1, partition_info); + } + for (auto& seg_block : segmented_blocks) { + torch::jit::ConstantPooling(seg_block.g()); + LOG_DEBUG("Running max graph @" << i); + getSegmentsOutputByRunning(seg_block, example_tensor_maps[2], 2, partition_info); + } + for (auto& seg_block : segmented_blocks) + seg_block.construct_dynamic_shape(); } return; } diff --git a/core/partitioning/shape_analysis.h b/core/partitioning/shape_analysis.h index 0626490222..75bac6c874 100644 --- a/core/partitioning/shape_analysis.h +++ b/core/partitioning/shape_analysis.h @@ -6,13 +6,13 @@ namespace torch_tensorrt { namespace core { namespace partitioning { -std::unordered_map generateRandomInputs( +std::vector> generateRandomInputs( std::unordered_map& input_ranges, std::unordered_map>& input_types); void runShapeAnalysis( std::vector& segmented_blocks, - std::unordered_map& ivalues_maps, + std::vector>& ivalues_maps, const PartitionInfo& partition_info); } // namespace partitioning From 78188707a537f101d79d9e6286df8dc01888a905 Mon Sep 17 00:00:00 2001 From: Cheng Hang Date: Wed, 3 Aug 2022 14:03:03 +0800 Subject: [PATCH 2/2] keep 0-dim tensors --- core/lowering/lowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp index d3296c347c..5175fbf5af 100644 --- a/core/lowering/lowering.cpp +++ b/core/lowering/lowering.cpp @@ -65,7 +65,7 @@ void LowerGraph(std::shared_ptr& g, LowerInfo lower_info) { passes::RemoveNOPs(g); passes::AliasOperators(g); passes::SiluToSigmoidMultipication(g); - passes::RemoveSingleUse0DTensors(g); + // passes::RemoveSingleUse0DTensors(g); passes::RemoveUnnecessaryCasts(g); LOG_GRAPH(*g); }