[TensorRT] Fix DDS output bug during engine update (#26272)

toothache · apsonawane · commit f91b24e23b7d · 2025-10-20T13:34:27.000-07:00
### Description
Fix a bug in the TRT Execution Provider where the DDS output tensor was
not bound after an engine update.


### Motivation and Context
The `dds_output_allocator_map` is not cleared on engine update, so that
it will mis-recognized as a known DDS and will not bind the output
allocation.

Script to reproduce the issue:
```:python
# create an onnx model with:
# inputs: data -&gt; NonZeros(data) -&gt; GatherND -&gt; output
# then run the model with onnxruntime

def create_model():
    import onnx
    from onnx import helper, TensorProto

    input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"])
    output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"])

    nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node")
    transpose_node = helper.make_node(
        "Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node"
    )
    gathernd_node = helper.make_node(
        "GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node"
    )

    value_info = [
        helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]),
        helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]),
    ]

    graph = helper.make_graph(
        [nonzeros_node, transpose_node, gathernd_node],
        "test_graph",
        [input],
        [output],
        value_info=value_info,
    )

    model = helper.make_model(graph)
    onnx.save(model, "model_dds.onnx")


def run_model():
    import onnxruntime as ort
    import numpy as np

    sess = ort.InferenceSession("model_dds.onnx", providers=["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"])

    print("Running with data shape (3,4)")
    data = np.random.randn(3, 4).astype(np.float32)
    sess.run(None, {"data": data})

    print("Running with data shape (5,6)")
    data = np.random.randn(5, 6).astype(np.float32)
    sess.run(None, {"data": data})


create_model()
run_model()
```

Before the change:
&gt; IExecutionContext::enqueueV3: Error Code 3: API Usage Error (Parameter
check failed, condition:
mContext.profileObliviousBindings.at(profileObliviousIndex) ||
getPtrOrNull(mOutputAllocators, profileObliviousIndex). Neither address
or allocator is set for output tensor scores. Call
setOutputTensorAddress, setTensorAddress or setOutputAllocator before
enqueue/execute.) ... Status Message: TensorRT EP execution context
enqueue failed.
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3976,6 +3976,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
       trt_state->context->reset();
       trt_state->engine->reset();
+
+      // Clear dds output allocator map since the engine and context will be recreated.
+      dds_output_allocator_map.clear();
+
       auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
       if (max_workspace_size_ > 0) {
         trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -713,6 +713,52 @@ TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {
   ASSERT_TRUE(status.IsOK());
 }
 
+TEST(TensorrtExecutionProviderTest, DDSOutputTest) {
+  PathString model_name = ORT_TSTR("testdata/ort_github_issue_26272_dds.onnx");
+  SessionOptions so;
+  so.session_logid = "TensorrtExecutionProviderRunWithDDSOutput";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+  InferenceSession session_object{so, GetEnvironment()};
+  auto cuda_provider = DefaultCudaExecutionProvider();
+  auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[1];
+  std::vector<int64_t> dims_op_x = {3, 4};
+  std::vector<float> values_op_x(12, 0.f);  // 12=3*4
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cuda_allocator, dims_op_x, values_op_x, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("data", ml_value_x));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("output");
+  std::vector<OrtValue> fetches;
+
+  OrtTensorRTProviderOptionsV2 params;
+  std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  auto status = session_object.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  // First pass run
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+
+  // Second pass run with new shape
+  dims_op_x = {6, 4};
+  values_op_x.resize(24, 0.f);  // 24=6*4
+  CreateMLValue<float>(cuda_allocator, dims_op_x, values_op_x, &ml_value_x);
+  feeds.clear();
+
+  feeds.insert(std::make_pair("data", ml_value_x));
+
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+}
+
 TEST_P(TensorrtExecutionProviderCacheTest, Run) {
   // GetParam() returns the parameter of following format:
   // ##cache type##_##input shape type##
diff --git a/onnxruntime/test/testdata/ort_github_issue_26272.py b/onnxruntime/test/testdata/ort_github_issue_26272.py
@@ -0,0 +1,26 @@
+import onnx
+from onnx import TensorProto, helper
+
+# Create a simple ONNX model with DDS output
+input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"])
+output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"])
+
+nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node")
+transpose_node = helper.make_node("Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node")
+gathernd_node = helper.make_node("GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node")
+
+value_info = [
+    helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]),
+    helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]),
+]
+
+graph = helper.make_graph(
+    [nonzeros_node, transpose_node, gathernd_node],
+    "test_graph",
+    [input],
+    [output],
+    value_info=value_info,
+)
+
+model = helper.make_model(graph)
+onnx.save(model, "ort_github_issue_26272_dds.onnx")
diff --git a/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx
@@ -0,0 +1,28 @@
+
+:�
+(
+datanonzerosnonzeros_node"NonZero
+1
+nonzeros
+nonzeros_ttranspose_node"	Transpose
+3
+data
+
+nonzeros_toutputgathernd_node"GatherND
+test_graphZ
+data
+
+d1
+d2b
+output
+
+nzrj
+nonzeros
+
+
+nzrj
+
+nonzeros_t
+
+nzr
+B

-Original file line number
+Diff line change
@@ @@ -0,0 +1,28 @@ @@
++
 +:�
 +(
 +datanonzerosnonzeros_node"NonZero
 +1
 +nonzeros
 +nonzeros_ttranspose_node"	Transpose
 +3
 +data
++
 +nonzeros_toutputgathernd_node"GatherND
 +test_graphZ
 +data
++
 +d1
 +d2b
 +output
++
 +nzrj
 +nonzeros
++
++
 +nzrj
++
 +nonzeros_t
++
 +nzr
 +B