Optionally disable debug handle validateion

kimishpatel · facebook-github-bot · commit 3c7caeaceb24 · 2025-09-10T15:07:20.000-07:00
Summary:
Often when aten graph has symbolic shape nodes, and inbuilt ops like gt/lt etc.,
during re-export of such a graph 'from_node' information is lost from node.meta. As a result we loose connection
between edge IR nodes and aten nodes for such ops. By default we validate that every edge IR node has corresponding
node in aten IR, and when such validation fails numeric debugger falls back to edge IR as reference graph. This
flag allows one to override such behavior and make best effort comparison.

Reviewed By: Gasoonjia

Differential Revision: D81784685
diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py
@@ -1169,6 +1169,7 @@ def _consume_etrecord(self) -> None:
 
     def _get_aot_intermediate_outputs_and_op_names(
         self,
+        disable_debug_handle_valdiation: bool = False,
     ) -> Tuple[Dict[DebugHandle, Any], Dict[DebugHandle, List[str]]]:
         """
         Capture intermediate outputs only if _representative_inputs are provided
@@ -1184,6 +1185,7 @@ def _get_aot_intermediate_outputs_and_op_names(
             self._etrecord.exported_program,
             self._etrecord.export_graph_id,
             self._etrecord.edge_dialect_program,
+            disable_debug_handle_valdiation,
         ):
             export_program = self._etrecord.exported_program
         else:
@@ -1404,7 +1406,7 @@ def get_exported_program(
             else self._etrecord.graph_map.get(graph)
         )
 
-    def calculate_numeric_gap(self, distance: str = "MSE"):
+    def calculate_numeric_gap(self, distance: str = "MSE", disable_debug_handle_valdiation: bool = False):
         """
         Compares logged intermediate outputs from the exported graph (in ETRecord)
         with runtime outputs (in ETDump) using a user-specific numerical comparator.
@@ -1416,12 +1418,17 @@ def calculate_numeric_gap(self, distance: str = "MSE"):
 
         Args:
             distance: the metrics the inspector will use for gap calculation. Should be one of "MSE", "L1" and "SNR".
+            disable_debug_handle_validation: Often when aten graph has symbolic shape nodes, and inbuilt ops like gt/lt etc.,
+            during re-export of such a graph 'from_node' information is lost from node.meta. As a result we loose connection
+            between edge IR nodes and aten nodes for such ops. By default we validate that every edge IR node has corresponding
+            node in aten IR, and when such validation fails numeric debugger falls back to edge IR as reference graph. This
+            flag allows one to override such behavior and make best effort comparison.
 
         Returns:
             pd.DataFrame: A DataFrame listing corresponding operator intermediate outputs from both stages and their computed numerical gaps.
         """
         aot_intermediate_outputs, aot_debug_handle_to_op_names = (
-            self._get_aot_intermediate_outputs_and_op_names()
+            self._get_aot_intermediate_outputs_and_op_names(disable_debug_handle_valdiation)
         )
         if len(aot_intermediate_outputs) == 0 or len(aot_debug_handle_to_op_names) == 0:
             raise ValueError(
@@ -1451,6 +1458,12 @@ def calculate_numeric_gap(self, distance: str = "MSE"):
         ) in mapping.items():
             if aot_intermediate_output is None or runtime_intermediate_output is None:
                 continue
+            # If aot outputs length is > 1 then comparison fails since we dont really have
+            # any instances where runtime intermediate output is a tuple or list
+            # This does not happen when edge dialect program is reference for comparison
+            # but happens in aten graph where ops like unbind remain undecomposed
+            if isinstance(aot_intermediate_output, Sequence) and len(aot_intermediate_output) > 1:
+                continue
             rows.append(
                 {
                     "aot_ops": find_op_names(
diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
@@ -657,13 +657,21 @@ def _combine_aot_overlapped_intermediate_outputs(
     # Combine all AOT debug_handles into a list
     aot_combined_debug_handle = [t[0] for t in aot_map.keys()]
 
-    if set(aot_combined_debug_handle) != set(runtime_debug_handle):
-        # AOT combined debug_handle and runtime debug_handle do not match.
+    # Reason we dont check for exact match:
+    # in some experiments where we want to rewrite the aten graph that was
+    # lowered, so as to use custom ops like int4_matmul, we lose some nodes
+    # on the graph and thus lose some debug handles. And we dont find
+    # exact match within connected components.
+    if not set(aot_combined_debug_handle).issubset(set(runtime_debug_handle)):
+        # AOT combined debug_handle is not a subset of runtime debug_handle.
         return (-1,), None
 
     # Pick the last intermediate output
     last_int = runtime_debug_handle[negative_index]
     key = (last_int,)
+    if key not in aot_map:
+        # If the last intermediate output is not in the AOT map, return None
+        return (-1,), None
     return runtime_debug_handle, aot_map[key]
 
 
@@ -965,7 +973,7 @@ def compare_intermediate_outputs(a: Any, b: Any, comparator) -> List[float]:
         # Ensure both sequences have the same length
         if len(a) != len(b):
             raise ValueError(
-                f"Sequences 'a' ({a}) and 'b' ({b}) must have the same length for comparison."
+                f"Sequences 'a' ({a}) and 'b' ({b}) must have the same length for comparison. len(a): {len(a)} len(b): {len(b)}."
             )
 
         # Compare each element in the sequences and return the list of results
@@ -990,6 +998,9 @@ def get_ancestor_node_identifiers(node: Node) -> List[str]:
     Returns: the identifiers of all its ancestor nodes
     """
 
+    if FROM_NODE_KEY not in node.meta:
+        return None
+
     node_source = node.meta[FROM_NODE_KEY]
     node_source = node_source[-1]
     ancestor_node_ids: List[str] = [f"{node_source.name}.{str(node_source.graph_id)}"]
@@ -1056,11 +1067,14 @@ def _find_n_match_node(node: Node) -> None:
         if node.op in ("output", "placeholder"):
             return
         node_id = f"{node.name}.{exported_program_graph_id}"
-        parent_node_id = get_parent_node_identifier(node)
+        parent_node_ids = get_ancestor_node_identifiers(node)
         if node_id in ancestors_node_id_to_debug_handle:
             matched_debug_handles.add(ancestors_node_id_to_debug_handle[node_id])
-        elif parent_node_id and parent_node_id in ancestors_node_id_to_debug_handle:
-            matched_debug_handles.add(ancestors_node_id_to_debug_handle[parent_node_id])
+        elif parent_node_ids:
+            for parent_node_id in parent_node_ids:
+                if parent_node_id in ancestors_node_id_to_debug_handle:
+                    matched_debug_handles.add(ancestors_node_id_to_debug_handle[parent_node_id])
+                    break
 
     bfs_trace_with_node_process(exported_program.graph_module, _find_n_match_node)
     return matched_debug_handles
@@ -1094,15 +1108,17 @@ def _equip_debug_handle(node: Node) -> None:
         if node.op in ("output", "placeholder"):
             return
         node_id = f"{node.name}.{exported_program_graph_id}"
-        parent_node_id = get_parent_node_identifier(node)
+        parent_node_ids = get_ancestor_node_identifiers(node)
+        node.meta[DEBUG_HANDLE_KEY] = UNSET_DEBUG_HANDLE
         if node_id in ancestors_node_id_to_debug_handle:
             node.meta[DEBUG_HANDLE_KEY] = ancestors_node_id_to_debug_handle[node_id]
-        elif parent_node_id and parent_node_id in ancestors_node_id_to_debug_handle:
-            node.meta[DEBUG_HANDLE_KEY] = ancestors_node_id_to_debug_handle[
-                parent_node_id
-            ]
-        else:
-            node.meta[DEBUG_HANDLE_KEY] = UNSET_DEBUG_HANDLE
+        elif parent_node_ids:
+            for parent_node_id in parent_node_ids:
+                if parent_node_id in ancestors_node_id_to_debug_handle:
+                    node.meta[DEBUG_HANDLE_KEY] = ancestors_node_id_to_debug_handle[
+                        parent_node_id
+                    ]
+                    break
 
     bfs_trace_with_node_process(exported_program.graph_module, _equip_debug_handle)
 
@@ -1111,6 +1127,7 @@ def propagate_back_debug_handle(
     exported_program: ExportedProgram,
     exported_program_graph_id: int,
     edge_dialect_program: ExportedProgram,
+    disable_debug_handle_valdiation: bool = False,
 ) -> bool:
     """
     Propagate debug handle from edge dialect program back to the exported program while maintain the correctness
@@ -1124,6 +1141,10 @@ def propagate_back_debug_handle(
     Then debug handle of op1 should be same as op1_0, and debug handle of op3 should be same as op3_0 and op3_1.
     The debug handle of op2 will be UNSET_DEBUG_HANDLE for further skipping.
 
+    disable_debug_handle_validation is used to avoid _verify_graph_match() in case of debug handle mismatch.
+    This can happen when we are comparing against aten graph in which case not all debug handles are matched
+    in aten graph. Example of this is when symbolic shape nodes are re-exported.
+
     Return: True if every debug handle in the edge dialect program has a corresponding node in the exported program, otherwise, return False.
     """
     # 1. Extract mapping from ancestor node identifiers to debug handles
@@ -1137,7 +1158,7 @@ def propagate_back_debug_handle(
     )
 
     # 3. Verify if every debug handle in edge dialect program has a corresponding node
-    if not _verify_graph_match(edge_dialect_program, matched_debug_handles):
+    if not disable_debug_handle_valdiation and not _verify_graph_match(edge_dialect_program, matched_debug_handles):
         return False
 
     # 4. Apply debug handles to the exported program
diff --git a/devtools/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py
@@ -838,6 +838,118 @@ def _gen_random_runtime_output(
     ) -> List[Union[None, List[torch.Tensor], bool, float, int, str, torch.Tensor]]:
         return [torch.randn(RAW_DATA_SIZE)]
 
+    def test_disable_debug_handle_validation_with_symbolic_shapes(self):
+        """
+        Test that demonstrates the issue with symbolic shape related nodes losing from_node info
+        during dynamic shape based export, and shows how disable_debug_handle_valdiation parameter
+        in propagate_back_debug_handle allows validation to be bypassed.
+        """
+        from executorch.devtools.inspector._inspector_utils import propagate_back_debug_handle
+
+        class SymbolicShapeModel(torch.nn.Module):
+            """Model that will have symbolic shape related operations after export."""
+
+            def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+                # This will create symbolic shape nodes during dynamic export
+                batch_size = x.shape[0]
+                x = x + torch.rand((batch_size, 1))
+                # Masking operation that creates gt/lt nodes
+                valid_mask = mask > 0.5
+                x = torch.where(valid_mask, x, torch.zeros_like(x))
+                return x
+
+        # Create model and dynamic inputs
+        model = SymbolicShapeModel()
+        batch_size = 2
+        seq_len = 4
+        x = torch.randn(batch_size, seq_len)
+        mask = torch.rand(batch_size, seq_len)
+        example_inputs = (x, mask)
+
+        # Export with dynamic shapes to create symbolic shape related nodes
+        dynamic_shapes = {
+            "x": {0: torch.export.Dim("batch_size", min=1, max=10)},
+            "mask": {0: torch.export.Dim("batch_size", min=1, max=10)},
+        }
+
+        exported_program = torch.export.export(
+            model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
+        )
+
+        """
+        In this case origina aten graph has sym_size_int_2 node but when we look at
+        nodes metadata in edge_program_manager, its sym_size node's from_node says
+        sym_size_int_3 which is not in the original aten graph.
+        """
+        # Create edge program - this is where from_node info can be lost for symbolic shape nodes
+        edge_program_manager: EdgeProgramManager = to_edge(exported_program)
+        edge_program_manager_copy = copy.deepcopy(edge_program_manager)
+        et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch()
+
+        with tempfile.NamedTemporaryFile(suffix=".bin") as tmp_file:
+            etrecord_path = tmp_file.name
+
+            # Generate ETRecord with the exported program (aten graph)
+            generate_etrecord(
+                etrecord_path,
+                edge_program_manager_copy,
+                et_program_manager,
+                exported_program=exported_program,
+            )
+
+            # Create Inspector and get etrecord
+            with patch.object(
+                _inspector, "gen_etdump_object", return_value=None
+            ), patch.object(
+                EventBlock, "_gen_from_etdump"
+            ):
+                inspector_instance = Inspector(
+                    etdump_path=ETDUMP_PATH,
+                    etrecord=etrecord_path,
+                )
+
+                # Extract the necessary values from the inspector's etrecord
+                exported_program_from_etrecord = inspector_instance._etrecord.exported_program
+                export_graph_id = inspector_instance._etrecord.export_graph_id
+                edge_dialect_program = inspector_instance._etrecord.edge_dialect_program
+
+                # Ensure we have all the necessary components
+                self.assertIsNotNone(exported_program_from_etrecord)
+                self.assertIsNotNone(export_graph_id)
+                self.assertIsNotNone(edge_dialect_program)
+
+                # Test propagate_back_debug_handle with validation enabled (should fail or return False)
+                # This demonstrates the issue with symbolic shape nodes losing from_node info
+                validation_enabled_result = propagate_back_debug_handle(
+                    exported_program_from_etrecord,
+                    export_graph_id,
+                    edge_dialect_program,
+                    disable_debug_handle_valdiation=False
+                )
+
+                # With validation enabled, it should return False when from_node info is lost
+                self.assertFalse(
+                    validation_enabled_result,
+                    "propagate_back_debug_handle should return False when validation is enabled "
+                    "and symbolic shape nodes lose from_node info"
+                )
+
+                # Test propagate_back_debug_handle with validation disabled (should succeed)
+                # This shows how the disable_debug_handle_valdiation flag allows the function to work
+                validation_disabled_result = propagate_back_debug_handle(
+                    exported_program_from_etrecord,
+                    export_graph_id,
+                    edge_dialect_program,
+                    disable_debug_handle_valdiation=True
+                )
+
+                # With validation disabled, it should return True even when from_node info is lost
+                self.assertTrue(
+                    validation_disabled_result,
+                    "propagate_back_debug_handle should return True when validation is disabled, "
+                    "allowing best effort comparison even when from_node info is lost"
+                )
+
     def _gen_random_events(self) -> List[Event]:
         events = []
         for i in range(2):