Fix batch norm folding in prepare_pt2e for multiple conv->BN chains sharing the same conv weights (#2795)

subhankarpal · web-flow · commit 947306053d9f · 2025-08-19T10:16:03.000-07:00
* Fix BN folding in  for multiple conv-&gt;BN chains sharing the same conv weights

* Fix variable names and format

---------

Co-authored-by: Subhankar Pal &lt;subh@meta.com&gt;
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -57,6 +57,7 @@
 from torchao.quantization.pt2e.quantizer.embedding_quantizer import (  # noqa: F811
     EmbeddingQuantizer,
 )
+from torchao.testing.model_architectures import ConvWithSharedWeightInExportedModel
 from torchao.testing.pt2e._xnnpack_quantizer import (
     XNNPACKQuantizer,
     get_symmetric_quantization_config,
@@ -150,6 +151,34 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             node_list,
         )
 
+    def test_chunked_bn_fusion(self):
+        batch_size = 1
+        n_chunks = 3
+        in_channels = 1
+        out_channels = 32
+        m = ConvWithSharedWeightInExportedModel(n_chunks, in_channels, out_channels)
+        m.bn.running_var = torch.nn.Parameter(
+            torch.rand(out_channels) * 1e-2, requires_grad=False
+        )
+
+        m.eval()
+        example_inputs = (torch.rand(batch_size, n_chunks, 32, 32),)
+        ref_outputs = m(*example_inputs)
+        traced_model = torch.export.export(m, example_inputs, strict=True).module()
+        traced_outputs = traced_model(*example_inputs)
+        prepared_model = prepare_pt2e(traced_model, XNNPACKQuantizer())
+        prepared_outputs = prepared_model(*example_inputs)
+
+        if isinstance(ref_outputs, (tuple, list)):
+            for ref, prepared, traced in zip(
+                ref_outputs, prepared_outputs, traced_outputs
+            ):
+                torch.testing.assert_close(ref, traced)
+                torch.testing.assert_close(traced, prepared)
+        else:
+            torch.testing.assert_close(ref_outputs, traced_outputs)
+            torch.testing.assert_close(traced_outputs, prepared_outputs)
+
     def test_wo_annotate_conv_output_quantizer(self):
         # TODO: use OP_TO_ANNOTATOR
         class BackendAQuantizer(Quantizer):
diff --git a/torchao/quantization/pt2e/utils.py b/torchao/quantization/pt2e/utils.py
@@ -671,6 +671,7 @@ def fold_bn_weights_into_conv_node(
     conv_bias_node: Optional[Node],
     bn_node: Node,
     m: GraphModule,
+    fake_fuse: bool = False,  # removes the BN nodes but doesn't change the conv weights
 ) -> None:
     # conv args: input, weight, bias, stride, padding, dilation, ...
     conv_w = _get_tensor_constant_from_node(conv_weight_node, m)
@@ -703,6 +704,16 @@ def fold_bn_weights_into_conv_node(
     if len(conv_args) == 2:
         conv_args.append(None)
 
+    if fake_fuse:
+        fused_weight, fused_bias = (
+            torch.nn.Parameter(conv_w, conv_w.requires_grad),
+            torch.nn.Parameter(conv_b, conv_b.requires_grad),
+        )
+    else:
+        fused_weight, fused_bias = fuse_conv_bn_weights(
+            conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b, transpose=transpose
+        )
+
     # calling data since the fused_weight and fused_bias are nn.Parameter
     weight_attr_name = conv_weight_node.target
     assert isinstance(weight_attr_name, str)
@@ -767,6 +778,9 @@ def _fuse_conv_bn_(m: GraphModule) -> None:
     has_bn = any(_is_bn_node(n) for n in m.graph.nodes)
     if not has_bn:
         return
+
+    # track which conv weights have been fused to avoid double fusing
+    fused_convs_weight_nodes = set()
     for n in m.graph.nodes:
         if n.op != "call_function" or n.target not in (
             torch.ops.aten._native_batch_norm_legit_no_training.default,
@@ -781,9 +795,14 @@ def _fuse_conv_bn_(m: GraphModule) -> None:
         conv_weight_node = conv_node.args[1]
         conv_bias_node = conv_node.args[2] if len(conv_node.args) > 2 else None
         fold_bn_weights_into_conv_node(
-            conv_node, conv_weight_node, conv_bias_node, bn_node, m
+            conv_node,
+            conv_weight_node,
+            conv_bias_node,
+            bn_node,
+            m,
+            (conv_weight_node in fused_convs_weight_nodes),
         )
-
+        fused_convs_weight_nodes.add(conv_weight_node)
     m.graph.eliminate_dead_code()
     m.recompile()
 
diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py
@@ -22,6 +22,27 @@ def forward(self, x):
         return x
 
 
+class ConvWithSharedWeightInExportedModel(nn.Module):
+    def __init__(
+        self, n_chunks, in_channels, out_channels, kernel_size=3, stride=1, padding=1
+    ) -> None:
+        super().__init__()
+        self.n_chunks = n_chunks
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x) -> torch.Tensor:
+        chunks = torch.chunk(x, self.n_chunks, dim=1)
+        outputs = []
+        for chunk in chunks:
+            out = self.conv(chunk)
+            out = self.bn(out)
+            out = self.relu(out)
+            outputs.append(out)
+        return torch.cat(outputs, dim=1)
+
+
 class LNLinearActivationModel(nn.Module):
     def __init__(self, fc_dim1, fc_dim2, dtype=torch.bfloat16, activation="sigmoid"):
         super().__init__()