From 8e61e4108cc2f91187b2af6eef9b18704874881c Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Sat, 14 Mar 2026 18:10:44 -0700
Subject: [PATCH] Support output_padding in XNNPACK transposed convolution
 (#18185)

Summary:

XNNPACK got support for transposed convolutions with output padding at some point. Wire it up through ET.

Differential Revision: D96603677
---
 backends/xnnpack/operators/op_conv2d.py       |  13 +-
 .../xnnpack/partition/config/gemm_configs.py  |  12 --
 backends/xnnpack/test/ops/test_conv2d.py      | 194 ++++++++++++++++--
 3 files changed, 181 insertions(+), 38 deletions(-)

diff --git a/backends/xnnpack/operators/op_conv2d.py b/backends/xnnpack/operators/op_conv2d.py
index 8cecb3c62ad..315bbd863cb 100644
--- a/backends/xnnpack/operators/op_conv2d.py
+++ b/backends/xnnpack/operators/op_conv2d.py
@@ -140,14 +140,11 @@ def define_node(
         stride = cast(List[int], node.args[3])
         padding = cast(List[int], node.args[4])
         dilation = cast(List[int], node.args[5])
+        output_padding = cast(List[int], node.args[7])
         if len(padding) == 1:
             padding = padding + padding
-
-        # args[7] = output padding
-        check_or_raise(
-            all(out_pad == 0 for out_pad in cast(List[int], node.args[7])),
-            "XNNPACK does not support output padding",
-        )
+        if len(output_padding) == 1:
+            output_padding = output_padding + output_padding
 
         check_or_raise(
             len(stride) == 2, "XNNPACK currently only supports 2D convolution"
@@ -165,8 +162,8 @@ def define_node(
         kwargs["group_input_channels"] = group_input_channels
         kwargs["group_output_channels"] = group_output_channels
         kwargs["groups"] = groups
-        kwargs["adjustment_height"] = 0
-        kwargs["adjustment_width"] = 0
+        kwargs["adjustment_height"] = output_padding[0]
+        kwargs["adjustment_width"] = output_padding[1]
         kwargs["flags"] = 0
 
         if is_depthwise_conv:
diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index d025c8e6029..e9cb0642f9b 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -382,18 +382,6 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             )
             return False
 
-        # XNNPACK does not support non-zero output padding in transposed
-        # convolutions.
-        if is_transpose and any(
-            out_pad != 0 for out_pad in cast(List[int], node.args[7])
-        ):
-            why(
-                node,
-                "XNNPACK does not support transposed convolutions with"
-                "non-zero output padding",
-            )
-            return False
-
         if (
             is_transpose
             and weight_quant_params is not None
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 2a0a82d99b6..8ca6d893be9 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -656,17 +656,20 @@ def get_inputs(self):
                             conv_count=1,
                         )
 
-    def test_padded_output_tconv(self):
-        class TConv2d(torch.nn.Module):
-            def __init__(self):
+    def test_fp32_tconv_output_padding(self):
+        """Test transposed convolution with non-zero output padding."""
+
+        class TConv2dOutputPadding(torch.nn.Module):
+            def __init__(self, output_padding):
                 super().__init__()
+                self.transpose = True
                 self.conv = torch.nn.ConvTranspose2d(
                     in_channels=2,
                     out_channels=1,
                     kernel_size=(3, 3),
                     stride=(2, 2),
                     padding=(1, 1),
-                    output_padding=(0, 1),
+                    output_padding=output_padding,
                     dilation=(1, 1),
                     groups=1,
                     bias=True,
@@ -675,26 +678,181 @@ def __init__(self):
             def forward(self, x):
                 return self.conv(x)
 
-        m = TConv2d()
-        inputs = (torch.randn(1, 2, 8, 8),)
-        tester = Tester(m.eval(), inputs)
+            def get_inputs(self):
+                return (torch.randn(1, 2, 8, 8),)
 
-        conv_count: int = 1
-        op = "torch.ops.aten.conv_transpose2d"
+        # Test asymmetric output padding (0, 1)
+        self._test(TConv2dOutputPadding(output_padding=(0, 1)))
 
-        (tester.export().check_count({op: conv_count}).to_edge_transform_and_lower())
+        # Test symmetric output padding (1, 1)
+        self._test(TConv2dOutputPadding(output_padding=(1, 1)))
 
-        # tconv should not be offloaded to XNNPack, since output padding is not supported
-        (
-            tester.check(
-                ["executorch_exir_dialects_edge__ops_aten_convolution_default"]
+    def test_qs8_tconv_output_padding(self):
+        """Test quantized transposed convolution with non-zero output padding."""
+
+        class TConv2dOutputPadding(torch.nn.Module):
+            def __init__(self, output_padding):
+                super().__init__()
+                self.transpose = True
+                self.conv = torch.nn.ConvTranspose2d(
+                    in_channels=2,
+                    out_channels=1,
+                    kernel_size=(3, 3),
+                    stride=(2, 2),
+                    padding=(1, 1),
+                    output_padding=output_padding,
+                    dilation=(1, 1),
+                    groups=1,
+                    bias=True,
+                ).to(torch.float)
+
+            def forward(self, x):
+                return self.conv(x)
+
+            def get_inputs(self):
+                return (torch.randn(1, 2, 8, 8),)
+
+        # Test asymmetric output padding (0, 1) with quantization
+        self._test(
+            TConv2dOutputPadding(output_padding=(0, 1)),
+            quant_config=get_symmetric_quantization_config(),
+        )
+
+        # Test symmetric output padding (1, 1) with quantization
+        self._test(
+            TConv2dOutputPadding(output_padding=(1, 1)),
+            quant_config=get_symmetric_quantization_config(),
+        )
+
+    def test_fp32_tconv_output_padding_large_stride(self):
+        """Test transposed convolution with larger output padding and stride values."""
+
+        class TConv2dLargeOutputPadding(torch.nn.Module):
+            def __init__(self, stride, output_padding):
+                super().__init__()
+                self.transpose = True
+                self.conv = torch.nn.ConvTranspose2d(
+                    in_channels=8,
+                    out_channels=16,
+                    kernel_size=(5, 5),
+                    stride=stride,
+                    padding=(2, 2),
+                    output_padding=output_padding,
+                    dilation=(1, 1),
+                    groups=1,
+                    bias=True,
+                ).to(torch.float)
+
+            def forward(self, x):
+                return self.conv(x)
+
+            def get_inputs(self):
+                return (torch.randn(2, 8, 16, 16),)
+
+        # Test with stride=4 and output_padding=(3, 3) - maximum valid for stride 4
+        self._test(TConv2dLargeOutputPadding(stride=(4, 4), output_padding=(3, 3)))
+
+        # Test with stride=3 and asymmetric output_padding=(2, 1)
+        self._test(TConv2dLargeOutputPadding(stride=(3, 3), output_padding=(2, 1)))
+
+        # Test with asymmetric stride and output_padding
+        self._test(TConv2dLargeOutputPadding(stride=(4, 3), output_padding=(3, 2)))
+
+    def test_fp32_tconv_output_padding_various_shapes(self):
+        """Test transposed convolution with output padding on various input shapes."""
+
+        class TConv2dVariousShapes(torch.nn.Module):
+            def __init__(
+                self,
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                output_padding,
+                height,
+                width,
+            ):
+                super().__init__()
+                self.transpose = True
+                self.height = height
+                self.width = width
+                self.in_channels = in_channels
+                self.conv = torch.nn.ConvTranspose2d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=output_padding,
+                    dilation=(1, 1),
+                    groups=1,
+                    bias=True,
+                ).to(torch.float)
+
+            def forward(self, x):
+                return self.conv(x)
+
+            def get_inputs(self):
+                return (torch.randn(1, self.in_channels, self.height, self.width),)
+
+        # Test with larger kernel (7x7), stride=2, output_padding=1
+        self._test(
+            TConv2dVariousShapes(
+                in_channels=3,
+                out_channels=32,
+                kernel_size=(7, 7),
+                stride=(2, 2),
+                padding=(3, 3),
+                output_padding=(1, 1),
+                height=32,
+                width=32,
             )
-            .check_not(["torch.ops.higher_order.executorch_call_delegate"])
-            .to_executorch()
-            .serialize()
-            .run_method_and_compare_outputs(qtol=1)
         )
 
+        # Test with rectangular kernel and asymmetric output padding
+        self._test(
+            TConv2dVariousShapes(
+                in_channels=16,
+                out_channels=8,
+                kernel_size=(3, 5),
+                stride=(2, 3),
+                padding=(1, 2),
+                output_padding=(1, 2),
+                height=24,
+                width=32,
+            )
+        )
+
+        # Test with small spatial dimensions but larger output padding
+        self._test(
+            TConv2dVariousShapes(
+                in_channels=4,
+                out_channels=4,
+                kernel_size=(4, 4),
+                stride=(4, 4),
+                padding=(0, 0),
+                output_padding=(3, 3),
+                height=4,
+                width=4,
+            )
+        )
+
+        # Test with batch size > 1 and asymmetric everything
+        model = TConv2dVariousShapes(
+            in_channels=6,
+            out_channels=12,
+            kernel_size=(5, 3),
+            stride=(3, 2),
+            padding=(2, 1),
+            output_padding=(2, 1),
+            height=20,
+            width=15,
+        )
+        # Override get_inputs to use larger batch
+        model.get_inputs = lambda: (torch.randn(4, 6, 20, 15),)
+        self._test(model)
+
     def test_dq_conv2d(self) -> None:
         model = Conv2d(
             in_channels=3,