pytorch · namgyu-youn · Aug 11, 2025 · Aug 11, 2025 · Aug 12, 2025 · Aug 11, 2025
diff --git a/benchmarks/benchmark_aq.py b/benchmarks/benchmark_aq.py
@@ -16,32 +16,7 @@
     _replace_with_custom_fn_if_matches_filter,
     quantize_,
 )
-
-
-class ToyLinearModel(torch.nn.Module):
-    """Single linear for m * k * n problem size"""
-
-    def __init__(
-        self, m=64, n=32, k=64, has_bias=False, dtype=torch.float, device="cuda"
-    ):
-        super().__init__()
-        self.m = m
-        self.dtype = dtype
-        self.device = device
-        self.linear = torch.nn.Linear(k, n, bias=has_bias).to(
-            dtype=self.dtype, device=self.device
-        )
-
-    def example_inputs(self):
-        return (
-            torch.randn(
-                self.m, self.linear.in_features, dtype=self.dtype, device=self.device
-            ),
-        )
-
-    def forward(self, x):
-        x = self.linear(x)
-        return x
+from torchao.testing.model_architectures import ToySingleLinearModel
 
 
 def _get_ref_change_linear_weights_to_woqtensors(deprecated_tenosr_subclass):
@@ -69,14 +44,26 @@ def _ref_change_linear_weights_to_woqtensors(model, filter_fn=None, **kwargs):
 
 
 @torch.no_grad
-def _bench_quantized_tensor_subclass_perf(api, config, M, N, K):
-    m = ToyLinearModel(
+def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
+    if kwargs is None:
+        kwargs = {}
+
+    m = ToySingleLinearModel(
         M, N, K, has_bias=True, dtype=torch.bfloat16, device="cuda"
     ).eval()
     m_bf16 = copy.deepcopy(m)
+    m_ref = copy.deepcopy(m)
+    example_inputs = m.example_inputs(batch_size=M)
+
+    api(m, **kwargs)
+
+    # reference
     example_inputs = m.example_inputs()
 
-    api(m, config)  # Pass both model and config
+    res = m(*example_inputs)
+    ref = m_ref(*example_inputs)
+
+    assert torch.equal(res, ref)
 
     # perf comparison
     from torchao.utils import benchmark_model
@@ -95,6 +82,11 @@ def _bench_quantized_tensor_subclass_perf(api, config, M, N, K):
     benchmark_model(m, WARMUP, example_inputs)
     elapsed_time = benchmark_model(m, RUNS, example_inputs)
 
+    torch._dynamo.reset()
+    m_bf16 = torch.compile(m_bf16, mode="max-autotune", fullgraph=True)
+    benchmark_model(m_bf16, WARMUP, example_inputs)
+    bf16_elapsed_time = benchmark_model(m_bf16, RUNS, example_inputs)
+
     print(
         f"{(M, N, K)}: elapsed time: {elapsed_time}, bf16 elapsed time: {bf16_elapsed_time}"
     )

diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -38,6 +38,7 @@
     choose_qparams_affine,
 )
 from torchao.quantization.quantize_.common import KernelPreference
+from torchao.testing.model_architectures import ToyTwoLinearModel
 from torchao.utils import (
     is_sm_at_least_89,
     is_sm_at_least_90,
@@ -48,18 +49,6 @@
 torch.manual_seed(0)
 
 
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
-        self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.linear2(x)
-        return x
-
-
 class TestAffineQuantizedFloat8Compile(InductorTestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
@@ -121,8 +110,8 @@ def test_fp8_linear_variants(
                 ),
             }
 
-            # Create a linear layer with bfloat16 dtype
-            model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+            # Create a linear layer
+            model = ToyTwoLinearModel(K, N, K, device="cuda", dtype=dtype).eval()
 
             quantized_model = copy.deepcopy(model)
             factory = mode_map[mode]()
@@ -179,7 +168,9 @@ def test_per_row_with_float32(self):
             AssertionError,
             match="PerRow quantization only works for bfloat16 precision",
         ):
-            model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
+            model = ToyTwoLinearModel(
+                64, 64, 64, device="cuda", dtype=torch.float32
+            ).eval()
             quantize_(
                 model,
                 Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
@@ -192,7 +183,7 @@ def test_per_row_with_float32(self):
     @common_utils.parametrize("mode", ["dynamic", "weight-only", "static"])
     def test_serialization(self, mode: str):
         # Create and quantize the model
-        model = ToyLinearModel(16, 32).to(device="cuda")
+        model = ToyTwoLinearModel(16, 32, 16, device="cuda", dtype=torch.float32)
 
         mode_map = {
             "dynamic": partial(
@@ -224,7 +215,9 @@ def test_serialization(self, mode: str):
 
         # Create a new model and load the state dict
         with torch.device("meta"):
-            new_model = ToyLinearModel(16, 32)
+            new_model = ToyTwoLinearModel(
+                16, 32, 16, device="cuda", dtype=torch.float32
+            )
             if mode == "static":
                 quantize_(new_model, factory)
             new_model.load_state_dict(loaded_state_dict, assign=True)
@@ -266,7 +259,9 @@ def test_serialization(self, mode: str):
     )
     def test_fp8_weight_dimension_warning(self):
         # Create model with incompatible dimensions (not multiples of 16)
-        model = ToyLinearModel(10, 25).cuda()  # 10x25 and 25x10 weights
+        model = ToyTwoLinearModel(
+            10, 25, 10, device="cuda", dtype=torch.float32
+        )  # 10x25 and 25x10 weights
 
         # Set up logging capture
         with self.assertLogs(

diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -69,6 +69,7 @@
 from torchao.quantization.utils import (
     compute_error as SQNR,
 )
+from torchao.testing.model_architectures import ToyTwoLinearModel
 from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     benchmark_model,
@@ -1910,30 +1911,13 @@ def test_get_model_size_aqt(self, api, test_device, test_dtype):
 
 
 class TestBenchmarkModel(unittest.TestCase):
-    class ToyLinearModel(torch.nn.Module):
-        def __init__(self, m=64, n=32, k=64):
-            super().__init__()
-            self.linear1 = torch.nn.Linear(m, n, bias=False)
-            self.linear2 = torch.nn.Linear(n, k, bias=False)
-
-        def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"):
-            return (
-                torch.randn(
-                    batch_size, self.linear1.in_features, dtype=dtype, device=device
-                ),
-            )
-
-        def forward(self, x):
-            x = self.linear1(x)
-            x = self.linear2(x)
-            return x
-
     def run_benchmark_model(self, device):
         # params
-        dtype = torch.bfloat16
-        m = self.ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to(device)
+        m = ToyTwoLinearModel(
+            1024, 1024, 1024, device=device, dtype=torch.bfloat16
+        ).eval()
         m_bf16 = copy.deepcopy(m)
-        example_inputs = m.example_inputs(dtype=dtype, device=device)
+        example_inputs = m.example_inputs()
         m_bf16 = torch.compile(m_bf16, mode="max-autotune")
         num_runs = 1
         return benchmark_model(m_bf16, num_runs, example_inputs)

diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -25,6 +25,7 @@
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.quantization.quantize_.workflows.float8.float8_tensor import Float8Tensor
 from torchao.quantization.utils import compute_error
+from torchao.testing.model_architectures import ToyTwoLinearModel
 from torchao.testing.utils import TorchAOIntegrationTestCase
 from torchao.utils import (
     _is_fbgemm_gpu_genai_available,
@@ -38,18 +39,6 @@
 torch._dynamo.config.cache_size_limit = 128
 
 
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
-        self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.linear2(x)
-        return x
-
-
 class ToyConvModel(torch.nn.Module):
     def __init__(
         self, dim, in_channels, out_channels, kernel_size, bias, padding, dtype, device
@@ -145,7 +134,7 @@ def test_fp8_linear_variants(
             input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
 
             # Create a linear layer with bfloat16 dtype
-            model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+            model = ToyTwoLinearModel(K, N, K, device="cuda", dtype=dtype).eval()
 
             quantized_model = copy.deepcopy(model)
 
@@ -333,7 +322,7 @@ def test_kernel_preference_numerical_equivalence(self, granularity, sizes):
         dtype = torch.bfloat16
         input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
         # Create a linear layer with bfloat16 dtype
-        model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+        model = ToyTwoLinearModel(K, N, K, device="cuda", dtype=dtype).eval()
 
         # reference kernel preference and results
         # we are using KerenelPreference.TORCH as the reference