Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
b2e7f54
Summary:
namgyu-youn Aug 11, 2025
c5faa07
fix ruff
namgyu-youn Aug 11, 2025
ddeb027
separate single/multi linear toy model
namgyu-youn Aug 12, 2025
2aafd64
Summary:
namgyu-youn Aug 11, 2025
68e4482
Merge branch 'main' into refactor-toymodel
namgyu-youn Aug 14, 2025
6e88012
fix CI error after rebase
namgyu-youn Aug 17, 2025
6fd9672
update 3-linear model to 2-linear model
namgyu-youn Aug 22, 2025
98dd997
Merge branch 'main' into refactor-toymodel
namgyu-youn Aug 23, 2025
1656126
revert: observer shape
namgyu-youn Aug 26, 2025
994b507
revert: toy model for tutorials
namgyu-youn Aug 26, 2025
0ced363
update dtype, device handling in ToyTwoLinearModel
namgyu-youn Aug 26, 2025
6b03dc3
fix: test module for `create_model_and_input_data()`
namgyu-youn Aug 26, 2025
6b4eaa8
revert: toy model for tutorials
namgyu-youn Aug 29, 2025
ee7b0f4
fix: uniform args (device & dtype) in ToyModel
namgyu-youn Aug 29, 2025
c8320a7
remove overused args: `sequence_length`
namgyu-youn Aug 30, 2025
b6a752e
revert edge-case to source: `test_awq.py`
namgyu-youn Aug 30, 2025
f3f0abd
refactor: inline for clear understanding
namgyu-youn Sep 3, 2025
ed04949
revert tutorials to source
namgyu-youn Sep 4, 2025
ad535e2
fix input handler for batch
namgyu-youn Sep 4, 2025
3a87497
enable single linear to use batch size
namgyu-youn Sep 6, 2025
776b28b
fix old args in quantization API
namgyu-youn Sep 6, 2025
b5802c5
specify for clarify: device, dtype
namgyu-youn Sep 8, 2025
f2dd213
fix single linear: batch size & dimension shape
namgyu-youn Sep 10, 2025
8cc0a3b
specify dtype, device across toy models
namgyu-youn Sep 10, 2025
1345264
Merge branch 'main' into refactor-toymodel
namgyu-youn Sep 11, 2025
6dc439b
add dtype, device after rebase
namgyu-youn Sep 11, 2025
350d442
Merge pull request #2 from namgyu-youn/main
namgyu-youn Sep 11, 2025
01b1fb2
fix lint
namgyu-youn Sep 16, 2025
93e94b3
Merge branch 'main' into refactor-toymodel
namgyu-youn Sep 18, 2025
0e47cbb
fix pre-commit after rebase
namgyu-youn Sep 18, 2025
1140b05
fix ImportError after rebase
namgyu-youn Sep 19, 2025
94610a7
make toy model use direct dtype, device
namgyu-youn Sep 23, 2025
ac55a29
fix incorrect attributes
namgyu-youn Sep 25, 2025
59260a2
revert to original dtype
namgyu-youn Sep 25, 2025
54f7fee
remove default dtype, device
namgyu-youn Sep 27, 2025
74ee61c
fix incorrect dtype
namgyu-youn Sep 28, 2025
26c5b2e
Merge branch 'main' into refactor-toymodel
namgyu-youn Oct 2, 2025
0b4a545
fix typo in toy model
namgyu-youn Oct 3, 2025
bfc2345
fix incorrect device
namgyu-youn Oct 3, 2025
9058159
revert quantization flows to version 1
namgyu-youn Oct 3, 2025
3efd5f4
fix incorrect attributes
namgyu-youn Oct 7, 2025
ddb6b0d
update toy model args
namgyu-youn Oct 8, 2025
ef6c3f4
add new code with toy model
namgyu-youn Oct 8, 2025
7630988
Merge branch 'main' into refactor-toymodel
namgyu-youn Oct 8, 2025
d89f1d6
fix pre-commit format
namgyu-youn Oct 9, 2025
0c1d4d7
Merge branch 'main' into refactor-toymodel
namgyu-youn Oct 16, 2025
86a81a9
fix after rebase
namgyu-youn Oct 17, 2025
22d341b
fix lint
namgyu-youn Oct 20, 2025
c6cc73c
Merge branch 'main' into refactor-toymodel
namgyu-youn Oct 31, 2025
9110fd1
fix after rebase
namgyu-youn Oct 31, 2025
0bb1938
Merge branch 'main' into refactor-toymodel
namgyu-youn Oct 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 21 additions & 29 deletions benchmarks/benchmark_aq.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,7 @@
_replace_with_custom_fn_if_matches_filter,
quantize_,
)


class ToyLinearModel(torch.nn.Module):
"""Single linear for m * k * n problem size"""

def __init__(
self, m=64, n=32, k=64, has_bias=False, dtype=torch.float, device="cuda"
):
super().__init__()
self.m = m
self.dtype = dtype
self.device = device
self.linear = torch.nn.Linear(k, n, bias=has_bias).to(
dtype=self.dtype, device=self.device
)

def example_inputs(self):
return (
torch.randn(
self.m, self.linear.in_features, dtype=self.dtype, device=self.device
),
)

def forward(self, x):
x = self.linear(x)
return x
from torchao.testing.model_architectures import ToySingleLinearModel


def _get_ref_change_linear_weights_to_woqtensors(deprecated_tenosr_subclass):
Expand Down Expand Up @@ -69,14 +44,26 @@ def _ref_change_linear_weights_to_woqtensors(model, filter_fn=None, **kwargs):


@torch.no_grad
def _bench_quantized_tensor_subclass_perf(api, config, M, N, K):
m = ToyLinearModel(
def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
if kwargs is None:
kwargs = {}

m = ToySingleLinearModel(
M, N, K, has_bias=True, dtype=torch.bfloat16, device="cuda"
).eval()
m_bf16 = copy.deepcopy(m)
m_ref = copy.deepcopy(m)
example_inputs = m.example_inputs(batch_size=M)

api(m, **kwargs)

# reference
example_inputs = m.example_inputs()

api(m, config) # Pass both model and config
res = m(*example_inputs)
ref = m_ref(*example_inputs)

assert torch.equal(res, ref)

# perf comparison
from torchao.utils import benchmark_model
Expand All @@ -95,6 +82,11 @@ def _bench_quantized_tensor_subclass_perf(api, config, M, N, K):
benchmark_model(m, WARMUP, example_inputs)
elapsed_time = benchmark_model(m, RUNS, example_inputs)

torch._dynamo.reset()
m_bf16 = torch.compile(m_bf16, mode="max-autotune", fullgraph=True)
benchmark_model(m_bf16, WARMUP, example_inputs)
bf16_elapsed_time = benchmark_model(m_bf16, RUNS, example_inputs)

print(
f"{(M, N, K)}: elapsed time: {elapsed_time}, bf16 elapsed time: {bf16_elapsed_time}"
)
Expand Down
31 changes: 13 additions & 18 deletions test/dtypes/test_affine_quantized_float.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
choose_qparams_affine,
)
from torchao.quantization.quantize_.common import KernelPreference
from torchao.testing.model_architectures import ToyTwoLinearModel
from torchao.utils import (
is_sm_at_least_89,
is_sm_at_least_90,
Expand All @@ -48,18 +49,6 @@
torch.manual_seed(0)


class ToyLinearModel(torch.nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)

def forward(self, x):
x = self.linear1(x)
x = self.linear2(x)
return x


class TestAffineQuantizedFloat8Compile(InductorTestCase):
@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@unittest.skipIf(
Expand Down Expand Up @@ -121,8 +110,8 @@ def test_fp8_linear_variants(
),
}

# Create a linear layer with bfloat16 dtype
model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
# Create a linear layer
model = ToyTwoLinearModel(K, N, K, device="cuda", dtype=dtype).eval()

quantized_model = copy.deepcopy(model)
factory = mode_map[mode]()
Expand Down Expand Up @@ -179,7 +168,9 @@ def test_per_row_with_float32(self):
AssertionError,
match="PerRow quantization only works for bfloat16 precision",
):
model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
model = ToyTwoLinearModel(
64, 64, 64, device="cuda", dtype=torch.float32
).eval()
quantize_(
model,
Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
Expand All @@ -192,7 +183,7 @@ def test_per_row_with_float32(self):
@common_utils.parametrize("mode", ["dynamic", "weight-only", "static"])
def test_serialization(self, mode: str):
# Create and quantize the model
model = ToyLinearModel(16, 32).to(device="cuda")
model = ToyTwoLinearModel(16, 32, 16, device="cuda", dtype=torch.float32)

mode_map = {
"dynamic": partial(
Expand Down Expand Up @@ -224,7 +215,9 @@ def test_serialization(self, mode: str):

# Create a new model and load the state dict
with torch.device("meta"):
new_model = ToyLinearModel(16, 32)
new_model = ToyTwoLinearModel(
16, 32, 16, device="cuda", dtype=torch.float32
)
if mode == "static":
quantize_(new_model, factory)
new_model.load_state_dict(loaded_state_dict, assign=True)
Expand Down Expand Up @@ -266,7 +259,9 @@ def test_serialization(self, mode: str):
)
def test_fp8_weight_dimension_warning(self):
# Create model with incompatible dimensions (not multiples of 16)
model = ToyLinearModel(10, 25).cuda() # 10x25 and 25x10 weights
model = ToyTwoLinearModel(
10, 25, 10, device="cuda", dtype=torch.float32
) # 10x25 and 25x10 weights

# Set up logging capture
with self.assertLogs(
Expand Down
26 changes: 5 additions & 21 deletions test/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
from torchao.quantization.utils import (
compute_error as SQNR,
)
from torchao.testing.model_architectures import ToyTwoLinearModel
from torchao.testing.utils import skip_if_rocm
from torchao.utils import (
benchmark_model,
Expand Down Expand Up @@ -1910,30 +1911,13 @@ def test_get_model_size_aqt(self, api, test_device, test_dtype):


class TestBenchmarkModel(unittest.TestCase):
class ToyLinearModel(torch.nn.Module):
def __init__(self, m=64, n=32, k=64):
super().__init__()
self.linear1 = torch.nn.Linear(m, n, bias=False)
self.linear2 = torch.nn.Linear(n, k, bias=False)

def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"):
return (
torch.randn(
batch_size, self.linear1.in_features, dtype=dtype, device=device
),
)

def forward(self, x):
x = self.linear1(x)
x = self.linear2(x)
return x

def run_benchmark_model(self, device):
# params
dtype = torch.bfloat16
m = self.ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to(device)
m = ToyTwoLinearModel(
1024, 1024, 1024, device=device, dtype=torch.bfloat16
).eval()
m_bf16 = copy.deepcopy(m)
example_inputs = m.example_inputs(dtype=dtype, device=device)
example_inputs = m.example_inputs()
m_bf16 = torch.compile(m_bf16, mode="max-autotune")
num_runs = 1
return benchmark_model(m_bf16, num_runs, example_inputs)
Expand Down
17 changes: 3 additions & 14 deletions test/quantization/quantize_/workflows/float8/test_float8_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from torchao.quantization.quantize_.common import KernelPreference
from torchao.quantization.quantize_.workflows.float8.float8_tensor import Float8Tensor
from torchao.quantization.utils import compute_error
from torchao.testing.model_architectures import ToyTwoLinearModel
from torchao.testing.utils import TorchAOIntegrationTestCase
from torchao.utils import (
_is_fbgemm_gpu_genai_available,
Expand All @@ -38,18 +39,6 @@
torch._dynamo.config.cache_size_limit = 128


class ToyLinearModel(torch.nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)

def forward(self, x):
x = self.linear1(x)
x = self.linear2(x)
return x


class ToyConvModel(torch.nn.Module):
def __init__(
self, dim, in_channels, out_channels, kernel_size, bias, padding, dtype, device
Expand Down Expand Up @@ -145,7 +134,7 @@ def test_fp8_linear_variants(
input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")

# Create a linear layer with bfloat16 dtype
model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
model = ToyTwoLinearModel(K, N, K, device="cuda", dtype=dtype).eval()

quantized_model = copy.deepcopy(model)

Expand Down Expand Up @@ -333,7 +322,7 @@ def test_kernel_preference_numerical_equivalence(self, granularity, sizes):
dtype = torch.bfloat16
input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
# Create a linear layer with bfloat16 dtype
model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
model = ToyTwoLinearModel(K, N, K, device="cuda", dtype=dtype).eval()

# reference kernel preference and results
# we are using KerenelPreference.TORCH as the reference
Expand Down
Loading