Skip to content

Commit 16913bc

Browse files
committed
Merge branch 'main' into jcaip/fix-module-device
2 parents ff68bb3 + 6b0bca4 commit 16913bc

File tree

13 files changed

+778
-326
lines changed

13 files changed

+778
-326
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ If you believe there's other CUDA kernels we should be taking a closer look at p
254254

255255
TorchAO is integrated into some of the leading open-source libraries including:
256256

257-
* Unsloth for QAT, blog post coming soon!
257+
* Unsloth now supports QAT: [Read blog](https://docs.unsloth.ai/new/quantization-aware-training-qat) and [guide](https://docs.unsloth.ai/new/quantization-aware-training-qat#qat--lora-finetuning).
258258
* HuggingFace transformers with a [builtin inference backend](https://huggingface.co/docs/transformers/main/quantization/torchao) and [low bit optimizers](https://github.com/huggingface/transformers/pull/31865)
259259
* HuggingFace diffusers best practices with `torch.compile` and TorchAO in a standalone repo [diffusers-torchao](https://github.com/huggingface/diffusers/blob/main/docs/source/en/quantization/torchao.md)
260260
* vLLM for LLM serving: [usage](https://docs.vllm.ai/en/latest/features/quantization/torchao.html), [detailed docs](https://docs.pytorch.org/ao/main/torchao_vllm_integration.html)

test/dtypes/test_affine_quantized_float.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def test_invalid_granularity(self):
152152
def test_mismatched_granularity(self):
153153
with pytest.raises(
154154
ValueError,
155-
match="Different granularities for activation and weight are not supported",
155+
match="Unsupported granularity types",
156156
):
157157
Float8DynamicActivationFloat8WeightConfig(
158158
granularity=(PerTensor(), PerRow())
@@ -165,7 +165,7 @@ def test_unsupported_granularity(self):
165165
class UnsupportedGranularity:
166166
pass
167167

168-
with pytest.raises(ValueError, match="Invalid granularity types"):
168+
with pytest.raises(ValueError, match="Unsupported granularity types"):
169169
Float8DynamicActivationFloat8WeightConfig(
170170
granularity=(UnsupportedGranularity(), UnsupportedGranularity()),
171171
)

test/prototype/mx_formats/test_mx_tensor.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,6 @@ def test_some_zeros(elem_dtype):
116116
_test_mx(data, elem_dtype, block_size)
117117

118118

119-
# TODO(future PR): fix and reenable this test
120-
@pytest.mark.skip(reason="does not pass on B200 yet")
121119
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
122120
def test_to_mx_rceil():
123121
# nan
@@ -131,11 +129,7 @@ def test_to_mx_rceil():
131129
],
132130
dtype=torch.uint32,
133131
).view(torch.float32)
134-
# fmt: on
135-
ground_truth_scale = torch.tensor([255], dtype=torch.uint8).view(
136-
torch.float8_e8m0fnu
137-
)
138-
# fmt: off
132+
139133
ground_truth_fp8 = torch.tensor(
140134
[
141135
127, 0, 0, 0, 0, 0, 0, 0,
@@ -149,7 +143,7 @@ def test_to_mx_rceil():
149143
data_mx = MXTensor.to_mx(
150144
data_hp, torch.float8_e4m3fn, 32, ScaleCalculationMode.RCEIL
151145
)
152-
torch.testing.assert_close(data_mx.scale, ground_truth_scale)
146+
assert torch.isnan(data_mx.scale)
153147
assert torch.isnan(data_mx.qdata[0])
154148
assert torch.all(data_mx.qdata[1:] == 0)
155149
# fp32 denorm

test/quantization/quantize_/workflows/float8/test_float8_tensor.py

Lines changed: 158 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from torchao.quantization import (
1919
Float8DynamicActivationFloat8WeightConfig,
2020
Float8WeightOnlyConfig,
21+
PerBlock,
2122
PerRow,
2223
PerTensor,
2324
quantize_,
@@ -30,6 +31,7 @@
3031
_is_fbgemm_gpu_genai_available,
3132
is_sm_at_least_89,
3233
is_sm_at_least_90,
34+
is_sm_at_least_100,
3335
torch_version_at_least,
3436
)
3537

@@ -38,17 +40,39 @@
3840

3941

4042
class ToyLinearModel(torch.nn.Module):
41-
def __init__(self, in_features, out_features):
43+
def __init__(self, in_features, out_features, bias):
4244
super().__init__()
43-
self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
44-
self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)
45+
self.linear1 = torch.nn.Linear(in_features, out_features, bias=bias)
46+
self.linear2 = torch.nn.Linear(out_features, in_features, bias=bias)
4547

4648
def forward(self, x):
4749
x = self.linear1(x)
4850
x = self.linear2(x)
4951
return x
5052

5153

54+
class ToyConvModel(torch.nn.Module):
55+
def __init__(
56+
self, dim, in_channels, out_channels, kernel_size, bias, padding, dtype, device
57+
):
58+
super().__init__()
59+
convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
60+
self.conv = convs[dim](
61+
in_channels,
62+
out_channels,
63+
kernel_size,
64+
bias=bias,
65+
padding=padding,
66+
dtype=dtype,
67+
device=device,
68+
)
69+
if dim == 3:
70+
self.conv = self.conv.to(memory_format=torch.channels_last_3d)
71+
72+
def forward(self, x):
73+
return self.conv(x)
74+
75+
5276
# TODO: move tests in test_affine_quantized_float.py here after we migrated all implementations
5377
@unittest.skipIf(not torch_version_at_least("2.8.0"), "Need pytorch 2.8+")
5478
@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -64,7 +88,10 @@ def setUp(self):
6488
@common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
6589
@common_utils.parametrize("mode", ["dynamic", "weight-only"])
6690
@common_utils.parametrize("compile", [True, False])
67-
@common_utils.parametrize("granularity", [PerTensor(), PerRow()])
91+
@common_utils.parametrize(
92+
"granularity",
93+
[PerTensor(), PerRow(), (PerBlock((1, 128)), PerBlock((128, 128)))],
94+
)
6895
@common_utils.parametrize(
6996
"kernel_preference",
7097
[KernelPreference.AUTO, KernelPreference.TORCH, KernelPreference.FBGEMM],
@@ -74,9 +101,11 @@ def setUp(self):
74101
"sizes",
75102
[
76103
((128,), 256, 128),
77-
((32, 128), 64, 256),
104+
((32, 128), 256, 512),
78105
],
79106
)
107+
@common_utils.parametrize("bias", [False, True])
108+
@torch.no_grad()
80109
def test_fp8_linear_variants(
81110
self,
82111
dtype: torch.dtype,
@@ -85,14 +114,36 @@ def test_fp8_linear_variants(
85114
granularity,
86115
kernel_preference: KernelPreference,
87116
sizes: Tuple,
117+
bias: bool,
88118
):
89-
if (
90-
isinstance(granularity, PerTensor)
91-
and kernel_preference == KernelPreference.FBGEMM
92-
):
93-
return unittest.skip(
94-
"per tensor with fbgemm kernel preferece does not work yet"
95-
)
119+
if isinstance(granularity, PerTensor):
120+
if kernel_preference is KernelPreference.FBGEMM:
121+
return unittest.skip(
122+
"per tensor with fbgemm kernel preference does not work yet"
123+
)
124+
elif mode == "weight-only":
125+
return unittest.skip("unimplemented")
126+
127+
elif granularity == (PerBlock((1, 128)), PerBlock((128, 128))):
128+
if dtype is not torch.bfloat16:
129+
return unittest.skip("unimplemented")
130+
elif mode != "dynamic":
131+
return unittest.skip("unimplemented")
132+
elif kernel_preference not in (
133+
KernelPreference.AUTO,
134+
KernelPreference.TORCH,
135+
):
136+
return unittest.skip("unimplemented")
137+
138+
if bias is True:
139+
sizes_to_keep = ((128,), 256, 128)
140+
if (
141+
sizes != sizes_to_keep
142+
or kernel_preference is not KernelPreference.TORCH
143+
):
144+
return unittest.skip(
145+
"cut down on number of options to save test time"
146+
)
96147

97148
error_message = None
98149
if isinstance(granularity, PerRow):
@@ -122,7 +173,7 @@ def test_fp8_linear_variants(
122173
input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
123174

124175
# Create a linear layer with bfloat16 dtype
125-
model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
176+
model = ToyLinearModel(K, N, bias).eval().to(dtype).to("cuda")
126177

127178
quantized_model = copy.deepcopy(model)
128179

@@ -137,6 +188,20 @@ def test_fp8_linear_variants(
137188

138189
quantize_(quantized_model, config)
139190

191+
# ensure weight scaling is what we expect
192+
qs1 = quantized_model.linear1.weight.scale
193+
qs2 = quantized_model.linear2.weight.scale
194+
if granularity == PerTensor():
195+
assert qs1.shape == (1, 1)
196+
assert qs2.shape == (1, 1)
197+
elif granularity == PerRow():
198+
assert qs1.shape == (N, 1)
199+
assert qs2.shape == (K, 1)
200+
else:
201+
assert granularity == (PerBlock((1, 128)), PerBlock((128, 128)))
202+
assert qs1.shape == (N // 128, K // 128)
203+
assert qs2.shape == (K // 128, N // 128)
204+
140205
if compile:
141206
quantized_model = torch.compile(quantized_model, fullgraph=True)
142207

@@ -148,6 +213,85 @@ def test_fp8_linear_variants(
148213
f"Quantization error is too high got a SQNR of {error}"
149214
)
150215

216+
@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
217+
@unittest.skipIf(
218+
not is_sm_at_least_100(), "Requires GPU with compute capability >= 10.0"
219+
)
220+
@common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
221+
@common_utils.parametrize("compile", [True, False])
222+
@common_utils.parametrize("granularity", [PerTensor()])
223+
@common_utils.parametrize("inference_mode", [True, False])
224+
@common_utils.parametrize(
225+
"kernel_preference",
226+
[KernelPreference.AUTO],
227+
)
228+
# only test for 3D conv for now
229+
# Inputs are (N, C_in, C_out, D, H, W)
230+
@common_utils.parametrize(
231+
"sizes",
232+
[
233+
(4, 16, 64, 32, 32, 32),
234+
],
235+
)
236+
def test_fp8_conv_variants(
237+
self,
238+
dtype: torch.dtype,
239+
compile: bool,
240+
granularity,
241+
inference_mode: bool,
242+
kernel_preference: KernelPreference,
243+
sizes: Tuple,
244+
):
245+
if (not _is_fbgemm_gpu_genai_available()) or (not is_sm_at_least_100()):
246+
return unittest.skip(
247+
"Requires fbgemm_gpu_genai and sm version >= 10.0 to run "
248+
"fbgemm kernel preference test"
249+
)
250+
251+
dim = 3
252+
N, C_in, C_out, D, H, W = sizes
253+
kernel_size = 3
254+
255+
# Note: this is channel last memory format
256+
input_tensor = torch.randn(N, C_in, D, H, W, dtype=dtype, device="cuda")
257+
input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
258+
259+
# Create a linear layer with bfloat16 dtype
260+
model = ToyConvModel(
261+
dim,
262+
C_in,
263+
C_out,
264+
kernel_size,
265+
bias=False,
266+
padding=0,
267+
dtype=dtype,
268+
device="cuda",
269+
).eval()
270+
271+
quantized_model = copy.deepcopy(model)
272+
273+
config = Float8DynamicActivationFloat8WeightConfig(
274+
granularity=granularity,
275+
kernel_preference=kernel_preference,
276+
)
277+
278+
_is_conv3d = lambda m, fqn: isinstance(m, torch.nn.Conv3d)
279+
280+
quantize_(quantized_model, config, filter_fn=_is_conv3d)
281+
282+
if compile:
283+
quantized_model = torch.compile(quantized_model, fullgraph=True)
284+
285+
inference_mode_ctx = torch.inference_mode() if inference_mode else nullcontext()
286+
with inference_mode_ctx:
287+
output_original = model(input_tensor)
288+
output_quantized = quantized_model(input_tensor)
289+
290+
error = compute_error(output_original, output_quantized)
291+
assert compute_error(output_original, output_quantized) > 20, (
292+
f"Quantization error is too high got a SQNR of {error}"
293+
)
294+
151295
@common_utils.parametrize("granularity", [PerTensor(), PerRow()])
152296
@unittest.skipIf(
153297
not is_sm_at_least_90(),
@@ -231,7 +375,7 @@ def test_kernel_preference_numerical_equivalence(self, granularity, sizes):
231375
dtype = torch.bfloat16
232376
input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
233377
# Create a linear layer with bfloat16 dtype
234-
model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
378+
model = ToyLinearModel(K, N, bias=False).eval().to(dtype).to("cuda")
235379

236380
# reference kernel preference and results
237381
# we are using KerenelPreference.TORCH as the reference

test/quantization/test_quant_primitives.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@
1414
MappingType,
1515
ZeroPointDomain,
1616
_choose_qparams_affine_tinygemm,
17+
_choose_scale_float8,
1718
_fake_quantize_affine,
1819
_fake_quantize_affine_cachemask,
1920
_maybe_expand_scale_to_tensor_shape,
21+
_quantize_affine_float8,
2022
choose_qparams_affine,
2123
dequantize_affine,
2224
quantize_affine,
@@ -55,6 +57,23 @@ def check_idempotent(self, fn, *args, **kwargs):
5557
return output1
5658

5759

60+
# from https://github.com/pytorch/pytorch/blob/7563f61cc8a40a5ba21a498a2d98895b4eec3f39/test/test_scaled_matmul_cuda.py#L100
61+
# with scale modified to be the inverse of the version in PT core
62+
def _tensor_to_scale_block(
63+
x: torch.Tensor,
64+
float8_dtype: torch.dtype,
65+
block_outer: int,
66+
block_inner: int,
67+
) -> tuple[torch.Tensor, torch.Tensor]:
68+
x = x.unflatten(1, (-1, block_inner)).unflatten(0, (-1, block_outer))
69+
amax = x.abs().amax(dim=[1, 3], keepdim=True).float()
70+
scale = amax / torch.finfo(float8_dtype).max
71+
x = x.div(scale).to(float8_dtype)
72+
x = x.flatten(2, 3).flatten(0, 1)
73+
scale = scale.flatten(2, 3).flatten(0, 1)
74+
return x, scale
75+
76+
5877
# Legacy tinygemm ops
5978
def _get_groupwise_affine_qparams(
6079
w,
@@ -798,6 +817,33 @@ def test_maybe_expand_scale_to_tensor_shape(self):
798817
self.assertEqual(new_scale5.shape, torch.Size([3, 2, 8]))
799818
self.assertEqual(new_scale5.unique(dim=-1).shape, torch.Size([3, 2, 2]))
800819

820+
def test_float8_blockwise_scaling(self):
821+
M, K = 512, 1024
822+
hp_tensor = torch.randn(M, K, dtype=torch.float)
823+
# make the scales from some of the blocks obviously different
824+
hp_tensor[0:128, 0:128] *= 3.0
825+
hp_tensor[0:128, 128:256] *= 7.0
826+
hp_tensor[128:256, 0:128] *= 2.0
827+
hp_tensor[128:256, 128:256] *= 100.0
828+
829+
block_size = (128, 128)
830+
831+
scale = _choose_scale_float8(
832+
hp_tensor,
833+
float8_dtype=torch.float8_e4m3fn,
834+
block_size=block_size,
835+
hp_value_lb=None,
836+
hp_value_ub=None,
837+
)
838+
data = _quantize_affine_float8(hp_tensor, scale, torch.float8_e4m3fn)
839+
840+
ref_data, ref_scale = _tensor_to_scale_block(
841+
hp_tensor, torch.float8_e4m3fn, 128, 128
842+
)
843+
844+
torch.testing.assert_close(scale, ref_scale, atol=0, rtol=0)
845+
torch.testing.assert_close(data.float(), ref_data.float(), atol=0, rtol=0)
846+
801847

802848
if __name__ == "__main__":
803849
unittest.main()

test/sparsity/test_marlin.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def setUp(self):
3939
.half()
4040
.cuda()
4141
)
42+
for param in self.model.parameters():
43+
param.requires_grad = False
4244

4345
@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
4446
@skip_if_rocm("ROCm enablement in progress")

0 commit comments

Comments
 (0)