Update

vkuzo · vkuzo · commit b7d606957ae5 · 2025-08-18T15:53:09.000-07:00
[ghstack-poisoned]
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_marlin_sparse_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_marlin_sparse_tensor.py
@@ -21,6 +21,7 @@
 )
 from torchao.quantization.utils import compute_error
 from torchao.sparsity.sparse_api import apply_fake_sparsity
+from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
 )
@@ -38,6 +39,7 @@ class TestInt4MarlinSparseTensor(TestCase):
     def setUp(self):
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
+    @skip_if_rocm("ROCm enablement in progress")
     @parametrize("config", [BF16_ACT_CONFIG])
     @parametrize(
         "sizes",
@@ -65,6 +67,7 @@ def test_linear(self, config, sizes):
         quantized_and_compiled = compiled_linear(input)
         self.assertTrue(compute_error(original, quantized_and_compiled) > 20)
 
+    @skip_if_rocm("ROCm enablement in progress")
     @unittest.skip("Fix later")
     @parametrize("config", [BF16_ACT_CONFIG])
     def test_to_device(self, config):
@@ -81,6 +84,7 @@ def test_to_device(self, config):
             quantize_(linear, config)
             linear.to(device)
 
+    @skip_if_rocm("ROCm enablement in progress")
     @parametrize("config", [BF16_ACT_CONFIG])
     def test_module_path(self, config):
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py
@@ -33,8 +33,8 @@
     version=2,
 )
 
+# only 128 group_size is supported
 FP8_ACT_CONFIG = Float8DynamicActivationInt4WeightConfig(
-    group_size=128,
     packing_format="preshuffled",
 )
 
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1927,7 +1927,7 @@ def test_quantize_api_fp8_int4(self):
             quantize_(model, QATConfig(Float8DynamicActivationInt4WeightConfig(), step="convert"))
         """
         self._test_quantize_api_against_ptq(
-            Float8DynamicActivationInt4WeightConfig(group_size=128),
+            Float8DynamicActivationInt4WeightConfig(),
             target_prepare_sqnr=15,
             target_convert_sqnr=float("inf"),
         )
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_quantize.cuh b/torchao/csrc/cuda/mx_kernels/mxfp8_quantize.cuh
@@ -451,7 +451,7 @@ __device__ __forceinline__ OType torchao_quantize_value(float input_value,
  * Template parameters ensure compile-time array size checking for safety
  */
 template <typename OType, int NUM_VALUES, ScaleCalculationMode ScalingMode>
-__device__ __forceinline__ float
+__device__ __forceinline__ void
 quantize_block(float amax, e8m0_t &out_scale,
                        const float (&input_values)[NUM_VALUES],
                        OType (&output_values)[NUM_VALUES]) {
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1156,13 +1156,13 @@ def _int4_weight_only_transform(
 class Float8DynamicActivationInt4WeightConfig(AOBaseConfig):
     """Configuration for apply float8 dynamic per row quantization and int4
     per group weight quantization to linear
+    (only group_size 128 is supported right now since underlying kernel used only supports 128
+    and above and no benefits of making it bigger)
 
     Args:
-        `group_size`: group size for groupwise quantization for weight
         `packing_format`: how the weight is packed, only preshuffled is supported
     """
 
-    group_size: int = 128
     packing_format: PackingFormat = "preshuffled"
 
 
@@ -1174,13 +1174,13 @@ def _float8_dynamic_activation_int4_weight_transform(
         "applying int8 weight only quant requires module to have weight attribute"
         + " but {module} does not have one"
     )
-    group_size = config.group_size
     packing_format = config.packing_format
 
     assert packing_format == "preshuffled", (
         f"only preshuffled packing_format supported right now, got: {packing_format}"
     )
     weight = module.weight
+    group_size = 128
     block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
     new_weight = Int4PreshuffledTensor.from_hp(
         module.weight,

Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,8 @@`
`33`	`33`	`version=2,`
`34`	`34`	`)`
`35`	`35`
	`36`	`+# only 128 group_size is supported`
`36`	`37`	`FP8_ACT_CONFIG = Float8DynamicActivationInt4WeightConfig(`
`37`		`- group_size=128,`
`38`	`38`	`packing_format="preshuffled",`
`39`	`39`	`)`
`40`	`40`
Original file line number	Diff line number	Diff line change
`@@ -1927,7 +1927,7 @@ def test_quantize_api_fp8_int4(self):`
`1927`	`1927`	`quantize_(model, QATConfig(Float8DynamicActivationInt4WeightConfig(), step="convert"))`
`1928`	`1928`	`"""`
`1929`	`1929`	`self._test_quantize_api_against_ptq(`
`1930`		`- Float8DynamicActivationInt4WeightConfig(group_size=128),`
	`1930`	`+ Float8DynamicActivationInt4WeightConfig(),`
`1931`	`1931`	`target_prepare_sqnr=15,`
`1932`	`1932`	`target_convert_sqnr=float("inf"),`
`1933`	`1933`	`)`