update

dsikka · dsikka · commit dc235db78911 · 2025-11-09T09:16:42.000-05:00
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -21,7 +21,7 @@
     DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
-    round_to_quantized_type,
+    round_to_quantized_type_args,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
@@ -473,13 +473,10 @@ def _quantize(
     if zero_point is not None:
         scaled += zero_point.to(x.dtype)
 
-    # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
-    clamped_value = torch.clamp(
-        scaled,
-        q_min,
-        q_max,
+    # clamp and round
+    quantized_value = round_to_quantized_type_args(
+        tensor=scaled, args=args, min=q_min, max=q_max
     )
-    quantized_value = round_to_quantized_type(clamped_value, args)
 
     if dtype is not None:
         quantized_value = quantized_value.to(dtype)
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -31,7 +31,8 @@
     "QuantizationType",
     "QuantizationStrategy",
     "QuantizationArgs",
-    "round_to_quantized_type",
+    "round_to_quantized_type_args",
+    "round_to_quantized_type_dtype",
     "ActivationOrdering",
     "DynamicType",
 ]
@@ -392,47 +393,57 @@ def get_observer(self) -> str:
     model_config = ConfigDict(extra="forbid")
 
 
-def _round_dtype(tensor: torch.Tensor, dtype: torch.dtype):
+def round_to_quantized_type_dtype(
+    tensor: torch.Tensor, dtype: torch.dtype
+) -> torch.Tensor:
+    """
+    Rounds an input tensor to the nearest quantized representation given a dtype.
+    The original dtype is kept post-rounding.
+
+    :param tensor: tensor to round
+    :param dtype: dtype to use for rounding
+    :return: rounded tensor
+    """
+    original_dtype = tensor.dtype
     if torch.is_floating_point(torch.tensor([], dtype=dtype)):
         finfo = torch.finfo(dtype)
-        return torch.clamp(tensor, finfo.min, finfo.max).to(dtype)
+        rounded = torch.clamp(tensor, finfo.min, finfo.max).to(dtype)
     else:
         iinfo = torch.iinfo(dtype)
-        return torch.round(torch.clamp(tensor, iinfo.min, iinfo.max))
-
+        rounded = torch.round(torch.clamp(tensor, iinfo.min, iinfo.max))
 
-def _round_args(tensor: torch.Tensor, args: QuantizationArgs):
-    if args.type == QuantizationType.FLOAT:
-        if args.num_bits == 8:
-            return tensor.to(FP8_E4M3_DATA.dtype)
-        elif args.num_bits == 4:
-            return FP4_E2M1_DATA.cast_to_fp4(tensor)
-        else:
-            raise NotImplementedError("Only num_bits in (4, 8) are supported")
-    elif args.type == QuantizationType.INT:
-        return torch.round(tensor)
-    else:
-        raise ValueError(f"Invalid quantization type {args.type}")
+    return rounded.to(original_dtype)
 
 
-def round_to_quantized_type(
+def round_to_quantized_type_args(
     tensor: torch.Tensor,
-    args: Optional[QuantizationArgs] = None,
-    dtype: Optional[torch.dtype] = None,
+    args: QuantizationArgs,
+    min: torch.Tensor,
+    max: torch.Tensor,
 ) -> torch.Tensor:
     """
-    Rounds each element of the input tensor to the nearest quantized representation,
-    keeping to original dtype. This can be done given QuantizationArgs or dtype
+    Rounds an input tensor to the nearest quantized representation given
+    qunatization args. The original dtype is kept post-rounding.
 
     :param tensor: tensor to round
-    :param args: QuantizationArgs to pull appropriate dtype from
-    :param dtype: dtype to use for rounding
+    :param args: quantization args to use for rounding
+    :param min: min value to use for clamping
+    :param max: max value to use for clamping
     :return: rounded tensor
     """
+
     original_dtype = tensor.dtype
-    if dtype is not None:
-        rounded = _round_dtype(tensor=tensor, dtype=dtype)
-    elif args is not None:
-        rounded = _round_args(tensor=tensor, args=args)
+    tensor = torch.clamp(tensor, min, max)
+    if args.type == QuantizationType.FLOAT:
+        if args.num_bits == 8:
+            rounded = tensor.to(FP8_E4M3_DATA.dtype)
+        elif args.num_bits == 4:
+            rounded = FP4_E2M1_DATA.cast_to_fp4(tensor)
+        else:
+            raise NotImplementedError("Only num_bits in (4, 8) are supported")
+    elif args.type == QuantizationType.INT:
+        rounded = torch.round(tensor)
+    else:
+        raise ValueError(f"Invalid quantization type {args.type}")
 
     return rounded.to(original_dtype)
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -24,7 +24,7 @@
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
-    round_to_quantized_type,
+    round_to_quantized_type_dtype,
 )
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.utils import deprecated
@@ -108,7 +108,9 @@ def calculate_qparams(
 
     # 3. Conditionally round the scale to the quantized dtype, if scale_dtype is set
     if quantization_args.scale_dtype is not None:
-        scales = round_to_quantized_type(scales, dtype=quantization_args.scale_dtype)
+        scales = round_to_quantized_type_dtype(
+            scales, dtype=quantization_args.scale_dtype
+        )
 
     # 4. Update any 0s with small values to
     # prevent div by 0
@@ -124,7 +126,9 @@ def calculate_qparams(
     )
 
     # 5. Round the zp to zp_dtype
-    zero_points = round_to_quantized_type(zero_points, dtype=quantization_args.zp_dtype)
+    zero_points = round_to_quantized_type_dtype(
+        zero_points, dtype=quantization_args.zp_dtype
+    )
 
     if scales.ndim == 0:
         scales = scales.reshape(1)