add infernece only roofline

drisspg · drisspg · commit d2a032853ea7 · 2025-10-13T17:51:32.000Z
diff --git a/.gitignore b/.gitignore
@@ -34,6 +34,7 @@ aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
 benchmarks/.data
+benchmarks/data
 caffe2/cpp_test/
 dist/
 docs/build/
diff --git a/benchmarks/float8/float8_roofline.py b/benchmarks/float8/float8_roofline.py
@@ -66,6 +66,8 @@
 from torchao.testing.training.roofline_utils import (
     get_float8_mem_sympy,
     get_gemm_time_sympy,
+    get_inference_float8_mem_sympy,
+    get_inference_gemm_time_sympy,
 )
 from torchao.utils import is_MI300
 
@@ -206,21 +208,32 @@ def run(
     n_limit: Optional[int] = None,
     float8_recipe_name: Optional[str] = None,
     mx_recipe_name: Optional[str] = None,
+    nvfp4_recipe_name: Optional[str] = None,
     enable_fusion_modeling: bool = False,
+    inference_only: bool = False,
 ):
     """
     Args:
     * `do_benchmarks`: if True, gemm and e2e fwd+bwd of LNLinearSigmoid are benchmarked
     * `shape_gen_name`: `llama`, `pow2`, `pow2_extended`, or `sweep`
     * `gemm_cache_filename (optional)`: file to cache gemm benchmark results
     * `n_limit (optional)`: if specified, only runs `n_limit` iterations
+    * `float8_recipe_name (optional)`: float8 quantization recipe
+    * `mx_recipe_name (optional)`: MX format recipe
+    * `nvfp4_recipe_name (optional)`: NVFP4 format recipe
     * `enable_fusion_modeling`: if False uses Linear, if True uses LNLinearSigmoid and models the fusion of float8 overhead
+    * `inference_only`: if True, only models inference (forward pass), not training
     """
 
-    assert not ((float8_recipe_name is not None) and (mx_recipe_name is not None)), (
-        "unsupported"
+    # Handle recipe specification
+    recipe_count = sum(
+        x is not None for x in [float8_recipe_name, mx_recipe_name, nvfp4_recipe_name]
     )
-    if float8_recipe_name is None and mx_recipe_name is None:
+
+    # Ensure only one recipe type is specified for single runs
+    assert recipe_count <= 1, "Only one recipe type can be specified at a time"
+
+    if recipe_count == 0:
         float8_recipe_name = "tensorwise"
 
     print(f"GPU: {torch.cuda.get_device_name(0)}")
@@ -230,28 +243,48 @@ def run(
     print(f"shape_gen_name: {shape_gen_name}")
     print(f"float8_recipe_name: {float8_recipe_name}")
     print(f"mx_recipe_name: {mx_recipe_name}")
+    print(f"nvfp4_recipe_name: {nvfp4_recipe_name}")
     print(f"enable_fusion_modeling: {enable_fusion_modeling}")
+    print(f"inference_only: {inference_only}")
 
     M, K, N = sympy.symbols("M K N")
 
-    fp8_ovhd_time_sympy = get_float8_mem_sympy(
-        M,
-        K,
-        N,
-        float8_recipe_name,
-        mx_recipe_name,
-        enable_fusion_modeling,
-    )
-    bf16_gemm_time_sympy = get_gemm_time_sympy(
-        M, K, N, torch.bfloat16, None, None, None
-    )
-    lowp_input_dtype = torch.float8_e4m3fn
-    if mx_recipe_name == "mxfp4_cutlass":
-        lowp_input_dtype = torch.float4_e2m1fn_x2
+    # Choose functions based on inference_only flag
+    if inference_only:
+        fp8_ovhd_time_sympy = get_inference_float8_mem_sympy(
+            M, K, N, float8_recipe_name, mx_recipe_name, nvfp4_recipe_name
+        )
+        bf16_gemm_time_sympy = get_inference_gemm_time_sympy(
+            M, K, N, torch.bfloat16, None, None
+        )
+        if nvfp4_recipe_name is not None:
+            # Use FP4 for NVFP4 format
+            fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
+                M, K, N, torch.float4_e2m1fn_x2, float8_recipe_name, nvfp4_recipe_name
+            )
+        else:
+            fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
+                M, K, N, torch.float8_e4m3fn, float8_recipe_name, None
+            )
+    else:
+        fp8_ovhd_time_sympy = get_float8_mem_sympy(
+            M,
+            K,
+            N,
+            float8_recipe_name,
+            mx_recipe_name,
+            enable_fusion_modeling,
+        )
+        bf16_gemm_time_sympy = get_gemm_time_sympy(
+            M, K, N, torch.bfloat16, None, None, None
+        )
+        lowp_input_dtype = torch.float8_e4m3fn
+        if mx_recipe_name == "mxfp4_cutlass":
+            lowp_input_dtype = torch.float4_e2m1fn_x2
 
-    fp8_gemm_time_sympy = get_gemm_time_sympy(
-        M, K, N, lowp_input_dtype, float8_recipe_name, mx_recipe_name, None
-    )
+        fp8_gemm_time_sympy = get_gemm_time_sympy(
+            M, K, N, lowp_input_dtype, float8_recipe_name, mx_recipe_name, None
+        )
     print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
     print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)
     print("fp8_ovhd_time_sympy", fp8_ovhd_time_sympy)
@@ -397,6 +430,9 @@ def run(
             m_fp8_dyn = torch.compile(m_fp8_dyn)
             b_fp8_e2e_time_s = get_gpu_kernel_time(m_fp8_dyn, x, grad_output)
 
+        # Calculate roofline speedup
+        roofline_speedup = r_bf16_gemm_time_s / (r_fp8_gemm_time_s + r_fp8_ovhd_time_s)
+
         results.append(
             [
                 M_val,
@@ -409,7 +445,7 @@ def run(
                 r_fp8_ovhd_time_s,
                 # roofline - gemm + overhead, and speedup
                 r_fp8_gemm_time_s + r_fp8_ovhd_time_s,
-                r_bf16_gemm_time_s / (r_fp8_gemm_time_s + r_fp8_ovhd_time_s),
+                roofline_speedup,
                 # benchmarks - gemm
                 b_bf16_gemm_time_s,
                 b_fp8_gemm_time_s,
diff --git a/torchao/testing/training/roofline_utils.py b/torchao/testing/training/roofline_utils.py
@@ -12,6 +12,9 @@
 BYTES_PER_EL_FLOAT4 = 0.5
 BYTES_PER_EL_FLOAT8 = 1
 BYTES_PER_EL_BF16 = 2
+BYTES_PER_EL_FLOAT8_E8M0 = 1
+BYTES_PER_EL_FLOAT32 = 4
+BYTES_PER_EL_FLOAT4 = 0.5
 
 gpu_name_to_specs = {
     "NVIDIA H100": {
@@ -241,7 +244,7 @@ def get_individual_gemm_time_sympy(
     elif dtype is torch.float4_e2m1fn_x2:
         peak_tops = specs["fp4_peak_tops"]
     else:
-        assert False, "unsupported"
+        assert False, f"unsupported dtype: {dtype}"
     compute_gemm_time_s = gemm_ops / peak_tops / specs["pct_achievable_gemm_tops"]
 
     # memory bound
@@ -274,7 +277,7 @@ def get_individual_gemm_time_sympy(
     elif dtype is torch.float4_e2m1fn_x2:
         bytes_rw = num_reads * BYTES_PER_EL_FLOAT4 + num_writes * BYTES_PER_EL_BF16
     else:
-        assert False, "unsupported"
+        assert False, f"unsupported dtype: {dtype}"
     mem_gemm_time_s = (
         bytes_rw / specs["peak_mem_bw_bytes_sec"] / specs["pct_achievable_mem_bw"]
     )
@@ -376,27 +379,56 @@ def get_inference_tensor_memory_traffic_ovhd_s(
     dim1,
     tensor_role: str,
     float8_recipe_name: Optional[str],
+    mx_recipe_name: Optional[str],
     fuse_with_prev=False,
 ) -> List[Union[sympy.Symbol, float]]:
     """
     Inference version of `get_tensor_memory_traffic_ovhd_s`.
     The only thing happening here is we quantize the activation.
     """
-    assert float8_recipe_name == "rowwise", "unsupported"
     assert fuse_with_prev is False, "unsupported"
+    assert tensor_role == "input", "inference only quantizes input activations"
 
     # assumes input bf16, output f8
     numel = dim0 * dim1
 
     res_bytes = None
 
-    assert tensor_role == "input"
-    # x_bf16 = ...
-    # kernel 1:               x_bf16 -> x_fp8
-    kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
-    res_bytes = [
-        kernel_1_rw,
-    ]
+    if float8_recipe_name == "tensorwise":
+        # x_bf16 = ...
+        # kernel 1:               x_bf16 -> max_abs_stage_1 -> tmp
+        # kernel 2 (mem traffic not modeled): tmp -> max_abs_stage_2 -> max_abs
+        # kernel 3:               x_bf16, max_abs -> to_float8 -> x_fp8
+        # kernel 1: read numel, write 0 (assume size(tmp) ~ 0)
+        kernel_1_rw = BYTES_PER_EL_BF16 * numel
+        # kernel 3: read in bf16, write in float8
+        kernel_3_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+        res_bytes = [kernel_1_rw, kernel_3_rw]
+
+    elif float8_recipe_name == "rowwise":
+        # x_bf16 = ...
+        # kernel 1:               x_bf16 -> x_fp8 (with per-row scaling)
+        kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+        # add in the bytes for scale writes
+        kernel_1_rw += BYTES_PER_EL_FLOAT32 * dim0
+        res_bytes = [kernel_1_rw]
+
+    elif mx_recipe_name in ("mxfp8_emulated", "mxfp8_cublas", "mxfp8_cublas_rceil"):
+        # x_bf16 = ...
+        # kernel 1:               x_bf16 -> x_mxfp8 (block-wise scaling for inference)
+        kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+        # add in the bytes for scale writes
+        kernel_1_rw += BYTES_PER_EL_FLOAT8_E8M0 * dim0 * (dim1 // 32)
+        res_bytes = [kernel_1_rw]
+
+    else:
+        # For NVFP4, assume minimal overhead since it's primarily a compute format
+        # x_bf16 = ...
+        # kernel 1:               x_bf16 -> x_nvfp4 (per-tensor scaling for inference)
+        kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT4 * numel
+        # add minimal scaling overhead (per-tensor scale)
+        kernel_1_rw += BYTES_PER_EL_FLOAT32  # single scale factor
+        res_bytes = [kernel_1_rw]
 
     # convert from bytes to seconds
     res_s = [
@@ -410,11 +442,75 @@ def get_inference_tensor_memory_traffic_ovhd_s(
     return res_s
 
 
+# def get_inference_tensor_memory_traffic_ovhd_bytes(
+#     dim0,
+#     dim1,
+#     tensor_role: str,
+#     float8_recipe_name: Optional[str],
+#     mx_recipe_name: Optional[str],
+#     fuse_with_prev=False,
+# ) -> int:
+#     """
+#     Get total bytes transferred for inference quantization overhead (bytes only, no time conversion).
+#     """
+#     assert fuse_with_prev is False, "unsupported"
+#     assert tensor_role == "input", "inference only quantizes input activations"
+
+#     numel = dim0 * dim1
+
+#     if float8_recipe_name == "tensorwise":
+#         # kernel 1: read numel in bf16
+#         kernel_1_rw = BYTES_PER_EL_BF16 * numel
+#         # kernel 3: read in bf16, write in float8
+#         kernel_3_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+#         total_bytes = kernel_1_rw + kernel_3_rw
+
+#     elif float8_recipe_name == "rowwise":
+#         # kernel 1: read bf16, write fp8 + scales
+#         kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+#         kernel_1_rw += BYTES_PER_EL_FLOAT32 * dim0
+#         total_bytes = kernel_1_rw
+
+#     elif mx_recipe_name in ("mxfp8_emulated", "mxfp8_cublas", "mxfp8_cublas_rceil"):
+#         # kernel 1: read bf16, write fp8 + block scales
+#         kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+#         kernel_1_rw += BYTES_PER_EL_FLOAT8_E8M0 * dim0 * (dim1 // 32)
+#         total_bytes = kernel_1_rw
+
+#     else:
+#         raise ValueError(f"Unsupported recipe for inference roofline: float8={float8_recipe_name}, mx={mx_recipe_name}")
+
+#     return total_bytes
+
+
+# def get_inference_float8_mem_bytes_sympy(
+#     M,
+#     K,
+#     N,
+#     float8_recipe_name: Optional[str],
+#     mx_recipe_name: Optional[str] = None,
+# ):
+#     """Get total bytes transferred for inference FP8 quantization overhead."""
+#     # input @ weight_t = output
+#     # MxK @ KxN => MxN
+#     total_bytes = get_inference_tensor_memory_traffic_ovhd_bytes(
+#         M,
+#         K,
+#         tensor_role="input",
+#         float8_recipe_name=float8_recipe_name,
+#         mx_recipe_name=mx_recipe_name,
+#         fuse_with_prev=False,
+#     )
+#     return total_bytes
+
+
 def get_inference_float8_mem_sympy(
     M,
     K,
     N,
     float8_recipe_name: Optional[str],
+    mx_recipe_name: Optional[str] = None,
+    nvfp4_recipe_name: Optional[str] = None,
     gpu_name: Optional[str] = None,
 ):
     specs = get_specs(gpu_name)
@@ -426,6 +522,7 @@ def get_inference_float8_mem_sympy(
         K,
         tensor_role="input",
         float8_recipe_name=float8_recipe_name,
+        mx_recipe_name=mx_recipe_name,
         fuse_with_prev=False,
     )
     res = sum([*fwd_fp8_input_mem])
@@ -438,9 +535,9 @@ def get_inference_gemm_time_sympy(
     N: sympy.Symbol,
     dtype,
     float8_recipe_name: Optional[str],
-    gpu_name: Optional[str],
+    nvfp4_recipe_name: Optional[str] = None,
+    gpu_name: Optional[str] = None,
 ):
-    assert float8_recipe_name == "rowwise" or float8_recipe_name is None, "unsupported"
     # note: this function is currently not super accurate for small shapes:
     # when M,K,N <= 1k,1k,1k it undercounts by around 2x
     gemm_output_time_s = get_individual_gemm_time_sympy(M, K, N, dtype, None, gpu_name)