Update custom op reg and add e2e testing

adabeyta · adabeyta · commit 6ec29a1857ae · 2025-09-26T19:50:29.000Z
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
@@ -139,6 +139,21 @@ def test_custom_compile_config(
     run_model(compilation_config, model, model_kwargs)
 
 
+@pytest.mark.parametrize(
+    "optimization_level",
+    [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE],
+)
+def test_fp8_kv_scale_compile(optimization_level: int):
+    model = "Qwen/Qwen2-0.5B"
+    model_kwargs = {
+        "quantization": "fp8",
+        "kv_cache_dtype": "fp8_e4m3",
+        "calculate_kv_scales": True,
+        "max_model_len": 512,
+    }
+    run_model(optimization_level, model, model_kwargs)
+
+
 def test_inductor_graph_partition_attn_fusion(caplog_vllm):
     if not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available "
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -586,10 +586,8 @@ def maybe_calc_kv_scales_fake(
 direct_register_custom_op(
     op_name="maybe_calc_kv_scales",
     op_func=maybe_calc_kv_scales,
-    mutates_args=[],
+    mutates_args=["query", "key", "value"],
     fake_impl=maybe_calc_kv_scales_fake,
-    dispatch_key=current_platform.dispatch_key,
-    tags=tag_cudagraph_unsafe,
 )