[Executorch][llama] Renamed quantized_kv_cache to custom_kv_cache

kimishpatel · kimishpatel · commit eef2a99b6462 · 2025-04-07T15:02:14.000-07:00
Because old name was misnomer Differential Revision: [D71833067](https://our.internmc.facebook.com/intern/diff/D71833067/) ghstack-source-id: 276640305 Pull Request resolved: #9944
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
@@ -108,7 +108,7 @@ runtime.python_library(
         "source_transformation/pre_quantization.py",
         "source_transformation/prune_vocab.py",
         "source_transformation/quantize.py",
-        "source_transformation/quantized_kv_cache.py",
+        "source_transformation/custom_kv_cache.py",
         "source_transformation/rms_norm.py",
         "source_transformation/rope.py",
         "source_transformation/sdpa.py",
@@ -208,9 +208,9 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "quantized_kv_cache",
+    name = "custom_kv_cache",
     srcs = [
-        "source_transformation/quantized_kv_cache.py",
+        "source_transformation/custom_kv_cache.py",
     ],
     _is_external_target = True,
     visibility = ["//executorch/..."],
@@ -240,7 +240,7 @@ runtime.python_test(
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
     ],
     deps = [
-        ":quantized_kv_cache",
+        ":custom_kv_cache",
         "//caffe2:torch",
         "//executorch/examples/models/llama:llama_transformer",
     ],
@@ -255,7 +255,7 @@ runtime.python_test(
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
     ],
     deps = [
-        ":quantized_kv_cache",
+        ":custom_kv_cache",
         ":sdpa",
         "//caffe2:torch",
         "//executorch/examples/models/llama:llama_transformer",
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -59,14 +59,14 @@
 )
 
 from .source_transformation.attention import replace_attention_to_attention_sha
+from .source_transformation.custom_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
+    replace_kv_cache_with_quantized_kv_cache,
+)
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
 )
-from .source_transformation.quantized_kv_cache import (
-    replace_kv_cache_with_custom_kv_cache,
-    replace_kv_cache_with_quantized_kv_cache,
-)
 from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
 
 from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis
diff --git a/examples/models/llama/source_transformation/custom_kv_cache.py b/examples/models/llama/source_transformation/custom_kv_cache.py
diff --git a/examples/models/llama/source_transformation/test_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_quantized_kv_cache.py
@@ -10,7 +10,7 @@
 
 from executorch.examples.models.llama.attention import KVCache
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     QuantizedCacheType,
     QuantizedKVCache,
 )
diff --git a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
@@ -10,7 +10,7 @@
 
 from executorch.examples.models.llama.attention import KVCache
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     CustomKVCache,
     QuantizedCacheType,
     QuantizedKVCache,
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -20,13 +20,13 @@
     build_args_parser,
     get_quantizer_and_quant_params,
 )
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
+)
 from executorch.examples.models.llama.source_transformation.quantize import (
     EmbeddingQuantHandler,
     get_quant_weight_transform,
 )
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
-    replace_kv_cache_with_custom_kv_cache,
-)
 from executorch.examples.models.llama.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
@@ -15,7 +15,7 @@
 from executorch.examples.models.llama.llama_transformer import Transformer
 from executorch.examples.models.llama.model_args import ModelArgs
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     replace_kv_cache_with_custom_kv_cache,
 )
 from executorch.examples.models.llama.source_transformation.sdpa import (

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`
`11`	`11`	`from executorch.examples.models.llama.attention import KVCache`
`12`	`12`
`13`		`-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (`
	`13`	`+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (`
`14`	`14`	`QuantizedCacheType,`
`15`	`15`	`QuantizedKVCache,`
`16`	`16`	`)`
Original file line number	Diff line number	Diff line change
`@@ -20,13 +20,13 @@`
`20`	`20`	`build_args_parser,`
`21`	`21`	`get_quantizer_and_quant_params,`
`22`	`22`	`)`
	`23`	`+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (`
	`24`	`+ replace_kv_cache_with_custom_kv_cache,`
	`25`	`+)`
`23`	`26`	`from executorch.examples.models.llama.source_transformation.quantize import (`
`24`	`27`	`EmbeddingQuantHandler,`
`25`	`28`	`get_quant_weight_transform,`
`26`	`29`	`)`
`27`		`-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (`
`28`		`- replace_kv_cache_with_custom_kv_cache,`
`29`		`-)`
`30`	`30`	`from executorch.examples.models.llama.source_transformation.sdpa import (`
`31`	`31`	`replace_sdpa_with_custom_op,`
`32`	`32`	`)`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`from executorch.examples.models.llama.llama_transformer import Transformer`
`16`	`16`	`from executorch.examples.models.llama.model_args import ModelArgs`
`17`	`17`
`18`		`-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (`
	`18`	`+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (`
`19`	`19`	`replace_kv_cache_with_custom_kv_cache,`
`20`	`20`	`)`
`21`	`21`	`from executorch.examples.models.llama.source_transformation.sdpa import (`