Merge pull request vllm-project#32 from luo-cheng2021/luocheng/pa-kv-u8-desc

ilya-lavrenov · web-flow · commit 2e5648ab25a0 · 2024-04-23T11:29:19.000+04:00
[CPU] Add comment for u8 kvcache layout
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
@@ -60,6 +60,9 @@ def __init__(
         self.head_size = model_config.get_head_size()
         if device_config.device.type == "cpu":
             if cache_config.cache_dtype == "u8":
+                # Scale, zero point and quantized data will be stored together.
+                # The layout for per token per head:
+                # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)|
                 self.head_size += 8
         self.num_layers = model_config.get_num_layers(parallel_config)
         self.num_heads = model_config.get_num_kv_heads(parallel_config)
@@ -191,6 +194,9 @@ def get_cache_block_size(
         head_size = model_config.get_head_size()
         if device_config.device.type == "cpu":
             if cache_dtype == "u8":
+                # Scale, zero point and quantized data will be stored together.
+                # The layout for per token per head:
+                # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)|
                 head_size += 8
         num_heads = model_config.get_num_kv_heads(parallel_config)
         num_layers = model_config.get_num_layers(parallel_config)