Skip to content

Commit 2e5648a

Browse files
Merge pull request vllm-project#32 from luo-cheng2021/luocheng/pa-kv-u8-desc
[CPU] Add comment for u8 kvcache layout
2 parents 469a4d0 + 560c2ce commit 2e5648a

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

vllm/executor/openvino_executor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ def __init__(
6060
self.head_size = model_config.get_head_size()
6161
if device_config.device.type == "cpu":
6262
if cache_config.cache_dtype == "u8":
63+
# Scale, zero point and quantized data will be stored together.
64+
# The layout for per token per head:
65+
# |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)|
6366
self.head_size += 8
6467
self.num_layers = model_config.get_num_layers(parallel_config)
6568
self.num_heads = model_config.get_num_kv_heads(parallel_config)
@@ -191,6 +194,9 @@ def get_cache_block_size(
191194
head_size = model_config.get_head_size()
192195
if device_config.device.type == "cpu":
193196
if cache_dtype == "u8":
197+
# Scale, zero point and quantized data will be stored together.
198+
# The layout for per token per head:
199+
# |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)|
194200
head_size += 8
195201
num_heads = model_config.get_num_kv_heads(parallel_config)
196202
num_layers = model_config.get_num_layers(parallel_config)

0 commit comments

Comments
 (0)