gguf-py : fix and simplify quantized shape round-trip (#7483)

compilade · web-flow · commit b83bab15a5d2 · 2024-05-25T11:11:48.000+10:00
* gguf-py : fix and simplify quantized shape round-trip

* gguf-py : remove unused import
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -313,11 +313,10 @@ def write_tensors(self):
                         data = data.astype(np.float32)
                     data_qtype = gguf.GGMLQuantizationType.F32
 
-                block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
+                shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
+
                 # reverse shape to make it similar to the internal ggml dimension order
-                shape_str = f"""{{{', '.join(str(n) for n in reversed(
-                    (*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
-                )}}}"""
+                shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
 
                 # n_dims is implicit in the shape
                 logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
@@ -12,6 +12,8 @@
 import numpy as np
 import numpy.typing as npt
 
+from .quants import quant_shape_to_byte_shape
+
 if __name__ == "__main__":
     import sys
     from pathlib import Path
@@ -251,6 +253,7 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
             tensor_names.add(tensor_name)
             ggml_type = GGMLQuantizationType(raw_dtype[0])
             n_elems = int(np.prod(dims))
+            np_dims = tuple(reversed(dims.tolist()))
             block_size, type_size = GGML_QUANT_SIZES[ggml_type]
             n_bytes = n_elems * type_size // block_size
             data_offs = int(start_offs + offset_tensor[0])
@@ -279,14 +282,15 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
             else:
                 item_count = n_bytes
                 item_type = np.uint8
+                np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
             tensors.append(ReaderTensor(
                 name = tensor_name,
                 tensor_type = ggml_type,
                 shape = dims,
                 n_elements = n_elems,
                 n_bytes = n_bytes,
                 data_offset = data_offs,
-                data = self._get(data_offs, item_type, item_count),
+                data = self._get(data_offs, item_type, item_count).reshape(np_dims),
                 field = field,
             ))
         self.tensors = tensors
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -13,7 +13,6 @@
 import numpy as np
 
 from .constants import (
-    GGML_QUANT_SIZES,
     GGUF_DEFAULT_ALIGNMENT,
     GGUF_MAGIC,
     GGUF_VERSION,
@@ -26,6 +25,8 @@
     TokenType,
 )
 
+from .quants import quant_shape_from_byte_shape
+
 logger = logging.getLogger(__name__)
 
 
@@ -229,10 +230,7 @@ def add_tensor_info(
         else:
             dtype = raw_dtype
             if tensor_dtype == np.uint8:
-                block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
-                if tensor_shape[-1] % type_size != 0:
-                    raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
-                tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
+                tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
         n_dims = len(tensor_shape)
         self.ti_data += self._pack("I", n_dims)
         for i in range(n_dims):
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Callable
+from typing import Callable, Sequence
 
 from numpy.typing import DTypeLike
 
@@ -9,6 +9,20 @@
 import numpy as np
 
 
+def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+    block_size, type_size = GGML_QUANT_SIZES[quant_type]
+    if shape[-1] % block_size != 0:
+        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
+    return (*shape[:-1], shape[-1] // block_size * type_size)
+
+
+def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+    block_size, type_size = GGML_QUANT_SIZES[quant_type]
+    if shape[-1] % type_size != 0:
+        raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
+    return (*shape[:-1], shape[-1] // type_size * block_size)
+
+
 # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
 def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
     n = n.astype(np.float32, copy=False).view(np.int32)
diff --git a/gguf-py/scripts/gguf-new-metadata.py b/gguf-py/scripts/gguf-new-metadata.py
@@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
 
     for tensor in reader.tensors:
         total_bytes += tensor.n_bytes
-        # Dimensions are written in reverse order, so flip them first
-        shape = np.flipud(tensor.shape).tolist()
-        writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
+        writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
 
     bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)