gguf.py updates to support v4

t81dev · t81dev · commit dfc80ee15d9c · 2025-12-12T12:43:31.000-05:00
diff --git a/README.md b/README.md
@@ -108,6 +108,17 @@ Optional CUDA/ROCm backends can be enabled with `-DUSE_CUDA=ON` / `-DUSE_ROCM=ON
 
 `t81-convert`, `t81-gguf`, and `t81-qat` automate quantize→export→train flows with progress reporting and validation hooks. Browse [docs/references/cli-usage.md](docs/references/cli-usage.md), [docs/diagrams/cli-workflows-mermaid.md](docs/diagrams/cli-workflows-mermaid.md), and [examples/cli-examples.md](examples/cli-examples.md) for recipes.
 
+## GGUF v4 compliance
+
+t81’s GGUF exports already mirror the llama.cpp conventions; v4’s mandatory `gguf_header` additions are worth calling out for everybody writing their own converter:
+
+- **Header bump** – write `version = 4` instead of 3 so llama.cpp accepts the file and no longer fails with “unsupported version”.
+- **Global alignment metadata** – after `tensor_count`/`kv_count` emit `alignment` (default 32, power-of-two) and `reserved` (0) before the metadata block, and compute tensor padding with `GGML_PAD(size, alignment)` so every tensor data block ends on that boundary.
+- **Tensor padding & metadata rules** – rely on the new alignment field instead of optional metadata keys, keep `general.alignment` as a `uint32_t` power-of-two, and let missing/invalid values fail fast instead of corrupting mmaped loads.
+- **Implementation note** – `struct gguf_header { char magic[4]; uint32_t version; uint64_t tensor_count, kv_count; uint32_t alignment; uint32_t reserved; };` plus explicit `fwrite(&alignment, sizeof(uint32_t), 1, f); fwrite(&reserved, sizeof(uint32_t), 1, f);` immediately after writing `kv_count` is enough to match the official layout.
+
+The eight extra header bytes are negligible even for huge models, but they unlock ARM64-friendly alignment, predictable metadata parsing, and wide compatibility with upcoming llama.cpp releases.
+
 ## Use cases
 
 - Ternary LLM weight quantization and GGUF exports for Hugging Face + `llama.cpp`.
diff --git a/t81/gguf.py b/t81/gguf.py
@@ -20,10 +20,10 @@
 import t81lib
 from .nn import Linear as TernaryLinear
 
-HEADER_STRUCT = struct.Struct("<4sIIIIQQQQ")
 HEADER_MAGIC = b"GGUF"
-HEADER_VERSION = 0x00000003
+HEADER_VERSION = 4
 HEADER_ALIGNMENT = 32
+HEADER_STRUCT = struct.Struct("<4sIQQII")
 HEADER_SIZE = HEADER_STRUCT.size
 
 GGML_TYPE_TQ1_0 = 250
@@ -236,9 +236,6 @@ def write_gguf(
         len(tensor_payloads),
         metadata_count,
         HEADER_ALIGNMENT,
-        HEADER_ALIGNMENT,
-        tensor_infos_offset,
-        tensor_data_offset,
         0,
     )
 
@@ -257,7 +254,11 @@ def write_gguf(
         handle.write(tensor_data_section)
 
 
-def _parse_metadata(buffer: bytes, offset: int, count: int) -> Mapping[str, Any]:
+def _parse_metadata(
+    buffer: bytes,
+    offset: int,
+    count: int,
+) -> tuple[Mapping[str, Any], int]:
     metadata: dict[str, Any] = {}
     cursor = offset
     for _ in range(count):
@@ -285,7 +286,7 @@ def _parse_metadata(buffer: bytes, offset: int, count: int) -> Mapping[str, Any]
             cursor = value_end + 1
         else:
             raise ValueError(f"unsupported metadata value type {value_type}")
-    return metadata
+    return metadata, cursor
 
 
 def _parse_tensor_infos(buffer: bytes, offset: int, count: int) -> list[_TensorInfo]:
@@ -343,15 +344,19 @@ def read_gguf(path: str | Path, *, dequantize: bool = True) -> Mapping[str, torc
         version,
         num_tensors,
         metadata_kv_count,
-        _metadata_alignment,
-        _tensor_alignment,
-        tensor_infos_offset,
-        tensor_data_offset,
+        alignment,
+        reserved,
     ) = HEADER_STRUCT.unpack_from(buffer, 0)
     if magic != HEADER_MAGIC or version != HEADER_VERSION:
         raise ValueError("GGUF header mismatch")
+    if alignment != HEADER_ALIGNMENT:
+        raise ValueError("unsupported GGUF alignment")
+    if reserved != 0:
+        raise ValueError("reserved header field must be zero")
 
-    metadata = _parse_metadata(buffer, HEADER_SIZE, metadata_kv_count)
+    metadata, metadata_end = _parse_metadata(buffer, HEADER_SIZE, metadata_kv_count)
+    metadata_size = metadata_end - HEADER_SIZE
+    tensor_infos_offset = _align(HEADER_SIZE + metadata_size)
     block_rows = int(metadata.get("quantization.block_size", GGUF_QUANT_BLOCK_ROWS))
     tensor_infos = _parse_tensor_infos(buffer, tensor_infos_offset, num_tensors)
     sorted_infos = sorted(tensor_infos, key=lambda info: info.offset)