begin metal implementation

t81dev · t81dev · commit 6db53f0eda38 · 2026-01-19T10:21:34.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 build/
+build-*/
 dist/
 *.o
 *.obj
@@ -14,6 +15,13 @@ pipx_home/
 pipx_logs/
 t81lib.egg-info/
 
+# CMake artifacts
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+CTestTestfile.cmake
+Makefile
+
 # Python runtime artifacts
 __pycache__/
 *.py[cod]
diff --git a/AGENTS.md b/AGENTS.md
@@ -63,3 +63,5 @@ This file helps AI agents discover and understand how to work with this reposito
 - Hardened the SIMD detection helpers in `include/t81/core/detail/simd.hpp` with CPUID/xgetbv fallbacks, documented the `add_trytes_*` overflow semantics, and made NEON runtime checks opt-out via `T81_DISABLE_NEON`.
 - Added the `compression-first` GGUF export profile (metadata + CLI flags), plus `scripts/gguf_benchmark.py` and CLI docs that walk FP16 to ternary GGUF before/after measurements.
 - Added `examples/ternary_phi3_ptq_qat_demo.ipynb` to showcase Phi-3-mini PTQ/QAT size, latency, and perplexity comparisons in one compact notebook.
+- Added Metal pack/quantize kernels (`src/linalg/pack_kernel.metal`, `src/linalg/pack_metal.mm`) plus `include/t81/linalg/pack_gpu.hpp` and Python binding dispatch so PTQ packing can run on Apple Metal when enabled.
+- Documented GGUF helper APIs (`read_gguf`, `repack_gguf`, `dequantize_gguf`) plus the experimental TQ1_1 note in the GGUF and Python docs.
diff --git a/docs/python-api.md b/docs/python-api.md
@@ -13,6 +13,15 @@ This page is the landing spot for the auto-generated Python reference. It is pro
 | `t81.convert` / `t81.gguf` | call the conversion/GGUF helpers programmatically | `from t81 import convert, gguf` |
 | `t81.hardware` | explore ternary hardware emulation helpers | `from t81 import hardware` |
 
+## GGUF helpers (quick reference)
+
+The `t81.gguf` module exposes streaming and compatibility utilities beyond the CLI wrappers:
+
+- `t81.gguf.write_gguf` to emit GGUF bundles from converted models.
+- `t81.gguf.read_gguf` to stream tensor payloads and metadata without loading the full file.
+- `t81.gguf.repack_gguf` to re-quantize existing float16/float32 GGUF bundles into TQ1_0/TQ2_0.
+- `t81.gguf.dequantize_gguf` (plus `t81.dequantize_gguf_to_float`) to rewrite ternary bundles into float GGUF files for broader runtime compatibility.
+
 ## Generating the docs
 
 1. Install the tooling (ideally in a virtual environment):
diff --git a/docs/python-cookbook.md b/docs/python-cookbook.md
@@ -63,3 +63,22 @@ t81-gguf --input model.t81 --validate
 ```
 
 This recipe shows how Python experiments (scripts, notebooks) complement the CLI docs in `docs/references/cli-usage.md`.
+
+## 4. Inspect, repack, or dequantize GGUF bundles in Python
+
+```python
+import numpy as np
+from t81 import gguf
+
+# Stream metadata and tensors without loading the full file into RAM.
+payload, metadata = gguf.read_gguf("model-tq1.gguf", return_metadata=True)
+print(metadata.get("general.architecture"))
+
+# Repack a float GGUF into ternary (float tensors only).
+gguf.repack_gguf("model-f16.gguf", "model-tq1.gguf", quant="TQ1_0", threshold=0.45)
+
+# Convert a ternary GGUF back to float for runtimes without TQ support.
+gguf.dequantize_gguf("model-tq1.gguf", "model-f16.gguf", dtype=np.float16)
+```
+
+Use `dequantize_gguf_to_float` when you always want float32 output, and set `T81_ENABLE_TQ1_1=1` before using the experimental `tq1_1-draft` profile.
diff --git a/docs/references/gguf.md b/docs/references/gguf.md
@@ -18,3 +18,13 @@ t81 convert meta-llama/Llama-3.2-3B-Instruct llama3.2-3b-t81.gguf \
 ### Export profiles
 
 For a no-knobs compression-first export, use the `compression-first` profile via the CLI (`--gguf-profile` or `--profile`). It stamps `t81.profile=compression-first` in metadata and pins the GGUF quant scheme to TQ1_0 for maximum compression.
+
+### Experimental TQ1_1 profile
+
+`tq1_1-draft` is available for header-size testing only. It requires `T81_ENABLE_TQ1_1=1` and writes payloads that are not yet loadable by llama.cpp, so use it for experiments rather than production GGUF bundles.
+
+### Repacking + dequantizing existing GGUF files
+
+`t81.gguf.repack_gguf` re-quantizes an existing GGUF file (float tensors only) and preserves the metadata, so you can take a float32 or float16 bundle and emit a ternary one without running the full conversion pipeline. For compatibility with runtimes that do not support ternary types, `t81.gguf.dequantize_gguf` (and the convenience `t81.dequantize_gguf_to_float`) converts TQ1_0/TQ2_0 payloads into float32 or float16 GGUF files.
+
+If you need to inspect a GGUF without loading everything into RAM, `t81.gguf.read_gguf` streams metadata and tensor payloads from the file handle and can return raw bytes instead of dequantized tensors.
diff --git a/include/t81/linalg/pack_gpu.hpp b/include/t81/linalg/pack_gpu.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <span>
+
+#ifndef T81LIB_USE_METAL
+#define T81LIB_USE_METAL 0
+#endif
+
+namespace t81::linalg::detail {
+
+#if T81LIB_USE_METAL
+void metal_quantize_to_trits(std::span<const float> src,
+                             std::span<std::int8_t> dst,
+                             float threshold);
+
+void metal_pack_dense_matrix(std::span<const float> src,
+                             std::span<std::uint8_t> dst,
+                             int rows,
+                             int cols,
+                             float threshold);
+#endif
+
+} // namespace t81::linalg::detail
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -64,7 +64,8 @@ endif()
 
 if(USE_METAL)
     target_sources(t81lib_python PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/../src/linalg/gemm_metal.mm)
+        ${CMAKE_CURRENT_SOURCE_DIR}/../src/linalg/gemm_metal.mm
+        ${CMAKE_CURRENT_SOURCE_DIR}/../src/linalg/pack_metal.mm)
 
     set(METAL_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/../src/linalg/gemm_kernel.metal)
     set(METAL_AIR ${CMAKE_CURRENT_BINARY_DIR}/gemm_kernel.air)
@@ -79,6 +80,20 @@ if(USE_METAL)
     add_custom_target(gemm_metal_shader DEPENDS ${METAL_LIB})
     add_dependencies(t81lib_python gemm_metal_shader)
     target_compile_definitions(t81lib_python PRIVATE GEMM_METAL_LIBRARY_PATH=\"${METAL_LIB}\")
+
+    set(PACK_METAL_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/../src/linalg/pack_kernel.metal)
+    set(PACK_METAL_AIR ${CMAKE_CURRENT_BINARY_DIR}/pack_kernel.air)
+    set(PACK_METAL_LIB ${CMAKE_CURRENT_BINARY_DIR}/pack_kernel.metallib)
+
+    add_custom_command(OUTPUT ${PACK_METAL_LIB}
+        COMMAND xcrun metal -c -o ${PACK_METAL_AIR} ${PACK_METAL_SOURCE}
+        COMMAND xcrun metallib -o ${PACK_METAL_LIB} ${PACK_METAL_AIR}
+        DEPENDS ${PACK_METAL_SOURCE}
+        COMMENT "Compiling Metal pack shader")
+
+    add_custom_target(pack_metal_shader DEPENDS ${PACK_METAL_LIB})
+    add_dependencies(t81lib_python pack_metal_shader)
+    target_compile_definitions(t81lib_python PRIVATE PACK_METAL_LIBRARY_PATH=\"${PACK_METAL_LIB}\")
 endif()
 
 set_target_properties(t81lib_python PROPERTIES
diff --git a/python/bindings.cpp b/python/bindings.cpp
@@ -24,6 +24,7 @@
 #include <t81/io/format.hpp>
 #include <t81/linalg/gemm.hpp>
 #include <t81/linalg/gemm_gpu.hpp>
+#include <t81/linalg/pack_gpu.hpp>
 #include <t81/tensor_metadata.hpp>
 #include <t81/sparse/simple.hpp>
 #include <t81/t81lib.hpp>
@@ -426,6 +427,17 @@ namespace {
         const std::size_t total = static_cast<std::size_t>(std::max<py::ssize_t>(info.size, 0));
         const auto src = static_cast<const float *>(info.ptr);
         const auto dst = static_cast<std::int8_t *>(output.request().ptr);
+#if T81LIB_USE_METAL
+        if (t81::linalg::detail::metal_available()) {
+            try {
+                std::span<const float> src_span{src, total};
+                std::span<std::int8_t> dst_span{dst, total};
+                t81::linalg::detail::metal_quantize_to_trits(src_span, dst_span, threshold);
+                return output;
+            } catch (const std::exception &) {
+            }
+        }
+#endif
         for (std::size_t index = 0; index < total; ++index) {
             dst[index] = quantize_trit(src[index], threshold);
         }
@@ -463,8 +475,25 @@ namespace {
         py::array_t<std::uint8_t> packed(
             {static_cast<std::size_t>(rows), static_cast<std::size_t>(limbs_per_row), limb_bytes});
         const auto *src = static_cast<const float *>(info.ptr);
-        auto *dst = static_cast<std::uint8_t *>(packed.request().ptr);
+        auto packed_info = packed.request(true);
+        auto *dst = static_cast<std::uint8_t *>(packed_info.ptr);
         const std::size_t row_stride = static_cast<std::size_t>(limbs_per_row) * limb_bytes;
+#if T81LIB_USE_METAL
+        if (t81::linalg::detail::metal_available()) {
+            try {
+                const std::size_t total_src =
+                    static_cast<std::size_t>(rows) * static_cast<std::size_t>(cols);
+                const std::size_t total_dst =
+                    static_cast<std::size_t>(std::max<py::ssize_t>(packed_info.size, 0));
+                std::span<const float> src_span{src, total_src};
+                std::span<std::uint8_t> dst_span{dst, total_dst};
+                t81::linalg::detail::metal_pack_dense_matrix(src_span, dst_span,
+                                                             rows, cols, threshold);
+                return packed;
+            } catch (const std::exception &) {
+            }
+        }
+#endif
         for (int row = 0; row < rows; ++row) {
             const auto *row_ptr =
                 src + static_cast<std::size_t>(row) * static_cast<std::size_t>(cols);
diff --git a/src/linalg/pack_kernel.metal b/src/linalg/pack_kernel.metal
@@ -0,0 +1,76 @@
+#include <metal_stdlib>
+using namespace metal;
+
+struct PackParams {
+    uint rows;
+    uint cols;
+    uint limbs_per_row;
+    uint trits_per_limb;
+    uint limb_bytes;
+    float threshold;
+};
+
+struct QuantParams {
+    uint count;
+    float threshold;
+};
+
+static inline int quantize_trit(float value, float threshold) {
+    float clamped = clamp(value, -1.0f, 1.0f);
+    if (clamped >= threshold) {
+        return 1;
+    }
+    if (clamped <= -threshold) {
+        return -1;
+    }
+    return 0;
+}
+
+kernel void quantize_trits_kernel(device const float *src [[buffer(0)]],
+                                  device char *dst [[buffer(1)]],
+                                  constant QuantParams &params [[buffer(2)]],
+                                  uint gid [[thread_position_in_grid]]) {
+    if (gid >= params.count) {
+        return;
+    }
+    const int trit = quantize_trit(src[gid], params.threshold);
+    dst[gid] = static_cast<char>(trit);
+}
+
+kernel void pack_dense_matrix_kernel(device const float *src [[buffer(0)]],
+                                     device uchar *dst [[buffer(1)]],
+                                     constant PackParams &params [[buffer(2)]],
+                                     uint gid [[thread_position_in_grid]]) {
+    const uint total_limbs = params.rows * params.limbs_per_row;
+    if (gid >= total_limbs) {
+        return;
+    }
+
+    const uint row = gid / params.limbs_per_row;
+    const uint limb = gid % params.limbs_per_row;
+    const uint base_col = limb * params.trits_per_limb;
+    const uint out_offset = (row * params.limbs_per_row + limb) * params.limb_bytes;
+
+    for (uint tryte_idx = 0; tryte_idx < params.limb_bytes; ++tryte_idx) {
+        const uint trit_base = tryte_idx * 3u;
+        int t0 = 0;
+        int t1 = 0;
+        int t2 = 0;
+
+        uint col = base_col + trit_base;
+        if (col < params.cols) {
+            t0 = quantize_trit(src[row * params.cols + col], params.threshold);
+        }
+        col = base_col + trit_base + 1u;
+        if (col < params.cols) {
+            t1 = quantize_trit(src[row * params.cols + col], params.threshold);
+        }
+        col = base_col + trit_base + 2u;
+        if (col < params.cols) {
+            t2 = quantize_trit(src[row * params.cols + col], params.threshold);
+        }
+
+        const int tryte = t0 + 3 * t1 + 9 * t2 + 13;
+        dst[out_offset + tryte_idx] = static_cast<uchar>(tryte);
+    }
+}
diff --git a/src/linalg/pack_metal.mm b/src/linalg/pack_metal.mm
diff --git a/src/llama.cpp b/src/llama.cpp