t81dev
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 70 additions & 9 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 70 additions & 9 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 1 addition & 0 deletions b/‎AGENTS.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎DEVELOPMENT.md‎
Lines changed: 56 additions & 8 deletions b/‎DEVELOPMENT.md‎
Lines changed: 56 additions & 8 deletions
diff --git a/‎README.md‎
Lines changed: 25 additions & 2 deletions b/‎README.md‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎docs/ROADMAP.md‎
Lines changed: 21 additions & 11 deletions b/‎docs/ROADMAP.md‎
Lines changed: 21 additions & 11 deletions
diff --git a/‎docs/index.md‎
Lines changed: 27 additions & 0 deletions b/‎docs/index.md‎
Lines changed: 27 additions & 0 deletions
@@ -31,47 +31,67 @@ jobs:
   build-and-test:
     name: Matrix build & tests
     needs: format-and-lint
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.compiler.os }}
     strategy:
       fail-fast: false
       matrix:
         compiler:
           - name: gcc
             c: gcc
             cxx: g++
+            os: ubuntu-latest
           - name: clang
             c: clang
             cxx: clang++
+            os: ubuntu-latest
+          - name: appleclang
+            c: clang
+            cxx: clang++
+            os: macos-14
         configuration:
           - name: minimal
             python_bindings: OFF
           - name: python
             python_bindings: ON
-        avx:
-          - name: avx
-            flags: "-mavx512f"
+            python_version: "3.11"
+        simd:
           - name: scalar
             flags: "-mno-avx"
+          - name: avx2
+            flags: "-mavx2"
+        build_type:
+          - Release
+          - Debug
+        exclude:
+          - compiler:
+              name: appleclang
+            simd:
+              name: avx2
     env:
-      BUILD_DIR: build-${{ matrix.compiler.name }}-${{ matrix.configuration.name }}-${{ matrix.avx.name }}
-      ARTIFACT_LABEL: ci-${{ matrix.compiler.name }}-${{ matrix.configuration.name }}-${{ matrix.avx.name }}
+      BUILD_DIR: build-${{ matrix.compiler.os }}-${{ matrix.compiler.name }}-${{ matrix.configuration.name }}-${{ matrix.simd.name }}-${{ matrix.build_type }}
+      ARTIFACT_LABEL: ci-${{ matrix.compiler.os }}-${{ matrix.compiler.name }}-${{ matrix.configuration.name }}-${{ matrix.simd.name }}-${{ matrix.build_type }}
     steps:
       - uses: actions/checkout@v4
+      - name: Setup Python
+        if: matrix.configuration.name == 'python'
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.configuration.python_version }}
       - name: Configure
         run: |
           cmake -S . -B "$BUILD_DIR" \
             -DT81LIB_BUILD_TESTS=ON \
             -DT81LIB_BUILD_PYTHON_BINDINGS=${{ matrix.configuration.python_bindings }} \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
             -DCMAKE_C_COMPILER=${{ matrix.compiler.c }} \
             -DCMAKE_CXX_COMPILER=${{ matrix.compiler.cxx }} \
-            -DCMAKE_C_FLAGS="${{ matrix.avx.flags }}" \
-            -DCMAKE_CXX_FLAGS="${{ matrix.avx.flags }}"
+            ${{ runner.os == 'Linux' && format('-DCMAKE_C_FLAGS={0} -DCMAKE_CXX_FLAGS={0}', matrix.simd.flags) || '' }}
       - name: Build
         run: cmake --build "$BUILD_DIR" --parallel
       - name: Test
         run: ctest --test-dir "$BUILD_DIR" --output-on-failure
       - name: Python binding + GGUF regression tests
-        if: matrix.configuration.name == 'python'
+        if: matrix.configuration.name == 'python' && runner.os == 'Linux'
         run: |
           set -euo pipefail
           mkdir -p artifacts
@@ -91,6 +111,47 @@ jobs:
           name: ${{ env.ARTIFACT_LABEL }}
           path: artifacts
 
+  coverage:
+    name: Coverage report
+    runs-on: ubuntu-latest
+    needs: build-and-test
+    env:
+      BUILD_DIR: build-coverage
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install coverage tooling
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install gcovr
+      - name: Configure
+        run: |
+          cmake -S . -B "$BUILD_DIR" \
+            -DT81LIB_BUILD_TESTS=ON \
+            -DT81LIB_BUILD_PYTHON_BINDINGS=OFF \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DCMAKE_C_FLAGS="--coverage -O0 -g" \
+            -DCMAKE_CXX_FLAGS="--coverage -O0 -g"
+      - name: Build
+        run: cmake --build "$BUILD_DIR" --parallel
+      - name: Test
+        run: ctest --test-dir "$BUILD_DIR" --output-on-failure
+      - name: Generate coverage report
+        run: |
+          mkdir -p artifacts
+          gcovr --root . \
+            --exclude "$BUILD_DIR/.*" \
+            --xml --output artifacts/coverage.xml \
+            --html --html-details --output artifacts/coverage.html
+      - name: Upload coverage artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-report
+          path: artifacts
+
   gpu-cuda:
     name: CUDA GPU smoke
     runs-on: ubuntu-latest
 
@@ -62,3 +62,4 @@ This file helps AI agents discover and understand how to work with this reposito
 - Enhanced `tests/python/test_gguf.py` with quant-parameterized round-trip checks, metadata assertions, and a regression case for invalid quant identifiers to spotlight the GGUF helpers before future agents touch them.
 - Hardened the SIMD detection helpers in `include/t81/core/detail/simd.hpp` with CPUID/xgetbv fallbacks, documented the `add_trytes_*` overflow semantics, and made NEON runtime checks opt-out via `T81_DISABLE_NEON`.
 - Added the `compression-first` GGUF export profile (metadata + CLI flags), plus `scripts/gguf_benchmark.py` and CLI docs that walk FP16 to ternary GGUF before/after measurements.
+- Added `examples/ternary_phi3_ptq_qat_demo.ipynb` to showcase Phi-3-mini PTQ/QAT size, latency, and perplexity comparisons in one compact notebook.
@@ -1,4 +1,4 @@
-#Developer Onboarding Guide
+# Developer Onboarding Guide
 
 This project already boasts extensive documentation (`README.md`, `docs/index.md`, `AGENTS.md`, `docs/ROADMAP.md`), but this guide focuses on the concrete steps a new contributor needs to take so they can build, test, and iterate quickly.
 
@@ -11,15 +11,43 @@ cd t81lib
 
 The repository uses CMake for every build target, so keep a spare build directory for each configuration (e.g., `build`, `build-python`).
 
-## 2. Native C++ build & tests
+## 2. Quickstart matrix
+
+Pick the smallest path that matches your task:
+
+**C++ core only** (no Python bindings)
+
+```bash
+cmake -S . -B build -DT81LIB_BUILD_TESTS=ON
+cmake --build build
+ctest --test-dir build
+```
+
+**C++ + Python bindings (pybind11)**
+
+```bash
+cmake -S . -B build-python -DT81LIB_BUILD_PYTHON_BINDINGS=ON -DT81LIB_BUILD_TESTS=ON
+cmake --build build-python
+PYTHONPATH=build-python python tests/python/test_bindings.py
+```
+
+**CLI helpers (torch/transformers extras)**
+
+```bash
+pipx install .[torch]
+pipx ensurepath
+t81 --help
+```
+
+## 3. Native C++ build & tests
 
 ```bash
 ./run-tests.sh
 ```
 
 `run-tests.sh` (located in the repo root) configures CMake with `-DT81LIB_BUILD_TESTS=ON`, builds the default targets, and runs `ctest`. The script respects the `BUILD_DIR` environment variable, so you can run `BUILD_DIR=build-debug ./run-tests.sh` if you want a custom path.
 
-## 3. Python bindings
+## 4. Python bindings
 
 ```bash
 ./build-python.sh
@@ -37,19 +65,39 @@ pipx ensurepath
 
 The console scripts `t81-convert`, `t81-gguf`, and `t81-qat` become available after the pipx installation.
 
-## 4. CLI helpers
+## 5. Common workflows
+
+Use the scripts in the repo root for the most common tasks:
+
+```bash
+# clean configure + build + ctest
+./run-tests.sh
+
+# configure + build the pybind11 module
+./build-python.sh
+
+# custom build dir
+BUILD_DIR=build-debug ./run-tests.sh
+```
+
+Expected outputs:
+
+- `run-tests.sh` prints the CMake configure summary, build steps, and `ctest` results.
+- `build-python.sh` emits the binding target and leaves the extension module in the build dir.
+
+## 6. CLI helpers
 
 All CLI workflow documentation lives in `docs/references/cli-usage.md`, and the Mermaid diagrams are in `docs/diagrams/cli-workflows-mermaid.md`. Consult those docs for flag explanations, input requirements, and usage examples before writing CLI-focused contributions.
 
-## 5. Documentation & roadmap
+## 7. Documentation & roadmap
 
 If you're updating architecture or proposing a major change, refer to `docs/ROADMAP.md` for the current vision and the recommended initiatives that maintainers are tracking. Document your work in the nearest relevant doc (README, docs/index, AGENTS, etc.).
 
-## 6. Developer container
+## 8. Developer container
 
-VS Code users can open this repo in the configured `.devcontainer` to get a reproducible environment with CMake, Ninja, Clang, Python, and pipx pre-installed. After opening the folder, select **Reopen in Container** and let VS Code build the container once.
+Preferred setup is a native local toolchain. Use the `.devcontainer` only if you need a fully reproducible VS Code + Docker environment with CMake, Ninja, Clang, Python, and pipx pre-installed. After opening the folder in VS Code, select **Reopen in Container** and let the container build once.
 
-## 7. Additional tips
+## 9. Additional tips
 
 * Keep your changes small and test locally before opening a PR.
 * Run `clang-format` on files you touch (see `.clang-format` for the style configuration).
 
@@ -14,6 +14,29 @@ README.md — Visitor-facing overview, focused onboarding, and first-steps guida
 packed ternary GEMMs, Python bindings, and quantization helpers to deterministic numerics and ternary-aware
 AI workflows.
 
+**Featured demo** — [Ternary Quantization on Phi-3-mini: PTQ + QAT (8-10x compression, <10% PPL degradation)](examples/ternary_phi3_ptq_qat_demo.ipynb)
+
+## Getting started for Torch users
+
+If you are arriving from PyTorch or Hugging Face, start here and treat `t81` as your single entry point:
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install ".[torch]"
+```
+
+```python
+import t81 as t8
+
+tensor = t8.torch.TernaryTensor.from_float(weight, threshold=0.45)
+output = tensor.matmul_input(input_tensor, bias=bias)
+```
+
+# -> Runs 2-4x faster on CPU than FP16 with packed ternary kernels
+
+Next steps: `t8.nn.Linear` for drop-in layers, `t8.convert`/`t8.gguf` for programmatic conversion, and `t81 convert`/`t81 gguf` for CLI flows.
+
 ## Minimum viable success
 
 ```cpp
@@ -121,7 +144,7 @@ Optional CUDA/ROCm backends can be enabled with `-DUSE_CUDA=ON` / `-DUSE_ROCM=ON
 
 ## CLI helpers
 
-`t81 convert`, `t81 gguf`, `t81 info`, and `t81-qat` automate quantize→export→train flows with progress reporting and validation hooks (the legacy `t81-convert`/`t81-gguf` names still work). Browse [docs/references/cli-usage.md](docs/references/cli-usage.md), [docs/diagrams/cli-workflows-mermaid.md](docs/diagrams/cli-workflows-mermaid.md), and [examples/cli-examples.md](examples/cli-examples.md) for recipes.
+`t81 convert`, `t81 gguf`, `t81 info`, and `t81-qat` automate quantize→export→train flows with progress reporting and validation hooks (the legacy `t81-convert`/`t81-gguf` names still work). Note: both `t81 convert` (new) and `t81-convert` (legacy) are available for backward compatibility. Browse [docs/references/cli-usage.md](docs/references/cli-usage.md), [docs/diagrams/cli-workflows-mermaid.md](docs/diagrams/cli-workflows-mermaid.md), and [examples/cli-examples.md](examples/cli-examples.md) for recipes.
 
 ### Large models & GGUF streaming
 
@@ -153,7 +176,7 @@ For a zero-disk workaround you can also dequantize on the fly (via `t81.dequanti
 - Deterministic numerics and research-grade arithmetic built atop the same core.
 - Ternary hardware simulation and energy modeling (see [docs/hardware.md](docs/hardware.md)).
 
-See [docs/use-cases.md](docs/use-cases.md) for demos, notebooks, and experiments that spotlight these flows.
+See [Use Cases & Demos](docs/use-cases.md) for real-world workflows, including the Phi-3-mini ternary notebook.
 
 ## Examples
 
 
@@ -56,35 +56,45 @@ The codebase is thoughtfully commented, consistently formatted by `clang-format`
 
 ## 3. Next Steps Recommendations
 
+### Progress update (2024)
+
+Recent work has delivered parts of this roadmap:
+
+* **Recommendation 1** — quickstart matrix + common workflows added to `DEVELOPMENT.md`.
+* **Recommendation 2** — CI matrix expanded for OS/build types and SIMD guards; Python tests standardized on Linux.
+* **Recommendation 3** — Python entry-points table added to `docs/python-api.md` and `docs/python-cookbook.md`, with links from `docs/index.md`.
+
+Remaining items are listed below with the next steps still required.
+
 ### Recommendation 1: Streamline the Developer Onboarding Experience
 
 * **Why**: New contributors struggle with CMake options, Python extras, and building bindings.
 * **Benefits**: Faster first-time builds, fewer setup questions, more contributions.
 * **Effort**: Medium.
 * **Implementation**:
-  1. Add `DEVELOPMENT.md` with step-by-step instructions (clone, configure with `T81LIB_BUILD_PYTHON_BINDINGS`, `pipx` options, CLI usage).
-  2. Provide helper scripts or Makefile targets (`run-tests.sh`, `build-python.sh`) that wrap the most common commands.
-  3. Deliver a `.devcontainer` (VS Code + Docker) to let contributors spin up a configured environment without manual dependency juggling.
+  1. Expand `DEVELOPMENT.md` with a quickstart matrix (CMake-only vs. bindings vs. torch extras) and explicit `T81LIB_BUILD_PYTHON_BINDINGS` examples. **Done.**
+  2. Add a short "common workflows" section that references `run-tests.sh` and `build-python.sh`, plus expected outputs/flags. **Done.**
+  3. Decide whether a `.devcontainer` is still needed or document the current preferred local setup to avoid duplicate paths. **Done.**
 
 ### Recommendation 2: Expand CI and Code Quality Automation
 
 * **Why**: The existing CI could do more to prevent regressions and enforce style across languages.
 * **Benefits**: Higher confidence in main, earlier detection of regressions, better cross-platform coverage.
 * **Effort**: Medium.
 * **Implementation**:
-  1. Extend `.github/workflows/ci.yml` with a build matrix (GCC/Clang, binding vs. minimal configuration, AVX vs. scalar) and run both C++ and Python test suites.
-  2. Add format/lint steps (`clang-format` check, `ruff`/`black` for Python) to gate style.
-  3. Publish coverage/artifacts (e.g., via Codecov or GHA artifacts) so maintainers can monitor test completeness.
+  1. Extend `.github/workflows/ci.yml` with a richer build matrix (GCC/Clang, bindings vs. minimal configuration, AVX vs. scalar) and run both C++ and Python test suites. **Done (matrix + SIMD guard updates).**
+  2. Add format/lint steps (`clang-format` check, `ruff`/`black` for Python) to gate style. **Done.**
+  3. Publish coverage/artifacts (e.g., via Codecov or GHA artifacts) so maintainers can monitor test completeness. **Done (coverage artifact upload).**
 
 ### Recommendation 3: Unify and Document the Python API Surface
 
 * **Why**: Python users currently discover helpers across `t81lib`, `t81`, and CLI docs.
 * **Benefits**: Easier discoverability, faster adoption, clearer path from C++ bindings to Torch wrappers.
 * **Effort**: Low-Medium.
 * **Implementation**:
-  1. Integrate Sphinx or MkDocs into `docs/` to auto-generate Python API reference from docstrings and tie it to the existing docs site.
-  2. Add a “Python Cookbook” doc with recipes showing how to combine `t81lib.pack_dense_matrix`, `t81.torch.TernaryTensor`, and CLI helpers.
-  3. Consider re-exporting the binding objects via the higher-level `t81` module so users can `import t81` and access the full quantization stack.
+  1. Expand MkDocs coverage by generating the Python API reference via mkdocstrings and ensuring key modules are linked from `docs/index.md`. **Done (entry-point links + extra directives).**
+  2. Keep the “Python Cookbook” up to date with end-to-end recipes (bindings + `t81.torch` + CLI), and add a short "choose your entry point" table. **Done (entry points table).**
+  3. Validate that the `t81` re-exports stay in sync with `t81lib` bindings and add a quick API surface checklist. **Done (checklist added).**
 
 ### Recommendation 4: Introduce a Standardized Quantization-Aware Training Benchmark
 
@@ -93,8 +103,8 @@ The codebase is thoughtfully commented, consistently formatted by `clang-format`
 * **Effort**: High.
 * **Implementation**:
   1. Define a benchmark (e.g., small BERT or ViT on GLUE/CIFAR subsets) with FP32, PTQ, and QAT runs.
-  2. Create a `scripts/` benchmark script that trains the model, applies `t81` quantization, and logs accuracy, model size, and latency.
-  3. Document the benchmark/results in a new `BENCHMARKS.md` (link from `README.md`) so the community can reproduce and compare.
+  2. Extend the existing `scripts/` benchmark tooling to log accuracy, model size, and latency in a standardized JSON schema.
+  3. Update `BENCHMARKS.md` with reproducible baseline results and link the dataset/model artifacts used for comparisons.
 
 ### Recommendation 5: Harden the GPU Tensor Metadata Path
 
 
@@ -7,6 +7,32 @@ docs/index.md — Primary landing page for the documentation set.
 This landing page highlights the most helpful resources for people discovering `t81lib` or wanting
 to understand the balanced ternary engine without digging through specs immediately.
 
+## Featured example
+
+Try the compact, end-to-end PTQ + QAT notebook that measures size, latency, and perplexity on Phi-3-mini:
+[`examples/ternary_phi3_ptq_qat_demo.ipynb`](../examples/ternary_phi3_ptq_qat_demo.ipynb).
+
+**Featured demo** — [Try Phi-3-mini PTQ + QAT](../examples/ternary_phi3_ptq_qat_demo.ipynb)
+
+## Getting started for Torch users
+
+If you are arriving from PyTorch or Hugging Face, use `t81` as the single entry point and alias it once:
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install ".[torch]"
+```
+
+```python
+import t81 as t8
+
+tensor = t8.torch.TernaryTensor.from_float(weight, threshold=0.45)
+output = tensor.matmul_input(input_tensor, bias=bias)
+```
+
+From here: `t8.nn.Linear` for drop-in layers, `t8.convert`/`t8.gguf` for scripted conversion, and `t81 convert`/`t81 gguf` for CLI workflows.
+
 ## Core resources
 
 - **Landing & Quick Start** — [`README.md`](../README.md) contains the hero content, badges, and a comprehensive quick
@@ -22,6 +48,7 @@ to understand the balanced ternary engine without digging through specs immediat
   [`examples/ternary_sparse_preview.py`](../examples/ternary_sparse_preview.py) for runnable workflows.
 - **Python API reference** — [`docs/python-api.md`](python-api.md) lays out how MkDocs plus `mkdocstrings` auto-generate the binding reference.
 - **Python cookbook** — [`docs/python-cookbook.md`](python-cookbook.md) gathers recipes that mix `t81lib.pack_dense_matrix`, `t81.torch.TernaryTensor`, and the CLI helpers.
+- **Python entry points** — [`docs/python-api.md`](python-api.md) and [`docs/python-cookbook.md`](python-cookbook.md) now include a quick table showing which module to import for each workflow.
 - **Python install paths** — [`docs/python-install.md`](python-install.md) explains pip/pipx builds, validation tips, and CLI helper installs.
 - **PyTorch how-to** — [`docs/torch.md`](torch.md) walks through `t81.torch`, `t81.nn`, conversion helpers, and how the CLI scripts mirror the Python flows.
 - **CLI reference** — [`docs/references/cli-usage.md`](references/cli-usage.md) lists the unified `t81 convert`/`t81 gguf` helpers (with legacy `t81-convert`/`t81-gguf` aliases) plus `t81-qat`