more benchmarks

t81dev · t81dev · commit 64d06d4b0949 · 2025-12-31T20:39:45.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,7 @@ __pycache__/
 # Benchmark outputs & downloads
 data/
 benchmarks/
+*.gguf
+.cache/
+.matplotlib/
+.fontconfig/
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -1,7 +1,11 @@
-#Quantization Benchmark Suite
+# Quantization Benchmark Suite
 
 This repository now provides a reproducible benchmark that compares FP32,
-    post - training ternary quantization(PTQ), and quantization - aware training(QAT) through a small Fashion-MNIST classifier. The script is located at `scripts/ternary_quantization_benchmark.py` and is designed to log accuracy, latency, and storage so you can understand the benefits of moving from float32 weights to ternary-trained representations.
+post-training ternary quantization (PTQ), and quantization-aware training (QAT)
+through a small Fashion-MNIST classifier. The script is located at
+`scripts/ternary_quantization_benchmark.py` and is designed to log accuracy,
+latency, and storage so you can understand the benefits of moving from float32
+weights to ternary-trained representations.
 
 ## Benchmark matrix
 
@@ -61,6 +65,67 @@ Expected output:
 - Console summary with size, accuracy/loss, and images/s for baseline/PTQ/QAT.
 - `benchmarks/vit_cifar10_baseline.json` with stage metrics and model metadata.
 
+### Fast-mode recipes (quick baselines)
+
+Use these when you want a low-latency run to confirm the pipeline without
+waiting for full PTQ/QAT loops.
+
+ViT size + accuracy baseline (skip throughput, minimal eval):
+
+```bash
+python scripts/vit_ptq_qat_benchmark.py \
+  --model-id google/vit-base-patch16-224 \
+  --device cpu \
+  --threshold 0.45 \
+  --batch-size 16 \
+  --max-train-samples 256 \
+  --max-eval-samples 128 \
+  --eval-batches 1 \
+  --max-eval-batches 1 \
+  --skip-throughput \
+  --json-output benchmarks/vit_cifar10_quick.json
+```
+
+Observed output (CPU, size-only run with `--max-eval-batches 0` + `--skip-throughput`):
+- baseline size: 0.32 GiB
+- PTQ size: 0.03 GiB
+- accuracy/loss/images_per_s: 0.0 (skipped)
+
+Phi-3 baseline PPL only (skip latency + PTQ PPL/QAT):
+
+```bash
+python scripts/phi3_ptq_qat_benchmark.py \
+  --model-id microsoft/Phi-3-mini-4k-instruct \
+  --device cpu \
+  --dtype float32 \
+  --max-eval-tokens 512 \
+  --eval-texts 16 \
+  --max-new-tokens 16 \
+  --skip-latency \
+  --skip-ptq-ppl \
+  --json-output benchmarks/phi3_baseline_ppl.json
+```
+
+Status: PTQ PPL + short QAT pending (CPU-only PTQ conversion exceeded 2h locally). Resume on GPU:
+
+```bash
+python scripts/phi3_ptq_qat_benchmark.py \
+  --model-id microsoft/Phi-3-mini-4k-instruct \
+  --device auto \
+  --dtype bfloat16 \
+  --threshold 0.45 \
+  --max-eval-tokens 128 \
+  --eval-texts 2 \
+  --max-new-tokens 0 \
+  --skip-latency \
+  --run-qat \
+  --qat-steps 5 \
+  --train-split 'train[:10]' \
+  --json-output benchmarks/phi3_ptq_qat_fast.json
+```
+
+Note: PTQ still runs on CPU (t81.torch fallback), so keep enough host RAM available.
+
 ### 4) GGUF export + load check
 
 ```bash
@@ -148,6 +213,50 @@ Each row contains:
 
 Use this CSV to plot accuracy vs. storage or compare latency across the three modes.
 
+## JSON artifact schema (ViT + Phi-3)
+
+The ViT and Phi-3 scripts emit JSON when you pass `--json-output`. These files
+are intended to be committed alongside baseline numbers.
+
+ViT JSON keys (from `scripts/vit_ptq_qat_benchmark.py`):
+
+```json
+{
+  "model_id": "google/vit-base-patch16-224",
+  "dataset": "cifar10",
+  "device": "cpu",
+  "threshold": 0.45,
+  "baseline": {"size_gib": 0.00, "accuracy": 0.0, "loss": 0.0, "images_per_s": 0.0},
+  "ptq": {"size_gib": 0.00, "accuracy": 0.0, "loss": 0.0, "images_per_s": 0.0},
+  "qat": null
+}
+```
+
+Phi-3 JSON keys (from `scripts/phi3_ptq_qat_benchmark.py`):
+
+```json
+{
+  "model_id": "microsoft/Phi-3-mini-4k-instruct",
+  "dataset": "wikitext-2-raw-v1",
+  "device": "cpu",
+  "dtype": "float32",
+  "threshold": 0.45,
+  "max_eval_tokens": 1024,
+  "eval_texts": 32,
+  "max_new_tokens": 64,
+  "skip_latency": true,
+  "skip_ptq_ppl": false,
+  "run_qat": false,
+  "qat_steps": 5,
+  "train_split": "train[:1%]",
+  "learning_rate": 5e-5,
+  "compression_ratio": 0.0,
+  "baseline": {"size_gib": 0.00, "ppl": 0.0, "tok_s": 0.0},
+  "ptq": {"size_gib": 0.00, "ppl": null, "tok_s": 0.0},
+  "qat": null
+}
+```
+
 ## Diagrams
 
 View the [benchmark comparison diagram](docs/diagrams/benchmarks.mermaid.md) for a quick latency/storage summary that highlights the 15–22× wins.
diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md
@@ -65,13 +65,15 @@ Recent work has delivered parts of this roadmap:
 * **Recommendation 3** — Python entry-points table added to `docs/python-api.md` and `docs/python-cookbook.md`, with links from `docs/index.md`. **In progress (benchmark visibility added in `README.md`, `BENCHMARKS.md`, and the Phi-3 notebook).**
 * **GGUF compatibility** — Phi-3 export validated (`phi3-tq1-fixed12.gguf`); QKV split experiment reverted for llama.cpp parity.
 * **QAT benchmark groundwork** — ViT CIFAR-10 PTQ/QAT script added with size-only baseline captured; Phi-3 baseline PPL captured (PTQ PPL/QAT pending).
+* **GPU fallback safety** — `t81.torch` now warns + falls back to CPU for PTQ when tensors originate on GPU; smoke test added and troubleshooting docs updated.
 
 ### Status timeline (recent highlights)
 
 * Python entry-point discoverability refreshed (docs landing page + cookbook + API entry table).
 * Phi-3 GGUF export validated with llama.cpp baseline metrics captured for reference.
 * CLI documentation updated to call out Phi-3 GGUF compatibility expectations.
 * ViT size-only baseline logged; Phi-3 baseline PPL captured with PTQ PPL/QAT queued.
+* GPU fallback behavior documented; `.gitignore` hardened against GGUF/cache artifacts.
 
 ### High-impact next priorities (effort vs. impact)
 
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
@@ -36,6 +36,7 @@ This guide complements [README.md](../README.md) and the other reference pages b
 - **Progress bar missing?** The progress reporting relies on `tqdm`; install it (`pip install tqdm`) if the CLI skips bars or prints raw percentages.
 - **Meta device / accelerate offload errors.** When converting large Hugging Face checkpoints with the default `device_map=auto`, Accelerate may place many layers onto disk/`meta`. If `t81 convert`/`t81 gguf` (or the legacy `t81-convert`/`t81-gguf` scripts) later tries to call `.to("cpu")` you’ll hit `NotImplementedError: Cannot copy out of meta tensor` or `RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.` Always rerun with `--force-cpu-device-map` or `--device-map none/cpu` so the checkpoints stay on host RAM, and set `ACCELERATE_DISABLE=1` or `HF_ACCELERATE_DISABLE=1` before launching the CLI so no accelerate hooks re-enable offloading. This makes every `nn.Linear` serializable and avoids the meta-device save failure that occurs after the “Some parameters are on the meta device” log.
 - **Large GGUF conversions.** Extremely large ternary bundles (Gemma 3.x / Llama 3.x) may exhaust RAM when you read them with older readers because the whole file was loaded before parsing. The new `t81.gguf.read_gguf` implementation parses metadata, tensor infos, and tensor payloads directly from the file handle, seeks to each sorted tensor offset, and never slices the entire bundle into memory. When you still hit memory pressure or Matplotlib font-cache warnings, define `MPLCONFIGDIR=$PWD/data/cache/matplotlib` and `FONTCONFIG_PATH=$PWD/data/cache/fontconfig`, prefer `--force-cpu-device-map`, and keep `ACCELERATE_DISABLE=1`/`HF_ACCELERATE_DISABLE=1` set before rerunning the CLI so every tensor stays on the CPU.
+- **GPU PTQ fallback.** `t81.torch.TernaryTensor.from_float` currently quantizes on CPU; when your model lives on GPU it will warn and move tensors to CPU for PTQ, then return outputs back to the original device. Keep enough host RAM available and avoid meta/offload tensors (`device_map=auto`) if you plan to run PTQ PPL or short QAT loops.
 
 ## Testing & benchmarking hiccups
 
diff --git a/t81/torch/__init__.py b/t81/torch/__init__.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 from typing import Any, Callable, Dict, Mapping, Optional, Sequence
+import warnings
 
 import numpy as np
 import torch
@@ -50,8 +51,17 @@ def _quantize_tensor(tensor: torch.Tensor, threshold: float = 0.5) -> torch.Tens
 
 
 def _to_cpu_float(tensor: torch.Tensor) -> torch.Tensor:
+    if tensor.is_meta:
+        raise NotImplementedError(
+            "t81.trit does not support meta tensors; load weights on CPU or disable offload."
+        )
     if tensor.device.type != "cpu":
-        raise NotImplementedError("t81.trit currently only supports CPU tensors")
+        warnings.warn(
+            f"t81.trit runs on CPU; moving tensor from {tensor.device} to CPU.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+        return tensor.detach().to(device="cpu", dtype=torch.float32)
     return tensor.detach().to(dtype=torch.float32, copy=False)
 
 
@@ -272,7 +282,11 @@ def forward(ctx, ternary_weight: TernaryTensor, rhs: torch.Tensor) -> torch.Tens
         rhs_cpu = _to_cpu_float(rhs)
         ctx.save_for_backward(rhs_cpu)
         ctx.ternary = ternary_weight
-        return ternary_weight._compute_gemm(rhs_cpu)
+        ctx.rhs_device = rhs.device
+        output = ternary_weight._compute_gemm(rhs_cpu)
+        if rhs.device.type != "cpu":
+            output = output.to(rhs.device)
+        return output
 
     @staticmethod
     def backward(ctx, grad_output: torch.Tensor) -> tuple[None, torch.Tensor]:
@@ -281,8 +295,12 @@ def backward(ctx, grad_output: torch.Tensor) -> tuple[None, torch.Tensor]:
             _limbs_to_trits(ctx.ternary._packed, ctx.ternary._rows, ctx.ternary._k_limbs)
             .astype(np.float32)
         )[:, : ctx.ternary._k_actual]
+        grad_output_cpu = grad_output.to(device="cpu")
         # Gradient for rhs follows the usual matmul gradient formula.
-        grad_rhs = weight_float.transpose(-2, -1).matmul(grad_output)
+        grad_rhs = weight_float.transpose(-2, -1).matmul(grad_output_cpu)
+        rhs_device = getattr(ctx, "rhs_device", torch.device("cpu"))
+        if rhs_device.type != "cpu":
+            grad_rhs = grad_rhs.to(rhs_device)
         return None, grad_rhs
 
 
diff --git a/tests/python/test_torch_ternary.py b/tests/python/test_torch_ternary.py
@@ -0,0 +1,42 @@
+import warnings
+
+import pytest
+
+
+torch = pytest.importorskip("torch")
+
+
+def _best_device() -> torch.device | None:
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return None
+
+
+def test_ternary_tensor_cpu_roundtrip():
+    import t81.torch as t81_torch
+
+    weight = torch.linspace(-1.0, 1.0, steps=48, dtype=torch.float32).reshape(3, 16)
+    ternary = t81_torch.TernaryTensor.from_float(weight, threshold=0.45)
+    rhs = torch.randn(16, 4, dtype=torch.float32)
+    out = torch.matmul(ternary, rhs)
+    assert out.shape == (3, 4)
+    assert out.device.type == "cpu"
+
+
+def test_ternary_tensor_gpu_fallback_warning():
+    device = _best_device()
+    if device is None:
+        pytest.skip("No GPU/MPS device available for fallback test.")
+
+    import t81.torch as t81_torch
+
+    weight = torch.linspace(-1.0, 1.0, steps=48, dtype=torch.float32, device=device).reshape(3, 16)
+    with warnings.catch_warnings(record=True) as caught:
+        warnings.simplefilter("always", RuntimeWarning)
+        ternary = t81_torch.TernaryTensor.from_float(weight, threshold=0.45)
+    assert any("moving tensor" in str(item.message) for item in caught)
+    rhs = torch.randn(16, 4, dtype=torch.float32, device=device)
+    out = torch.matmul(ternary, rhs)
+    assert out.device == device