Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
3098 commits
Select commit Hold shift + click to select a range
e36c789
test(mlx): cover Model KV capture/restore, prompt-cache warm, and LoR…
Snider Jun 20, 2026
94eff4f
test(mlx): drive the no-cgo native text-model Generate/Chat/stream on…
Snider Jun 20, 2026
8239b2d
test(gemma4): cover the greedy-token Go-graph fallback arms
Snider Jun 20, 2026
b4d4311
test(cmd-mlx): cover generate state/trace model-load error branches
Snider Jun 20, 2026
358196d
test(cmd-mlx): cover SFT metrics line-protocol sink tail on e2b-4bit
Snider Jun 20, 2026
685d1ae
test(mlx): cover adapter block-cache service, continuity wiring, nati…
Snider Jun 20, 2026
2e9633d
test(cmd-mlx): cover writeTuningProfile mkdir/write error branches
Snider Jun 20, 2026
e03a0db
test(gemma4): add prompt-cache suffix-forward coverage
Snider Jun 20, 2026
c16030d
test(metal): cover SplitPrefill unsupported-arch path on gemma4
Snider Jun 20, 2026
947ea35
test(mlx): cover SpeculativePair Metrics/Err facade on the live pair
Snider Jun 20, 2026
78a29e3
test(gemma4): cover targetKVByLayerType + draftStepActivations guards
Snider Jun 20, 2026
5c6429b
test(gemma4): cover ForwardMasked final-logit-softcapping arm
Snider Jun 20, 2026
5626bab
fix(mlx): use GC wrapper not runtime.GC in coverage tests (AX-6)
Snider Jun 20, 2026
c88b9c0
test(gemma4): cover weights.go pure load-time logic
Snider Jun 20, 2026
7a83e42
test(gemma4): cover Gemma4Backend.Head edge arms
Snider Jun 20, 2026
d4aa508
perf(tokenizer): reduce decodeGPT2Bytes allocs 1->0/op (AX-11)
Snider Jun 20, 2026
e339fb0
perf(gguf): add appendQuantizeQ* forms for reused output buffers (AX-11)
Snider Jun 20, 2026
ef1026c
perf(gguf): reduce writeQuantizedGGUFTensorStream allocs 12315->31/op…
Snider Jun 20, 2026
254cd08
perf(native): cut DecodeForwardArchICBQuant projResident B/op 153890-…
Snider Jun 20, 2026
0f241bd
test(gguf): prove streamed GGUF is byte-identical to buffered under s…
Snider Jun 20, 2026
799bd40
perf(native): cut DecodeForwardICBQuant projResident B/op 141595->140…
Snider Jun 20, 2026
80d9ce4
perf(native): cut DecodeForwardArchICB projResident B/op 161257->1609…
Snider Jun 20, 2026
2cd71cb
perf(native): cut DecodeForwardICB projResident B/op 150324->150205 (…
Snider Jun 20, 2026
b4ad87f
bench(safetensors): add load-path benches
Snider Jun 20, 2026
d9e4a32
perf(safetensors): decode header into typed entries, not map[string]any
Snider Jun 20, 2026
623e850
perf(safetensors): drop redundant whole-buffer string copies in the l…
Snider Jun 20, 2026
0078931
perf(safetensors): presize Encode's data buffer to the exact total
Snider Jun 20, 2026
d9943f9
perf(native): cut DecodeForwardArchICB resident B/op (slices.Grow, AX…
Snider Jun 20, 2026
267a9da
perf(native): cut DecodeForwardICB resident B/op (slices.Grow, AX-11)
Snider Jun 20, 2026
23dc536
fix(safetensors): restore the missing-shape rejection in Parse
Snider Jun 20, 2026
ae763f8
perf(score): cut DoubleMetaphone allocs 3->2/op (AX-11)
Snider Jun 20, 2026
f564347
perf(score): cut assonance fallback allocs 60->0/op (AX-11)
Snider Jun 20, 2026
85a043b
perf(score): cut tokeniseWords allocs 6->2/op (AX-11)
Snider Jun 20, 2026
2d8cc03
perf(agent): reduce NewStateIndex allocs 1006->6/op (AX-11)
Snider Jun 20, 2026
0b57e0d
perf(score): cut meter scratch alloc 1->0/op (AX-11)
Snider Jun 20, 2026
6f76582
perf(score): cut PseudoJargon allocs 21->1/op (AX-11)
Snider Jun 20, 2026
cfaaa60
perf(score): cut RhymeDensity nonEmptyLines allocs 13->12/op (AX-11)
Snider Jun 20, 2026
a7a080b
perf(score): allocation-free internal Metaphone code path (AX-11)
Snider Jun 20, 2026
9c791c5
perf(openai): reduce serveOpenAIResponseStream allocs 580->70/op (AX-11)
Snider Jun 20, 2026
64447bd
perf(merge): stream hashFile sha256, cut Packs B/op 8.5M->188K/op (AX…
Snider Jun 20, 2026
18ecf11
perf(model): reduce readModelConfigAt allocs 12->11/op (AX-11)
Snider Jun 20, 2026
293a2bd
perf(openai): reduce serveOllamaStream allocs 540->32/op (AX-11)
Snider Jun 20, 2026
a551b11
perf(hf): reduce PlanFits allocs 7->6/op (AX-11)
Snider Jun 20, 2026
edfcfe4
perf(hf): cut planFitEntries combined query+ids growslice (AX-11)
Snider Jun 20, 2026
8c2fb89
perf(grpo): reduce buildGRPOUpdate allocs 4->3/op (AX-11)
Snider Jun 20, 2026
de80a6d
perf(train): reduce sftTruncateAndReturn allocs 2->1/row (AX-11)
Snider Jun 20, 2026
b8f87ad
perf(memorypretrain): reduce buildNode allocs 112->106/op (AX-11)
Snider Jun 20, 2026
fce862c
bench(spine): cover toProbeEvent per-token probe path + cloneProbeMeta
Snider Jun 20, 2026
8767b63
perf(minimax-m2): cut find*Ref miss-path allocs to 0/op (AX-11)
Snider Jun 20, 2026
d046dc0
bench(spine): cover prompt, lora_config, model_info surfaces
Snider Jun 20, 2026
584bfe3
test(memorypretrain): add FFN-build + cluster-ID enrichment benches (…
Snider Jun 20, 2026
cc284ff
bench(gguf): add QuantizeModelPack bench
Snider Jun 20, 2026
b302745
bench(kv): add Load-path benches for un-benched restore surface
Snider Jun 20, 2026
419e3df
bench(bundle): add Snapshot-path benches
Snider Jun 20, 2026
09a07ab
perf(gguf): reduce QuantizeModelPack buildStreamingGGUFQuantizedTenso…
Snider Jun 20, 2026
082c961
bench(lora): add FuseIntoPack bench
Snider Jun 20, 2026
96f2710
bench(model): add ResolveQuant bench
Snider Jun 20, 2026
62fa8bb
perf(model): reduce deriveQuantFromIndex allocs 1->0/op (AX-11)
Snider Jun 20, 2026
94e7ab4
bench(agent): add Load-path benches over synthetic state-index/memvid…
Snider Jun 20, 2026
46c6a42
perf(lora): reduce fuseBaseWeightSidecars allocs 10->0/op on dense fu…
Snider Jun 20, 2026
068911f
perf(lora): reduce fuseJoinCanonicalTarget allocs 3->2/op (AX-11)
Snider Jun 20, 2026
7b85ebd
bench(memorypretrain): add outer build + corpus/bank load-path benches
Snider Jun 20, 2026
505bf02
bench(chaptersmoke): add resolve-path bench
Snider Jun 20, 2026
70a395a
bench(compute): add Read-path bench
Snider Jun 20, 2026
78df8b3
bench(minimax-m2): add expert-dequant/load benches
Snider Jun 20, 2026
ca6bd0d
perf(minimax-m2): reduce LoadPackedExperts allocs 1038->670/op (AX-11)
Snider Jun 20, 2026
2a5e6b2
bench(gguf): add metadata-copy bench over realistic-size tokenizer
Snider Jun 20, 2026
6ea4989
perf(gguf): stream copyLocalFile metadata copy, cut B/op 4212662->436…
Snider Jun 20, 2026
f4244c9
style(gguf): gofmt the metadata-copy bench doc comment
Snider Jun 20, 2026
8dee7c3
perf(minimax-m2): reduce LoadPackedExperts allocs 670->622/op (AX-11)
Snider Jun 20, 2026
66d9ee2
bench(distill): add LoadDistillCheckpointMetadata bench (AX-11)
Snider Jun 20, 2026
07450ca
bench(grpo): add LoadGRPOCheckpointMetadata bench (AX-11)
Snider Jun 20, 2026
ddfe2bf
bench(train): add SFT checkpoint-metadata loader benches (AX-11)
Snider Jun 20, 2026
146a05b
bench(native): add real-e2b decode-loop bench (AX-11)
Snider Jun 20, 2026
4f4d6a6
perf(safetensors): open shard once for multi-ref reads, cut reopens/a…
Snider Jun 20, 2026
8e06107
bench(safetensors): add multi-ref read bench
Snider Jun 20, 2026
a526c79
perf(score): reduce RhymeDensity allocs 12->3/op (AX-11)
Snider Jun 20, 2026
6896d9b
bench(mlx): add real-e2b Generate bench (AX-11 Pass-2 attribution)
Snider Jun 20, 2026
2d4006f
perf(score): reduce newTokenContext allocs 6->4/op (AX-11)
Snider Jun 20, 2026
21247df
perf(score): reduce punFromTokens allocs 4->3/op (AX-11)
Snider Jun 20, 2026
b121378
test(score): add AX-11 alloc baselines for the Score() load path
Snider Jun 20, 2026
597f979
perf(score): reduce letterTokens allocs 8->3/op (AX-11)
Snider Jun 20, 2026
cf7c22e
perf(score): cut Hostility core.Split allocs 5->4/op (AX-11)
Snider Jun 20, 2026
d4ff825
perf(score): reduce LEK split allocs 5->3/op (AX-11)
Snider Jun 20, 2026
bb646b4
perf(score): guard RhymeDensity upfront Upper on single-line input (A…
Snider Jun 20, 2026
34ed323
perf(native): zero-trampoline encoder-send seam for the two hot Metal…
Snider Jun 20, 2026
68cc032
perf(metal): reduce FixedKVCache.retireAfterNextEval allocs/token 2->…
Snider Jun 20, 2026
e1836c4
bench(metal): add FixedKVCache retire bench
Snider Jun 20, 2026
628369b
perf(gemma4): reuse compiledDecodeForward array scratch, cut per-laye…
Snider Jun 20, 2026
a7ab1d4
bench(gemma4): add compiledDecodeForward bench
Snider Jun 20, 2026
1a5cdf1
build(go-mlx): add go-i18n as external workspace module
Snider Jun 20, 2026
89a5f34
chore(go-mlx): remove dead internal/tokenizer (dup of pkg/tokenizer, …
Snider Jun 20, 2026
95fdc3d
perf(openai): reduce serveOpenAIResponseStream allocs 70->61/op (AX-11)
Snider Jun 20, 2026
f67c3f1
bench(gemma4): add Go-graph attention FixedKVCache decode bench (AX-11)
Snider Jun 20, 2026
dd3a02e
perf(native): heterogeneous-shape ICB recorder (per-layer dFF/headDim…
Snider Jun 20, 2026
7c15f76
perf(metal): reduce evalOutputs per-token alloc (AX-11)
Snider Jun 20, 2026
e4afdff
perf(gguf): eliminate []ggufTensorInfo intermediate, cut ReadInfo 122…
Snider Jun 20, 2026
bee166e
perf(tokenizer): reduce DecodeToken solo-marker allocs 1->0/op (AX-11)
Snider Jun 20, 2026
6415b91
perf(model): reduce inspectModelPackTokenizer allocs 47->43/op (AX-11)
Snider Jun 20, 2026
a54e3eb
perf(kvconv): reduce rootTurboQuantPayloads allocs 132->106/op (AX-11)
Snider Jun 20, 2026
61b6081
perf(merge): reduce Packs allocs 194->128/op (AX-11)
Snider Jun 20, 2026
20b1ca0
bench(profile): add NormalizeArchitecture/transformers-name/resolve b…
Snider Jun 20, 2026
a37d43f
perf(merge): marshalMergedHeader nil-map guard for full json parity
Snider Jun 20, 2026
4c7b1cb
perf(model): reduce inspectModelPackTaskProfiles normalize-scan alloc…
Snider Jun 20, 2026
0cd72d3
bench(autoround): add SignRound rounding-search quantize bench (AX-11)
Snider Jun 20, 2026
cfc365d
bench(mlx): add real-e2b Decode/CaptureKVChunks/GenerateChunks/Inspec…
Snider Jun 20, 2026
f51adc1
test(gemma4): AX-11 alloc benches for Forward (real-e2b) + DecodeForw…
Snider Jun 20, 2026
d92bea8
perf(gemma4): hoist Step embedHook override out of the per-token path…
Snider Jun 20, 2026
799613e
test(session): real-e2b Wake/Sleep continuity benches + byte-identity…
Snider Jun 20, 2026
db94fde
bench(native): add real-e2b model.GenerateSampled/Generate contract b…
Snider Jun 20, 2026
d873884
perf(model): reuse Sampler.Sample softmax/rank scratch — kill per-tok…
Snider Jun 20, 2026
cfea6b5
perf(metal): reduce snapshotKVCaches alloc 577->502/op + B/op 53M->39…
Snider Jun 20, 2026
c90da54
bench(model/gemma4,model/mistral): add AX-11 alloc baselines for two …
Snider Jun 20, 2026
161246b
perf(distill): hand-rolled BatchCacheKey emitter, drop reflect-JSON a…
Snider Jun 20, 2026
7b5be6f
docs(metal): correct CaptureKV bench record — ns/op was a memprofiler…
Snider Jun 20, 2026
503f2c7
perf(distill): exact-size BatchCacheKey emitter buffer, B/op 1472->35…
Snider Jun 20, 2026
517dbfc
bench(mlx): add real-e2b GenerateSpeculative bench
Snider Jun 20, 2026
21012f2
perf(score): reduce alliteration/assonance FromTokens allocs 3->2/op …
Snider Jun 20, 2026
4e68164
perf(score): reduce lekDegeneration allocs 3->2/op (AX-11)
Snider Jun 20, 2026
394db53
bench(mlx): assert byte-identical IDs in GenerateSpeculative bench
Snider Jun 20, 2026
3597ff1
perf(score): reduce punFromTokens allocs 3->2/op (AX-11)
Snider Jun 20, 2026
7d2de30
perf(native): PLE-aware arch ICB — E2B/E4B on the encode-bypass (AX-11)
Snider Jun 20, 2026
c744d22
refactor(native): split arch ICB into held recorder + per-token repla…
Snider Jun 20, 2026
52e8c60
perf(native): incremental ICB session — per-token decode encode-bypas…
Snider Jun 20, 2026
11e8e3b
perf(native): per-layer rope in the arch ICB — gemma4 sliding/proport…
Snider Jun 20, 2026
893f33d
test(native): per-layer rope ICB parity — sliding/global different th…
Snider Jun 20, 2026
568e370
perf(native): per-layer head dim in the arch ICB — real gemma4 E2B on…
Snider Jun 20, 2026
6bcde90
refactor(native): de-gemma the serving session — Gemma4Session → Arch…
Snider Jun 21, 2026
0e1bd36
refactor(model): arch declaration lives at the pkg/model root, not th…
Snider Jun 21, 2026
fbf7699
test(native): guard the ArchSession name — fail on a model-named serv…
Snider Jun 21, 2026
5e626c7
refactor(native): de-gemma the general weight containers — Gemma4BF16…
Snider Jun 21, 2026
25cb423
refactor(model): LoadedModel/LoadedLayer/LoadedMoE → pkg/model root (…
Snider Jun 21, 2026
9bee4a5
refactor(model): model_type loader registry — RegisterLoader/LookupLo…
Snider Jun 21, 2026
df30a9f
refactor(mistral): pkg/model/mistral.Load → *LoadedModel + registry (…
Snider Jun 21, 2026
ce633f5
feat(native): generic reactive directory loader — LoadDir via model.L…
Snider Jun 21, 2026
ff255a6
port(model/gemma4): literal copy of metal's config parser into pkg/mo…
Snider Jun 21, 2026
f550cf7
port(model/gemma4): wire gemma4.Load to the copied parser — the faith…
Snider Jun 21, 2026
7aefd9b
port(model/gemma4): metal-vs-neutral config parity test + // Deprecat…
Snider Jun 21, 2026
e78029a
cleanup(native): retire the dead per-arch loaders/assemblers — reacti…
Snider Jun 21, 2026
af13988
refactor(model): extract TransformerConfig to the pkg/model root — en…
Snider Jun 21, 2026
1c2ea3d
refactor(model): extract the weight-shape inference engine to pkg/mod…
Snider Jun 21, 2026
3f67678
refactor(model): extract QuantConfig to the pkg/model root — engine o…
Snider Jun 21, 2026
8c0833c
chore(deps): bump external/go-io 24333e1 → dev (baec68d)
Snider Jun 21, 2026
d08b101
feat(model): reactive arch contract — ArchConfig + ArchSpec registry …
Snider Jun 21, 2026
377653b
feat(model): generic model.Assemble driven by arch.Layer + WeightName…
Snider Jun 21, 2026
3ea8e32
feat(model): reactive model.Load orchestration + model_type probe (en…
Snider Jun 21, 2026
8ce5474
feat(model): gemma4 + mistral register ArchSpec; native loads via mod…
Snider Jun 21, 2026
19fa546
refactor(model): delete the per-arch loaders — gemma4 + mistral are c…
Snider Jun 21, 2026
02c8d39
chore(model): delete the retired RegisterLoader registry — the reacti…
Snider Jun 21, 2026
a12c16e
refactor(model): purge gemma/mistral/g4 from the pkg/model engine roo…
Snider Jun 21, 2026
bfb4ad4
refactor(model): use core.NewRegistry for the arch registry + lift ge…
Snider Jun 21, 2026
5e89f30
refactor(model): hoist quant validation to model.QuantConfig.Validate…
Snider Jun 21, 2026
33e76d3
feat(model/gemma4): neutral vision weight-name canonicalisation (visi…
Snider Jun 21, 2026
065f26e
feat(model/gemma4): neutral vision config inference from weight shape…
Snider Jun 21, 2026
56f4ddb
feat(model/gemma4): LoadedVision assemble — the vision loader output …
Snider Jun 21, 2026
3c337ca
feat(native): MatRowsBF16 — composed multi-row projection (vision for…
Snider Jun 21, 2026
49a75a5
feat(native): VisionPatchEmbed — SigLIP patch embedding (vision forwa…
Snider Jun 21, 2026
26a3d4c
feat(native): VisionSDPA — decomposed full attention (vision forward …
Snider Jun 21, 2026
24cd891
feat(native): VisionEncoderLayer + decouple vision tests from pkg/met…
Snider Jun 21, 2026
e70c54c
feat(native): VisionTower — full SigLIP forward, port complete (visio…
Snider Jun 21, 2026
65ab018
feat(native): VisionInjectFeatures — image-placeholder splice (multim…
Snider Jun 21, 2026
e430890
feat(native): audio Conformer FeedForward — start of the audio tower …
Snider Jun 21, 2026
350d7b3
feat(native): audio Conformer LightConv — GLU + causal depthwise conv…
Snider Jun 21, 2026
019add0
feat(native): RunUnaryBF16 + SigmoidBF16/SiLUBF16 — byte-parity bf16 …
Snider Jun 21, 2026
23df196
fix(native): AudioFeedForward now BYTE-IDENTICAL to metal (byte-ident…
Snider Jun 21, 2026
398c6f2
fix(native): AudioLightConv now BYTE-IDENTICAL to metal (byte-identic…
Snider Jun 21, 2026
7c4b01b
feat(native): SoftmaxF32 — byte-parity block softmax kernel (byte-ide…
Snider Jun 21, 2026
eb432ae
feat(native): LayerNormBF16 — byte-parity layer-norm kernel (byte-ide…
Snider Jun 21, 2026
618463b
feat(native): Conv2dBF16 — byte-parity NHWC 2-D conv (byte-identical …
Snider Jun 21, 2026
163333c
feat(native): AudioSubsample — byte-identical Conformer subsampler (b…
Snider Jun 21, 2026
41cb62d
feat(native): MatMulF32 — byte-parity fused steel GEMM, unblocks the …
Snider Jun 21, 2026
e8a1466
feat(native): AudioAttention byte-identical + split-K f32 GEMM (byte-…
Snider Jun 21, 2026
1095847
feat(native): per-linear activation clamps for the audio tower (byte-…
Snider Jun 21, 2026
4fb9057
feat(native): fp32 audio tower — byte-identical to the REAL metal For…
Snider Jun 21, 2026
cbe13ba
feat(native): fp32 subsampler completes the byte-identical audio towe…
Snider Jun 21, 2026
07885fc
test(native): durable TestAudioLayer (fp32) vs the REAL metal Gemma4A…
Snider Jun 21, 2026
6638028
feat(native): port the gemma4 audio feature extractor (log-mel STFT) …
Snider Jun 21, 2026
4ddaa6d
feat(native): MTP speculative-decode logic (token-identical) — perf g…
Snider Jun 21, 2026
986b70d
feat(native): SDPACausalBF16 — bd256 causal attention for the MTP ver…
Snider Jun 21, 2026
89e5600
feat(native): DecodeLayerBatchedKV — the MTP batched verify forward (…
Snider Jun 22, 2026
178488c
feat(native): wire the batched verify into MTPDecode (one pass over t…
Snider Jun 22, 2026
9f32775
test(native): MTP batched decode is token-identical to greedy Generat…
Snider Jun 22, 2026
0bae7b2
test(native): bench the batched verify vs sequential — v1 is overhead…
Snider Jun 22, 2026
7de5909
feat(native): MatMulBF16NT — fused bf16 steel GEMM, byte-identical to…
Snider Jun 22, 2026
dfb7495
fix(native): keep the MTP draft cache aligned with the committed run …
Snider Jun 22, 2026
8cf81b9
feat(native): open native training — the linear-layer VJP, gradient-c…
Snider Jun 22, 2026
d8d4cb4
feat(native): training VJP #2 — RMSNorm backward, gradient-checked
Snider Jun 22, 2026
0756e4c
feat(native): training VJP #3 — gelu(gate)·up activation backward, gr…
Snider Jun 22, 2026
28fd1d4
feat(native): compose a full MLP-block backward from the VJPs, gradie…
Snider Jun 22, 2026
eee75a2
feat(native): training VJP #4 — softmax backward, gradient-checked (g…
Snider Jun 22, 2026
4c62405
feat(native): training VJP #5 — RoPE backward (inverse rotation), gra…
Snider Jun 22, 2026
0a7cede
feat(native): native training step works end-to-end — cross-entropy +…
Snider Jun 22, 2026
306f283
feat(native): compose the attention-block backward from the VJPs, gra…
Snider Jun 22, 2026
f669582
fix(native): drop duplicate transposeF32 — reuse vision.go's (build w…
Snider Jun 22, 2026
9bce99a
feat(native): LoRA adapter + SFT loop — frozen base trains, cross-ent…
Snider Jun 22, 2026
3e3fe82
feat(native): compose the full attention-BLOCK backward — a complete …
Snider Jun 22, 2026
442ba81
feat(native): full-stack backward chains across layers — 2-layer+head…
Snider Jun 22, 2026
0d8efc8
feat(native): multi-head GQA attention backward, gradient-checked (to…
Snider Jun 22, 2026
d614054
feat(native): gemma4 QK-norm backward, gradient-checked (gemma4-speci…
Snider Jun 22, 2026
b454c79
feat(native): ArchSession activation-saving forward — the named train…
Snider Jun 22, 2026
c4fa942
feat(native): multi-head GQA attention BLOCK backward — the real gemm…
Snider Jun 22, 2026
7102028
feat(native): forward-match instrument — host layer forward verified …
Snider Jun 22, 2026
37ee504
feat(native): real-model SFT — train on a REAL ArchSession, cross-ent…
Snider Jun 22, 2026
ba59d28
feat(native): full-stack projection LoRA on a REAL ArchSession — cros…
Snider Jun 22, 2026
3f66875
test(native): session + kvconv byte-identical to the whole-sequence f…
Snider Jun 22, 2026
ca11849
feat(native): conversation continuity — SerializeState/RestoreState t…
Snider Jun 22, 2026
df4247d
feat(native): automatic prompt caching — reuse the warm KV prefix, to…
Snider Jun 22, 2026
b73352e
feat(native): cache compaction — evict to recent N, re-prefill, token…
Snider Jun 22, 2026
3647188
feat(native): full-stack LoRA across ALL layers — backward chains the…
Snider Jun 22, 2026
615e9d1
feat(native): LoRA fuse-into-model — the train→serve bridge, byte-exa…
Snider Jun 22, 2026
00368c1
feat(model): register gemma3 in the native reactive loader — model-zo…
Snider Jun 22, 2026
3c9788e
feat(model): register dense qwen3 in the native reactive loader (mode…
Snider Jun 22, 2026
57eb714
feat(model): native Mamba-2 SSD selective scan — the first SSM mixer …
Snider Jun 22, 2026
c468bdf
feat(model): native Mamba-2 causal depthwise conv1d with conv-state r…
Snider Jun 22, 2026
1664799
feat(model): native Mamba-2 full block — in-proj→conv→scan→gated-norm…
Snider Jun 22, 2026
832e085
feat(model): native Mamba-2 recurrent decode — model + session, decod…
Snider Jun 22, 2026
78bb6fa
feat(model): native Mamba-2 checkpoint loader — first SSM family load…
Snider Jun 22, 2026
d560f01
test(model): mamba2 benchmark files (AX-11) — scan/conv/block/model, …
Snider Jun 22, 2026
0bf2d70
feat(model): mamba2 satisfies the serve contract (model.SessionModel)…
Snider Jun 22, 2026
6d8b197
feat(native): wire mamba2 into the serve loader — first SSM end-to-en…
Snider Jun 22, 2026
15de2f2
perf(mamba2): device steel GEMM for the block projections — the bench…
Snider Jun 22, 2026
a55f3d1
feat(model): native RWKV-7 WKV7 recurrence — second SSM/FLA family, g…
Snider Jun 22, 2026
4086fae
feat(model): native RWKV-7 time-mix block + device-GEMM seam — projec…
Snider Jun 22, 2026
4eca79b
perf(rwkv7): wire native device GEMM for the block projections (symme…
Snider Jun 22, 2026
0391285
feat(model): native gated delta-rule recurrence — the Qwen 3.6 FLA mi…
Snider Jun 22, 2026
d5f4596
feat(model): native Qwen 3.6 gated-delta block — conv→GQA→delta-rule→…
Snider Jun 22, 2026
222402c
perf(qwen3): wire native device GEMM for the gated-delta projections …
Snider Jun 22, 2026
4306112
feat(model): native ComposedModel orchestration, cut 1 — hybrid block…
Snider Jun 22, 2026
2ff6a3a
feat(model): ComposedModel cut 2 — full-attention mixer + hybrid deco…
Snider Jun 22, 2026
1d65747
feat(model): ComposedModel loader — Qwen 3.6 hybrid checkpoint loads …
Snider Jun 22, 2026
7df3870
feat(native): Qwen 3.6 serves end-to-end — composed serve wrapper + c…
Snider Jun 22, 2026
2513388
feat(model): qwen3_6_moe — Mixture-of-Experts FFN, the MoE variant cl…
Snider Jun 22, 2026
cd9b48e
fix(mamba2): gated norm is gate-BEFORE — real-checkpoint smoke vs HF …
Snider Jun 22, 2026
eb22607
test(deltanet): cross-check gated-delta recurrence vs HF Qwen3-Next —…
Snider Jun 22, 2026
0a73a5f
test(native): unblock the suite — registry-arc API cleanup (g4.Assemb…
Snider Jun 22, 2026
29306de
fix(native): gemma4 KV-sharing broke the quant decode — gate K/V size…
Snider Jun 22, 2026
6eddab6
fix(native): Generate bypassed the ICB replay — 13.7→79.8 tok/s on e2…
Snider Jun 22, 2026
d7186bf
perf(native): resident no-copy PLE projection — 79.8→95.4 tok/s on e2…
Snider Jun 22, 2026
1769d7d
test(native): ICB-vs-per-op cross-load A/B — the cross-load decode dr…
Snider Jun 22, 2026
4e36be1
perf(native): batch the PLE projection chain — 5 GPU round-trips → 1,…
Snider Jun 22, 2026
c493de7
perf(native): 4-bit embed dequant fast path + per-group affine hoist …
Snider Jun 22, 2026
f881726
diag(native): per-piece decode GPU-time split — host is at the floor,…
Snider Jun 22, 2026
2eb1081
perf(native): skip the monotonic logit softcap for argmax callers — 1…
Snider Jun 22, 2026
7a874c1
diag(native): ICB replay GPU-span — the gap is barrier-serialisation …
Snider Jun 22, 2026
f843fa3
perf(metal): inspectAttention skips the redundant single-owner head c…
Snider Jun 22, 2026
cff0f00
fix(kimi): load MoE experts quantized when the checkpoint quantizes t…
Snider Jun 22, 2026
ddff908
test(lora): coverage-close adapter.go + fuse.go residual branches
Snider Jun 22, 2026
708f149
perf(native): skip ICB barriers on independent secondary consumers — …
Snider Jun 22, 2026
7dd1659
diag(native): no-barrier ICB ceiling — barriers ARE the whole gap (ke…
Snider Jun 22, 2026
972ae8c
native work
Snider Jun 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
15 changes: 2 additions & 13 deletions .codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,11 @@ coverage:
status:
project:
default:
target: 80%
threshold: 1%
target: 98%
threshold: 8%
patch:
default:
target: 70%

ignore:
# Hardware/native runtime paths need a separate Metal-backed integration gate.
- "go/*_darwin.go"
- "go/register_metal.go"
- "go/internal/metal/**"

# Adapter shells and sidecars are tested, but not part of the core library gate.
- "go/training.go"
- "go/mlxlm/**"
- "go/pkg/daemon/**"
- "go/pkg/memvid/cli/**"
- "go/cmd/**"
- "go/tests/**"
12 changes: 0 additions & 12 deletions .forgejo/workflows/security-scan.yml

This file was deleted.

27 changes: 0 additions & 27 deletions .forgejo/workflows/test.yml

This file was deleted.

20 changes: 17 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
# Build artifacts
build/
bin/
*.dylib
*.so
*.a

# `go build ./go/cmd/mlx/` without -o lands the binary at repo root.
# Convention is `go build -o bin/mlx` (bin/ already ignored above);
# this catches the shortcut form too.
/mlx

# CMake
CMakeCache.txt
CMakeFiles/
cmake_install.cmake
Makefile

# CMake install output (keep headers for Go module consumers)
dist/*
!dist/include/
# CMake install output
dist/

# Local Go build/test shortcuts
/go/mlx
/*.test

# IDE
.idea/
Expand All @@ -22,6 +31,11 @@ dist/*
# macOS
.DS_Store

# lthn/desktop frontend dist — copied at build time by
# scripts/make-app-bundle.sh, embedded in cmd/mlx via go:embed.
# Single source of truth lives in lthn/desktop/frontend/.
go/cmd/mlx/frontend/dist/

# Knowledge base
KB/
.core/
Expand Down
16 changes: 16 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,19 @@
path = external/go-io
url = https://github.com/dappcore/go-io.git
branch = dev
[submodule "external/go-ai"]
path = external/go-ai
url = https://github.com/dappcore/go-ai.git
branch = dev
[submodule "external/go-ml"]
path = external/go-ml
url = https://github.com/dappcore/go-ml.git
branch = dev
[submodule "external/go-cgo"]
path = external/go-cgo
url = https://github.com/dappcore/go-cgo.git
branch = dev
[submodule "external/go-i18n"]
path = external/go-i18n
url = https://github.com/dappcore/go-i18n.git
branch = dev
20 changes: 15 additions & 5 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ All Go code lives under `go/`:
`nomlxlm` removes it)
- `go/cmd/violet/` and `go/pkg/daemon/` — local Violet Unix-socket sidecar
- `cpp/` — C++ side companion (CLion-side worktree)
- `lib/mlx/` — upstream MLX submodule pinned at `v0.30.1`
- `lib/mlx/` — upstream MLX submodule pinned at `v0.31.1`
- `patches/` — local patches against `lib/mlx` (manual apply only)
- `docs/`, `examples/` — markdown documentation and per-feature usage examples

Expand All @@ -25,6 +25,15 @@ Unsupported builds compile against the `*_stub.go` files and a stub
`MetalAvailable() bool` that returns false. Do not move CGO code out of
`go/internal/metal/`.

The native path targets [macOS Tahoe 26.0+](https://developer.apple.com/documentation/macos-release-notes/macos-26-release-notes)
on Apple Silicon. The floor is intentional: the Metal 4 API generation this
runner is built around shipped with macOS 26, including lower-overhead command
encoding, explicit compilation control, tensor resources, and machine-learning
passes. Keep build and test invocations aligned with that floor by passing
`-ldflags "-extldflags=-mmacosx-version-min=26.0"` when compiling native code.
See `docs/operator/deployment.md` and `docs/operator/metallib-and-variants.md`
for the full reference chain.

## Conventions

- UK English in code, comments, and docs (colour, organisation, behaviour)
Expand All @@ -47,10 +56,11 @@ model downloads.

## Sandboxing Notes

Before handing off, run the repository gates from the brief with `GOWORK=off`.
On sandboxed systems, set `GOCACHE` to a writable directory such as
`/tmp/codex-go-mlx-cache` so Go can compile without touching the user
cache. If the sandbox cannot resolve the bundled `mlx.metallib`, apply
Before handing off, run the repository gates from the checked-in workspace; do
not use `GOWORK=off` unless the user explicitly asks for an isolated module
check. On sandboxed systems, set `GOCACHE` to a writable directory such as
`/tmp/codex-go-mlx-cache` so Go can compile without touching the user cache.
If the sandbox cannot resolve the bundled `mlx.metallib`, apply
`patches/mlx-metallib-path.patch` inside `lib/mlx` to enable the
`MLX_METALLIB_PATH` env-var override (not auto-applied).

Expand Down
7 changes: 4 additions & 3 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,18 @@ After Mantis #1241, all Go code lives under `go/`:
```
go/ Go module root (dappco.re/go/mlx)
*.go Public root API: model, tokenizer, compute, training, eval, distill, GRPO, hf-fit, merge, gguf-quantize, kv-snapshot, lora-fuse
cmd/mlx/ CLI tool (built with `-o core-mlx`; consumers rename: lthn-mlx)
cmd/violet/ Unix-socket sidecar daemon
internal/metal/ All CGO code (mlx-c bindings)
mlxlm/ CGO-free Python subprocess backend
pkg/daemon/ Daemon implementation
pkg/memvid/ Memvid storage CLI
pkg/memvid/ Deprecated State codec compatibility shim
tests/ Integration tests
cpp/ C++ side (CLion-side companion)
docs/ Markdown documentation
examples/ Per-feature usage examples (markdown)
external/ Vendored core libraries
lib/mlx/ Upstream mlx submodule (pinned at v0.30.1)
lib/mlx/ Upstream mlx submodule (pinned at v0.31.1)
patches/ Local patches to lib/mlx (not auto-applied)
```

Expand Down Expand Up @@ -127,7 +128,7 @@ Architecture is detected from `config.json` (`model_type`) for safetensors and f

## Submodule Patches

`lib/mlx` is pinned at upstream tag `v0.30.1`. Local patches that we do not upstream live in `patches/` as standalone diff files (e.g. `patches/mlx-metallib-path.patch` for the `MLX_METALLIB_PATH` env-var override). Patches are not auto-applied — run them inside the submodule manually when their function is needed:
`lib/mlx` is pinned at upstream tag `v0.31.1`. Local patches that we do not upstream live in `patches/` as standalone diff files (e.g. `patches/mlx-metallib-path.patch` for the `MLX_METALLIB_PATH` env-var override). Patches are not auto-applied — run them inside the submodule manually when their function is needed:

```bash
git -C lib/mlx apply ../../patches/mlx-metallib-path.patch
Expand Down
119 changes: 119 additions & 0 deletions CLAUDE.operator.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# CLAUDE.operator.md

Operator-facing guidance for **running** `lthn-mlx` in production. Companion to `CLAUDE.md` (developer-facing — architecture, build, contribute). If you arrived here mid-session needing to deploy, troubleshoot, or reason about distribution, you're in the right doc. If you arrived needing to add a model decoder or change cgo bindings, go to `CLAUDE.md`.

The operator audience is a future Cladius / Athena / Hephaestus session, *or* a human operator (Snider, ops-side) doing a deploy. Same mental model serves both — the difference is just whether the reader can edit code on the spot.

## Read order

1. **This file**, skim through "Operating principles" — calibrates what the binary is and isn't.
2. **`docs/operator/deployment.md`** — what you ship, how it runs, what to bind to.
3. **`docs/operator/metallib-and-variants.md`** — the variant question, the bundling strategy, the active CWD-resolution panic.
4. **`docs/operator/troubleshooting.md`** — the failure modes in lifecycle order, with fixes.
5. **`docs/operator/index.md`** — the full operator doc set + what's planned.

If you have ~3 minutes, read this file. If you have ~30 minutes, read all five.

## What lthn-mlx is

A single-process boundary that wraps native Apple Metal GPU inference (via mlx-c CGO bindings) and serves it as OpenAI / Anthropic / Ollama-compatible HTTP. Snider's framing, made explicit on 2026-05-25:

> **"The actual model is the binary, the rest is package."**

This is the load-bearing architecture decision. Everything that wants inference — `lthn` desktop, `pkg/lemma` in lthn/desktop, providers in `go-ai`, any OpenAI-compatible Python / TypeScript / curl client — talks to `lthn-mlx` over HTTP. There is no in-process library substitute for production. The binary is the boundary.

**One process. One model. One HTTP listener.** That's the unit. Multi-model deployments mean multiple processes on different ports plus a router in front (the `pkg/lemma` client is the canonical Go-side router).

The binary is built from `dappco.re/go/mlx/cmd/mlx`, default output name `core-mlx`, consumers rename to `lthn-mlx`. Module path is `dappco.re/go/mlx`.

## Operating principles

These are the load-bearing facts an operator needs in working memory. Each one shapes a deployment decision.

### 1. Apple Silicon only

`darwin/arm64`. No Linux. No Intel macOS. The CGO files carry `//go:build darwin && arm64`; a stub returns `MetalAvailable() = false` everywhere else. M1 / M2 / M3 / M4, any chip class, any deployment macOS ≥13 — one binary serves them all (modulo the metallib variant matrix; see point 5).

If the deployment target isn't Apple Silicon, you don't want `lthn-mlx` — you want a different go-inference backend (`go-rocm` for AMD GPUs, or the CGO-free `mlxlm` subprocess backend bundled in the same repo for Python-on-anything).

### 2. The binary needs the metallib

`mlx.metallib` (~107 MB, MetalLib v1.2.9, the compiled GPU kernel archive) must be findable at runtime. Today, until the bundling work lands, this means **setting `MLX_METALLIB_PATH` to an absolute path** before invoking. Not setting it is the single most common deployment failure — the binary starts, `/v1/health` passes, then panics inside `mlx_metal_load_library` on the first GPU dispatch.

```bash
export MLX_METALLIB_PATH=/opt/lthn-mlx/lib/mlx.metallib
lthn-mlx serve --model /opt/lthn-mlx/models/lemer-lite --addr :11434
```

The permanent fix is Path B bundling (embed via `//go:embed`, load via `MTLDevice newLibraryWithData:`). Until that ships, treat the env var as mandatory deployment config. See `docs/operator/metallib-and-variants.md` for the why and `docs/operator/troubleshooting.md` for the panic signature.

### 3. Model loads lazily

`lthn-mlx serve` starts in under a second. The model loads on the **first request that needs it**, not at process start. This means:

- Liveness probes against `/v1/health` pass before the model is loaded. They are not readiness probes.
- The first inference request after start takes 2-15 seconds depending on model size and storage speed.
- For consistent first-request latency, pre-warm in the service manager's post-start hook with a one-token completion (see deployment.md).

There is no on-disk lock, no PID file, no recovery state. Restart is safe; the new process starts cold and lazy-loads. The service manager is responsible for single-instance enforcement.

### 4. HTTP surface is trusted-network only

`lthn-mlx serve` has no authentication, no rate limiting, no TLS. Default bind is `:11434` (matches Ollama). Bind to `127.0.0.1:11434` for same-machine, `0.0.0.0:11434` for LAN. **Production LAN exposure sits behind a reverse proxy** that handles auth and TLS (Caddy, nginx).

If you need authenticated remote access, that lives in `pkg/lemma` (the Go client) plus a tunnel / proxy / auth-gateway — not in `lthn-mlx` itself. Don't try to add auth to the serve binary; it would violate the boundary rule and duplicate work already done one layer up.

### 5. Variants matter at the toolchain axis, not the chip axis

Snider's question of 2026-05-25: "if the lib is different for different apple versions, we need to know the variants that need building." The chip family (M1/M2/M3/M4) is **not** a variant axis — Apple's Metal driver handles forward-compatibility from a single archive. What actually varies is the build-host toolchain: Metal language version ≥4.0 + macOS SDK ≥26.2 (Xcode 26+) unlocks the NAX kernel family for M4-class tensor coprocessors.

**Practical ship matrix:**

| Variant | Build host | Runs on | Use case |
|---------|------------|---------|----------|
| `mlx-baseline.metallib` | Any modern Xcode, deployment-min 13 | M1-M4 on macOS 13+ | Default ship today |
| `mlx-nax.metallib` | Xcode 26+, deployment-min 26 | M4-class on macOS 26+ only | Deferred to M4 optimisation lane |

Ship the baseline. The NAX variant is a future M4 fast-path optimisation, not a today-decision. Full evidence and the open questions (driver-side load behaviour for higher `min`, NAX dispatch gating on non-M4) in `docs/operator/metallib-and-variants.md`.

### 6. Unified memory is the budget

On Apple Silicon there is no separate VRAM line item — the GPU and CPU share unified memory. The process budget includes: model weights, KV cache (scales linearly with `--context`), MLX allocator cache, plus everything else macOS is doing. A 7B model in 4-bit needs ~5 GB resident; the default 131k context can add several more.

Tuning knobs live in `dappco.re/go/mlx` at the package level (`SetMemoryLimit`, `SetCacheLimit`, `SetWiredLimit`, `ClearCache`, `GetActiveMemory`, `GetPeakMemory`). They are **not** exposed as `serve` flags today — if you need them on the bundled CLI, file a feature ticket against `cmd/mlx/serve.go`. For now, custom integrations on top of `openai.NewMuxWithAdmin` can wire them directly.

Activity Monitor's "Memory" column is the right place to watch the process. `/v1/cache/stats` reports MLX's allocator view.

### 7. Graceful shutdown is signal-driven

SIGINT and SIGTERM both trigger `http.Server.Shutdown` with `--shutdown-timeout` (default 10s) as the drain deadline. After the deadline, the process exits. There is no explicit model-unload step — the OS reclaims Metal allocations on exit.

If you have long-running generations and need them to drain cleanly on bounce, raise `--shutdown-timeout` (30s-60s). If you need explicit teardown for an exotic daemon scenario, wire the `Sleep` admin callback in a custom integration.

## Mental model in one paragraph

`lthn-mlx serve` is a stateless OpenAI-compatible HTTP server backed by Apple Metal GPU inference, single-model per process, lazy-load on first request, signal-driven graceful shutdown, requires a findable `mlx.metallib` (env var until bundling lands), no built-in auth or TLS, designed for trusted-network use, with a `pkg/lemma`-shaped routing layer one level up for multi-model or remote-access patterns. The architecture insists on the binary as the only process boundary — everything else is packages talking to it over HTTP.

That paragraph plus the seven principles is the working mental model. Everything else in `docs/operator/` fills in the operator's view of specific concerns.

## What this doc does not cover

- **How the inference works inside.** That's `docs/architecture.md`, `docs/runtime/`, `docs/memory/`. Developer-side.
- **How to add a model architecture.** That's a decoder under `go/internal/metal/`. Developer-side.
- **How training works.** That's `docs/training.md`, `docs/distillation.md`, `docs/grpo.md`. Production-bench / research-side.
- **GOAL.md production-bench lane.** Separate concern with its own canonical brief.
- **Memory limits & cache tuning as a knob set.** Stubbed in `docs/operator/performance-tuning.md` — not yet written. Source of truth meanwhile: `go/internal/metal/backend.go:10-12` and the `mlx.Set*` package surface.

## When the docs and reality disagree

This doc and `docs/operator/*` describe behaviour. Behaviour changes. If you find a discrepancy between what `lthn-mlx serve` actually does and what these docs claim, **the code is right and the docs are wrong**. Fix the doc, or PR a comment-block on the responsible source file referencing this directory.

The maintenance discipline lives in `docs/operator/index.md` under "Maintenance discipline." Read it if you're about to merge a PR that touches `cmd/mlx/serve.go`, `go/openai/openai.go`, `go/openai/admin.go`, or `go/internal/metal/backend.go` — those four files are the operator-visible surface.

## Files this directory ships

- `CLAUDE.operator.md` (this file) — operator mental model
- `docs/operator/index.md` — operator doc index + planned slots
- `docs/operator/deployment.md` — what you ship + how it runs
- `docs/operator/metallib-and-variants.md` — bundling strategy + variant matrix
- `docs/operator/troubleshooting.md` — lifecycle-phase failure modes
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ cmake_minimum_required(VERSION 3.24)
project(mlx)

set(CMAKE_OSX_DEPLOYMENT_TARGET "26.0" CACHE STRING "Minimum macOS version")
set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)

include(${CMAKE_CURRENT_LIST_DIR}/cmake/CompilerCache.cmake)

if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/dist" CACHE PATH "" FORCE)
Expand All @@ -11,13 +16,14 @@ endif()
set(MLX_BUILD_GGUF ON CACHE BOOL "" FORCE)
set(MLX_BUILD_SAFETENSORS ON CACHE BOOL "" FORCE)
set(MLX_C_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)

set(CMAKE_INSTALL_RPATH "@loader_path")

include(FetchContent)

set(MLX_C_GIT_TAG "v0.4.1" CACHE STRING "")
set(MLX_C_GIT_TAG "fba4470" CACHE STRING "") # mlx-c main: bindings regenerated for MLX 0.31.2 (v0.6.0 predates the 0.31.2 FFT API)
set(FETCHCONTENT_SOURCE_DIR_MLX "${CMAKE_CURRENT_SOURCE_DIR}/lib/mlx" CACHE PATH "Local patched MLX source")

FetchContent_Declare(
mlx-c
Expand Down
Loading