diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 5c53b2ba..4a6bc331 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -414,6 +414,22 @@ jobs:
           name: jacoco-report
           path: target/site/jacoco/jacoco.xml
           if-no-files-found: ignore
+      - name: Run PIT mutation tests
+        run: mvn --batch-mode --no-transfer-progress test-compile org.pitest:pitest-maven:mutationCoverage -Dmaven.javadoc.skip=true
+      - name: Extract PIT survivors
+        if: always()
+        run: |
+          echo "=== PIT Survived Mutations ==="
+          for html_file in $(find target/pit-reports -name "*.html" -type f 2>/dev/null | sort); do
+            if grep -q "SURVIVED" "$html_file"; then
+              echo "Found survivors in $html_file:"
+              grep -B 2 -A 3 "SURVIVED" "$html_file"
+              echo ""
+            fi
+          done
+      - uses: actions/upload-artifact@v7
+        if: always()
+        with: { name: pit-reports, path: target/pit-reports/ }
       - name: Memory after tests
         if: always()
         run: free -h
diff --git a/.mvn/jvm.config b/.mvn/jvm.config
new file mode 100644
index 00000000..504456f9
--- /dev/null
+++ b/.mvn/jvm.config
@@ -0,0 +1,10 @@
+--add-exports=jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED
+--add-exports=jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED
+--add-exports=jdk.compiler/com.sun.tools.javac.main=ALL-UNNAMED
+--add-exports=jdk.compiler/com.sun.tools.javac.model=ALL-UNNAMED
+--add-exports=jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED
+--add-exports=jdk.compiler/com.sun.tools.javac.processing=ALL-UNNAMED
+--add-exports=jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED
+--add-exports=jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED
+--add-opens=jdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED
+--add-opens=jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED
diff --git a/.mvn/jvm.config.license b/.mvn/jvm.config.license
new file mode 100644
index 00000000..b918686f
--- /dev/null
+++ b/.mvn/jvm.config.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+
+SPDX-License-Identifier: MIT
diff --git a/CLAUDE.md b/CLAUDE.md
index b6bec371..8f48354e 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b9442**
+Current llama.cpp pinned version: **b9495**
 
 ## Upgrading CUDA Version
 
@@ -702,21 +702,36 @@ interim measure until that work lands.
 
 ## Open TODOs
 
-- **[URGENT] Replace jqwik with QuickTheories.** Upstream is openly hostile to the AI-assisted workflow this project uses (jqwik 1.10.0 added a deliberate prompt-injection string to test stdout; jqwik 1.10.1 release notes added: *"This project is not meant to be used by any 'AI' coding agents at all."*). See the "jqwik prompt-injection in test output" section above for context and links. Replace the one jqwik test class in this repo (`LlamaParameterProperties`) with one of (in order of preference):
-  - **QuickTheories** (`org.quicktheories:quicktheories`, MIT) — preferred. Native JUnit Jupiter (5/6); plain `@Test` methods with `qt().forAll(...).check(...)` bodies. No `@RunWith`, no JUnit Vintage engine. Property-based generation with shrinking preserved; the fluent DSL (`integers().between(...)`, `floats().between(...)`, `strings().basicLatinAlphabet().ofLengthBetween(...)`) covers every constraint the current jqwik tests use.
-  - **junit-quickcheck** (`com.pholser:junit-quickcheck-core` + `-generators`) — closest annotation match to jqwik but requires the JUnit Vintage engine alongside Jupiter; only use if the QuickTheories DSL turns out to be a poor fit.
-  - A minimal hand-rolled `@ParameterizedTest` + `@MethodSource`/`@ArgumentsSource` approach using JUnit Jupiter that is already on the classpath. Lower dependency count; loses shrinking and built-in generators.
-
-  Remove the jqwik dependency from `pom.xml` (and the `jqwik.version` property), drop the jqwik bullet from any test-frameworks documentation, and verify CI is green with the replacement. Until this lands, the doc-only warning section above is the interim mitigation.
+- **DO NOT UPGRADE jqwik past 1.9.3.** jqwik 1.10.0 added a deliberate anti-AI prompt-injection string to test stdout; the 1.10.1 user guide states the library "is not meant to be used by any 'AI' coding agents at all." 1.9.3 is the last pre-disclosure release and is the pinned version for this repo. Any CI / Dependabot / contributor PR that bumps `jqwik.version` past 1.9.3 must be rejected. The library is otherwise actively maintained and the current pin is the equilibrium position; replacement candidates (QuickTheories, junit-quickcheck, hand-rolled `@ParameterizedTest`) were evaluated and rejected because all available alternatives are either dormant since 2019 or strictly worse on the integration / shrinking axis. See the "jqwik prompt-injection in test output" section above for the full incident reference.
 
 - **`@VisibleForTesting` audit.** No usages currently. Walk the production tree for package-private/protected methods or fields that exist purely so tests can reach them, and either annotate (`com.google.common.annotations.VisibleForTesting`) or move into the test source tree.
-- **Strict null-safety with Maven hard-check.** Nullability annotations today are sporadic and from `org.jetbrains.annotations`. Migrate to JSpecify (`org.jspecify:jspecify`) and add Error Prone + NullAway in the compiler plugin so the build fails on potential NPEs (the BitcoinAddressFinder pom.xml already does this and is a working reference).
-- **At least one LogCaptor smoke test.** SLF4J + Logback are wired in (`OSInfo` uses an SLF4J logger; `LlamaLoader` deliberately uses `System.err` for bootstrap). Add a `LogCaptor.forClass(OSInfo.class)` test that confirms a known log message actually fires through the configured pipeline, so a future logback misconfiguration is caught at test time rather than silently swallowed.
-
-- **Expose `common_params::skip_download` via `ModelParameters.setSkipDownload(boolean)`.** Added in b9437 (`--skip-download` CLI flag); when set, `common_params_handle_models()` returns `false` instead of attempting any HF download, and `common_download_file_single()` returns `-2` on missing-file / ETag-mismatch. Useful for air-gapped / pre-staged-model deployments where any outbound network call is a failure mode. Pair with handling `common_skip_download_exception` from the JNI side so the Java caller sees a typed "model unavailable" failure instead of a generic load error.
+- **Null-safety refinement.** JSpecify + NullAway are now enforced at compile time in **strict JSpecify mode** with the extra options `CheckOptionalEmptiness`, `AcknowledgeRestrictiveAnnotations`, `AcknowledgeAndroidRecent`, `AssertsEnabled` (see `pom.xml`); `@NullMarked` on the three packages via `package-info.java`; JDK module exports in `.mvn/jvm.config`. The legacy `org.jetbrains.annotations` dep has been removed; all nullability annotations are JSpecify. Public-API methods that may legitimately have no value use `Optional<T>` rather than `@Nullable T` (`ChatResponse.getFirstMessage`, `ChatMessage.getParts`, `ChatRequest.buildToolsJson`). Open follow-up: review remaining unannotated public API surfaces for places where `@Nullable` would be more precise than the implicit non-null default.
+
+- **Further-strictness open points (cross-repo, not yet done).** Items below are tracked across all four Bernard-Ladenthin Java repos and can be picked up incrementally:
+  - **SpotBugs `effort=Max` + `threshold=Low`** — currently default effort/threshold. Raising both surfaces more findings (and takes longer per build). Worth a one-off experiment to triage what appears before committing.
+  - ~~**Error Prone bug-pattern promotions to `ERROR`**~~ — **DONE** in 855f447 ("Promote 12 Error Prone bug patterns to ERROR + enable -Xlint:all (no -Werror under release=8)"). Twelve high-confidence patterns are now promoted via `-Xep:<Name>:ERROR` args in `pom.xml` (`BoxedPrimitiveEquality`, `EqualsHashCode`, `EqualsIncompatibleType`, `IdentityBinaryExpression`, `SelfAssignment`, `SelfComparison`, `SelfEquals`, `DeadException`, `FormatString`, `InvalidPatternSyntax`, `OptionalEquality`, `ImpossibleNullComparison`).
+  - ~~**`javac -Werror` + `-Xlint:all,-serial,-options`**~~ — **DONE for this repo** in 3e2efbb ("Turn on javac -Werror"; earlier `-Xlint:all` setup in 855f447) with `-Xlint:all,-serial,-options,-classfile,-processing`. Approximately 20 distinct Error Prone warnings were addressed before flipping the switch: EqualsGetClass on `Pair` (instanceof); MissingOverride on `PoolingType` / `RopeScalingType`; JdkObsolete in `LlamaLoader` (`LinkedList` → `ArrayList`); StringSplitter in `LlamaLoader` (inline suppress — the empty-entry quirk is harmless because we explicitly skip blanks); 3× StringCaseLocaleUsage in `OSInfo` (added `Locale.ROOT`); EmptyCatch in `OSInfo.isAlpineLinux` (rationale comment added); FutureReturnValueIgnored in `LlamaModel.completeAsync` (deliberate fire-and-forget callback, suppressed); Finalize on `LlamaModel.finalize` (intentional finalizer-attack guard, suppressed); MixedMutabilityReturnType in 4 parser methods (`Collections.emptyList()` → `new ArrayList<>()`); EnumOrdinal in `InferenceParameters.setMiroStat` (wire format requires the ordinal, suppressed with rationale); EscapedEntity in `InferenceParameters` javadoc (`&lt;` → `<` inside `@code`); 4× TypeParameterUnusedInFormals on the self-typing builder idiom (suppressed); AnnotateFormatMethod on `Java8CompatibilityHelper.formatted` (callers pass runtime templates, suppressed); SafeVarargs + varargs on `Java8CompatibilityHelper.listOf`. Cross-repo: streambuffer + plugin already done; BAF has a separate catalogued warning list.
+  - ~~**`-parameters` javac arg**~~ — **DONE** in 4350cf2 ("Trivial strictness bundle: -parameters, --release, OnlyNullMarked"). `<parameters>true</parameters>` is set in `maven-compiler-plugin` config; real parameter names are now baked into bytecode.
+  - ~~**`--release N`** instead of `-source N -target N`~~ — **DONE** in 4350cf2 (same bundle commit). `<release>8</release>` is wired in `maven-compiler-plugin`, forcing the API surface to actually match the target JDK.
+  - ~~**Mutation-testing threshold enforcement (PIT)**~~ — **DONE** in 62f8a00 ("Wire PIT mutation testing narrowed to Pair") plus bb93a8f (docs) and 3bfa51f (README badge). `streambuffer` enforces 100 % mutation coverage over its whole package. **This repo and `llamacpp-ai-index-maven-plugin` / `BitcoinAddressFinder` use a "single class, full plumbing" pattern**: PIT is wired in `pom.xml` and runs on every CI build (in the `test-java-linux-x86_64` job) with `<mutationThreshold>100</mutationThreshold>`, but `<targetClasses>` is narrowed to `net.ladenthin.llama.Pair`. The intent is to keep the wiring exercised and the gate live without forcing every class up to 100 % mutation coverage at once. Expand `<targetClasses>` incrementally as classes reach parity (README TODO tracks this).
+  - **Checker Framework as a second static-nullness pass** — **DONE for this repo** in c63870b ("Add Checker Framework Nullness Checker as a 2nd static-nullness pass") (and `streambuffer`, `llamacpp-ai-index-maven-plugin`). The Nullness Checker (4.1.0) is wired in `pom.xml` and runs alongside NullAway. `toJsonString` uses `@PolyNull` (with a NullAway-suppress because NullAway has no PolyNull); native-method constructor calls in `LlamaModel` carry `@SuppressWarnings("method.invocation")`; `Pair.equals` and `Usage.equals` declare `@Nullable Object`; `LlamaSystemProperties` getters return `@Nullable String` to match javadoc; `getPackage()` and resource-stream null derefs are guarded. Remaining cross-repo work: `BitcoinAddressFinder`.
+  - **JPMS `module-info.java` with `@NullMarked` at module level** — **DONE for this repo** in 0fd066a ("Add JPMS module descriptor for the java-llama.cpp JNI bindings"); 9528e79 ("Move @NullMarked to module level + fix Java version badge to 8+") then moved `@NullMarked` from per-package `package-info.java` to the module descriptor (and `streambuffer`, `llamacpp-ai-index-maven-plugin`); remaining cross-repo work covers `BitcoinAddressFinder`. The module `net.ladenthin.llama` exports the three hand-written public packages (`net.ladenthin.llama`, `.args`, `.json`). The native libraries shipped under `/net/ladenthin/llama/{OS}/{ARCH}/` continue to load through `LlamaLoader.class.getResourceAsStream(...)` because that lookup runs against the loader's own module, which is this module, so no `opens` directive is needed. Two-execution `maven-compiler-plugin` pattern (release 8 for sources, release 9 for `module-info.java`); the resulting jar carries `module-info.class` at its root and is backward-compatible with Java 8 classpath consumers. Module-level `@NullMarked` was subsequently adopted in 9528e79 (previously deferred): the annotation now lives on the module descriptor instead of per-package `package-info.java`, mirroring the layout the sister repos converged on.
+  - ~~**Banned-API enforcement**~~ — **DONE** in 8baae0c ("Add Maven Enforcer with the four standard rules; pin slf4j-api") for `bannedDependencies`/`dependencyConvergence`, and 329d764 ("test(archunit): ban System.exit, new Random, Thread.sleep in production") for the `banned-api-checker`-style runtime bans (implemented as ArchUnit rules rather than the standalone plugin). Maven Enforcer `bannedDependencies` excludes `commons-logging`, `log4j:log4j`, old hamcrest split artifacts, and legacy `junit:junit`/`junit:junit-dep`. e6069da additionally bans `sun.*`/`com.sun.*`/`jdk.internal.*` imports in production.
+  - **Additional ArchUnit rules to consider** — layered-architecture rules (`layeredArchitecture().consideringAllDependencies()`), per-module banned-imports lists, public-API-surface constraints (no public mutable static state, etc.). Partial progress: 7b6667d ("test(archunit): public non-static fields must be final (LlamaOutput compliant)") covers the "no public field that is not final" sub-rule.
+- ~~**At least one LogCaptor smoke test.** SLF4J + Logback are wired in (`OSInfo` uses an SLF4J logger; `LlamaLoader` deliberately uses `System.err` for bootstrap). Add a `LogCaptor.forClass(OSInfo.class)` test that confirms a known log message actually fires through the configured pipeline, so a future logback misconfiguration is caught at test time rather than silently swallowed.~~ **DONE** in `LoggingSmokeTest` (two tests): (1) `slf4jPipelineEmits` directly emits a known INFO event through `LoggerFactory.getLogger(OSInfo.class)` and asserts LogCaptor saw it — catches broken SLF4J binding / misrouted Logback config; (2) `getHardwareNameLogsError_whenProcessRunnerThrows` swaps `OSInfo.processRunner` with a stub that throws `IOException`, then asserts the production `error("Error while running uname -m", e)` line at `OSInfo.java:299` was captured — pins the production log call as part of the contract.
+
+- ~~**Expose `common_params::skip_download` via `ModelParameters.setSkipDownload(boolean)`.**~~ **DONE**: `ModelFlag.SKIP_DOWNLOAD` + `ModelParameters.setSkipDownload(boolean)` + `ModelParameters.hasFlag(ModelFlag)` ship as a strict-addition Java API. Upstream raises `common_skip_download_exception` inside `common_download_file_single`, but it is caught inside upstream `common_params_parse_ex` (`common/arg.cpp:476`) and surfaces only as a `false` return from `common_params_parse` &mdash; so the JNI never sees the exception directly. The Java layer therefore uses a heuristic in `SkipDownloadFailureTranslator`: when `SKIP_DOWNLOAD` is set AND the JNI throws `LlamaException("Failed to parse model parameters")`, the failure is translated to a typed public `ModelUnavailableException` (extends the now-public `LlamaException`). 7 unit tests in `LlamaModelSkipDownloadTest` cover the round-trip + every translation edge case (skip-set + parse-failed → typed; skip-set + unrelated message → passthrough; skip-not-set + parse-failed → passthrough; null message → passthrough). No JNI / native rebuild required.
 
 - **Expose `--spec-draft-backend-sampling` toggle via `ModelParameters.setSpecDraftBackendSampling(boolean)`.** Added in b9437 (env `LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING`). Backend sampling for the speculative draft is enabled by default upstream but auto-disabled on `LLAMA_SPLIT_MODE_TENSOR` setups; an explicit Java-side setter lets callers force-disable it for benchmarking or for backends with sampler bugs. Add only after a real user request &mdash; this is plumbing that mostly matters for speculative-decoding power users.
 
+- **Expose runtime reasoning control via `InferenceParameters.setReasoningControl(boolean)` + `LlamaModel.endReasoning(...)`.** Added in b9444&#x2013;b9490: new `common_params_sampling::reasoning_control` flag arms the budget sampler so reasoning can be ended at runtime, and new `common_sampler_reasoning_budget_force(common_sampler *)` triggers the end-of-thinking token injection on the next sample. Upstream also adds a `POST /v1/chat/completions/control` server endpoint accepting `{"id": "...", "action": "reasoning_end"}`. Java mapping would be: (a) `InferenceParameters.setReasoningControl(boolean)` arms the sampler on the inference run, (b) a new `LlamaModel.endReasoning(int slotId)` (or per-streaming-task-id) JNI method calls the upstream `common_sampler_reasoning_budget_force` against the slot's sampler. Useful for interactive UIs that want a "skip thinking and answer now" button. Add only after a real user request &mdash; relevant only for reasoning-trained models (DeepSeek-R1, Qwen3-Thinking, GPT-OSS-Reasoner, etc.).
+
+- **Expose `llama_context_params::n_outputs_max` via `ModelParameters.setMaxOutputs(int)`.** Added in b9444&#x2013;b9490 (default `-1` = derived from `n_batch`). Caps the number of output slots allocated per context; relevant for memory-constrained setups that always run with `logits_all=false` and want to prevent over-allocation when `n_batch` is large. Trivial JNI plumbing (one `cparams` field passthrough); add when a user reports OOM on context creation tied to output slot pre-allocation.
+
+- **Expose Multi-Token Prediction toggle via `ModelParameters.setMtp(boolean)`.** Existed since the Qwen3.5 MTP work; b9444&#x2013;b9490 extends it to Step-3.5. CLI flags `--mtp`/`--no-mtp` (env `LLAMA_ARG_MTP`) control whether the draft head runs alongside the main model for accelerated decoding. Java setter would route to `common_params_speculative::type = COMMON_SPECULATIVE_TYPE_DRAFT_MTP`. Add only after a real user request &mdash; relevant only for MTP-trained models.
+
+- **Expose `llama_vocab::get_suppress_tokens()` via `LlamaModel.getSuppressTokens()`.** Added in b9490&#x2013;b9495 alongside the new `tokenizer.ggml.suppress_tokens` GGUF key and the `LLM_KV_TOKENIZER_SUPPRESS_TOKENS` constant. When a GGUF declares this array, upstream stores it on `llama_vocab::impl::suppress_tokens` and exposes it via the new `llama_vocab::get_suppress_tokens()` accessor. The bias is **applied automatically** inside the model forward graph &mdash; the Gemma4 Unified graph (`src/models/gemma4.cpp`) reads the list and adds a `-INFINITY` logit bias to those token IDs via a new `llm_graph_input_logits_bias` input so the model cannot emit them (used to block `<image|>` / `<audio|>` placeholders). A Java mirror would be `public int[] getSuppressTokens()` on `LlamaModel`: a read-only inspector returning the suppression list for debugging or for callers running their own sampling who want to replicate the same bias. Value is low (the bias is auto-applied, Java callers cannot change it; java-llama.cpp does not expose custom logit-bias hooks at this level); cost is trivial (one JNI passthrough + a `getSuppressTokens()` Java method). Add only after a real user request &mdash; same posture as the b9444&#x2013;b9490 follow-ups (`setReasoningControl`, `setMaxOutputs`, `setMtp`) queued above.
+
 - **`@VisibleForTesting` design-fit review.** Complement to the audit above: for every existing or planned `@VisibleForTesting` usage, ask whether widening access is the cleanest path to testability. Common alternatives that should be preferred when applicable: (a) inject the dependency through the constructor and have the test pass a stub or fake; (b) extract the tested behaviour into a separate testable helper class with public methods; (c) restructure the production API so what the test wants to verify is observable through normal public methods. Only keep the annotation where these alternatives are materially worse. `@VisibleForTesting` should be the last resort, not the first.
 
 - **Package hierarchy review.** Walk the full `src/main/java/.../` tree and assess whether the current package layout still expresses the design intent. Look for: classes that have drifted into the wrong package as the codebase grew; flat "kitchen-sink" packages that should be split (high class count, mixed concerns); deeply nested packages that fragment cohesive components; circular dependencies between packages; missing seams where a sub-package boundary would prevent leaking implementation details. Produce a target tree as a separate planning step BEFORE making any moves — large package refactors are expensive to review and easy to do twice if the target isn't clear up front.
@@ -724,3 +739,18 @@ interim measure until that work lands.
 - **Class and method naming review (pair with the package hierarchy work).** While the package hierarchy review is in flight, also audit class and method names for the same kinds of drift: stale names that no longer describe what the class actually does after years of growth; over-abbreviated or cryptic identifiers (`Utils`, `Helper`, `Mgr`, `do*`, `process*`) that hide responsibilities; method names whose verbs do not match the actual side effects (named `get*` but writes, named `is*` but mutates, etc.); name collisions across packages that force qualified imports everywhere. Renames are far cheaper to do INSIDE a package-restructure commit than as standalone follow-ups (one IDE refactor pass touches both the move and the rename), so capture name changes in the same target tree as the package plan rather than as a separate later step.
 
 - **Abstract the Java and test writing guidelines to a workspace-level shared layer.** The Java code-writing rules and test-writing conventions referenced from this CLAUDE.md (`CODE_WRITING_GUIDE.md`, `TEST_WRITING_GUIDE.md` where present, and the `.claude/skills/java-tdd-guide/SKILL.md` skill) are already nearly identical across all 4 Bernard-Ladenthin Java repos (`BitcoinAddressFinder`, `llamacpp-ai-index-maven-plugin`, `streambuffer`, `java-llama.cpp`) and the duplication will drift over time. Lift them into a single workspace-level location that AI assistants pick up regardless of which repo they were opened in: the canonical Java conventions go into a workspace-wide Claude skill (e.g. `~/.claude/skills/java-tdd-guide/SKILL.md` already exists as the seed); per-repo `CLAUDE.md` only keeps repo-specific supplements (build commands, module layout, project-specific testing notes) and points at the shared skill instead of duplicating the rules. Same plan covers any other workspace-level seams (shared editor config, shared `.spotbugs-exclude.xml` fragments for cross-repo idioms, shared GitHub-workflow templates). Capture the canonical version BEFORE deleting the per-repo files; do not delete files in this pass.
+
+- **Feature backlog from similar projects.** See [`docs/feature-investigation-similar-projects.md`](docs/feature-investigation-similar-projects.md) for the consolidated investigation across the 5 pure-Java sibling runtimes ([llama3.java](https://github.com/mukel/llama3.java), [gemma4.java](https://github.com/mukel/gemma4.java), [gptoss.java](https://github.com/mukel/gptoss.java), [qwen35.java](https://github.com/mukel/qwen35.java), [nemotron3.java](https://github.com/mukel/nemotron3.java)) plus the dormant alternative JNI binding [llamacpp4j](https://github.com/sebicom/llamacpp4j). The doc captures 18 candidate items grouped into cross-cutting themes (UTF-8 streaming boundary safety, thinking-channel router, operator timing line, jbang single-file example, README system-properties table, etc.) and per-repo unique findings (Harmony channel decoder, Qwen empty-`<think>` injection, llama_state_* save/load, llama_adapter_lora_* hot-apply, etc.), each with effort sizing (XS / S / M / L) and a prioritised backlog. **Recommended first batch** (items 1, 3, 4, 5): UTF-8 boundary-safe streaming decoder + per-run timing line + one jbang-runnable example + a README system-properties table; ~1-2 days total, no JNI changes.
+
+- **Evaluate GraalVM Native Image as an alternative distribution target.** Reference: [GraalVM Native Image](https://www.graalvm.org/latest/reference-manual/native-image/). The pure-Java sibling projects in the README's "Similar Projects" list (mukel's `llama3.java` / `gemma4.java` / `gptoss.java` / `qwen35.java` / `nemotron3.java`) demonstrate that single-jar, no-JNI Java inference is viable for individual model architectures. Native Image opens an orthogonal direction for THIS project: AOT-compile the Java layer + JNI bridge to a self-contained binary that bundles the libjllama.so (or per-OS equivalent) and starts in milliseconds without a JVM, which would make jllama usable in CLI tools, serverless functions, and short-lived processes where JVM startup is the dominant cost.
+
+  **What to investigate before committing**:
+  - **JNI-loading shape.** Native Image supports JNI but requires `--enable-native-access=ALL-UNNAMED` + reflection/JNI configuration files (`reflect-config.json`, `jni-config.json`, `resource-config.json`) describing every class/method/field reachable across the JNI boundary. The 17 native methods in `jllama.cpp` plus the JNI-side `FindClass` / `GetFieldID` / `GetMethodID` calls at `JNI_OnLoad` need to be mapped. The GraalVM tracing agent (`-agentlib:native-image-agent=config-output-dir=...`) can auto-generate the config during a representative test run, but the `LlamaLoader` JAR-extraction path needs at least one resource-config rule for `net/ladenthin/llama/{OS}/{ARCH}/lib*.so`.
+  - **Native-library packaging.** The current `LlamaLoader` extracts the OS-specific `.so`/`.dll`/`.dylib` from the JAR to a tmp dir at first use. Native Image needs the same file at AOT-execution time, so either (a) ship the native lib alongside the produced binary as a sidecar file and adjust `LlamaLoader` to find it on the same directory, or (b) embed the native lib as a resource and keep the existing extract-to-tmpdir flow (which Native Image supports via `resource-config.json`).
+  - **CUDA / Metal / OpenCL backend selection.** Today the choice between CPU-only / `cuda13-linux-x86-64` / `opencl-android-aarch64` JARs is at Maven-classifier time. Native Image would need either one binary per backend (multiplying the release matrix) or a runtime selector inside `LlamaLoader` that picks among bundled backend libs. The latter is a bigger refactor.
+  - **Startup-time benchmark to justify the work.** Measure cold-start of a current java-llama.cpp `LlamaModel(new ModelParameters().setModel("...").setNPredict(1))` invocation: how much is JVM startup + class load vs JNI load + model parse + tokenize + 1 token? If JVM startup is &lt; 10 % of cold-start, Native Image yields little. If JVM startup is &gt; 50 %, it's a clear win for CLI / serverless use cases.
+  - **Maintenance cost.** Native Image adds a second build matrix (per OS × per backend × per JDK) and a new failure surface (Native Image config drift when a llama.cpp version bump adds new JNI-reachable types). Should ship only with a CI job that exercises the Native Image build on at least one OS, otherwise the config files will rot silently.
+
+  **Out of scope until evidence supports it**: actually implementing any of the above. This entry exists so that when someone asks "can I ship java-llama.cpp as a single 30 MB binary?" the answer points to a concrete investigation plan rather than restarting from zero.
+
+- **Adopt a standard `CLAUDE.md` template/tool for cross-repo consistency.** The four Bernard-Ladenthin Java repos (`BitcoinAddressFinder`, `llamacpp-ai-index-maven-plugin`, `streambuffer`, `java-llama.cpp`) each carry their own hand-grown `CLAUDE.md`; section ordering, headings, and conventions have already drifted between them. Evaluate adopting a standardised template — for example [`centminmod/my-claude-code-setup` `CLAUDE-template-1.md`](https://github.com/centminmod/my-claude-code-setup/blob/master/CLAUDE-template-1.md) — so every repo's `CLAUDE.md` shares the same top-level structure (project overview, build/test commands, conventions, open TODOs, …) and so future edits land in predictable places. Pairs with the "Abstract the Java and test writing guidelines to a workspace-level shared layer" TODO above: the template covers the per-repo structure, the workspace skill covers the shared content. Capture the template choice and the migration plan BEFORE rewriting any existing `CLAUDE.md`; do not rewrite files in this pass.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d5fe64cf..057c03ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,7 +114,7 @@ set(LLAMA_BUILD_APP OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b9442
+	GIT_TAG        b9495
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/README.md b/README.md
index b3a8b110..8fc69bea 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,13 @@
 **Build:**  
-![Java 11+](https://img.shields.io/badge/Java-11%2B-informational)  
-![JUnit](https://img.shields.io/badge/tested%20with-JUnit4-yellow)  
+![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)  
+![Platform](https://img.shields.io/badge/Platform-Linux%20%7C%20macOS%20%7C%20Windows%20%7C%20Android-lightgrey)  
+[![JPMS](https://img.shields.io/badge/JPMS-modular%20JAR-25A162)](https://openjdk.org/projects/jigsaw/)  
+![JUnit](https://img.shields.io/badge/tested%20with-JUnit6-25A162)  
+[![JSpecify](https://img.shields.io/badge/JSpecify-1.0.0%20%40NullMarked-25A162)](https://jspecify.dev)  
+[![NullAway](https://img.shields.io/badge/NullAway-strict%20JSpecify-25A162)](https://github.com/uber/NullAway)  
+[![Checker Framework](https://img.shields.io/badge/Checker%20Framework-Nullness-25A162)](https://checkerframework.org)  
+[![Error Prone](https://img.shields.io/badge/Error%20Prone-12%20patterns%20at%20ERROR-25A162)](https://errorprone.info)  
+[![Maven Enforcer](https://img.shields.io/badge/Maven%20Enforcer-strict-25A162)](https://maven.apache.org/enforcer/)  
 [![jqwik](https://img.shields.io/badge/tested%20with-jqwik-1f6feb)](https://jqwik.net)  
 [![ArchUnit](https://img.shields.io/badge/tested%20with-ArchUnit-c71a36)](https://www.archunit.org)  
 [![SpotBugs](https://img.shields.io/badge/analyzed%20with-SpotBugs-3b5998)](https://spotbugs.github.io)  
@@ -8,7 +15,7 @@
 [![Lincheck](https://img.shields.io/badge/tested%20with-Lincheck-7F52FF)](https://github.com/JetBrains/lincheck)  
 [![vmlens](https://img.shields.io/badge/tested%20with-vmlens-ff6f00)](https://vmlens.com)  
 [![JMH](https://img.shields.io/badge/benchmarked%20with-JMH-25A162)](https://openjdk.org/projects/code-tools/jmh/)  
-[![llama.cpp b9442](https://img.shields.io/badge/llama.cpp-%23b9442-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9442)  
+[![llama.cpp b9495](https://img.shields.io/badge/llama.cpp-%23b9495-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9495)  
 [![Publish](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/publish.yml/badge.svg)](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/publish.yml)  
 [![CodeQL](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/codeql.yml/badge.svg)](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/codeql.yml)  
 
@@ -16,10 +23,7 @@
 [![Coverage Status](https://coveralls.io/repos/github/bernardladenthin/java-llama.cpp/badge.svg?branch=main)](https://coveralls.io/github/bernardladenthin/java-llama.cpp?branch=main)  
 [![codecov](https://codecov.io/gh/bernardladenthin/java-llama.cpp/graph/badge.svg)](https://codecov.io/gh/bernardladenthin/java-llama.cpp)  
 [![JaCoCo](https://img.shields.io/codecov/c/github/bernardladenthin/java-llama.cpp?label=JaCoCo&logo=java)](https://codecov.io/gh/bernardladenthin/java-llama.cpp)  
-<!--
-PIT mutation testing is not configured for this repository.
-Do not add a PIT badge here unless PIT is wired into pom.xml + CI.
--->
+[![PIT Mutation](https://img.shields.io/badge/PIT%20mutation-100%25%20(1%20class)-brightgreen)](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/publish.yml)  
 
 **Quality:**  
 [![Quality Gate](https://sonarcloud.io/api/project_badges/measure?project=bernardladenthin_java-llama.cpp&metric=alert_status)](https://sonarcloud.io/dashboard?id=bernardladenthin_java-llama.cpp)  
@@ -84,6 +88,8 @@ Inference of Meta's LLaMA model (and others) in pure C/C++.
 4. [Android](#importing-in-android)
 5. [Feature Ideas](#feature-ideas)
 
+> ⚠️ **DO NOT UPGRADE jqwik past 1.9.3.** jqwik 1.10.0 added an anti-AI prompt-injection string to test stdout; the 1.10.1 user guide states the library "is not meant to be used by any 'AI' coding agents at all." 1.9.3 is the last pre-disclosure release and is the pinned version. See `CLAUDE.md` section "jqwik prompt-injection in test output" for the full context.
+
 ## Features
 
 - Text completion (blocking and streaming) with full control over sampling parameters.
@@ -487,6 +493,10 @@ android {
 keep class net.ladenthin.llama.** { *; }
 ```
 
+## TODO
+
+- **Expand PIT mutation-testing scope.** PIT is wired in `pom.xml` and runs on every CI build (in the `test-java-linux-x86_64` job) with `<mutationThreshold>100</mutationThreshold>`, but `<targetClasses>` is currently narrowed to a single class (`Pair`). The intent is to exercise the wiring and gate against regressions on that single class today; widen `<targetClasses>` incrementally as additional classes reach mutation-test parity. Final target: `<param>net.ladenthin.llama.*</param>` matching the streambuffer pattern.
+
 ## Feature Ideas
 
 Forward-looking ideas being tracked for this fork:
@@ -523,6 +533,17 @@ The system's updated C++ runtime will be used instead, resolving the crash.
 
 ## Similar Projects / Usage
 
+**Bindings / wrappers**
+
 - [LLaMAndroid](https://github.com/Rattlyy/LLaMAndroid/tree/main/app) — Android app demonstrating usage of llama.cpp bindings.
 - [llama-stack-client-kotlin](https://github.com/ogx-ai/llama-stack-client-kotlin) — Kotlin client for the Llama Stack API.
 - [llama.cpp-android-tutorial](https://github.com/JackZeng0208/llama.cpp-android-tutorial) — Step-by-step tutorial for running llama.cpp on Android.
+- [llamacpp4j](https://github.com/sebicom/llamacpp4j) — alternative Java/JNI binding to llama.cpp (SWIG-generated facade); pre-GGUF, dormant since 2023 but historically the other Java JNI option.
+
+**Pure-Java single-model inference (no JNI / no llama.cpp)** — Alfonso² Peterssen's `*.java` family of standalone, dependency-free Java inference runtimes, one per model architecture. Useful when JNI is unavailable (e.g. some sandboxes / GraalVM native-image scenarios) or when you want a single jar with no native side at all. Different design point from this project, which prioritises GGUF compatibility and llama.cpp performance via JNI.
+
+- [llama3.java](https://github.com/mukel/llama3.java) — Llama 3 / 3.1 / 3.2 inference.
+- [gemma4.java](https://github.com/mukel/gemma4.java) — Gemma 4 (and earlier Gemma 2/3) inference.
+- [gptoss.java](https://github.com/mukel/gptoss.java) — GPT-OSS architecture inference.
+- [qwen35.java](https://github.com/mukel/qwen35.java) — Qwen 3.5 inference.
+- [nemotron3.java](https://github.com/mukel/nemotron3.java) — NVIDIA Nemotron-3 inference.
diff --git a/docs/feature-investigation-similar-projects.md b/docs/feature-investigation-similar-projects.md
new file mode 100644
index 00000000..748f747a
--- /dev/null
+++ b/docs/feature-investigation-similar-projects.md
@@ -0,0 +1,199 @@
+# Feature Investigation — ideas from pure-Java sibling runtimes and `llamacpp4j`
+
+Comparison sources (all surveyed in one pass for this document):
+
+| Repo | Shape | License | Survey notes |
+|------|-------|---------|--------------|
+| [mukel/llama3.java](https://github.com/mukel/llama3.java) | Pure Java, single-file (~3.4k LOC), Vector API + GraalVM Native Image | MIT | Llama 3 / 3.1 / 3.2 |
+| [mukel/gemma4.java](https://github.com/mukel/gemma4.java) | Pure Java, single-file (~3.9k LOC) | Apache 2.0 | Gemma 4 + earlier Gemma 2/3 |
+| [mukel/gptoss.java](https://github.com/mukel/gptoss.java) | Pure Java, single-file | Apache 2.0 | OpenAI GPT-OSS (Harmony chat format) |
+| [mukel/qwen35.java](https://github.com/mukel/qwen35.java) | Pure Java, single-file | Apache 2.0 | Qwen 3.5 dense + MoE |
+| [mukel/nemotron3.java](https://github.com/mukel/nemotron3.java) | Pure Java, single-file | Apache 2.0 | NVIDIA Nemotron-3 (dense + MoE + recurrent SSM) |
+| [sebicom/llamacpp4j](https://github.com/sebicom/llamacpp4j) | Alternative JNI binding (SWIG-generated facade over `llama.h`) | unspecified | **Dormant** — 1 commit (2023-07-04), pre-GGUF (llama.cpp build 491), no LICENSE, no tests, no CI |
+
+The 5 `mukel` projects are written by the same author (Alfonso² Peterssen), share a single-file template, and re-implement GGUF parsing + tensor kernels in pure Java. They are NOT direct competitors to `java-llama.cpp` (which delegates inference to llama.cpp via JNI); they are interesting because they have **better operator-facing ergonomics** at the CLI and example layers.
+
+`llamacpp4j` is the only other Java-side JNI binding to llama.cpp; the survey looked specifically for API-shape ideas and capabilities not currently exposed here.
+
+Effort sizing (mirrors [`feature-investigation-llama-stack-client-kotlin.md`](feature-investigation-llama-stack-client-kotlin.md)):
+
+| Size | Calendar effort (1 engineer) | Description |
+|------|------------------------------|-------------|
+| XS   | < 0.5 day                    | Trivial Java-side change, no JNI |
+| S    | 0.5 – 2 days                 | Java surface + minor JNI/JSON wiring |
+| M    | 2 – 5 days                   | New JNI methods, native plumbing, tests |
+| L    | 1 – 2 weeks                  | New native subsystem or large API surface |
+
+---
+
+## 1. What this project already covers
+
+The following are confirmed present in `java-llama.cpp` as of this survey — flagged so we do not re-investigate them:
+
+| Capability | Status |
+|---|---|
+| `setSkipDownload(boolean)` + typed `ModelUnavailableException` | ✅ (commit `37754d4`) |
+| Reasoning-format toggle, reasoning-budget tokens | ✅ (`InferenceParameters#setReasoningFormat` etc.) |
+| Tool calls + custom chat templates | ✅ |
+| Speculative draft model | ✅ |
+| Multimodal vision (mmproj) | ✅ |
+| Infill (fill-in-the-middle) | ✅ |
+| Streaming via `LlamaIterator` / Reactive Streams `Publisher` | ✅ |
+| `CompletableFuture` async + `CancellationToken` | ✅ |
+| `LoadProgressCallback` model-load progress | ✅ |
+
+---
+
+## 2. Cross-cutting themes — universal across the 5 `mukel` projects
+
+These ideas appear in every (or nearly every) `mukel` runtime; portability across reasoning-model families makes them the **highest-leverage** items.
+
+### 2.1 Streaming UTF-8 decoder for multi-byte boundary safety  *(S, medium-high priority)*
+
+Sources: `qwen35.java` (`StreamingDecoder`, L2929–2987), `nemotron3.java`, `gemma4.java`.
+
+GGUF byte-fallback tokenisation can split a single Unicode codepoint across two consecutive token pieces. `LlamaIterator` callers today can receive a `LlamaOutput.text` value containing a partial UTF-8 sequence and either render mojibake (CJK, emoji) or hand-roll their own buffering. The `mukel` runtimes wrap the token stream in a small decoder that holds back trailing bytes until a complete codepoint is available, then flushes.
+
+- **Why**: silent correctness bug for non-ASCII users; ~50-LOC fix.
+- **Shape**: `Utf8BoundaryStreamingDecoder` helper in the Java layer (no JNI change); optional `setUtf8BoundarySafe(true)` opt-in on `InferenceParameters`, or always-on inside `LlamaIterator`.
+- **Test**: use any of the existing CJK / emoji prompts; assert no partial codepoint ever crosses the iterator boundary.
+
+### 2.2 Tri-state thinking-channel router for reasoning models  *(S, medium priority)*
+
+Sources: `gemma4.java`, `gptoss.java` (Harmony channels), `qwen35.java`, `nemotron3.java`.
+
+A `--think off|on|inline` flag with three semantics: **`off`** strips reasoning tokens from the visible stream (and from chat history), **`on`** (default) routes them to a separate sink (e.g. stderr in CLI examples), **`inline`** interleaves them in the main output. Pairs cleanly with this project's existing `setReasoningFormat`/`setReasoningBudgetTokens`.
+
+- **Why**: every reasoning model in this project's test matrix (Qwen3-0.6B, plus any GPT-OSS / Gemma / Nemotron load) exposes thought tokens, but operators currently hand-roll the routing.
+- **Shape**: helper class `ThinkingChannelRouter` (or analogous) that consumes a `LlamaIterator` and produces two streams (visible / reasoning), plus an enum knob on `InferenceParameters`.
+- **gptoss specifically**: needs a Harmony-channel state machine that recognises `<|start|>`, `<|channel|>`, `<|message|>`, `<|end|>` and exposes `analysis` / `commentary` / `final` channels separately. Worth shipping as a separate `HarmonyChannelDecoder` if GPT-OSS users materialise. *(M for the Harmony variant; S for the generic `<think>` variant.)*
+
+### 2.3 Interactive chat REPL with slash commands  *(XS, low-medium priority)*
+
+Sources: `llama3.java`, `gemma4.java`, `gptoss.java`, `qwen35.java`, `nemotron3.java`.
+
+`/quit`, `/exit`, `/context` (the latter prints `used / max / remaining` tokens for the current chat session). Users currently Ctrl-C out of `ChatExample`.
+
+- **Shape**: a `ChatRepl` example under `src/test/java/examples/`. No new production API surface — it composes existing `LlamaModel` calls.
+- **Effort**: 1 new file, ~150 LOC.
+
+### 2.4 ANSI colour auto-detection honouring `NO_COLOR` + `TERM=dumb`  *(XS, low priority)*
+
+Sources: `gemma4.java`, `gptoss.java`, `qwen35.java`, `nemotron3.java`.
+
+Tri-state `--color on|off|auto` helper that honours the [`NO_COLOR`](https://no-color.org) informal standard, detects `TERM=dumb`, and falls back to no-colour when `System.console()` is `null`. ~15 LOC; useful in every example CLI that prints reasoning tokens or perf summaries in a different style.
+
+### 2.5 Operator-grade timing line on stderr  *(XS, medium priority)*
+
+Sources: `qwen35.java`, `nemotron3.java`.
+
+After every generation: a one-line `prompt: X tok/s (P tokens) | generation: Y tok/s (G tokens) | context: U/M` summary to stderr. `LlamaModel.getTimings()` already has all the inputs; no example formats them.
+
+### 2.6 `AutoCloseable Timer.log("label")` idiom  *(XS, low priority)*
+
+Sources: `gemma4.java` (`Timer` class, L320–333), `qwen35.java`.
+
+`try (var t = Timer.log("Load tensors")) { ... }` prints `Load tensors: 312 ms` to stderr on close. 12-line helper. The project already times model load + JNI init + first-token latency in ad-hoc places; one helper would unify them. Friendly to `LogCaptor` (already wired in tests).
+
+### 2.7 `jbang`-runnable single-file example  *(XS, medium priority)*
+
+Sources: all 5 `mukel` runtimes.
+
+Ship a self-contained `Example.java` with the `///usr/bin/env jbang` shebang and `//DEPS net.ladenthin:llama:5.0.0`. Lowers the "try it once" barrier from `mvn dependency:get + classpath wrangling` to one curl-and-run line. Pairs naturally with publishing on Maven Central.
+
+### 2.8 Documented system-properties table in the README  *(XS, medium priority)*
+
+Sources: all `mukel` runtimes (each documents its own `-D…` knobs alongside `--flag` parameters).
+
+Currently the `LlamaSystemProperties` setters (`net.ladenthin.llama.lib.path`, `.tmpdir`, `.osinfo.architecture`, `.test.ngl`, the per-test `.vision.*` and `.nomic.path` properties) are scattered across `CLAUDE.md`, source javadoc, and test setup. A single README table listing every supported property + default + meaning improves discoverability.
+
+---
+
+## 3. Per-repo unique ideas
+
+### 3.1 `llama3.java`
+
+- **`--echo` debug mode** *(XS, low)* — dump every token to stderr separately from `--stream`. Useful for teaching / first-time-user debugging.
+- **`-Dllama.VectorBitSize=0|128|256|512`** *(XS, low)* — runtime knob to pin SIMD width / benchmark when multiple ISA variants are co-located. Equivalent for this project: a system property selecting GGML CPU backend variant when multiple are on the library path.
+
+### 3.2 `gemma4.java`
+
+- **README note about `llama-quantize --pure`** *(XS, low)* — mixed-quant GGUF files (e.g. `Q4_0` with embedded `F16` tensors) cause subtle issues that users discover only by trawling the upstream issue tracker. Surface the workaround in the troubleshooting section.
+
+### 3.3 `gptoss.java`
+
+- **`Reasoning: low|medium|high` system-message injection** *(S, high if GPT-OSS users present)* — add `InferenceParameters.setReasoningEffort(LOW|MEDIUM|HIGH)` that synthesises the Harmony `Reasoning: X` line. Encodes a contract operators currently discover only by reading the Jinja template.
+- See also Harmony channel decoder under §2.2.
+
+### 3.4 `qwen35.java`
+
+- **"Empty `<think></think>` injection" to *disable* thinking on Qwen models** *(S, medium)* — prefill the assistant header with `<think>\n\n</think>\n\n` so the model produces only the visible answer with zero reasoning tokens, regardless of whether llama.cpp's `reasoning_format` understands the family. Complements existing `setReasoningFormat` / `setReasoningBudgetTokens`. Should land as a `ChatRequest` option or a thin Qwen-aware preset.
+
+### 3.5 `nemotron3.java`
+
+- All unique-value findings overlap with §2 themes; no Nemotron-specific item warranted its own row beyond what §2.1 / §2.2 already cover.
+
+### 3.6 `llamacpp4j`
+
+`llamacpp4j` is dormant (single commit, July 2023, pre-GGUF era) and its design is largely uninteresting (SWIG-generated facade with opaque `SWIGTYPE_p_*` pointers leaking through). The *useful* ideas come from the underlying `llama.h` API surface that SWIG happens to expose, not from anything Sebicom designed:
+
+- **`llama_state_*` save/load API** *(M, medium)* — `llama_copy_state_data`, `llama_set_state_data`, `llama_save_session_file` / `llama_load_session_file`. Useful for prompt-warm-start, multi-tenant resumption, and benchmarking. `ModelParameters` doesn't surface KV-cache snapshotting as first-class Java API.
+- **`llama_apply_lora_*` hot-apply at runtime** *(M, medium)* — adapter hot-swap without reloading the base model (common multi-tenant pattern). Use the modern `llama_adapter_lora_*` API, not the deprecated file-based one Sebicom exposes.
+- **`llama_model_quantize` exposure** *(S, low)* — one-line wrapper that converts FP16 → Q4/Q5/Q8 GGUF in-process. Lets Java apps build a "download FP16 → quantize for this device" path without shelling out.
+- **`llama_print_system_info()` wrapper** *(XS, low)* — trivial diagnostic that prints `AVX = 1 | AVX2 = 1 | …` etc. Useful for bug reports.
+
+**Explicitly skip from `llamacpp4j`**: the SWIG-generated facade itself (brittle, opaque pointer types leak), the `mainn(argv)` shortcut that forwards to `llama.cpp`'s reference CLI, the single-OS prebuilt `.so` checked into git, the README-documented "install JAR into local Maven repo" workflow. `java-llama.cpp`'s JSON-over-JNI + classifier-based packaging is strictly better.
+
+---
+
+## 4. Explicitly out of scope
+
+Recurring "don't port" themes across all 6 sources:
+
+- **Pure-Java tensor kernels / GGUF parser / quantization classes** — redundant with llama.cpp; the entire raison d'être of this project is to *delegate* these to the upstream C++.
+- **GraalVM Native Image AOT model preloading** — already captured as its own design-investigation TODO in `CLAUDE.md`; not duplicated here.
+- **Reimplementations of samplers** (`ToppSampler`, `CategoricalSampler`) — llama.cpp's sampler chain already covers TOP_P, TYP_P, MIN_P, XTC, DRY, etc.
+- **Single-file `jbang` distribution of the whole library** — wrong shape for a JNI library that ships per-OS classifier JARs. *(A single-file `jbang` *example* per §2.7 is fine; the library itself stays multi-module.)*
+- **Hard-coded per-model chat-template token strings** (e.g. Gemma's `<|turn>` / `<|think|>`) — llama.cpp's chat-template engine handles these generically.
+
+---
+
+## 5. Prioritised backlog (top picks across all 6 sources)
+
+Sorted by `priority × (1 / effort)`. Items in **bold** are the recommended first batch.
+
+| # | Item | Source(s) | Effort | Priority |
+|---|------|-----------|:--:|:--:|
+| 1 | **UTF-8 boundary-safe streaming decoder** | §2.1 | S | medium-high |
+| 2 | **Tri-state thinking-channel router** (generic `<think>`) | §2.2 | S | medium |
+| 3 | **Operator-grade per-run timing line on stderr** | §2.5 | XS | medium |
+| 4 | **`jbang`-runnable single-file example** | §2.7 | XS | medium |
+| 5 | **System-properties table in README** | §2.8 | XS | medium |
+| 6 | Empty `<think></think>` injection (Qwen) | §3.4 | S | medium |
+| 7 | `llama_state_*` save/load Java API | §3.6 | M | medium |
+| 8 | `llama_adapter_lora_*` hot-apply API | §3.6 | M | medium |
+| 9 | Chat REPL with `/quit /exit /context` | §2.3 | XS | low-medium |
+| 10 | Harmony channel decoder for GPT-OSS | §2.2 | M | conditional (ship when GPT-OSS users ask) |
+| 11 | `Reasoning: X` system-message injection | §3.3 | S | conditional |
+| 12 | ANSI colour auto-detection helper | §2.4 | XS | low |
+| 13 | `AutoCloseable Timer.log()` idiom | §2.6 | XS | low |
+| 14 | `llama_print_system_info()` wrapper | §3.6 | XS | low |
+| 15 | `llama_model_quantize` Java surface | §3.6 | S | low |
+| 16 | README note on `llama-quantize --pure` | §3.2 | XS | low |
+| 17 | `--echo` debug knob in example | §3.1 | XS | low |
+| 18 | `-Dllama.VectorBitSize`-style ISA knob | §3.1 | XS | low |
+
+Items 1–5 are the recommended first batch — none requires JNI changes and each closes a documented operator pain point.
+
+---
+
+## 6. Recommended next action
+
+Implement items 1, 3, 4, 5 in one focused "operator-facing ergonomics" commit:
+
+- UTF-8 boundary-safe streaming decoder (genuine correctness fix)
+- Per-run timing line (cheap operator signal)
+- One `jbang`-runnable example file
+- README system-properties table
+
+Estimated total: ~1–2 days of work, zero JNI changes, all backed by Java-only tests. Items 2 and 6–8 are good follow-ups once a real user asks.
diff --git a/docs/history/llama-cpp-breaking-changes.md b/docs/history/llama-cpp-breaking-changes.md
index 473dbc71..b55636d1 100644
--- a/docs/history/llama-cpp-breaking-changes.md
+++ b/docs/history/llama-cpp-breaking-changes.md
@@ -283,3 +283,23 @@ Used during `llama.cpp` version bumps: when upgrading, scan this file from the r
 | ~b9437–b9442 | `src/llama.cpp` | `llama_prepare_model_devices()` iGPU collection now appends only the FIRST `GGML_BACKEND_DEVICE_TYPE_IGPU` device (prevents duplicate iGPU registration on multi-iGPU hosts). Behavioural fix, single-line caller in `jllama.cpp` unchanged, no project source changes required |
 | ~b9437–b9442 | `tools/ui/embed.cpp` + `tools/ui/src/...` (Svelte) | Webasset embedder tightened printf format specifiers (`%lu` &#x2192; `%zu` and `PRIx64`); UI settings split `custom` into `customJson` + `customCss`; runtime CSS injection via `<svelte:head>`. Project does not ship the upstream UI, no impact |
 | ~b9437–b9442 | `gguf-py/`, `conversion/` (Python) | New `_set_vocab_whitespace()` helper and `add_normalizer_lowercase()` GGUF writer for the new whitespace tokenizer + lowercase normalizer keys (mirrors the vocab additions above); jina-v2 Roberta-tokenizer path now branches to whitespace when `tokenizer.json` declares a `Whitespace` pre-tokenizer. Python-side only, no impact on the Java/JNI build |
+| ~b9442–b9444 | `.github/workflows/build-cpu.yml` (upstream CI) | Upstream's CPU-build CI trigger paths narrowed to `**/*.h`, `**/*.hpp`, `**/*.c`, `**/*.cpp` (dropped `**/*.cu`, `**/*.cuh`, `**/*.swift`, `**/*.m`, `**/*.metal`, `**/*.comp`, `**/*.glsl`, `**/*.wgsl`) so GPU/Metal/Vulkan/WebGPU/Swift source edits no longer trigger the CPU build. Upstream-only CI plumbing; this project consumes none of upstream's workflow files and has its own `publish.yml`, no impact |
+| ~b9442–b9444 | `tools/server/server-http.cpp` | `If-None-Match` conditional-GET handling now also accepts the weak ETag form `W/"..."` (previously matched only strong ETag bytes-equal); 304 Not Modified returned for either form. This is the standalone `llama-server` HTTP tool, which is not linked into the JNI build (`libllama` + `libcommon` only); no project source changes required and no new Java API surface to expose |
+| ~b9444–b9490 | `common/common.cpp` | `common_prompt_batch_decode()` signature changed: new `int n_new` parameter added between `all_tokens` and `n_past`. Callers must pass the count of newly-decoded tokens for the batch. Only called inside upstream `tools/server/server-context.cpp` (compiled directly into jllama); no project source changes required &mdash; the new signature flows through transparently |
+| ~b9444–b9490 | `include/llama.h` | `llama_set_warmup()` deprecated via `LLAMA_DEPRECATED` macro (warmup is now handled internally during model load + first decode). Not called from `jllama.cpp` or any project source &mdash; absorbed inside upstream-compiled code, no project changes required. If a future jllama feature wants to control warmup explicitly, that path is the deprecated one and should pick the new replacement instead |
+| ~b9444–b9490 | `include/llama.h` + `src/llama-context.cpp` | New `llama_context_params::n_outputs_max` field (default `-1` = derived from `n_batch`). Limits the number of output slots allocated per context; useful for low-memory setups that always request `logits_all=false`. Not exposed by project today &mdash; consider adding `ModelParameters.setMaxOutputs(int)` if a user requests fine-grained control. Tracked under Open TODOs |
+| ~b9444–b9490 | `common/arg.cpp` + `common/common.cpp` | `common_params_handle_models()` no longer sets `hf_opts.download_mmproj = true` unconditionally; instead uses `opts.download_mmproj = !params.no_mmproj` so the new `--no-mmproj` flag suppresses the multimodal projector download. Not called from project source &mdash; arg parsing happens upstream, no project changes required |
+| ~b9444–b9490 | `common/sampling.h` + `common/sampling.cpp` | New `common_sampler_reasoning_budget_force(common_sampler *)` API that triggers the budget sampler to inject the end-of-thinking token on the next sample. Paired with new `common_params_sampling::reasoning_control` bool: when set, arms the budget sampler so external code (e.g. a server control endpoint) can end reasoning at runtime. Not used by project today &mdash; would pair with a future `InferenceParameters.setReasoningControl(boolean)` setter and a `LlamaModel.endReasoning(...)` helper. Tracked under Open TODOs |
+| ~b9444–b9490 | `common/common.h` + `common/arg.cpp` | New `common_params::sse_ping_interval` (int32, env `LLAMA_ARG_SSE_PING_INTERVAL`, CLI `--sse-ping-interval`); server emits SSE keep-alive comments at this interval. Server-only; project does not run the upstream HTTP server (uses a direct in-process API), no Java setter required |
+| ~b9444–b9490 | `tools/server/server-http.cpp` | New `POST /v1/chat/completions/control` endpoint accepting `{"id": "...", "action": "reasoning_end"}` &mdash; tells a streaming completion to wrap up reasoning early. Server-only; not linked into the JNI build (`libllama` + `libcommon` only), no project source changes required. If exposed in Java, would map to a new `LlamaModel.endReasoning(String taskId)` method that calls `common_sampler_reasoning_budget_force` on the slot's sampler. Tracked under Open TODOs |
+| ~b9444–b9490 | `src/llama-hparams.h` + `src/llama-model.cpp` | Internal renames: `hparams::recurrent_layer_arr` &#x2192; `hparams::is_recr_impl`; `hparams::swa_layers` &#x2192; `hparams::is_swa_impl`. Internal helper fields not part of the public API; not referenced by `jllama.cpp` or any project source, no changes required |
+| ~b9444–b9490 | `src/llama-arch.h` + `src/llama-arch.cpp` + `gguf-py/` | New `LLM_KV_HIDDEN_ACT` GGUF key (`%s.hidden_act`) for ModernBert SwiGLU/GeGLU activation selection; new `LLM_KV_ATTENTION_RECURRENT_LAYERS` key for hybrid (recurrent + attention) models. Additive vocabulary keys consumed automatically when loading a GGUF that sets them; no project source or Java API changes required |
+| ~b9444–b9490 | `src/llama-arch.h` + `src/models/*.cpp` (new) | New model architectures: `LLM_ARCH_MELLUM` (JetBrains code-completion), `LLM_ARCH_EXAONE4_5` (LG AI multimodal), `LLM_ARCH_STEP3P7` (StepFun Step-3.7 with MTP support); `LLM_ARCH_QWEN3NEXT`/`LLM_ARCH_QWEN35`/`LLM_ARCH_QWEN35MOE` removed from `llama_model_saver_supports_arch()` allowlist. New tokenizer pre-types: `LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI = 54`, `LLAMA_VOCAB_PRE_TYPE_MELLUM2 = 55`. All additive at the architecture level &mdash; consumed automatically when loading a matching GGUF, no project source or Java API changes required |
+| ~b9444–b9490 | `common/arg.cpp` | New `--mtp` / `--no-mtp` flags (env `LLAMA_ARG_MTP`) now apply to Step-3.5 in addition to the existing Qwen3.5 coverage. Multi-Token Prediction is consumed inside upstream-compiled server TUs; project does not expose an MTP setter today (would map to `ModelParameters.setMtp(boolean)`). Tracked under Open TODOs if a user requests it |
+| ~b9444–b9490 | upstream build / verification | Local build with `GIT_TAG b9490` was verified clean: `cmake -B build` configures cleanly; `cmake --build build --config Release -j$(nproc)` links `libjllama.so` with zero warnings on `jllama.cpp` or any project translation unit. All breaking changes in this range are absorbed inside upstream-compiled translation units (`common.cpp`, `arg.cpp`, `llama.cpp`, `server-*.cpp`, `download.cpp`); no project source edits required for the version bump itself |
+| ~b9490–b9495 | `include/llama.h` + `src/llama-ext.h` + `src/llama-context.{h,cpp}` + `src/llama-cparams.h` + `src/llama-graph.{h,cpp}` + `common/speculative.{h,cpp}` + `src/models/{qwen35,qwen35moe,step35}.cpp` | Mass terminology rename: `pre_norm` &#x2192; `nextn` everywhere the pre-final-norm hidden state is referenced. Affects the public API: `llama_set_embeddings_pre_norm()` &#x2192; `llama_set_embeddings_nextn()`, `llama_get_embeddings_pre_norm()` &#x2192; `llama_get_embeddings_nextn()`, `llama_get_embeddings_pre_norm_ith()` &#x2192; `llama_get_embeddings_nextn_ith()`. Internal: `cparams.embeddings_pre_norm` &#x2192; `cparams.embeddings_nextn`, `cparams.embeddings_pre_norm_masked` &#x2192; `cparams.embeddings_nextn_masked`, `llm_graph_result::t_h_pre_norm` &#x2192; `t_h_nextn`, `common_speculative_need_embd_pre_norm()` &#x2192; `common_speculative_need_embd_nextn()`. Qwen3.5 / Qwen3.5-MoE / Step-3.5 model graphs moved the final norm before extracting `t_h_nextn` (was after extracting the pre-norm hidden state). Project does not call any of these MTP-specific APIs directly &mdash; all references stay inside upstream-compiled translation units (`speculative.cpp`, `llama-context.cpp`, `server-context.cpp`, model TUs). Verified by grep across `src/main/cpp/*.{cpp,hpp}`: zero matches for any `pre_norm` / `nextn` / `embeddings_pre_norm*` / `t_h_pre_norm*` symbol. No project source changes required |
+| ~b9490–b9495 | `ggml/src/ggml-cuda/common.cuh` + 10 CUDA kernel files | New `GGML_CUDA_RESTRICT` macro replaces `__restrict__` on kernel parameter pointers. PDL (Programmatic Dependent Launch) on Hopper requires `__restrict__` to be disabled per [llama.cpp PR #24030](https://github.com/ggml-org/llama.cpp/pull/24030); the macro expands to nothing under `GGML_CUDA_USE_PDL && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER`, otherwise to `__restrict__`. Kernel signatures change from direct `T * __restrict__ x` parameters to `T * x_ptr` parameter + an internal `T * GGML_CUDA_RESTRICT x = x_ptr;` alias line; `GGML_UNUSED_VARS` calls in fallback branches updated to reference the `_ptr` names. Internal CUDA backend change; project does not compile any CUDA kernels in the JNI build (CUDA build uses upstream sources unchanged via FetchContent). No project source changes required |
+| ~b9490–b9495 | `src/llama-arch.{h,cpp}` + `src/llama-vocab.{h,cpp}` + `gguf-py/gguf/constants.py` + `gguf-py/gguf/gguf_writer.py` | New `LLM_KV_TOKENIZER_SUPPRESS_TOKENS` GGUF key (`tokenizer.ggml.suppress_tokens`). When a GGUF declares this array, the loader stores it on `llama_vocab::impl::suppress_tokens` and exposes it via new `llama_vocab::get_suppress_tokens()` accessor. The Gemma4 model graph (`src/models/gemma4.cpp`) reads this list and appends a `-INFINITY` logit bias to those token IDs at the end of the forward graph (new `llm_graph_input_logits_bias` class). Additive: existing models without the key produce an empty `suppress_tokens` vector and the bias-add branch is skipped. Mirrors a HuggingFace transformers `suppress_tokens` parameter; specifically used for Gemma4 Unified to prevent the model from emitting `<image|>` / `<audio|>` placeholder tokens. No project source or Java API changes required &mdash; the bias is applied automatically when a Gemma4U GGUF is loaded |
+| ~b9490–b9495 | `gguf-py/gguf/constants.py` + `gguf-py/gguf/tensor_mapping.py` + `tools/mtmd/clip-impl.h` + `tools/mtmd/clip-model.h` + `tools/mtmd/clip.cpp` + new `tools/mtmd/models/gemma4uv.cpp` + new `tools/mtmd/models/gemma4ua.cpp` + `tools/mtmd/mtmd-audio.{h,cpp}` + `tools/mtmd/mtmd.cpp` + `conversion/__init__.py` + `conversion/gemma.py` | New Gemma4 Unified vision + audio variant (`Gemma4UnifiedForConditionalGeneration`). Adds new projector types `PROJECTOR_TYPE_GEMMA4UV` and `PROJECTOR_TYPE_GEMMA4UA` (vision uses bigger patch size with token merging done on the conv layer; audio is encoder-free, raw 16 kHz waveform chunked into 640-sample frames). New `V_ENC_EMBD_PATCH_NORM` tensor enum (`v.patch_norm.{bid}`) and 3 indexed `patch_norm_{1,2,3}_{w,b}` weights on `clip_model` (Gemma4U uses standard PyTorch LayerNorm rather than RMSNorm before/after the patch embedding). New `mtmd_audio_preprocessor_gemma4ua` mel-major waveform packer (40 ms / 16 kHz frames; no FFT, no filterbank). Multimodal additions are routed through upstream `mtmd-cli` / `mtmd-debug` binaries that the project does not link; the JNI build links `libllama` + `libcommon` only. Additive at the GGUF / projector loader level: existing GGUFs without these projector types continue to load through the previous code paths. No project source or Java API changes required |
+| ~b9490–b9495 | `tools/ui/` (`package.json`, `src/lib/components/app/content/MarkdownContent/`, new `MermaidPreview.svelte`, new `DialogMermaidPreview.svelte`, new constants / icons / rehype plugins) | Upstream `llama-server` web UI gains Mermaid diagram rendering: new `mermaid@^11.15` dependency, lazy-loaded; new rehype plugin chain (`rehype-mermaid-pre`, `rehype-enhance-mermaid-blocks`) converts ` ```mermaid ` code fences to `<pre class="mermaid">` and wraps them with copy / preview action buttons; the existing single-file `MarkdownContent.svelte` is split into a `.svelte` + sibling `.css` / `markdown-utils.ts` / `markdown-handlers.ts` so the new mermaid renderer can share helpers. Project does not compile or ship the upstream `tools/ui` (server-only feature, classpath-only JNI build); no impact |
+| ~b9490–b9495 | upstream build / verification | Local build with `GIT_TAG b9495` was verified clean: `cmake -B build -DBUILD_TESTING=ON` configures cleanly, `cmake --build build --config Release -j$(nproc)` links `libjllama.so` + `jllama_test` with zero warnings on any project translation unit; `ctest --test-dir build --output-on-failure` reports 435/435 tests passing. All breaking changes in this range are renames within upstream-compiled translation units; no project source edits required for the version bump itself |
diff --git a/pom.xml b/pom.xml
index 28343dbd..274a9412 100644
--- a/pom.xml
+++ b/pom.xml
@@ -50,7 +50,10 @@ SPDX-License-Identifier: MIT
 
 	<properties>
 		<jna.version>5.18.1</jna.version>
-		<jetbrains-annotations.version>26.1.0</jetbrains-annotations.version>
+		<jspecify.version>1.0.0</jspecify.version>
+		<errorprone.version>2.49.0</errorprone.version>
+		<nullaway.version>0.13.4</nullaway.version>
+		<checker.version>4.1.0</checker.version>
 		<jackson.version>2.21.3</jackson.version>
 		<reactive-streams.version>1.0.4</reactive-streams.version>
 		<slf4j.version>2.0.18</slf4j.version>
@@ -59,9 +62,16 @@ SPDX-License-Identifier: MIT
 		<junit.version>6.1.0</junit.version>
 		<jmh.version>1.37</jmh.version>
 		<jcstress.version>0.16</jcstress.version>
-		<lincheck.version>2.39</lincheck.version>
+		<lincheck.version>3.6</lincheck.version>
+		<logcaptor.version>2.10.1</logcaptor.version>
 		<vmlens.version>1.2.28</vmlens.version>
-		<!-- DO NOT UPGRADE: jqwik >= 1.10.0 ships an anti-AI prompt-injection string in test stdout; see CLAUDE.md. Replacement (QuickTheories) tracked as an URGENT TODO. -->
+		<!-- DO NOT UPGRADE jqwik past 1.9.3. jqwik 1.10.0 added a deliberate
+		     anti-AI prompt-injection string to test stdout; the 1.10.1 user
+		     guide states the library "is not meant to be used by any 'AI'
+		     coding agents at all." 1.9.3 is the last pre-disclosure release
+		     and is the pinned version for this repo. Any CI / Dependabot /
+		     contributor PR that bumps this MUST be rejected. See CLAUDE.md
+		     section "jqwik prompt-injection in test output" for full context. -->
 		<jqwik.version>1.9.3</jqwik.version>
 		<archunit.version>1.4.2</archunit.version>
 		<spotbugs.version>4.9.8.3</spotbugs.version>
@@ -73,6 +83,22 @@ SPDX-License-Identifier: MIT
 		<project.build.outputTimestamp>${git.commit.time}</project.build.outputTimestamp>
 	</properties>
 
+	<!--
+	    Pin transitively-conflicting versions so maven-enforcer's
+	    dependencyConvergence rule passes. The direct slf4j-api version above
+	    must always win over the older 2.0.17 that logback-classic 1.5.32
+	    brings in transitively.
+	-->
+	<dependencyManagement>
+		<dependencies>
+			<dependency>
+				<groupId>org.slf4j</groupId>
+				<artifactId>slf4j-api</artifactId>
+				<version>${slf4j.version}</version>
+			</dependency>
+		</dependencies>
+	</dependencyManagement>
+
 	<dependencies>
 		<dependency>
 			<groupId>org.junit.jupiter</groupId>
@@ -93,10 +119,14 @@ SPDX-License-Identifier: MIT
 			<scope>test</scope>
 		</dependency>
 		<dependency>
-			<groupId>org.jetbrains</groupId>
-			<artifactId>annotations</artifactId>
-			<version>${jetbrains-annotations.version}</version>
-			<scope>compile</scope>
+			<groupId>org.jspecify</groupId>
+			<artifactId>jspecify</artifactId>
+			<version>${jspecify.version}</version>
+		</dependency>
+		<dependency>
+			<groupId>org.checkerframework</groupId>
+			<artifactId>checker-qual</artifactId>
+			<version>${checker.version}</version>
 		</dependency>
 		<dependency>
 			<groupId>com.fasterxml.jackson.core</groupId>
@@ -153,11 +183,18 @@ SPDX-License-Identifier: MIT
 			<scope>test</scope>
 		</dependency>
 		<dependency>
-			<groupId>org.jetbrains.kotlinx</groupId>
-			<artifactId>lincheck-jvm</artifactId>
+			<groupId>org.jetbrains.lincheck</groupId>
+			<artifactId>lincheck</artifactId>
 			<version>${lincheck.version}</version>
 			<scope>test</scope>
 		</dependency>
+		<!-- Test-only SLF4J/Logback log-capture for the LoggingSmokeTest. -->
+		<dependency>
+			<groupId>io.github.hakky54</groupId>
+			<artifactId>logcaptor</artifactId>
+			<version>${logcaptor.version}</version>
+			<scope>test</scope>
+		</dependency>
 	</dependencies>
 
 	<build>
@@ -218,6 +255,11 @@ SPDX-License-Identifier: MIT
 					<artifactId>maven-surefire-plugin</artifactId>
 					<version>3.5.5</version>
 				</plugin>
+				<plugin>
+					<groupId>org.apache.maven.plugins</groupId>
+					<artifactId>maven-enforcer-plugin</artifactId>
+					<version>3.6.3</version>
+				</plugin>
 				<plugin>
 					<groupId>org.codehaus.mojo</groupId>
 					<artifactId>exec-maven-plugin</artifactId>
@@ -228,6 +270,11 @@ SPDX-License-Identifier: MIT
 					<artifactId>jacoco-maven-plugin</artifactId>
 					<version>0.8.14</version>
 				</plugin>
+				<plugin>
+					<groupId>org.pitest</groupId>
+					<artifactId>pitest-maven</artifactId>
+					<version>1.25.1</version>
+				</plugin>
 				<plugin>
 					<groupId>org.sonatype.central</groupId>
 					<artifactId>central-publishing-maven-plugin</artifactId>
@@ -236,6 +283,44 @@ SPDX-License-Identifier: MIT
 			</plugins>
 		</pluginManagement>
 		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-enforcer-plugin</artifactId>
+				<executions>
+					<execution>
+						<id>enforce</id>
+						<goals>
+							<goal>enforce</goal>
+						</goals>
+						<configuration>
+							<rules>
+								<requireMavenVersion>
+									<version>[3.6.3,)</version>
+								</requireMavenVersion>
+								<requireJavaVersion>
+									<version>[1.8,)</version>
+								</requireJavaVersion>
+								<dependencyConvergence/>
+								<bannedDependencies>
+									<excludes>
+										<!-- Use SLF4J directly; never commons-logging. -->
+										<exclude>commons-logging:commons-logging</exclude>
+										<!-- Legacy / EoL logging frameworks. -->
+										<exclude>log4j:log4j</exclude>
+										<!-- Old hamcrest split artifacts; use hamcrest:hamcrest 2.x+. -->
+										<exclude>org.hamcrest:hamcrest-core</exclude>
+										<exclude>org.hamcrest:hamcrest-library</exclude>
+										<exclude>org.hamcrest:hamcrest-all</exclude>
+										<!-- Legacy JUnit 3 / 4; we use Jupiter (junit:junit-jupiter). -->
+										<exclude>junit:junit</exclude>
+										<exclude>junit:junit-dep</exclude>
+									</excludes>
+								</bannedDependencies>
+							</rules>
+						</configuration>
+					</execution>
+				</executions>
+			</plugin>
 			<plugin>
 				<groupId>io.github.git-commit-id</groupId>
 				<artifactId>git-commit-id-maven-plugin</artifactId>
@@ -259,16 +344,105 @@ SPDX-License-Identifier: MIT
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-compiler-plugin</artifactId>
 				<configuration>
-					<source>1.8</source>
-					<target>1.8</target>
-					<testSource>21</testSource>
-					<testTarget>21</testTarget>
+					<release>8</release>
+					<testRelease>21</testRelease>
+					<showWarnings>true</showWarnings>
+					<parameters>true</parameters>
+					<compilerArgs>
+						<!--
+						  -Xlint:all surfaces every category of javac warning. Excluded:
+						    -serial    : serialVersionUID warnings on non-Serializable classes
+						    -options   : bootclasspath / target-mismatch chatter under release=8
+						    -classfile : noise when javac reads our own .class output mid-build
+						    -processing: "No processor claimed any of these annotations" — the
+						                 animal-sniffer @IgnoreJRERequirement is consumed by the
+						                 maven-plugin (not a JSR-269 processor), so javac flags
+						                 it every build
+						  -Werror is on: the ElementType.MODULE blocker is gone now that
+						  @NullMarked lives only in module-info.java (compiled at release 9 in
+						  a separate execution); see module-info.java's javadoc.
+						-->
+						<arg>-Xlint:all,-serial,-options,-classfile,-processing</arg>
+						<arg>-Werror</arg>
+						<!--
+						  Checker Framework Nullness Checker runs as a second nullness pass
+						  alongside NullAway. CF is generics-aware and stricter than NullAway,
+						  so it acts as a second-opinion verifier on the same JSpecify
+						  annotations.
+						-->
+						<arg>-processor</arg>
+						<arg>org.checkerframework.checker.nullness.NullnessChecker</arg>
+						<arg>-XDaddTypeAnnotationsToSymbol=true</arg>
+						<arg>-XDcompilePolicy=simple</arg>
+						<arg>--should-stop=ifError=FLOW</arg>
+						<arg>-Xplugin:ErrorProne -Xep:NullAway:ERROR -XepOpt:NullAway:OnlyNullMarked=true -XepOpt:NullAway:JSpecifyMode=true -XepOpt:NullAway:CheckOptionalEmptiness=true -XepOpt:NullAway:AcknowledgeRestrictiveAnnotations=true -XepOpt:NullAway:AcknowledgeAndroidRecent=true -XepOpt:NullAway:AssertsEnabled=true -Xep:BoxedPrimitiveEquality:ERROR -Xep:EqualsHashCode:ERROR -Xep:EqualsIncompatibleType:ERROR -Xep:IdentityBinaryExpression:ERROR -Xep:SelfAssignment:ERROR -Xep:SelfComparison:ERROR -Xep:SelfEquals:ERROR -Xep:DeadException:ERROR -Xep:FormatString:ERROR -Xep:InvalidPatternSyntax:ERROR -Xep:OptionalEquality:ERROR -Xep:ImpossibleNullComparison:ERROR</arg>
+					</compilerArgs>
+					<annotationProcessorPaths>
+						<path>
+							<groupId>com.google.errorprone</groupId>
+							<artifactId>error_prone_core</artifactId>
+							<version>${errorprone.version}</version>
+						</path>
+						<path>
+							<groupId>com.uber.nullaway</groupId>
+							<artifactId>nullaway</artifactId>
+							<version>${nullaway.version}</version>
+						</path>
+						<path>
+							<groupId>org.checkerframework</groupId>
+							<artifactId>checker</artifactId>
+							<version>${checker.version}</version>
+						</path>
+					</annotationProcessorPaths>
 				</configuration>
 				<executions>
+					<execution>
+						<id>default-compile</id>
+						<configuration>
+							<!-- The default compile pass targets Java 8 bytecode for the library
+							     sources. module-info.java cannot be compiled at release 8 (it is a
+							     Java 9+ construct) and is handled by the separate execution below. -->
+							<excludes>
+								<exclude>module-info.java</exclude>
+							</excludes>
+						</configuration>
+					</execution>
+					<execution>
+						<id>module-info-compile</id>
+						<phase>compile</phase>
+						<goals>
+							<goal>compile</goal>
+						</goals>
+						<configuration>
+							<!-- module-info.java is the only Java 9+ source in this project; compile
+							     it at release 9 (the minimum version that understands modules) without
+							     the Error Prone / NullAway / Checker Framework processors that target
+							     ordinary source files only. Java 8 runtimes silently ignore the
+							     resulting module-info.class at the JAR root, so existing classpath
+							     consumers continue to work unchanged. LlamaLoader loads native libs
+							     via Class.getResourceAsStream on its own class object, which resolves
+							     to this module and therefore does not require any opens directives. -->
+							<release>9</release>
+							<includes>
+								<include>module-info.java</include>
+							</includes>
+							<compilerArgs combine.self="override"/>
+							<annotationProcessorPaths combine.self="override"/>
+						</configuration>
+					</execution>
 					<execution>
 						<id>default-testCompile</id>
 						<configuration>
-							<annotationProcessorPaths>
+							<!-- Drop -parameters for test compile so jcstress reflection doesn't
+							     pick up parameter names from MethodParameters. -->
+							<parameters>false</parameters>
+							<compilerArgs combine.self="override">
+								<arg>-XDaddTypeAnnotationsToSymbol=true</arg>
+								<arg>-XDcompilePolicy=simple</arg>
+								<arg>--should-stop=ifError=FLOW</arg>
+								<arg>-Xplugin:ErrorProne -Xep:NullAway:OFF -Xep:GuardedBy:OFF</arg>
+							</compilerArgs>
+							<annotationProcessorPaths combine.children="append">
 								<path>
 									<groupId>org.openjdk.jcstress</groupId>
 									<artifactId>jcstress-core</artifactId>
@@ -431,6 +605,35 @@ SPDX-License-Identifier: MIT
 					<classpathScope>test</classpathScope>
 				</configuration>
 			</plugin>
+			<plugin>
+				<!--
+				  PIT mutation testing is INTENTIONALLY scoped to a single class
+				  (Pair) so that the wiring is exercised on every CI build with a
+				  100% mutation threshold gate, without forcing every other class
+				  up to 100% mutation coverage at once. Expand targetClasses
+				  incrementally as classes reach mutation-test parity (see README
+				  TODO).
+				-->
+				<groupId>org.pitest</groupId>
+				<artifactId>pitest-maven</artifactId>
+				<dependencies>
+					<dependency>
+						<groupId>org.pitest</groupId>
+						<artifactId>pitest-junit5-plugin</artifactId>
+						<version>1.2.3</version>
+					</dependency>
+				</dependencies>
+				<configuration>
+					<targetClasses>
+						<param>net.ladenthin.llama.Pair</param>
+					</targetClasses>
+					<targetTests>
+						<param>net.ladenthin.llama.PairTest</param>
+					</targetTests>
+					<mutationThreshold>100</mutationThreshold>
+					<timeoutConstant>30000</timeoutConstant>
+				</configuration>
+			</plugin>
 		</plugins>
 	</build>
 
@@ -489,6 +692,11 @@ SPDX-License-Identifier: MIT
 								<goal>compile</goal>
 							</goals>
 							<configuration>
+								<!-- Same rationale as default-compile: this pass targets release 8;
+								     module-info.java is Java 9+ and is handled by module-info-compile. -->
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
 								<compilerArgs>
 									<arg>-h</arg>
 									<arg>src/main/cpp</arg>
@@ -565,6 +773,11 @@ SPDX-License-Identifier: MIT
 								<goal>compile</goal>
 							</goals>
 							<configuration>
+								<!-- Same rationale as default-compile: this pass targets release 8;
+								     module-info.java is Java 9+ and is handled by module-info-compile. -->
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
 								<compilerArgs>
 									<arg>-h</arg>
 									<arg>src/main/cpp</arg>
diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
new file mode 100644
index 00000000..b899ea65
--- /dev/null
+++ b/src/main/java/module-info.java
@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+// SPDX-FileCopyrightText: 2023-2025 Konstantin Herud
+//
+// SPDX-License-Identifier: MIT
+
+/**
+ * JPMS module descriptor for the java-llama.cpp JNI bindings.
+ *
+ * <p>Exports the three hand-written public packages
+ * ({@code net.ladenthin.llama}, {@code net.ladenthin.llama.args},
+ * {@code net.ladenthin.llama.json}). The native libraries shipped under
+ * {@code /net/ladenthin/llama/{OS}/{ARCH}/} are loaded by
+ * {@link net.ladenthin.llama.LlamaLoader} via
+ * {@link Class#getResourceAsStream(String)} on its own class object, so the resources
+ * are looked up in this module and do <em>not</em> need to be {@code opens}'d.</p>
+ *
+ * <p>JSpecify {@code @NullMarked} is declared at the module level here so that no source
+ * file compiled at {@code --release 8} references the JSpecify annotation type directly.
+ * Otherwise javac would emit an unsuppressible {@code unknown enum constant
+ * ElementType.MODULE} classfile-read warning for each source compiled at release 8 that
+ * resolves {@code @NullMarked} ({@code @NullMarked} carries
+ * {@code @Target({MODULE, PACKAGE, TYPE})} and Java 8 does not know about
+ * {@code ElementType.MODULE}). Confining the reference to {@code module-info.java} —
+ * which compiles at {@code --release 9} — keeps that warning out of the build entirely.</p>
+ *
+ * <p>{@code requires static org.jspecify} is needed only at compile time of this
+ * descriptor; JSpecify annotations carry {@code RetentionPolicy.CLASS} so module-path
+ * consumers never need jspecify on their runtime path. Checker Framework qualifiers and
+ * the Codehaus animal-sniffer annotation are likewise compile-time only. Jackson, SLF4J,
+ * and Reactive Streams API are referenced from ordinary sources only; javac in the
+ * separate {@code module-info-compile} execution compiles {@code module-info.java} in
+ * isolation and therefore does not need their module names. Consumers that put this jar
+ * on the module path will load these dependencies through their own {@code requires}
+ * graph; consumers on the classpath are unaffected.</p>
+ *
+ * <p>This descriptor compiles at {@code --release 9}; the rest of the source compiles
+ * at {@code --release 8}. Java 8 runtimes silently ignore {@code module-info.class} at
+ * the JAR root.</p>
+ */
+@org.jspecify.annotations.NullMarked
+module net.ladenthin.llama {
+    requires static org.jspecify;
+
+    exports net.ladenthin.llama;
+    exports net.ladenthin.llama.args;
+    exports net.ladenthin.llama.json;
+}
diff --git a/src/main/java/net/ladenthin/llama/ChatMessage.java b/src/main/java/net/ladenthin/llama/ChatMessage.java
index 319e65ce..c581c034 100644
--- a/src/main/java/net/ladenthin/llama/ChatMessage.java
+++ b/src/main/java/net/ladenthin/llama/ChatMessage.java
@@ -7,6 +7,8 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Optional;
+import org.jspecify.annotations.Nullable;
 
 /**
  * A single message in a chat conversation: a role ({@code "user"}, {@code "assistant"},
@@ -30,9 +32,9 @@ public final class ChatMessage {
 
     private final String role;
     private final String content;
-    private final String toolCallId;
+    private final @Nullable String toolCallId;
     private final List<ToolCall> toolCalls;
-    private final List<ContentPart> parts;
+    private final @Nullable List<ContentPart> parts;
 
     /**
      * Plain user/assistant/system message.
@@ -52,7 +54,7 @@ public ChatMessage(String role, String content) {
      * @param toolCallId for tool-result turns ({@code role="tool"}), the id of the originating call; {@code null} otherwise
      * @param toolCalls  for assistant tool-call turns, the list of calls; empty otherwise
      */
-    public ChatMessage(String role, String content, String toolCallId, List<ToolCall> toolCalls) {
+    public ChatMessage(String role, String content, @Nullable String toolCallId, List<ToolCall> toolCalls) {
         this(role, content, toolCallId, toolCalls, null);
     }
 
@@ -75,7 +77,11 @@ public ChatMessage(String role, List<ContentPart> parts) {
     }
 
     private ChatMessage(
-            String role, String content, String toolCallId, List<ToolCall> toolCalls, List<ContentPart> parts) {
+            String role,
+            String content,
+            @Nullable String toolCallId,
+            List<ToolCall> toolCalls,
+            @Nullable List<ContentPart> parts) {
         this.role = role;
         this.content = content;
         this.toolCallId = toolCallId;
@@ -155,7 +161,7 @@ public String getContent() {
      * Tool-call id for tool-result turns.
      * @return the originating tool call id, or {@code null} for non-tool messages
      */
-    public String getToolCallId() {
+    public @Nullable String getToolCallId() {
         return toolCallId;
     }
 
@@ -169,17 +175,17 @@ public List<ToolCall> getToolCalls() {
 
     /**
      * Multimodal content parts accessor.
-     * @return an unmodifiable list of text and image parts, or {@code null} for
-     *         legacy text-only messages built via {@link #ChatMessage(String, String)}
+     * @return an unmodifiable list of text and image parts, or {@link Optional#empty()}
+     *         for legacy text-only messages built via {@link #ChatMessage(String, String)}
      */
-    public List<ContentPart> getParts() {
-        return parts == null ? null : Collections.unmodifiableList(parts);
+    public Optional<List<ContentPart>> getParts() {
+        return parts == null ? Optional.empty() : Optional.of(Collections.unmodifiableList(parts));
     }
 
     /**
      * Whether this message carries multimodal parts (i.e. was constructed via
      * {@link #ChatMessage(String, List)} or {@link #userMultimodal(ContentPart...)}).
-     * @return {@code true} when {@link #getParts()} is non-null
+     * @return {@code true} when {@link #getParts()} is non-empty
      */
     public boolean hasParts() {
         return parts != null;
diff --git a/src/main/java/net/ladenthin/llama/ChatRequest.java b/src/main/java/net/ladenthin/llama/ChatRequest.java
index c40779bd..c7e9622e 100644
--- a/src/main/java/net/ladenthin/llama/ChatRequest.java
+++ b/src/main/java/net/ladenthin/llama/ChatRequest.java
@@ -10,7 +10,9 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.Optional;
 import java.util.function.Consumer;
+import org.jspecify.annotations.Nullable;
 
 /**
  * Builder for a typed chat completion call.
@@ -28,9 +30,9 @@ public final class ChatRequest {
 
     private final List<ChatMessage> messages = new ArrayList<ChatMessage>();
     private final List<ToolDefinition> tools = new ArrayList<ToolDefinition>();
-    private String toolChoice;
+    private @Nullable String toolChoice;
     private int maxToolRounds = 8;
-    private Consumer<InferenceParameters> paramsCustomizer;
+    private @Nullable Consumer<InferenceParameters> paramsCustomizer;
 
     /** Construct an empty request; populate via the setters. */
     public ChatRequest() {
@@ -75,7 +77,7 @@ public ChatRequest addTool(ToolDefinition tool) {
      * @param toolChoice the hint string, or {@code null} to clear
      * @return this builder
      */
-    public ChatRequest setToolChoice(String toolChoice) {
+    public ChatRequest setToolChoice(@Nullable String toolChoice) {
         this.toolChoice = toolChoice;
         return this;
     }
@@ -103,7 +105,7 @@ public ChatRequest setMaxToolRounds(int maxToolRounds) {
      * @param customizer the customizer; {@code null} clears any prior customizer
      * @return this builder
      */
-    public ChatRequest setInferenceCustomizer(Consumer<InferenceParameters> customizer) {
+    public ChatRequest setInferenceCustomizer(@Nullable Consumer<InferenceParameters> customizer) {
         this.paramsCustomizer = customizer;
         return this;
     }
@@ -128,7 +130,7 @@ public List<ToolDefinition> getTools() {
      * Tool choice accessor.
      * @return the {@code tool_choice} hint, or {@code null} when unset
      */
-    public String getToolChoice() {
+    public @Nullable String getToolChoice() {
         return toolChoice;
     }
 
@@ -153,8 +155,9 @@ public String buildMessagesJson() {
             ObjectNode obj = MAPPER.createObjectNode();
             obj.put("role", m.getRole());
             obj.put("content", m.getContent() == null ? "" : m.getContent());
-            if (m.getToolCallId() != null) {
-                obj.put("tool_call_id", m.getToolCallId());
+            final String toolCallId = m.getToolCallId();
+            if (toolCallId != null) {
+                obj.put("tool_call_id", toolCallId);
             }
             if (!m.getToolCalls().isEmpty()) {
                 ArrayNode tc = MAPPER.createArrayNode();
@@ -176,13 +179,12 @@ public String buildMessagesJson() {
     }
 
     /**
-     * Build the OAI-style {@code tools} array as a JSON string. Returns {@code null}
-     * when no tools were added.
+     * Build the OAI-style {@code tools} array as a JSON string.
      *
-     * @return the JSON array as a string, or {@code null} when there are no tools
+     * @return the JSON array as a string, or {@link Optional#empty()} when no tools were added
      */
-    public String buildToolsJson() {
-        if (tools.isEmpty()) return null;
+    public Optional<String> buildToolsJson() {
+        if (tools.isEmpty()) return Optional.empty();
         ArrayNode arr = MAPPER.createArrayNode();
         for (ToolDefinition t : tools) {
             ObjectNode entry = MAPPER.createObjectNode();
@@ -198,7 +200,7 @@ public String buildToolsJson() {
             entry.set("function", fn);
             arr.add(entry);
         }
-        return arr.toString();
+        return Optional.of(arr.toString());
     }
 
     /**
diff --git a/src/main/java/net/ladenthin/llama/ChatResponse.java b/src/main/java/net/ladenthin/llama/ChatResponse.java
index 65054220..23fe5eab 100644
--- a/src/main/java/net/ladenthin/llama/ChatResponse.java
+++ b/src/main/java/net/ladenthin/llama/ChatResponse.java
@@ -6,6 +6,7 @@
 
 import java.util.Collections;
 import java.util.List;
+import java.util.Optional;
 
 /**
  * Typed result of {@link LlamaModel#chat(ChatRequest)} and
@@ -59,10 +60,10 @@ public List<ChatChoice> getChoices() {
 
     /**
      * Convenience accessor for the first assistant message.
-     * @return the first choice's message, or {@code null} when there are no choices
+     * @return the first choice's message, or {@link Optional#empty()} when there are no choices
      */
-    public ChatMessage getFirstMessage() {
-        return choices.isEmpty() ? null : choices.get(0).getMessage();
+    public Optional<ChatMessage> getFirstMessage() {
+        return choices.isEmpty() ? Optional.empty() : Optional.of(choices.get(0).getMessage());
     }
 
     /**
@@ -70,8 +71,7 @@ public ChatMessage getFirstMessage() {
      * @return the first choice's message content, or {@code ""} when there are no choices
      */
     public String getFirstContent() {
-        ChatMessage m = getFirstMessage();
-        return m == null ? "" : m.getContent();
+        return getFirstMessage().map(ChatMessage::getContent).orElse("");
     }
 
     /**
diff --git a/src/main/java/net/ladenthin/llama/CliParameters.java b/src/main/java/net/ladenthin/llama/CliParameters.java
index c9374a8b..9904848b 100644
--- a/src/main/java/net/ladenthin/llama/CliParameters.java
+++ b/src/main/java/net/ladenthin/llama/CliParameters.java
@@ -10,7 +10,7 @@
 import java.util.List;
 import java.util.Map;
 import net.ladenthin.llama.args.CliArg;
-import org.jetbrains.annotations.Nullable;
+import org.jspecify.annotations.Nullable;
 
 abstract class CliParameters {
 
@@ -28,7 +28,9 @@ abstract class CliParameters {
      * @param <T>   the concrete subtype of this builder
      * @return this builder
      */
-    @SuppressWarnings("unchecked")
+    // Self-typing builder idiom: caller fixes T to its concrete subtype so chained
+    // calls return the concrete builder, not CliParameters.
+    @SuppressWarnings({"unchecked", "TypeParameterUnusedInFormals"})
     protected final <T extends CliParameters> T putScalar(String key, Object value) {
         parameters.put(key, String.valueOf(value));
         return (T) this;
@@ -43,7 +45,8 @@ protected final <T extends CliParameters> T putScalar(String key, Object value)
      * @param <T>   the concrete subtype of this builder
      * @return this builder
      */
-    @SuppressWarnings("unchecked")
+    // Self-typing builder idiom — see putScalar above.
+    @SuppressWarnings({"unchecked", "TypeParameterUnusedInFormals"})
     protected final <T extends CliParameters> T putEnum(String key, CliArg value) {
         parameters.put(key, value.getArgValue());
         return (T) this;
diff --git a/src/main/java/net/ladenthin/llama/ContentPart.java b/src/main/java/net/ladenthin/llama/ContentPart.java
index ba6ee49a..a7689b35 100644
--- a/src/main/java/net/ladenthin/llama/ContentPart.java
+++ b/src/main/java/net/ladenthin/llama/ContentPart.java
@@ -10,6 +10,7 @@
 import java.util.Base64;
 import java.util.Locale;
 import java.util.Objects;
+import org.jspecify.annotations.Nullable;
 
 /**
  * One piece of a {@link ChatMessage}'s multimodal content array: either a text
@@ -42,10 +43,10 @@ public enum Type {
     }
 
     private final Type type;
-    private final String text;
-    private final String imageUrl;
+    private final @Nullable String text;
+    private final @Nullable String imageUrl;
 
-    private ContentPart(Type type, String text, String imageUrl) {
+    private ContentPart(Type type, @Nullable String text, @Nullable String imageUrl) {
         this.type = type;
         this.text = text;
         this.imageUrl = imageUrl;
@@ -139,7 +140,7 @@ public Type getType() {
      * Text accessor (only set for {@link Type#TEXT}).
      * @return the text fragment, or {@code null} for {@link Type#IMAGE_URL} parts
      */
-    public String getText() {
+    public @Nullable String getText() {
         return text;
     }
 
@@ -147,7 +148,7 @@ public String getText() {
      * Image URL accessor (only set for {@link Type#IMAGE_URL}).
      * @return the URL or data URI, or {@code null} for {@link Type#TEXT} parts
      */
-    public String getImageUrl() {
+    public @Nullable String getImageUrl() {
         return imageUrl;
     }
 }
diff --git a/src/main/java/net/ladenthin/llama/InferenceParameters.java b/src/main/java/net/ladenthin/llama/InferenceParameters.java
index e5bdae9e..b73fba76 100644
--- a/src/main/java/net/ladenthin/llama/InferenceParameters.java
+++ b/src/main/java/net/ladenthin/llama/InferenceParameters.java
@@ -10,6 +10,7 @@
 import java.util.Map;
 import net.ladenthin.llama.args.ContinuationMode;
 import net.ladenthin.llama.args.MiroStat;
+import org.jspecify.annotations.Nullable;
 import net.ladenthin.llama.args.ReasoningFormat;
 import net.ladenthin.llama.args.Sampler;
 
@@ -253,6 +254,10 @@ public InferenceParameters setPresencePenalty(float presencePenalty) {
      * @param mirostat the MiroStat sampling strategy
      * @return this builder
      */
+    // .ordinal() is intentional here: the llama.cpp server expects the integer
+    // ordinal of the MiroStat enum (0 = OFF, 1 = V1, 2 = V2) on the wire. The
+    // declared order of MiroStat.values() matches the upstream contract.
+    @SuppressWarnings("EnumOrdinal")
     public InferenceParameters setMiroStat(MiroStat mirostat) {
         return putScalar(PARAM_MIROSTAT, mirostat.ordinal());
     }
@@ -559,7 +564,7 @@ public InferenceParameters setChatTemplateKwargs(java.util.Map<String, String> k
      * @param messages a list of user/assistant message pairs (role as key, content as value)
      * @return this builder
      */
-    public InferenceParameters setMessages(String systemMessage, List<Pair<String, String>> messages) {
+    public InferenceParameters setMessages(@Nullable String systemMessage, List<Pair<String, String>> messages) {
         parameters.put(
                 PARAM_MESSAGES,
                 serializer.buildMessages(systemMessage, messages).toString());
@@ -571,7 +576,7 @@ public InferenceParameters setMessages(String systemMessage, List<Pair<String, S
      * with non-null {@link ChatMessage#getParts()} are serialized as OAI array-form
      * {@code content} (text + image_url parts). Plain text messages emit the legacy
      * string-form {@code content}, so this overload is also a drop-in replacement
-     * for the {@code List&lt;Pair&gt;} variant when callers prefer the typed
+     * for the {@code List<Pair>} variant when callers prefer the typed
      * {@link ChatMessage} surface.
      * <p>
      * Image parts require the model to have a multimodal projector loaded via
diff --git a/src/main/java/net/ladenthin/llama/Java8CompatibilityHelper.java b/src/main/java/net/ladenthin/llama/Java8CompatibilityHelper.java
new file mode 100644
index 00000000..3062d704
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/Java8CompatibilityHelper.java
@@ -0,0 +1,140 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+package net.ladenthin.llama;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * Wrapper methods for Java 9+ APIs to provide Java 1.8 compatibility.
+ * This class centralizes all compatibility layer logic and can be mocked for testing.
+ *
+ * <p>Mirrors the pattern used by the sister repo's
+ * {@code net.ladenthin.maven.llamacpp.aiindex.Java8CompatibilityHelper}: each consuming
+ * class declares an instance field
+ * {@code private final Java8CompatibilityHelper compatibilityHelper = new Java8CompatibilityHelper();}
+ * and routes Java 9+ idioms through it. The build's {@code --release 8} compiler arg
+ * (see {@code pom.xml}) prevents accidental direct use of post-8 APIs in production code.
+ */
+public class Java8CompatibilityHelper {
+
+    /** Creates a new {@link Java8CompatibilityHelper}. */
+    public Java8CompatibilityHelper() {
+        // no-op
+    }
+
+    /**
+     * Wrapper for {@code String#isBlank()} (Java 11+).
+     * Returns {@code true} if the string is empty or contains only whitespace,
+     * {@code false} otherwise.
+     *
+     * @param str the string to check; must not be {@code null}
+     * @return {@code true} if the string is empty or blank, {@code false} otherwise
+     * @throws NullPointerException if {@code str} is {@code null}
+     */
+    public boolean isBlank(final String str) {
+        return str.isEmpty() || str.trim().isEmpty();
+    }
+
+    /**
+     * Wrapper for {@code String#formatted(Object...)} (Java 15+).
+     * Equivalent to {@link String#format(String, Object...)}.
+     *
+     * @param format the format string
+     * @param args   the arguments referenced by the format specifiers in the format string
+     * @return a formatted string
+     */
+    // Not annotated @FormatMethod because callers may pass a runtime format string;
+    // marking this @FormatMethod would propagate FormatStringAnnotation to every caller.
+    @SuppressWarnings("AnnotateFormatMethod")
+    public String formatted(final String format, final Object... args) {
+        return String.format(format, args);
+    }
+
+    /**
+     * Wrapper for {@code Files#readString(Path)} (Java 11+).
+     * Reads all bytes from a file and decodes them using UTF-8.
+     *
+     * @param path the path to the file to read
+     * @return the file content as a string
+     * @throws IOException if an I/O error occurs reading from the file
+     */
+    public String readString(final Path path) throws IOException {
+        return new String(Files.readAllBytes(path), StandardCharsets.UTF_8);
+    }
+
+    /**
+     * Wrapper for {@code Files#writeString(Path, CharSequence, Charset)} (Java 11+).
+     * Writes a string to a file using the specified charset.
+     *
+     * @param path    the path to the file to write
+     * @param content the string content to write
+     * @param charset the charset to encode the content with; defaults to UTF-8 if {@code null}
+     * @throws IOException if an I/O error occurs writing to the file
+     */
+    public void writeString(final Path path, final String content, final @org.jspecify.annotations.Nullable Charset charset)
+            throws IOException {
+        final Charset targetCharset = charset != null ? charset : StandardCharsets.UTF_8;
+        Files.write(path, content.getBytes(targetCharset));
+    }
+
+    /**
+     * Wrapper for {@code Stream#toList()} (Java 16+).
+     * Collects stream elements into a {@link List}.
+     *
+     * @param stream the stream to collect
+     * @param <T>    the element type
+     * @return a list containing the stream elements
+     */
+    public <T> List<T> toList(final Stream<T> stream) {
+        return stream.collect(Collectors.toList());
+    }
+
+    /**
+     * Wrapper for {@code List#of(Object...)} (Java 9+).
+     * Creates a list containing the specified elements.
+     *
+     * @param elements the elements to include in the list
+     * @param <T>      the element type
+     * @return a list containing the specified elements
+     */
+    // @SafeVarargs suppresses the warning at the listOf declaration; @SuppressWarnings
+    // is additionally needed because javac still flags the forwarded Arrays.asList(...)
+    // call as a possible-heap-pollution site even though Arrays.asList is itself
+    // @SafeVarargs in the JDK.
+    @SafeVarargs
+    @SuppressWarnings({"unchecked", "varargs"})
+    public final <T> List<T> listOf(final T... elements) {
+        return Arrays.asList(elements);
+    }
+
+    // Intentionally NOT wrapped:
+    // - Optional.isEmpty() (Java 11+) — use !opt.isPresent() inline instead. NullAway's
+    //   CheckOptionalEmptiness recognises Optional.isPresent() / isEmpty() directly as
+    //   null-narrowing for a subsequent .get(); a helper method call breaks that flow
+    //   analysis. The two extra characters of !opt.isPresent() are worth the safety.
+    // - Optional.orElseThrow() no-arg (Java 10+) — use orElseThrow(() -> new ...) with
+    //   an explicit exception type and message at each call site. A generic wrapper
+    //   would lose the per-site context that makes the failure debuggable.
+
+    /**
+     * Wrapper for {@code ByteArrayOutputStream#toString(Charset)} (Java 10+).
+     * Decodes the accumulated bytes with the given charset.
+     *
+     * @param baos    the buffer; must not be {@code null}
+     * @param charset the charset to decode with; must not be {@code null}
+     * @return the decoded string
+     */
+    public String toString(final ByteArrayOutputStream baos, final Charset charset) {
+        return new String(baos.toByteArray(), charset);
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/JsonParameters.java b/src/main/java/net/ladenthin/llama/JsonParameters.java
index 98bc2ebb..a2cf18e4 100644
--- a/src/main/java/net/ladenthin/llama/JsonParameters.java
+++ b/src/main/java/net/ladenthin/llama/JsonParameters.java
@@ -9,6 +9,7 @@
 import java.util.Map;
 import net.ladenthin.llama.args.CliArg;
 import net.ladenthin.llama.json.ParameterJsonSerializer;
+import org.checkerframework.checker.nullness.qual.PolyNull;
 
 /**
  * The Java library re-uses most of the llama.cpp server code, which mostly works with JSONs. Thus, the complexity and
@@ -42,7 +43,11 @@ public String toString() {
         return builder.toString();
     }
 
-    String toJsonString(String text) {
+    // @PolyNull lets the Checker Framework see that null in returns null and non-null
+    // in returns non-null. NullAway has no equivalent qualifier and reads the return as
+    // @NonNull (under @NullMarked), so we suppress the NullAway-only complaint here.
+    @SuppressWarnings("NullAway")
+    @PolyNull String toJsonString(@PolyNull String text) {
         if (text == null) return null;
         return serializer.toJsonString(text);
     }
@@ -59,7 +64,11 @@ String toJsonString(String text) {
      * @param <T>   the concrete subtype of this builder
      * @return this builder
      */
-    @SuppressWarnings("unchecked")
+    // Self-typing builder idiom: the caller fixes T to its own concrete subtype
+    // so that chained calls return the concrete builder instead of JsonParameters.
+    // This deliberately uses T only in the return type and is not the
+    // "TypeParameterUnusedInFormals" anti-pattern Error Prone warns about.
+    @SuppressWarnings({"unchecked", "TypeParameterUnusedInFormals"})
     protected final <T extends JsonParameters> T putScalar(String key, Object value) {
         parameters.put(key, String.valueOf(value));
         return (T) this;
@@ -74,7 +83,8 @@ protected final <T extends JsonParameters> T putScalar(String key, Object value)
      * @param <T>   the concrete subtype of this builder
      * @return this builder
      */
-    @SuppressWarnings("unchecked")
+    // Self-typing builder idiom — see putScalar above.
+    @SuppressWarnings({"unchecked", "TypeParameterUnusedInFormals"})
     protected final <T extends JsonParameters> T putEnum(String key, CliArg value) {
         parameters.put(key, value.getArgValue());
         return (T) this;
diff --git a/src/main/java/net/ladenthin/llama/LlamaException.java b/src/main/java/net/ladenthin/llama/LlamaException.java
index de2e0c05..ebc3c864 100644
--- a/src/main/java/net/ladenthin/llama/LlamaException.java
+++ b/src/main/java/net/ladenthin/llama/LlamaException.java
@@ -5,12 +5,33 @@
 
 package net.ladenthin.llama;
 
-class LlamaException extends RuntimeException {
+/**
+ * Base unchecked exception raised by the JNI layer when a llama.cpp operation
+ * fails. Specific failure modes may extend this class with typed subclasses
+ * (e.g. {@link ModelUnavailableException}).
+ *
+ * <p>This was historically package-private; it was promoted to {@code public}
+ * to allow external callers to {@code catch} the typed subclasses by their
+ * common base. Existing callers that caught {@link RuntimeException} continue
+ * to work unchanged.</p>
+ */
+public class LlamaException extends RuntimeException {
 
+    /**
+     * Creates a new {@link LlamaException} with the given message.
+     *
+     * @param message the detail message; may be {@code null}
+     */
     public LlamaException(String message) {
         super(message);
     }
 
+    /**
+     * Creates a new {@link LlamaException} with the given message and cause.
+     *
+     * @param message the detail message; may be {@code null}
+     * @param cause   the underlying cause; may be {@code null}
+     */
     public LlamaException(String message, Throwable cause) {
         super(message, cause);
     }
diff --git a/src/main/java/net/ladenthin/llama/LlamaIterable.java b/src/main/java/net/ladenthin/llama/LlamaIterable.java
index 88d61769..2e4f1d36 100644
--- a/src/main/java/net/ladenthin/llama/LlamaIterable.java
+++ b/src/main/java/net/ladenthin/llama/LlamaIterable.java
@@ -5,8 +5,6 @@
 
 package net.ladenthin.llama;
 
-import org.jetbrains.annotations.NotNull;
-
 /**
  * An {@link Iterable} wrapper around {@link LlamaIterator} returned by
  * {@link LlamaModel#generate(InferenceParameters)} and {@link LlamaModel#generateChat(InferenceParameters)}.
@@ -34,7 +32,6 @@ public final class LlamaIterable implements Iterable<LlamaOutput>, AutoCloseable
         this.iterator = iterator;
     }
 
-    @NotNull
     @Override
     public LlamaIterator iterator() {
         return iterator;
diff --git a/src/main/java/net/ladenthin/llama/LlamaLoader.java b/src/main/java/net/ladenthin/llama/LlamaLoader.java
index 6df905be..06b29ee8 100644
--- a/src/main/java/net/ladenthin/llama/LlamaLoader.java
+++ b/src/main/java/net/ladenthin/llama/LlamaLoader.java
@@ -13,10 +13,10 @@
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
-import java.util.LinkedList;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Stream;
-import org.jetbrains.annotations.Nullable;
+import org.jspecify.annotations.Nullable;
 
 /**
  * Set the system properties {@code net.ladenthin.llama.lib.path} /
@@ -86,7 +86,7 @@ private static void cleanPath(Path path) {
     }
 
     private static void loadNativeLibrary(String name) {
-        List<String> triedPaths = new LinkedList<>();
+        List<String> triedPaths = new ArrayList<>();
 
         String nativeLibName = System.mapLibraryName(name);
         String nativeLibPath = systemProperties.getLibPath();
@@ -112,7 +112,11 @@ private static void loadNativeLibrary(String name) {
 
         // Try to load the library from java.library.path
         String javaLibraryPath = System.getProperty("java.library.path", "");
-        for (String ldPath : javaLibraryPath.split(File.pathSeparator)) {
+        // String.split's "trailing empties dropped" quirk is benign here because
+        // we explicitly skip empty entries with the isEmpty() check below.
+        @SuppressWarnings("StringSplitter")
+        final String[] ldPaths = javaLibraryPath.split(File.pathSeparator);
+        for (String ldPath : ldPaths) {
             if (ldPath.isEmpty()) {
                 continue;
             }
@@ -164,8 +168,7 @@ public static boolean loadNativeLibrary(Path path) {
         }
     }
 
-    @Nullable
-    private static Path extractFile(String sourceDirectory, String fileName, String targetDirectory) {
+    private static @Nullable Path extractFile(String sourceDirectory, String fileName, String targetDirectory) {
         String nativeLibraryFilePath = sourceDirectory + "/" + fileName;
 
         Path extractedFilePath = Paths.get(targetDirectory, fileName);
@@ -188,6 +191,10 @@ private static Path extractFile(String sourceDirectory, String fileName, String
             // Check whether the contents are properly copied from the resource folder
             try (InputStream nativeIn = LlamaLoader.class.getResourceAsStream(nativeLibraryFilePath);
                     InputStream extractedLibIn = Files.newInputStream(extractedFilePath)) {
+                if (nativeIn == null) {
+                    System.err.println(String.format("Native library resource missing at %s", nativeLibraryFilePath));
+                    return null;
+                }
                 if (!contentsEquals(nativeIn, extractedLibIn)) {
                     System.err.println(String.format("Failed to write a native library file at %s", extractedFilePath));
                     return null;
@@ -245,7 +252,12 @@ static File getTempDir() {
     }
 
     static String getNativeResourcePath() {
-        String packagePath = LlamaLoader.class.getPackage().getName().replace('.', '/');
+        final Package pkg = LlamaLoader.class.getPackage();
+        // LlamaLoader is in a named package, so Class.getPackage() is never null here.
+        if (pkg == null) {
+            throw new IllegalStateException("LlamaLoader.class.getPackage() returned null");
+        }
+        String packagePath = pkg.getName().replace('.', '/');
         return String.format("/%s/%s", packagePath, OSInfo.getNativeLibFolderPathForCurrentOS());
     }
 
diff --git a/src/main/java/net/ladenthin/llama/LlamaModel.java b/src/main/java/net/ladenthin/llama/LlamaModel.java
index 5ec3f077..d5e21071 100644
--- a/src/main/java/net/ladenthin/llama/LlamaModel.java
+++ b/src/main/java/net/ladenthin/llama/LlamaModel.java
@@ -10,12 +10,15 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
 import java.util.concurrent.CompletableFuture;
 import java.util.function.BiConsumer;
 import net.ladenthin.llama.args.LogFormat;
 import net.ladenthin.llama.json.ChatResponseParser;
 import net.ladenthin.llama.json.CompletionResponseParser;
 import net.ladenthin.llama.json.RerankResponseParser;
+import org.jspecify.annotations.Nullable;
 
 /**
  * This class is a wrapper around the llama.cpp functionality.
@@ -52,10 +55,21 @@ public class LlamaModel implements AutoCloseable {
      * </ul>
      *
      * @param parameters the set of options
-     * @throws LlamaException if no model could be loaded from the given file path
-     */
+     * @throws ModelUnavailableException if {@link ModelParameters#setSkipDownload(boolean)
+     *                                   setSkipDownload(true)} (or
+     *                                   {@link net.ladenthin.llama.args.ModelFlag#SKIP_DOWNLOAD})
+     *                                   is set and the configured model file is missing or invalid
+     * @throws LlamaException            for any other load failure
+     */
+    // loadModel is a native method; it does not call back into Java with this,
+    // so the @UnderInitialization receiver warning is a CF false positive.
+    @SuppressWarnings("method.invocation")
     public LlamaModel(ModelParameters parameters) {
-        loadModel(parameters.toArray());
+        try {
+            loadModel(parameters.toArray());
+        } catch (LlamaException e) {
+            throw SkipDownloadFailureTranslator.translate(parameters, e);
+        }
     }
 
     /**
@@ -68,11 +82,19 @@ public LlamaModel(ModelParameters parameters) {
      * @param progress   load progress sink; {@code null} disables the callback
      * @throws LlamaException if loading fails or the callback aborts
      */
+    // loadModel / loadModelWithProgress are native methods; they do not call back
+    // into Java with this, so the @UnderInitialization receiver warning is a CF
+    // false positive.
+    @SuppressWarnings("method.invocation")
     public LlamaModel(ModelParameters parameters, LoadProgressCallback progress) {
-        if (progress == null) {
-            loadModel(parameters.toArray());
-        } else {
-            loadModelWithProgress(parameters.toArray(), progress);
+        try {
+            if (progress == null) {
+                loadModel(parameters.toArray());
+            } else {
+                loadModelWithProgress(parameters.toArray(), progress);
+            }
+        } catch (LlamaException e) {
+            throw SkipDownloadFailureTranslator.translate(parameters, e);
         }
     }
 
@@ -232,7 +254,11 @@ public CompletableFuture<String> completeAsync(InferenceParameters parameters) {
      */
     public CompletableFuture<String> completeAsync(InferenceParameters parameters, CancellationToken token) {
         CompletableFuture<String> future = CompletableFuture.supplyAsync(() -> complete(parameters, token));
-        future.whenComplete((result, ex) -> {
+        // whenComplete returns a new stage that we deliberately discard: this is a
+        // fire-and-forget cancellation callback attached to `future`, which is what
+        // the caller observes.
+        @SuppressWarnings("FutureReturnValueIgnored")
+        final CompletableFuture<String> cancelHook = future.whenComplete((result, ex) -> {
             if (ex instanceof java.util.concurrent.CancellationException) {
                 token.cancel();
             }
@@ -380,7 +406,7 @@ public void close() {
      * deleted, since the attack vector disappears together with finalization.
      * </p>
      */
-    @SuppressWarnings({"deprecation", "removal"})
+    @SuppressWarnings({"deprecation", "removal", "Finalize"})
     @Override
     protected final void finalize() {
         // no-op
@@ -520,14 +546,14 @@ public String chatCompleteText(InferenceParameters parameters) {
      */
     public ChatResponse chat(ChatRequest request) {
         InferenceParameters params = new InferenceParameters("").setMessagesJson(request.buildMessagesJson());
-        String toolsJson = request.buildToolsJson();
-        if (toolsJson != null) {
+        request.buildToolsJson().ifPresent(toolsJson -> {
             params.setToolsJson(toolsJson);
-            if (request.getToolChoice() != null) {
-                params.setToolChoice(request.getToolChoice());
+            final String toolChoice = request.getToolChoice();
+            if (toolChoice != null) {
+                params.setToolChoice(toolChoice);
             }
             params.setUseChatTemplate(true);
-        }
+        });
         request.applyCustomizer(params);
         String raw = chatComplete(params);
         return chatParser.parseResponse(raw);
@@ -551,13 +577,21 @@ public ChatResponse chat(ChatRequest request) {
      *         (or the last response when the round cap is hit)
      */
     public ChatResponse chatWithTools(ChatRequest request, java.util.Map<String, ToolHandler> handlers) {
-        ChatResponse last = null;
-        for (int round = 0; round < request.getMaxToolRounds(); round++) {
-            last = chat(request);
-            ChatMessage assistant = last.getFirstMessage();
-            if (assistant == null || assistant.getToolCalls().isEmpty()) {
+        final int maxRounds = request.getMaxToolRounds();
+        if (maxRounds < 1) {
+            throw new IllegalArgumentException(
+                    "ChatRequest.maxToolRounds must be >= 1 (got " + maxRounds + "); "
+                            + "chatWithTools always issues at least one chat call.");
+        }
+        ChatResponse last = chat(request);
+        for (int round = 1; round < maxRounds; round++) {
+            Optional<ChatMessage> assistantOpt = last.getFirstMessage();
+            // NOTE: inline !isPresent() here (not compatibilityHelper.isEmpty) so NullAway's
+            //       CheckOptionalEmptiness recognises this as null-narrowing for the .get() below.
+            if (!assistantOpt.isPresent() || assistantOpt.get().getToolCalls().isEmpty()) {
                 return last;
             }
+            ChatMessage assistant = assistantOpt.get();
             request.addMessage(assistant);
             for (ToolCall call : assistant.getToolCalls()) {
                 ToolHandler handler = handlers.get(call.getName());
@@ -576,6 +610,7 @@ public ChatResponse chatWithTools(ChatRequest request, java.util.Map<String, Too
                 }
                 request.addMessage(ChatMessage.toolResult(call.getId(), result));
             }
+            last = chat(request);
         }
         return last;
     }
@@ -807,7 +842,7 @@ public String restoreSlot(int slotId, String filepath) {
      */
     public native boolean configureParallelInference(String configJson) throws LlamaException;
 
-    native String handleSlotAction(int action, int slotId, String filename) throws LlamaException;
+    native String handleSlotAction(int action, int slotId, @Nullable String filename) throws LlamaException;
 
     native String handleChatCompletions(String params) throws LlamaException;
 
diff --git a/src/main/java/net/ladenthin/llama/LlamaOutput.java b/src/main/java/net/ladenthin/llama/LlamaOutput.java
index 25e16ded..b6294da9 100644
--- a/src/main/java/net/ladenthin/llama/LlamaOutput.java
+++ b/src/main/java/net/ladenthin/llama/LlamaOutput.java
@@ -8,7 +8,6 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
-import org.jetbrains.annotations.NotNull;
 
 /**
  * An output of the LLM providing access to the generated text and the associated probabilities. You have to configure
@@ -20,7 +19,6 @@ public final class LlamaOutput {
      * The last bit of generated text that is representable as text (i.e., cannot be individual utf-8 multibyte code
      * points).
      */
-    @NotNull
     public final String text;
 
     /**
@@ -30,7 +28,6 @@ public final class LlamaOutput {
      * <p>
      * Note, that you have to configure {@link InferenceParameters#setNProbs(int)} in order for probabilities to be returned.
      */
-    @NotNull
     public final Map<String, Float> probabilities;
 
     /**
@@ -38,7 +35,6 @@ public final class LlamaOutput {
      * Empty when {@link InferenceParameters#setNProbs(int)} is not configured or the native
      * response did not include {@code completion_probabilities}.
      */
-    @NotNull
     public final List<TokenLogprob> logprobs;
 
     /** Whether this is the final token of the generation. */
@@ -48,7 +44,6 @@ public final class LlamaOutput {
      * The reason generation stopped. {@link StopReason#NONE} on intermediate streaming tokens.
      * Only meaningful when {@link #stop} is {@code true}.
      */
-    @NotNull
     public final StopReason stopReason;
 
     /**
@@ -60,10 +55,10 @@ public final class LlamaOutput {
      * @param stopReason    the stop reason ({@link StopReason#NONE} on intermediate tokens)
      */
     public LlamaOutput(
-            @NotNull String text,
-            @NotNull Map<String, Float> probabilities,
+            String text,
+            Map<String, Float> probabilities,
             boolean stop,
-            @NotNull StopReason stopReason) {
+            StopReason stopReason) {
         this(text, probabilities, Collections.<TokenLogprob>emptyList(), stop, stopReason);
     }
 
@@ -77,11 +72,11 @@ public LlamaOutput(
      * @param stopReason    the stop reason ({@link StopReason#NONE} on intermediate tokens)
      */
     public LlamaOutput(
-            @NotNull String text,
-            @NotNull Map<String, Float> probabilities,
-            @NotNull List<TokenLogprob> logprobs,
+            String text,
+            Map<String, Float> probabilities,
+            List<TokenLogprob> logprobs,
             boolean stop,
-            @NotNull StopReason stopReason) {
+            StopReason stopReason) {
         this.text = text;
         this.probabilities = probabilities;
         this.logprobs = logprobs;
diff --git a/src/main/java/net/ladenthin/llama/LlamaSystemProperties.java b/src/main/java/net/ladenthin/llama/LlamaSystemProperties.java
index 62488318..3d30a5f0 100644
--- a/src/main/java/net/ladenthin/llama/LlamaSystemProperties.java
+++ b/src/main/java/net/ladenthin/llama/LlamaSystemProperties.java
@@ -5,6 +5,8 @@
 
 package net.ladenthin.llama;
 
+import org.jspecify.annotations.Nullable;
+
 /**
  * Resolves library-specific system properties under the {@link #PREFIX} domain prefix.
  */
@@ -16,7 +18,7 @@ public LlamaSystemProperties() {}
     /** Common system-property prefix for all library-specific overrides. */
     public static final String PREFIX = "net.ladenthin.llama";
 
-    private String getProperty(String suffix) {
+    private @Nullable String getProperty(String suffix) {
         return System.getProperty(PREFIX + suffix);
     }
 
@@ -25,7 +27,7 @@ private String getProperty(String suffix) {
      *
      * @return the configured library directory, or {@code null} if unset
      */
-    public String getLibPath() {
+    public @Nullable String getLibPath() {
         return getProperty(".lib.path");
     }
 
@@ -34,7 +36,7 @@ public String getLibPath() {
      *
      * @return the configured library file name, or {@code null} if unset
      */
-    public String getLibName() {
+    public @Nullable String getLibName() {
         return getProperty(".lib.name");
     }
 
@@ -44,7 +46,7 @@ public String getLibName() {
      *
      * @return the configured temp directory, or {@code null} if unset
      */
-    public String getTmpDir() {
+    public @Nullable String getTmpDir() {
         return getProperty(".tmpdir");
     }
 
@@ -53,7 +55,7 @@ public String getTmpDir() {
      *
      * @return the configured architecture override, or {@code null} if unset
      */
-    public String getOsinfoArchitecture() {
+    public @Nullable String getOsinfoArchitecture() {
         return getProperty(".osinfo.architecture");
     }
 
@@ -62,7 +64,7 @@ public String getOsinfoArchitecture() {
      *
      * @return the configured GPU layer count as a string, or {@code null} if unset
      */
-    public String getTestNgl() {
+    public @Nullable String getTestNgl() {
         return getProperty(".test.ngl");
     }
 }
diff --git a/src/main/java/net/ladenthin/llama/ModelParameters.java b/src/main/java/net/ladenthin/llama/ModelParameters.java
index 65fb026c..d0afb196 100644
--- a/src/main/java/net/ladenthin/llama/ModelParameters.java
+++ b/src/main/java/net/ladenthin/llama/ModelParameters.java
@@ -1418,13 +1418,49 @@ public ModelParameters clearFlag(ModelFlag flag) {
         return this;
     }
 
+    /**
+     * Returns whether the given flag is currently set on this builder.
+     *
+     * @param flag the flag to query
+     * @return {@code true} if {@link #setFlag(ModelFlag)} was called for {@code flag} and
+     *         {@link #clearFlag(ModelFlag)} has not since removed it; {@code false} otherwise
+     */
+    public boolean hasFlag(ModelFlag flag) {
+        return parameters.containsKey(flag.getCliFlag());
+    }
+
+    /**
+     * Skip any model file download — only validation is performed (default: {@code false}).
+     *
+     * <p>When enabled, the upstream loader will NOT attempt any outbound network call to
+     * download the configured model. If the model file is missing or invalid (e.g. ETag
+     * mismatch), {@link LlamaModel#LlamaModel(ModelParameters)} throws a typed
+     * {@link ModelUnavailableException} so the caller can distinguish an air-gapped miss
+     * from a genuine misconfiguration.</p>
+     *
+     * <p>Useful for air-gapped / pre-staged-model deployments where any outbound network
+     * call is itself a failure mode.</p>
+     *
+     * @param skip {@code true} to skip downloads (set {@link ModelFlag#SKIP_DOWNLOAD}),
+     *             {@code false} to clear the flag and allow downloads
+     * @return this builder
+     */
+    public ModelParameters setSkipDownload(boolean skip) {
+        if (skip) {
+            setFlag(ModelFlag.SKIP_DOWNLOAD);
+        } else {
+            clearFlag(ModelFlag.SKIP_DOWNLOAD);
+        }
+        return this;
+    }
+
     /**
      * Returns whether the given parameter key has not been explicitly set.
      *
      * @param key the parameter key without the {@code --} prefix
      * @return {@code true} if the key is absent from the configured parameters
      */
-    public boolean isDefault(String key) {
+    public boolean isUnset(String key) {
         return !parameters.containsKey("--" + key);
     }
 }
diff --git a/src/main/java/net/ladenthin/llama/ModelUnavailableException.java b/src/main/java/net/ladenthin/llama/ModelUnavailableException.java
new file mode 100644
index 00000000..dfa57fad
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/ModelUnavailableException.java
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import net.ladenthin.llama.args.ModelFlag;
+
+/**
+ * Thrown by {@link LlamaModel#LlamaModel(ModelParameters)} when
+ * {@link ModelFlag#SKIP_DOWNLOAD} (or {@link ModelParameters#setSkipDownload(boolean)
+ * setSkipDownload(true)}) is set and the configured model file is missing or
+ * invalid &#x2014; i.e. the loader would have had to download a replacement but is
+ * forbidden to.
+ *
+ * <p>Lets air-gapped / pre-staged-model deployments distinguish &quot;model file
+ * absent&quot; from generic configuration errors. Upstream raises
+ * {@code common_skip_download_exception} which is caught inside
+ * {@code common_params_parse_ex} and surfaces as a {@code false} return; the
+ * Java layer combines that with the {@code SKIP_DOWNLOAD} flag to recognise the
+ * skip-download case and translate it to this typed exception.</p>
+ */
+public class ModelUnavailableException extends LlamaException {
+
+    /**
+     * Creates a new {@link ModelUnavailableException} with the given message.
+     *
+     * @param message the detail message; may be {@code null}
+     */
+    public ModelUnavailableException(String message) {
+        super(message);
+    }
+
+    /**
+     * Creates a new {@link ModelUnavailableException} with the given message and cause.
+     *
+     * @param message the detail message; may be {@code null}
+     * @param cause   the underlying cause; may be {@code null}
+     */
+    public ModelUnavailableException(String message, Throwable cause) {
+        super(message, cause);
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/OSInfo.java b/src/main/java/net/ladenthin/llama/OSInfo.java
index 222574d7..cf40d5f9 100644
--- a/src/main/java/net/ladenthin/llama/OSInfo.java
+++ b/src/main/java/net/ladenthin/llama/OSInfo.java
@@ -227,7 +227,7 @@ private static boolean isRunningAndroid() {
      * @return {@code true} if the JVM identifies itself as Android
      */
     public static boolean isAndroidRuntime() {
-        return System.getProperty("java.runtime.name", "").toLowerCase().contains("android");
+        return System.getProperty("java.runtime.name", "").toLowerCase(Locale.ROOT).contains("android");
     }
 
     /**
@@ -237,7 +237,7 @@ public static boolean isAndroidRuntime() {
      */
     public static boolean isAndroidTermux() {
         try {
-            return processRunner.runAndWaitFor("uname -o").toLowerCase().contains("android");
+            return processRunner.runAndWaitFor("uname -o").toLowerCase(Locale.ROOT).contains("android");
         } catch (InterruptedException e) {
             Thread.currentThread().interrupt();
             return false;
@@ -257,7 +257,7 @@ public static boolean isAndroidTermux() {
     public static boolean isMusl() {
         Path mapFilesDir = Paths.get("/proc/self/map_files");
         try (Stream<Path> dirStream = Files.list(mapFilesDir)) {
-            return dirStream.map(OSInfo::toRealPathOrEmpty).anyMatch(s -> s.toLowerCase()
+            return dirStream.map(OSInfo::toRealPathOrEmpty).anyMatch(s -> s.toLowerCase(Locale.ROOT)
                     .contains("musl"));
         } catch (Exception ignored) {
             // fall back to checking for alpine linux in the event we're using an older kernel which
@@ -282,6 +282,9 @@ private static boolean isAlpineLinux() {
         try (Stream<String> osLines = Files.lines(Paths.get("/etc/os-release"))) {
             return osLines.anyMatch(l -> l.startsWith("ID") && l.contains("alpine"));
         } catch (Exception ignored2) {
+            // Treat any I/O / parse failure as "not Alpine" — the file is absent on
+            // non-Linux hosts and unreadable in sandboxed Linux runtimes; either
+            // way the answer is the same and there is nothing meaningful to log.
         }
         return false;
     }
diff --git a/src/main/java/net/ladenthin/llama/Pair.java b/src/main/java/net/ladenthin/llama/Pair.java
index 6dbe96e3..ceff22f0 100644
--- a/src/main/java/net/ladenthin/llama/Pair.java
+++ b/src/main/java/net/ladenthin/llama/Pair.java
@@ -6,6 +6,7 @@
 package net.ladenthin.llama;
 
 import java.util.Objects;
+import org.jspecify.annotations.Nullable;
 
 /**
  * A generic immutable key-value pair.
@@ -53,11 +54,10 @@ public int hashCode() {
     }
 
     @Override
-    public boolean equals(Object obj) {
+    public boolean equals(@Nullable Object obj) {
         if (this == obj) return true;
-        if (obj == null) return false;
-        if (getClass() != obj.getClass()) return false;
-        Pair other = (Pair) obj;
+        if (!(obj instanceof Pair)) return false;
+        Pair<?, ?> other = (Pair<?, ?>) obj;
         return Objects.equals(key, other.key) && Objects.equals(value, other.value);
     }
 
diff --git a/src/main/java/net/ladenthin/llama/ProcessRunner.java b/src/main/java/net/ladenthin/llama/ProcessRunner.java
index 61a93f68..0a54c10d 100644
--- a/src/main/java/net/ladenthin/llama/ProcessRunner.java
+++ b/src/main/java/net/ladenthin/llama/ProcessRunner.java
@@ -12,6 +12,9 @@
 import java.util.concurrent.TimeUnit;
 
 class ProcessRunner {
+
+    private final Java8CompatibilityHelper compatibilityHelper = new Java8CompatibilityHelper();
+
     String runAndWaitFor(String command) throws IOException, InterruptedException {
         Process p = Runtime.getRuntime().exec(splitArgs(command));
         p.waitFor();
@@ -37,7 +40,7 @@ private static String[] splitArgs(String command) {
         return command.split(" ");
     }
 
-    private static String getProcessOutput(Process process) throws IOException {
+    private String getProcessOutput(Process process) throws IOException {
         try (InputStream in = process.getInputStream()) {
             int readLen;
             ByteArrayOutputStream b = new ByteArrayOutputStream();
@@ -45,7 +48,7 @@ private static String getProcessOutput(Process process) throws IOException {
             while ((readLen = in.read(buf, 0, buf.length)) >= 0) {
                 b.write(buf, 0, readLen);
             }
-            return b.toString(StandardCharsets.UTF_8);
+            return compatibilityHelper.toString(b, StandardCharsets.UTF_8);
         }
     }
 }
diff --git a/src/main/java/net/ladenthin/llama/Session.java b/src/main/java/net/ladenthin/llama/Session.java
index 41a96200..8d0188ea 100644
--- a/src/main/java/net/ladenthin/llama/Session.java
+++ b/src/main/java/net/ladenthin/llama/Session.java
@@ -8,6 +8,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.function.Consumer;
+import org.jspecify.annotations.Nullable;
 
 /**
  * Thin multi-turn conversation wrapper over a {@link LlamaModel} slot. Maintains an
@@ -31,9 +32,9 @@ public final class Session implements AutoCloseable {
 
     private final LlamaModel model;
     private final int slotId;
-    private final String systemMessage;
+    private final @Nullable String systemMessage;
     private final List<Pair<String, String>> turns = new ArrayList<Pair<String, String>>();
-    private final Consumer<InferenceParameters> paramsCustomizer;
+    private final @Nullable Consumer<InferenceParameters> paramsCustomizer;
     private final Object lock = new Object();
     private boolean streamingActive;
 
@@ -45,7 +46,7 @@ public final class Session implements AutoCloseable {
      * @param slotId the slot id used by {@link #save(String)} / {@link #restore(String)}
      * @param systemMessage optional system prompt (may be {@code null} or empty)
      */
-    public Session(LlamaModel model, int slotId, String systemMessage) {
+    public Session(LlamaModel model, int slotId, @Nullable String systemMessage) {
         this(model, slotId, systemMessage, null);
     }
 
@@ -58,7 +59,11 @@ public Session(LlamaModel model, int slotId, String systemMessage) {
      * @param systemMessage optional system prompt
      * @param paramsCustomizer applied to each request's parameters; may be {@code null}
      */
-    public Session(LlamaModel model, int slotId, String systemMessage, Consumer<InferenceParameters> paramsCustomizer) {
+    public Session(
+            LlamaModel model,
+            int slotId,
+            @Nullable String systemMessage,
+            @Nullable Consumer<InferenceParameters> paramsCustomizer) {
         this.model = model;
         this.slotId = slotId;
         this.systemMessage = systemMessage;
diff --git a/src/main/java/net/ladenthin/llama/SkipDownloadFailureTranslator.java b/src/main/java/net/ladenthin/llama/SkipDownloadFailureTranslator.java
new file mode 100644
index 00000000..3c6ec985
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/SkipDownloadFailureTranslator.java
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import net.ladenthin.llama.args.ModelFlag;
+
+/**
+ * Pure-Java translator from the generic {@link LlamaException} raised by the JNI
+ * loader to the typed {@link ModelUnavailableException} when
+ * {@link ModelFlag#SKIP_DOWNLOAD} is set and the load failed because the
+ * configured model file was missing or invalid.
+ *
+ * <p>Lives outside {@link LlamaModel} so that unit tests can exercise the
+ * translation heuristic without triggering {@code LlamaModel}'s
+ * {@link LlamaLoader} static initializer (which loads the JNI library and is
+ * not available in CPU-only / non-native test environments).</p>
+ *
+ * <h2>Why a heuristic and not a direct exception catch</h2>
+ *
+ * <p>Upstream raises {@code common_skip_download_exception} inside
+ * {@code common_download_file_single} when {@code --skip-download} is set and
+ * the file is missing or has a stale ETag. However that exception is caught
+ * INSIDE upstream's own {@code common_params_parse_ex} (at
+ * {@code common/arg.cpp:476}) and surfaces only as a {@code false} return
+ * from {@code common_params_parse}. The JNI layer reports the {@code false}
+ * return as a generic {@link LlamaException} with the message
+ * {@value #LOAD_PARSE_FAILED_MESSAGE}. The Java layer therefore cannot catch
+ * the C++ exception directly and instead recognises the combined signal:
+ * {@code SKIP_DOWNLOAD} flag set + JNI message matches.</p>
+ */
+final class SkipDownloadFailureTranslator {
+
+    /**
+     * Substring used by the JNI bridge when {@code common_params_parse} returns
+     * {@code false}; matched at the Java layer to recognise the
+     * {@code SKIP_DOWNLOAD} case.
+     */
+    static final String LOAD_PARSE_FAILED_MESSAGE = "Failed to parse model parameters";
+
+    private SkipDownloadFailureTranslator() {
+        // utility — not instantiable
+    }
+
+    /**
+     * Translates a generic load failure into a typed
+     * {@link ModelUnavailableException} when the user opted into
+     * {@link ModelFlag#SKIP_DOWNLOAD} and the JNI surfaced the
+     * {@value #LOAD_PARSE_FAILED_MESSAGE} message; otherwise returns the
+     * original exception unchanged so the caller can re-throw it as-is.
+     *
+     * @param parameters the parameters passed to the failing constructor
+     * @param original   the original load failure to translate or pass through
+     * @return a {@link ModelUnavailableException} when the heuristic matches;
+     *         otherwise the original {@code LlamaException}
+     */
+    static LlamaException translate(ModelParameters parameters, LlamaException original) {
+        if (parameters.hasFlag(ModelFlag.SKIP_DOWNLOAD)
+                && original.getMessage() != null
+                && original.getMessage().contains(LOAD_PARSE_FAILED_MESSAGE)) {
+            return new ModelUnavailableException(
+                    "Model unavailable: --skip-download is set but the configured model file is missing or "
+                            + "invalid (no download attempted).",
+                    original);
+        }
+        return original;
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/StopReason.java b/src/main/java/net/ladenthin/llama/StopReason.java
index 32f51be6..c31c2809 100644
--- a/src/main/java/net/ladenthin/llama/StopReason.java
+++ b/src/main/java/net/ladenthin/llama/StopReason.java
@@ -5,6 +5,8 @@
 
 package net.ladenthin.llama;
 
+import org.jspecify.annotations.Nullable;
+
 /**
  * The reason why token generation stopped for a {@link LlamaOutput}.
  *
@@ -31,9 +33,9 @@ public enum StopReason {
     /** Token budget exhausted. Server {@code "stop_type"} value: {@code "limit"}. */
     MAX_TOKENS("limit");
 
-    private final String stopType;
+    private final @Nullable String stopType;
 
-    StopReason(String stopType) {
+    StopReason(@Nullable String stopType) {
         this.stopType = stopType;
     }
 
@@ -43,7 +45,7 @@ public enum StopReason {
      *
      * @return the stop-type string, or {@code null} for {@link #NONE}
      */
-    public String getStopType() {
+    public @Nullable String getStopType() {
         return stopType;
     }
 
@@ -55,7 +57,7 @@ public String getStopType() {
      * @param stopType the raw stop-type string, or {@code null} / empty for absent field
      * @return the corresponding {@link StopReason}, or {@link #NONE} if unrecognised
      */
-    public static StopReason fromStopType(String stopType) {
+    public static StopReason fromStopType(@Nullable String stopType) {
         if (stopType == null) return NONE;
         switch (stopType) {
             case "eos":
diff --git a/src/main/java/net/ladenthin/llama/Timings.java b/src/main/java/net/ladenthin/llama/Timings.java
index 3fe8048d..0910a9fe 100644
--- a/src/main/java/net/ladenthin/llama/Timings.java
+++ b/src/main/java/net/ladenthin/llama/Timings.java
@@ -5,6 +5,7 @@
 package net.ladenthin.llama;
 
 import com.fasterxml.jackson.databind.JsonNode;
+import org.jspecify.annotations.Nullable;
 
 /**
  * Per-completion timing data parsed from a llama.cpp result {@code timings} block.
@@ -70,7 +71,7 @@ public Timings(
      * @param node the {@code timings} object node; may be a missing-node
      * @return a populated {@link Timings} (all-zero when {@code node} is missing/null)
      */
-    public static Timings fromJson(JsonNode node) {
+    public static Timings fromJson(@Nullable JsonNode node) {
         if (node == null || node.isMissingNode() || node.isNull()) {
             return new Timings(0, 0, 0.0, 0.0, 0, 0.0, 0.0, 0, 0);
         }
diff --git a/src/main/java/net/ladenthin/llama/Usage.java b/src/main/java/net/ladenthin/llama/Usage.java
index 771a4d32..9708a5e3 100644
--- a/src/main/java/net/ladenthin/llama/Usage.java
+++ b/src/main/java/net/ladenthin/llama/Usage.java
@@ -4,6 +4,8 @@
 
 package net.ladenthin.llama;
 
+import org.jspecify.annotations.Nullable;
+
 /**
  * Token-usage counters, modeled after the OpenAI / Llama Stack {@code usage} block.
  * <p>
@@ -52,7 +54,7 @@ public long getTotalTokens() {
     }
 
     @Override
-    public boolean equals(Object o) {
+    public boolean equals(@Nullable Object o) {
         if (this == o) return true;
         if (!(o instanceof Usage)) return false;
         Usage u = (Usage) o;
diff --git a/src/main/java/net/ladenthin/llama/args/ModelFlag.java b/src/main/java/net/ladenthin/llama/args/ModelFlag.java
index 7850e867..af5807d5 100644
--- a/src/main/java/net/ladenthin/llama/args/ModelFlag.java
+++ b/src/main/java/net/ladenthin/llama/args/ModelFlag.java
@@ -107,7 +107,19 @@ public enum ModelFlag {
     MMPROJ_AUTO("--mmproj-auto"),
 
     /** Offload the mmproj vision projection model to the GPU. */
-    MMPROJ_OFFLOAD("--mmproj-offload");
+    MMPROJ_OFFLOAD("--mmproj-offload"),
+
+    /**
+     * Skip any model file download — only validation is performed. Useful for air-gapped or
+     * pre-staged-model deployments where any outbound network call is a failure mode.
+     *
+     * <p>When this flag is set and the configured model file is missing or invalid (e.g. ETag
+     * mismatch), upstream throws {@code common_skip_download_exception} during arg parsing,
+     * which is caught inside {@code common_params_parse_ex} and surfaces as a {@code false}
+     * return; the Java layer translates that combined signal into a typed
+     * {@link net.ladenthin.llama.ModelUnavailableException}.</p>
+     */
+    SKIP_DOWNLOAD("--skip-download");
 
     private final String cliFlag;
 
diff --git a/src/main/java/net/ladenthin/llama/args/PoolingType.java b/src/main/java/net/ladenthin/llama/args/PoolingType.java
index 0182c48a..ce948029 100644
--- a/src/main/java/net/ladenthin/llama/args/PoolingType.java
+++ b/src/main/java/net/ladenthin/llama/args/PoolingType.java
@@ -91,6 +91,7 @@ public enum PoolingType implements CliArg {
      *
      * @return the pooling type string (e.g. {@code "mean"}, {@code "cls"})
      */
+    @Override
     public String getArgValue() {
         return argValue;
     }
diff --git a/src/main/java/net/ladenthin/llama/args/RopeScalingType.java b/src/main/java/net/ladenthin/llama/args/RopeScalingType.java
index 8d73657d..09e88383 100644
--- a/src/main/java/net/ladenthin/llama/args/RopeScalingType.java
+++ b/src/main/java/net/ladenthin/llama/args/RopeScalingType.java
@@ -29,6 +29,7 @@ public enum RopeScalingType implements CliArg {
         this.argValue = value;
     }
 
+    @Override
     public String getArgValue() {
         return argValue;
     }
diff --git a/src/main/java/net/ladenthin/llama/args/package-info.java b/src/main/java/net/ladenthin/llama/args/package-info.java
new file mode 100644
index 00000000..18542d5e
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/args/package-info.java
@@ -0,0 +1,11 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+/**
+ * Typed enums for CLI-arg-valued options consumed by {@link net.ladenthin.llama.CliParameters}.
+ *
+ * <p>JSpecify {@code @NullMarked} is declared at module level in
+ * {@code module-info.java} and applies to this package transitively.
+ */
+package net.ladenthin.llama.args;
diff --git a/src/main/java/net/ladenthin/llama/json/ChatResponseParser.java b/src/main/java/net/ladenthin/llama/json/ChatResponseParser.java
index 93e7cd55..6cb71e24 100644
--- a/src/main/java/net/ladenthin/llama/json/ChatResponseParser.java
+++ b/src/main/java/net/ladenthin/llama/json/ChatResponseParser.java
@@ -162,7 +162,9 @@ public ChatResponse parseResponse(String json) {
     }
 
     private List<ChatChoice> parseChoices(JsonNode arr) {
-        if (!arr.isArray() || arr.size() == 0) return Collections.emptyList();
+        // Mutable ArrayList on both branches keeps the return-type contract consistent
+        // (Error Prone MixedMutabilityReturnType).
+        if (!arr.isArray() || arr.size() == 0) return new ArrayList<>();
         List<ChatChoice> out = new ArrayList<ChatChoice>(arr.size());
         for (JsonNode c : arr) {
             int index = c.path("index").asInt(0);
@@ -180,7 +182,7 @@ private List<ChatChoice> parseChoices(JsonNode arr) {
     }
 
     private List<ToolCall> parseToolCalls(JsonNode arr) {
-        if (!arr.isArray() || arr.size() == 0) return Collections.emptyList();
+        if (!arr.isArray() || arr.size() == 0) return new ArrayList<>();
         List<ToolCall> out = new ArrayList<ToolCall>(arr.size());
         for (JsonNode tc : arr) {
             String id = tc.path("id").asText("");
diff --git a/src/main/java/net/ladenthin/llama/json/CompletionResponseParser.java b/src/main/java/net/ladenthin/llama/json/CompletionResponseParser.java
index 35ca8f5d..f195eebc 100644
--- a/src/main/java/net/ladenthin/llama/json/CompletionResponseParser.java
+++ b/src/main/java/net/ladenthin/llama/json/CompletionResponseParser.java
@@ -160,7 +160,9 @@ public Map<String, Float> parseProbabilities(JsonNode root) {
     public List<TokenLogprob> parseLogprobs(JsonNode root) {
         JsonNode array = root.path("completion_probabilities");
         if (!array.isArray() || array.size() == 0) {
-            return Collections.emptyList();
+            // Return a mutable empty ArrayList to keep the return type consistent
+            // with the non-empty branch below (Error Prone MixedMutabilityReturnType).
+            return new ArrayList<>();
         }
         List<TokenLogprob> result = new ArrayList<TokenLogprob>(array.size());
         for (JsonNode entry : array) {
diff --git a/src/main/java/net/ladenthin/llama/json/ParameterJsonSerializer.java b/src/main/java/net/ladenthin/llama/json/ParameterJsonSerializer.java
index cd6f949a..e469aa39 100644
--- a/src/main/java/net/ladenthin/llama/json/ParameterJsonSerializer.java
+++ b/src/main/java/net/ladenthin/llama/json/ParameterJsonSerializer.java
@@ -9,6 +9,7 @@
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.node.ArrayNode;
+import org.jspecify.annotations.Nullable;
 import com.fasterxml.jackson.databind.node.ObjectNode;
 import java.io.IOException;
 import java.util.Collection;
@@ -79,7 +80,7 @@ public String toJsonString(String value) {
      * @return a Jackson {@link ArrayNode} of {@code {"role", "content"}} message objects
      * @throws IllegalArgumentException if any message has an invalid role
      */
-    public ArrayNode buildMessages(String systemMessage, List<Pair<String, String>> messages) {
+    public ArrayNode buildMessages(@Nullable String systemMessage, List<Pair<String, String>> messages) {
         ArrayNode arr = OBJECT_MAPPER.createArrayNode();
         if (systemMessage != null && !systemMessage.isEmpty()) {
             ObjectNode sys = OBJECT_MAPPER.createObjectNode();
@@ -118,15 +119,18 @@ public ArrayNode buildMessages(List<ChatMessage> messages) {
             msg.put("role", message.getRole());
             if (message.hasParts()) {
                 ArrayNode parts = OBJECT_MAPPER.createArrayNode();
-                for (ContentPart p : message.getParts()) {
+                for (ContentPart p : message.getParts().orElseThrow(
+                        () -> new IllegalStateException("hasParts() was true but getParts() was empty"))) {
                     ObjectNode part = OBJECT_MAPPER.createObjectNode();
                     if (p.getType() == ContentPart.Type.TEXT) {
                         part.put("type", "text");
-                        part.put("text", p.getText());
+                        final String text = p.getText();
+                        part.put("text", text != null ? text : "");
                     } else {
                         part.put("type", "image_url");
                         ObjectNode imageUrl = OBJECT_MAPPER.createObjectNode();
-                        imageUrl.put("url", p.getImageUrl());
+                        final String url = p.getImageUrl();
+                        imageUrl.put("url", url != null ? url : "");
                         part.set("image_url", imageUrl);
                     }
                     parts.add(part);
diff --git a/src/main/java/net/ladenthin/llama/json/RerankResponseParser.java b/src/main/java/net/ladenthin/llama/json/RerankResponseParser.java
index f5d63a1d..346e4c5b 100644
--- a/src/main/java/net/ladenthin/llama/json/RerankResponseParser.java
+++ b/src/main/java/net/ladenthin/llama/json/RerankResponseParser.java
@@ -61,7 +61,9 @@ public List<Pair<String, Float>> parse(String json) {
      */
     public List<Pair<String, Float>> parse(JsonNode arr) {
         if (!arr.isArray() || arr.size() == 0) {
-            return Collections.emptyList();
+            // Mutable empty list keeps the return-type contract consistent
+            // (Error Prone MixedMutabilityReturnType).
+            return new ArrayList<>();
         }
         List<Pair<String, Float>> results = new ArrayList<Pair<String, Float>>();
         for (JsonNode entry : arr) {
diff --git a/src/main/java/net/ladenthin/llama/json/package-info.java b/src/main/java/net/ladenthin/llama/json/package-info.java
new file mode 100644
index 00000000..0d68fa73
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/json/package-info.java
@@ -0,0 +1,11 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+/**
+ * JSON serialization helpers for {@link net.ladenthin.llama} request / response shapes.
+ *
+ * <p>JSpecify {@code @NullMarked} is declared at module level in
+ * {@code module-info.java} and applies to this package transitively.
+ */
+package net.ladenthin.llama.json;
diff --git a/src/main/java/net/ladenthin/llama/package-info.java b/src/main/java/net/ladenthin/llama/package-info.java
new file mode 100644
index 00000000..ca7cad49
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/package-info.java
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+/**
+ * Java bindings for llama.cpp.
+ *
+ * <p>JSpecify {@code @NullMarked} is declared at module level in
+ * {@code module-info.java} and applies transitively to every package
+ * in this module: every parameter, return value, and field is non-null
+ * unless explicitly annotated {@code @Nullable}. NullAway and the
+ * Checker Framework Nullness Checker both enforce this at compile
+ * time via the configured Error Prone compiler plugin (see
+ * {@code pom.xml}). Public-API methods that may legitimately have
+ * no value prefer {@code java.util.Optional<T>} over
+ * {@code @Nullable T}.
+ */
+package net.ladenthin.llama;
diff --git a/src/test/java/net/ladenthin/llama/CancellationTokenLincheckTest.java b/src/test/java/net/ladenthin/llama/CancellationTokenLincheckTest.java
index ac62e425..4119e832 100644
--- a/src/test/java/net/ladenthin/llama/CancellationTokenLincheckTest.java
+++ b/src/test/java/net/ladenthin/llama/CancellationTokenLincheckTest.java
@@ -4,8 +4,8 @@
 package net.ladenthin.llama;
 
 import org.jetbrains.kotlinx.lincheck.LinChecker;
-import org.jetbrains.kotlinx.lincheck.annotations.Operation;
-import org.jetbrains.kotlinx.lincheck.strategy.managed.modelchecking.ModelCheckingOptions;
+import org.jetbrains.lincheck.datastructures.ModelCheckingOptions;
+import org.jetbrains.lincheck.datastructures.Operation;
 import org.junit.jupiter.api.Test;
 
 /**
diff --git a/src/test/java/net/ladenthin/llama/ChatResponseTest.java b/src/test/java/net/ladenthin/llama/ChatResponseTest.java
index 1383d5e3..9769a7e8 100644
--- a/src/test/java/net/ladenthin/llama/ChatResponseTest.java
+++ b/src/test/java/net/ladenthin/llama/ChatResponseTest.java
@@ -5,7 +5,6 @@
 package net.ladenthin.llama;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.util.List;
@@ -61,7 +60,7 @@ public void parsesToolCalls() {
                 + "]},\"finish_reason\":\"tool_calls\"}],"
                 + "\"usage\":{\"prompt_tokens\":3,\"completion_tokens\":7}}";
         ChatResponse r = parser.parseResponse(json);
-        ChatMessage m = r.getFirstMessage();
+        ChatMessage m = r.getFirstMessage().orElseThrow();
         assertEquals("assistant", m.getRole());
         List<ToolCall> tc = m.getToolCalls();
         assertEquals(2, tc.size());
@@ -80,7 +79,7 @@ public void parsesObjectShapedArguments() {
                 + "{\"name\":\"f\",\"arguments\":{\"a\":1,\"b\":2}}}]},"
                 + "\"finish_reason\":\"tool_calls\"}]}";
         ChatResponse r = parser.parseResponse(json);
-        String args = r.getFirstMessage().getToolCalls().get(0).getArgumentsJson();
+        String args = r.getFirstMessage().orElseThrow().getToolCalls().get(0).getArgumentsJson();
         // exact text isn't guaranteed, but must contain both fields
         assertTrue(args.contains("\"a\":1"), "expected serialized object, got: " + args);
         assertTrue(args.contains("\"b\":2"));
@@ -112,7 +111,7 @@ public void buildMessagesJsonRoundTripsToolTurns() {
     @Test
     public void buildToolsJsonEmptyWhenNoTools() {
         ChatRequest req = new ChatRequest().addMessage("user", "hi");
-        assertNull(req.buildToolsJson());
+        assertTrue(req.buildToolsJson().isEmpty());
     }
 
     @Test
@@ -120,7 +119,7 @@ public void buildToolsJsonInlinesParameterSchema() {
         ChatRequest req = new ChatRequest()
                 .addTool(new ToolDefinition(
                         "echo", "Echo a string", "{\"type\":\"object\",\"properties\":{\"s\":{\"type\":\"string\"}}}"));
-        String tools = req.buildToolsJson();
+        String tools = req.buildToolsJson().orElseThrow();
         assertTrue(tools.contains("\"type\":\"function\""), tools);
         assertTrue(tools.contains("\"name\":\"echo\""), tools);
         assertTrue(tools.contains("\"properties\""), tools);
diff --git a/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java b/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
index 01a192b1..711646f9 100644
--- a/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
+++ b/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
@@ -3,12 +3,16 @@
 // SPDX-License-Identifier: MIT
 package net.ladenthin.llama;
 
+import static com.tngtech.archunit.lang.syntax.ArchRuleDefinition.fields;
 import static com.tngtech.archunit.lang.syntax.ArchRuleDefinition.noClasses;
+import static com.tngtech.archunit.library.dependencies.SlicesRuleDefinition.slices;
 
 import com.tngtech.archunit.core.importer.ImportOption;
 import com.tngtech.archunit.junit.AnalyzeClasses;
 import com.tngtech.archunit.junit.ArchTest;
 import com.tngtech.archunit.lang.ArchRule;
+import java.util.Random;
+import org.slf4j.Logger;
 
 @AnalyzeClasses(packages = "net.ladenthin.llama", importOptions = ImportOption.DoNotIncludeTests.class)
 public class LlamaArchitectureTest {
@@ -34,4 +38,98 @@ public class LlamaArchitectureTest {
             .should()
             .dependOnClassesThat()
             .resideInAnyPackage("org.junit..", "net.jqwik..", "com.tngtech.archunit..");
+
+    /**
+     * Every SLF4J {@link Logger} field follows the {@code private static final} idiom.
+     */
+    @ArchTest
+    static final ArchRule loggersArePrivateStaticFinal = fields()
+            .that()
+            .haveRawType(Logger.class)
+            .should()
+            .bePrivate()
+            .andShould()
+            .beStatic()
+            .andShould()
+            .beFinal();
+
+    /**
+     * No package cycles between sub-packages. Catches design drift where a leaf
+     * package starts importing from its parent or sibling.
+     */
+    @ArchTest
+    static final ArchRule noPackageCycles = slices()
+            .matching("net.ladenthin.llama.(*)..")
+            .should()
+            .beFreeOfCycles();
+
+    /**
+     * Production code must not import unsupported / internal JDK packages.
+     * These are not part of the Java SE API and may change or disappear without notice.
+     * {@code OSInfo} is vendored from xerial/sqlite-jdbc and was already audited;
+     * if it ever pulls in sun.*, this rule fails and forces a re-audit.
+     */
+    @ArchTest
+    static final ArchRule noInternalJdkImports = noClasses()
+            .that()
+            .resideInAPackage("net.ladenthin.llama..")
+            .should()
+            .dependOnClassesThat()
+            .resideInAnyPackage("sun..", "com.sun..", "jdk.internal..");
+
+    /**
+     * Public mutable state forbidden: any non-static field declared
+     * {@code public} must also be {@code final}. {@link LlamaOutput} is an
+     * immutable value class with {@code public final} fields — that pattern
+     * remains allowed because the fields ARE final.
+     */
+    @ArchTest
+    static final ArchRule noPublicMutableFields = fields()
+            .that()
+            .arePublic()
+            .and()
+            .areNotStatic()
+            .should()
+            .beFinal();
+
+    /**
+     * Production code must not call {@link System#exit(int)}; throw an exception instead.
+     */
+    @ArchTest
+    static final ArchRule noSystemExit = noClasses()
+            .that()
+            .resideInAPackage("net.ladenthin.llama..")
+            .should()
+            .callMethod(System.class, "exit", int.class)
+            .allowEmptyShould(true);
+
+    /**
+     * Production code must not construct {@link java.util.Random}; {@code Random} is a non-cryptographic
+     * PRNG (CWE-338). Use {@link java.security.SecureRandom} or {@link java.util.concurrent.ThreadLocalRandom}
+     * depending on whether cryptographic strength or thread-local fast jitter is needed.
+     */
+    @ArchTest
+    static final ArchRule noNewRandom = noClasses()
+            .that()
+            .resideInAPackage("net.ladenthin.llama..")
+            .should()
+            .callConstructor(Random.class)
+            .orShould()
+            .callConstructor(Random.class, long.class)
+            .allowEmptyShould(true);
+
+    /**
+     * Production code must not call {@link Thread#sleep(long)} / {@link Thread#sleep(long, int)};
+     * prefer {@link java.util.concurrent.BlockingQueue#poll(long, java.util.concurrent.TimeUnit)} or
+     * {@link java.util.concurrent.locks.Condition#await(long, java.util.concurrent.TimeUnit)}.
+     */
+    @ArchTest
+    static final ArchRule noThreadSleep = noClasses()
+            .that()
+            .resideInAPackage("net.ladenthin.llama..")
+            .should()
+            .callMethod(Thread.class, "sleep", long.class)
+            .orShould()
+            .callMethod(Thread.class, "sleep", long.class, int.class)
+            .allowEmptyShould(true);
 }
diff --git a/src/test/java/net/ladenthin/llama/LlamaModelSkipDownloadTest.java b/src/test/java/net/ladenthin/llama/LlamaModelSkipDownloadTest.java
new file mode 100644
index 00000000..dcf4eae5
--- /dev/null
+++ b/src/test/java/net/ladenthin/llama/LlamaModelSkipDownloadTest.java
@@ -0,0 +1,102 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import net.ladenthin.llama.args.ModelFlag;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for the {@code SKIP_DOWNLOAD} plumbing on {@link ModelParameters} and the
+ * paired translation in {@link SkipDownloadFailureTranslator}.
+ *
+ * <p>These tests do NOT load the native library &#x2014; they exercise pure Java logic:
+ * the boolean-setter round-trip via {@link ModelParameters#hasFlag(ModelFlag)} and the
+ * static translation heuristic that promotes a generic {@link LlamaException} to a typed
+ * {@link ModelUnavailableException} when the {@link ModelFlag#SKIP_DOWNLOAD} flag is
+ * set.</p>
+ */
+public class LlamaModelSkipDownloadTest {
+
+    /** Default constructor used by JUnit Jupiter. */
+    public LlamaModelSkipDownloadTest() {
+        // no-op
+    }
+
+    @Test
+    @DisplayName("setSkipDownload(true) sets the SKIP_DOWNLOAD flag")
+    public void setSkipDownload_true_setsFlag() {
+        ModelParameters p = new ModelParameters().setSkipDownload(true);
+        assertTrue(p.hasFlag(ModelFlag.SKIP_DOWNLOAD));
+    }
+
+    @Test
+    @DisplayName("setSkipDownload(false) clears the SKIP_DOWNLOAD flag")
+    public void setSkipDownload_false_clearsFlag() {
+        ModelParameters p = new ModelParameters().setSkipDownload(true).setSkipDownload(false);
+        assertFalse(p.hasFlag(ModelFlag.SKIP_DOWNLOAD));
+    }
+
+    @Test
+    @DisplayName("hasFlag returns false by default")
+    public void hasFlag_byDefault_returnsFalse() {
+        assertFalse(new ModelParameters().hasFlag(ModelFlag.SKIP_DOWNLOAD));
+    }
+
+    @Test
+    @DisplayName("translate: SKIP_DOWNLOAD set + 'Failed to parse' message -> ModelUnavailableException")
+    public void translate_skipDownloadSetAndParseFailed_returnsTypedException() {
+        ModelParameters p = new ModelParameters().setSkipDownload(true);
+        LlamaException original = new LlamaException("Failed to parse model parameters");
+
+        LlamaException translated = SkipDownloadFailureTranslator.translate(p, original);
+
+        assertInstanceOf(ModelUnavailableException.class, translated);
+        assertNotNull(translated.getMessage());
+        assertTrue(
+                translated.getMessage().contains("--skip-download"),
+                "message should mention the --skip-download flag for caller diagnosis");
+        assertSame(original, translated.getCause(), "original exception should be preserved as cause");
+    }
+
+    @Test
+    @DisplayName("translate: SKIP_DOWNLOAD set but unrelated message -> original exception passes through")
+    public void translate_skipDownloadSetButUnrelatedMessage_returnsOriginal() {
+        ModelParameters p = new ModelParameters().setSkipDownload(true);
+        LlamaException original = new LlamaException("could not allocate VRAM");
+
+        LlamaException translated = SkipDownloadFailureTranslator.translate(p, original);
+
+        assertSame(original, translated);
+    }
+
+    @Test
+    @DisplayName("translate: SKIP_DOWNLOAD NOT set -> original exception passes through even on parse-failed")
+    public void translate_skipDownloadNotSet_returnsOriginal() {
+        ModelParameters p = new ModelParameters(); // skip-download not set
+        LlamaException original = new LlamaException("Failed to parse model parameters");
+
+        LlamaException translated = SkipDownloadFailureTranslator.translate(p, original);
+
+        assertSame(original, translated);
+    }
+
+    @Test
+    @DisplayName("translate: null message -> original exception passes through")
+    public void translate_nullMessage_returnsOriginal() {
+        ModelParameters p = new ModelParameters().setSkipDownload(true);
+        LlamaException original = new LlamaException((String) null);
+
+        LlamaException translated = SkipDownloadFailureTranslator.translate(p, original);
+
+        assertSame(original, translated);
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/LlamaModelTest.java b/src/test/java/net/ladenthin/llama/LlamaModelTest.java
index 6b3255b6..2605f627 100644
--- a/src/test/java/net/ladenthin/llama/LlamaModelTest.java
+++ b/src/test/java/net/ladenthin/llama/LlamaModelTest.java
@@ -362,7 +362,7 @@ public void testTypedChat() {
         ChatResponse r = model.chat(req);
         assertNotNull(r);
         assertFalse(r.getChoices().isEmpty());
-        assertNotNull(r.getFirstMessage());
+        assertTrue(r.getFirstMessage().isPresent());
         assertTrue(r.getUsage().getTotalTokens() > 0);
     }
 
diff --git a/src/test/java/net/ladenthin/llama/LoggingSmokeTest.java b/src/test/java/net/ladenthin/llama/LoggingSmokeTest.java
new file mode 100644
index 00000000..82e884d5
--- /dev/null
+++ b/src/test/java/net/ladenthin/llama/LoggingSmokeTest.java
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import nl.altindag.log.LogCaptor;
+import org.junit.jupiter.api.Test;
+import org.slf4j.LoggerFactory;
+
+@ClaudeGenerated(
+        purpose = "Smoke-test the SLF4J + Logback pipeline so a future binding or "
+                + "configuration regression is caught at test time rather than silently "
+                + "swallowing logs in production.")
+public class LoggingSmokeTest {
+
+    /**
+     * Direct binding/routing check: emit a known event through the configured
+     * pipeline and assert LogCaptor saw it. Fails if SLF4J binds to NOPLogger
+     * or if Logback is misconfigured to drop INFO from this logger.
+     */
+    @Test
+    public void slf4jPipelineEmits() {
+        try (LogCaptor captor = LogCaptor.forClass(OSInfo.class)) {
+            LoggerFactory.getLogger(OSInfo.class).info("smoke");
+            assertTrue(
+                    captor.getInfoLogs().contains("smoke"),
+                    "SLF4J pipeline did not deliver INFO event to LogCaptor; "
+                            + "binding or Logback config is broken");
+        }
+    }
+
+    /**
+     * Production call-site check: trigger {@link OSInfo#getHardwareName()} on a
+     * stub {@link ProcessRunner} that throws, and assert the catch-block's
+     * {@code error} log is captured. Pins the production log line as part of
+     * the contract — an accidental refactor that drops the logger call fails
+     * this test.
+     */
+    @Test
+    public void getHardwareNameLogsError_whenProcessRunnerThrows() {
+        ProcessRunner original = OSInfo.processRunner;
+        try (LogCaptor captor = LogCaptor.forClass(OSInfo.class)) {
+            OSInfo.processRunner = new ProcessRunner() {
+                @Override
+                String runAndWaitFor(String command) throws IOException {
+                    throw new IOException("boom");
+                }
+            };
+            assertEquals("unknown", OSInfo.getHardwareName());
+            assertTrue(
+                    captor.getErrorLogs().stream()
+                            .anyMatch(m -> m.contains("Error while running uname -m")),
+                    "expected error log 'Error while running uname -m' was not captured");
+        } finally {
+            OSInfo.processRunner = original;
+        }
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/ModelParametersExtendedTest.java b/src/test/java/net/ladenthin/llama/ModelParametersExtendedTest.java
index bef4a3d4..1f4dc4f2 100644
--- a/src/test/java/net/ladenthin/llama/ModelParametersExtendedTest.java
+++ b/src/test/java/net/ladenthin/llama/ModelParametersExtendedTest.java
@@ -1058,22 +1058,22 @@ public void testToArrayComplexCombination() {
     }
 
     // -------------------------------------------------------------------------
-    // isDefault — extended
+    // isUnset — extended
     // -------------------------------------------------------------------------
 
     @Test
     public void testIsDefaultForCtxSize() {
         ModelParameters p = new ModelParameters();
-        assertTrue(p.isDefault("ctx-size"));
+        assertTrue(p.isUnset("ctx-size"));
         p.setCtxSize(2048);
-        assertFalse(p.isDefault("ctx-size"));
+        assertFalse(p.isUnset("ctx-size"));
     }
 
     @Test
     public void testIsDefaultForFlagOnly() {
         ModelParameters p = new ModelParameters();
-        assertTrue(p.isDefault("flash-attn"));
+        assertTrue(p.isUnset("flash-attn"));
         p.enableFlashAttn();
-        assertFalse(p.isDefault("flash-attn"));
+        assertFalse(p.isUnset("flash-attn"));
     }
 }
diff --git a/src/test/java/net/ladenthin/llama/ModelParametersTest.java b/src/test/java/net/ladenthin/llama/ModelParametersTest.java
index 59a295cf..7bd8630e 100644
--- a/src/test/java/net/ladenthin/llama/ModelParametersTest.java
+++ b/src/test/java/net/ladenthin/llama/ModelParametersTest.java
@@ -23,7 +23,7 @@
                 + "correct CLI argument formatting for enum-based setters (PoolingType, RopeScalingType, "
                 + "CacheType, GpuSplitMode, NumaStrategy, MiroStat) and composite-value setters "
                 + "(loraScaled, controlVectorScaled, controlVectorLayerRange), semicolon-separated "
-                + "lowercase sampler list, isDefault key-presence check, and the CliParameters base "
+                + "lowercase sampler list, isUnset key-presence check, and the CliParameters base "
                 + "behaviour: toString omits 'null' for flag-only entries, toArray always prepends an "
                 + "empty argv[0] string and omits values for null-valued flags.")
 public class ModelParametersTest {
@@ -185,25 +185,25 @@ public void testSetControlVectorLayerRangeSameStartEnd() {
     }
 
     // -------------------------------------------------------------------------
-    // isDefault
+    // isUnset
     // -------------------------------------------------------------------------
 
     @Test
     public void testIsDefaultTrueWhenNotSet() {
         ModelParameters p = new ModelParameters();
-        assertTrue(p.isDefault("threads"));
+        assertTrue(p.isUnset("threads"));
     }
 
     @Test
     public void testIsDefaultFalseWhenSet() {
         ModelParameters p = new ModelParameters().setThreads(4);
-        assertFalse(p.isDefault("threads"));
+        assertFalse(p.isUnset("threads"));
     }
 
     @Test
     public void testIsDefaultFalseAfterFlagOnly() {
         ModelParameters p = new ModelParameters().enableEmbedding();
-        assertFalse(p.isDefault("embedding"));
+        assertFalse(p.isUnset("embedding"));
     }
 
     // -------------------------------------------------------------------------
diff --git a/src/test/java/net/ladenthin/llama/MultimodalMessagesTest.java b/src/test/java/net/ladenthin/llama/MultimodalMessagesTest.java
index 523f098c..9fb5cafc 100644
--- a/src/test/java/net/ladenthin/llama/MultimodalMessagesTest.java
+++ b/src/test/java/net/ladenthin/llama/MultimodalMessagesTest.java
@@ -32,7 +32,7 @@ public class MultimodalMessagesTest {
     public void hasPartsIsFalseForLegacyConstructor() {
         ChatMessage m = new ChatMessage("user", "hello");
         assertFalse(m.hasParts());
-        assertEquals(null, m.getParts());
+        assertTrue(m.getParts().isEmpty());
     }
 
     @Test
@@ -40,7 +40,7 @@ public void hasPartsIsTrueForPartsConstructor() {
         ChatMessage m = new ChatMessage(
                 "user", Arrays.asList(ContentPart.text("hi"), ContentPart.imageUrl("data:image/png;base64,AAAA")));
         assertTrue(m.hasParts());
-        assertEquals(2, m.getParts().size());
+        assertEquals(2, m.getParts().orElseThrow().size());
     }
 
     @Test
@@ -60,9 +60,10 @@ public void userMultimodalFactoryBuildsUserMessage() {
         ChatMessage m = ChatMessage.userMultimodal(
                 ContentPart.text("what is this?"), ContentPart.imageUrl("data:image/jpeg;base64,Y"));
         assertEquals("user", m.getRole());
-        assertEquals(2, m.getParts().size());
-        assertEquals(ContentPart.Type.TEXT, m.getParts().get(0).getType());
-        assertEquals(ContentPart.Type.IMAGE_URL, m.getParts().get(1).getType());
+        List<ContentPart> parts = m.getParts().orElseThrow();
+        assertEquals(2, parts.size());
+        assertEquals(ContentPart.Type.TEXT, parts.get(0).getType());
+        assertEquals(ContentPart.Type.IMAGE_URL, parts.get(1).getType());
     }
 
     @Test
@@ -80,7 +81,7 @@ public void nullPartsListIsRejected() {
     public void getPartsListIsUnmodifiable() {
         ChatMessage m = ChatMessage.userMultimodal(ContentPart.text("x"));
         try {
-            m.getParts().add(ContentPart.text("y"));
+            m.getParts().orElseThrow().add(ContentPart.text("y"));
             fail("getParts() must return an unmodifiable list");
         } catch (UnsupportedOperationException expected) {
             // ok
diff --git a/src/test/java/net/ladenthin/llama/PairTest.java b/src/test/java/net/ladenthin/llama/PairTest.java
index 5a591561..d04819d0 100644
--- a/src/test/java/net/ladenthin/llama/PairTest.java
+++ b/src/test/java/net/ladenthin/llama/PairTest.java
@@ -7,6 +7,7 @@
 
 import static org.junit.jupiter.api.Assertions.*;
 
+import java.util.Objects;
 import org.junit.jupiter.api.Test;
 
 public class PairTest {
@@ -107,6 +108,16 @@ public void testHashCodeWithNull() {
         assertNotNull(pair.hashCode());
     }
 
+    @Test
+    public void testHashCodeMatchesObjectsHash() {
+        // Pins hashCode() to Objects.hash(key, value) exactly.
+        // Without this, PIT's PrimitiveReturnsMutator survives by replacing
+        // the return with 0 - the existing assertNotNull tests cannot detect
+        // that because hashCode()'s primitive int autoboxes to a non-null Integer.
+        Pair<String, Integer> pair = new Pair<>("key", 123);
+        assertEquals(Objects.hash("key", 123), pair.hashCode());
+    }
+
     @Test
     public void testToString() {
         Pair<String, Integer> pair = new Pair<>("testKey", 42);
diff --git a/src/test/java/net/ladenthin/llama/args/ModelFlagTest.java b/src/test/java/net/ladenthin/llama/args/ModelFlagTest.java
index 4b01e6a7..86b46b04 100644
--- a/src/test/java/net/ladenthin/llama/args/ModelFlagTest.java
+++ b/src/test/java/net/ladenthin/llama/args/ModelFlagTest.java
@@ -48,6 +48,7 @@ public static Collection<Object[]> data() {
             {ModelFlag.NO_CLEAR_IDLE, "--no-cache-idle-slots"},
             {ModelFlag.MMPROJ_AUTO, "--mmproj-auto"},
             {ModelFlag.MMPROJ_OFFLOAD, "--mmproj-offload"},
+            {ModelFlag.SKIP_DOWNLOAD, "--skip-download"},
         });
     }
 
@@ -63,7 +64,7 @@ public void testGetCliFlag(ModelFlag flag, String expectedCliFlag) {
 
     @Test
     public void testEnumCount() {
-        assertEquals(31, ModelFlag.values().length);
+        assertEquals(32, ModelFlag.values().length);
     }
 
     @ParameterizedTest(name = "{0} -> {1}")