diff --git a/.gitignore b/.gitignore
index ffe669b..b2791c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ tasks.json
 scsi1.raw
 scsi2.raw
 cdrom4.iso
-Cargo.lock
\ No newline at end of file
+Cargo.lock
+*.log
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..f6ba8a7
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,60 @@
+# Claude Instructions
+
+## INVARIANTS (non-negotiable)
+
+These are hard rules. Violating any of these is a session failure.
+
+- **MCP TOOLS BEFORE EVERYTHING**: `search` first!! `search` everything before you do it so you don't flail around.`report_error` before attempting any fix. `check_compat` before writing any compat function. `search` before inventing any technique. NO EXCEPTIONS. Do not skip these because you think you already know the answer — your training data for IRIX is outdated and wrong. **ENFORCED AT TWO LEVELS**: (1) The knowledge MCP tracks MCP tool calls — warning at turn 3, blocking at turn 6+ (nudge_escalation_threshold=2). (2) A Claude Code PreToolUse hook tracks built-in tool calls (Edit, Write, Bash) — warning after 8 calls without MCP search, **Edit/Write BLOCKED after 20 calls**. The hook catches the blind spot where the MCP nudge system can't see built-in tools. Both levels reset when you call search/report_error/check_compat.
+- **NO FIXES OUTSIDE MOGRIX RULES**: Every fix goes into `rules/`, `compat/`, or `patches/`. If you `sed` a file during debugging, that fix MUST end up in a YAML rule. If it doesn't, you have failed.
+- **NO INLINE C IN YAML**: C files go in `patches/packages/<pkg>/`, referenced via `add_source`. No heredocs generating .c/.h files in `prep_commands`.
+- **`add_rule` IMMEDIATELY AFTER FIX CONFIRMED**: The moment a build passes after a fix, call `add_rule` with `file_path` pointing to the authoritative rule file. Do not batch to session end — context pressure causes deferred `add_rule` calls to be dropped.
+- **DB IS CACHE, FILES ARE AUTHORITATIVE**: Rule files (`rules/packages/*.yaml`, `rules/generic.yaml`, `compat/catalog.yaml`, `rules/methods/*.md`) are the source of truth. `add_rule` must include `file_path`.
+- **DELEGATE LONG DEBUGS**: >2 failed fix attempts for the same error → stop and spawn a sub-agent with `Task()`. Pass it the error text, file paths, and tell it to use MCP tools first. Never let debug trace flood parent context.
+- **REDIRECT BUILD OUTPUT**: Never let rpmbuild output flood context. Log to file. Use sub-agents (`Task(model="haiku")`) for reading large build logs.
+- **INVOCATION**: `uv run mogrix <command>`. No other invocation method works.
+
+---
+
+## Session Protocol
+
+1. Call `session_start` MCP tool
+2. Work — use MCP tools for every error, every symbol, every lookup
+3. `add_rule` immediately after each confirmed fix
+4. Call `session_handoff` MCP tool before ending
+
+---
+
+## MCP Tool Quick Reference
+
+These are your primary interface. Use them before reading files, before grepping, before guessing.
+
+| When | Tool | What it does |
+|------|------|--------------|
+| Hit any error | `report_error` | Logs error AND auto-searches rules+compat+errors in one call |
+| Need to look something up | `search` (or `knowledge_query`) | FTS5 search across all knowledge, rules, errors, negative knowledge |
+| Confirmed a fix | `add_rule` | Stores the fix with `file_path` to authoritative rule file |
+| Learned something | `add_knowledge` (or `report_finding`) | Stores findings, decisions, insights |
+| Found a dead end | `add_negative` | Stores anti-patterns so they're never repeated |
+| Session start | `session_start` | Context summary, last handoff, active tasks |
+| Session end | `session_handoff` | Snapshot state for next session |
+
+
+## Context Management
+
+**Tuned for 1M context (Opus 4.6).** Sessions can safely run 400+ turns. Compaction/handoff urgency is low. Focus is on knowledge capture quality, not checkpoint frequency.
+
+- **Sub-agents for investigation**: Any task requiring >200 lines of output gets a sub-agent. `Task(model="haiku")` for build log reading. Sub-agent investigates and returns a concise summary; parent applies the fix.
+- **Re-orientation check every 8 tool calls**: Am I using MCP tools? Am I freestyling a fix that's probably already documented? Have I stored my findings? If unsure, call `session_start`.
+- **Store knowledge continuously**: `report_error` when you hit it → fix it → build passes → `add_rule` right then. Don't accumulate findings to store later. The nudge system fires a store reminder after 6 turns without a store.
+- **Checkpoint at 30 turns**: `save_snapshot` or `session_handoff` to reset the checkpoint counter. Mandatory stop at 60 turns (enforced, blocks all tools).
+- **Batch builds**: Max 2-3 background agents, each with its own rpmbuild directory. Only the orchestrator updates rule files. See `rules/methods/task-tracking.md`.
+
+**MCP enforcement thresholds** (mcm-engine.yaml):
+- Store reminder: 6 turns
+- Checkpoint: 30 turns
+- Mandatory stop: 60 turns (+10 grace)
+- Nudge escalation: 2 ignores → blocking
+- MCP-first enforcement: warning at turn 3, blocks at turn ~7 if no search/report_error/check_compat called
+
+---
+
diff --git a/Cargo.toml b/Cargo.toml
index acf1270..7a73fc3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,8 @@ developer_ip7 = []  # CP0 Compare/timer calibration stats and debug prints
 # Lightning: pedal-to-the-metal build — disables breakpoint checks and traceback buffer updates.
 # Incompatible with interactive debugging. For end-user / benchmarking builds only.
 lightning = []
+# Cranelift-based JIT compiler for MIPS → native translation.
+jit = ["cranelift-codegen", "cranelift-frontend", "cranelift-jit", "cranelift-module", "cranelift-native", "target-lexicon"]
 
 [dependencies]
 clap = { version = "4", features = ["derive"] }
@@ -31,6 +33,12 @@ serde = { version = "1.0.228", features = ["derive"] }
 toml = "1.0.3"
 parking_lot = "0.12"
 spin = "0.10.0"
+cranelift-codegen    = { version = "0.116", optional = true }
+cranelift-frontend   = { version = "0.116", optional = true }
+cranelift-jit        = { version = "0.116", optional = true }
+cranelift-module     = { version = "0.116", optional = true }
+cranelift-native     = { version = "0.116", optional = true }
+target-lexicon       = { version = "0.13",  optional = true }
 
 [target.'cfg(not(windows))'.dependencies]
 libc = "0.2"
@@ -39,6 +47,7 @@ libc = "0.2"
 lto = "fat"
 codegen-units = 1
 panic = "abort"
+debug = 1
 
 # Developer profile: release optimizations + debug symbols. Default build target.
 # Enables the "developer" feature flag for dev-only tooling.
diff --git a/iris.toml b/iris.toml
index ec1de59..1305f1f 100644
--- a/iris.toml
+++ b/iris.toml
@@ -15,7 +15,7 @@ prom = "prom.bin"
 
 # Window scale factor: 1 = native resolution, 2 = 2× for HiDPI/4K monitors.
 # Can also be set with the --2x command-line flag (CLI takes precedence).
-scale = 2
+scale = 1
 
 # RAM bank sizes in MB.
 # Each bank must be 0 (absent), 8, 16, 32, 64, or 128.
diff --git a/jit-diag.sh b/jit-diag.sh
new file mode 100755
index 0000000..6934c34
--- /dev/null
+++ b/jit-diag.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# JIT diagnostic launcher — runs emulator and captures output for analysis
+# Usage: ./jit-diag.sh [mode]
+#   mode: "jit"      — JIT enabled (default)
+#         "verify"   — JIT with verification
+#         "nojit"    — interpreter only through JIT dispatch
+#         "interp"   — pure interpreter (no JIT feature, baseline)
+#         "perf"     — perf profile, interpreter only (text report for analysis)
+#         "perf-jit" — perf profile with JIT enabled
+#
+# All IRIS_JIT_* env vars are passed through automatically:
+#   IRIS_JIT_MAX_TIER=0 ./jit-diag.sh jit
+#   IRIS_JIT_PROBE=500 IRIS_JIT_PROBE_MIN=100 ./jit-diag.sh jit
+
+MODE="${1:-jit}"
+OUTFILE="jit-diag-$(date +%Y%m%d-%H%M%S)-${MODE}.log"
+
+# Collect all IRIS_JIT_* env vars for display and passthrough
+JIT_VARS=$(env | grep '^IRIS_JIT_' | tr '\n' ' ')
+
+echo "=== IRIS JIT Diagnostic ===" | tee "$OUTFILE"
+echo "Mode: $MODE" | tee -a "$OUTFILE"
+echo "Date: $(date)" | tee -a "$OUTFILE"
+echo "Host: $(uname -m) $(uname -s) $(uname -r)" | tee -a "$OUTFILE"
+echo "Rust: $(rustc --version)" | tee -a "$OUTFILE"
+[ -n "$JIT_VARS" ] && echo "Env: $JIT_VARS" | tee -a "$OUTFILE"
+echo "" | tee -a "$OUTFILE"
+
+case "$MODE" in
+  jit)
+    echo "Running: IRIS_JIT=1 ${JIT_VARS}cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
+    IRIS_JIT=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    ;;
+  verify)
+    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 ${JIT_VARS}cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
+    IRIS_JIT=1 IRIS_JIT_VERIFY=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    ;;
+  nojit)
+    echo "Running: cargo run --release --features jit,lightning (no IRIS_JIT)" | tee -a "$OUTFILE"
+    cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    ;;
+  interp)
+    echo "Running: cargo run --release --features lightning (no jit feature)" | tee -a "$OUTFILE"
+    cargo run --release --features lightning 2>&1 | tee -a "$OUTFILE"
+    ;;
+  perf)
+    PERFREPORT="perf-report-$(date +%Y%m%d-%H%M%S).txt"
+    echo "Building (profiling profile, no jit feature)..." | tee -a "$OUTFILE"
+    cargo build --profile profiling --features lightning 2>&1 | tee -a "$OUTFILE"
+    echo "--- Press Ctrl-C when you have enough samples ---"
+    perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
+    echo "Processing perf data..." | tee -a "$OUTFILE"
+    perf report --stdio --no-children -i perf.data > "$PERFREPORT" 2>&1
+    echo "Perf report saved to: $PERFREPORT"
+    ;;
+  perf-jit)
+    PERFREPORT="perf-report-jit-$(date +%Y%m%d-%H%M%S).txt"
+    echo "Building (profiling profile, jit feature)..." | tee -a "$OUTFILE"
+    cargo build --profile profiling --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    echo "--- Press Ctrl-C when you have enough samples ---"
+    IRIS_JIT=1 perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
+    echo "Processing perf data..." | tee -a "$OUTFILE"
+    perf report --stdio --no-children -i perf.data > "$PERFREPORT" 2>&1
+    echo "Perf report saved to: $PERFREPORT"
+    ;;
+  *)
+    echo "Unknown mode: $MODE"
+    echo "Usage: $0 [jit|verify|nojit|interp|perf|perf-jit]"
+    exit 1
+    ;;
+esac
+
+echo "" >> "$OUTFILE"
+echo "=== Exit code: $? ===" >> "$OUTFILE"
+echo "Output saved to: $OUTFILE"
diff --git a/jit_overview.md b/jit_overview.md
new file mode 100644
index 0000000..28f34b6
--- /dev/null
+++ b/jit_overview.md
@@ -0,0 +1,52 @@
+# IRIS Adaptive JIT — How We Taught an Emulator to Learn
+
+## The Problem
+
+IRIS emulates an SGI Indy (MIPS R4400) well enough to boot IRIX 6.5 to a graphical desktop. But the interpreter tops out at ~30 MIPS on x86_64. We wanted a Cranelift-based JIT compiler to go faster.
+
+First attempt: compile everything. Result: **hang**. Loads and stores in the same compiled block caused Cranelift to generate bad register spill code on x86_64 (only 15 usable registers vs AArch64's 31). Weeks of debugging.
+
+## The Insight
+
+Instead of fixing one bug and praying, make the JIT **fix itself**.
+
+## How It Works
+
+Every compiled block starts at the safest level and earns its way up:
+
+```
+Tier 0 (Alu)    Pure math + branches. Can't go wrong.
+Tier 1 (Loads)  Add memory reads. Might hit TLB misses.
+Tier 2 (Full)   Add memory writes. Full native speed.
+```
+
+**Lifecycle of a block:**
+1. First seen → compile at Tier 0, mark **speculative**
+2. Before each speculative run → snapshot the entire CPU (~2.3 KB)
+3. Block runs clean 50 times → **trusted** (no more snapshots)
+4. Trusted for 200 runs → **promote** to next tier (speculative again)
+5. Block causes 3 exceptions at new tier → **demote** back, recompile
+
+If a speculative block misbehaves, CPU state is rolled back from the snapshot and the interpreter re-runs the instruction correctly. The system never crashes — it just learns that block isn't ready yet.
+
+## Bugs Found Along the Way
+
+1. **SSA register pressure** — Cranelift's exception paths referenced values across block boundaries. Fixed by flushing modified registers before each helper call.
+
+2. **Delay slot skip** *(the real killer)* — MIPS branches have a "delay slot": the instruction after a branch always executes. The JIT's tracer included load instructions in delay slots but the compiler's tier gate silently skipped them. Every branch with a load delay slot (extremely common in MIPS) produced wrong results. One-line fix.
+
+## Profile Cache
+
+Hot block profiles are saved to `~/.iris/jit-profile.bin` on shutdown. Next boot, blocks are pre-compiled at their proven tier — skipping the entire warmup.
+
+## Results
+
+```
+Run with IRIS_JIT=0:  boots ✓  (interpreter only)
+Run with IRIS_JIT=1:  boots to graphical desktop ✓
+                      73,015 blocks compiled
+                      4,036 promotions, 6 demotions, 145 rollbacks
+                      0 crashes
+```
+
+The JIT is now self-correcting. It starts conservative, learns what's safe, and backs off when it's wrong. The emulator doesn't need us to manually decide what to compile — it figures it out at runtime.
diff --git a/mcm-engine.yaml b/mcm-engine.yaml
new file mode 100644
index 0000000..37acef8
--- /dev/null
+++ b/mcm-engine.yaml
@@ -0,0 +1,52 @@
+project_name: iris
+db_path: .claude/knowledge.db
+rules_path: rules/
+plugins: []
+nudges:
+  # Tuned for 1M context (Opus 4.6) — sessions can safely run 400+ turns.
+  # Compaction/handoff thresholds are relaxed; enforcement thresholds are tightened.
+  store_reminder_turns: 6        # was 4 — less aggressive, sessions are longer
+  checkpoint_turns: 30            # was 10 — 1M context, no rush to checkpoint
+  mandatory_stop_turns: 320        # was 20 — allows much longer sessions
+  hyper_focus_threshold: 8        # was 3 — slight increase for longer sessions
+  rules_check_interval: 4         # was 5 — less frequent but still periodic
+  # Block tool calls (return error) after mandatory_stop + grace without checkpoint.
+  mandatory_stop_blocking: true
+  mandatory_stop_grace: 10        # was 5 — more grace for long sessions
+  # After N ignored nudges of the same type, escalate to blocking.
+  # TIGHTER than before — escalate faster when agent ignores nudges.
+  nudge_escalation_threshold: 2   # was 3
+
+server_name: iris-knowledge
+
+server_instructions: |
+
+  BEHAVIORAL MANDATES (non-negotiable):
+
+  0. Do not be lazy. Do not cheat. Focus on correctness and precision, not the "quickest way" to solve problems. Carefully examine any potential shortcut     and consider how it will impact downstream packages.
+
+  1. MUST call `report_error` BEFORE attempting manual fixes for any build/link/runtime error.
+     It logs the error AND auto-searches rules, errors, and compat catalog in one call.
+
+  2. MUST call `check_compat` BEFORE writing compat function implementations.
+
+  3. MUST call `add_rule` IMMEDIATELY after confirming a fix works (build passes, test passes).
+     Do NOT defer add_rule calls to session end — context pressure causes them to be dropped.
+     Pattern: report_error when you hit it -> fix it -> build passes -> add_rule RIGHT THEN.
+
+  4. MUST delegate to a sub-agent after 2 failed fix attempts for the same error.
+     Long debug sessions destroy parent context.
+
+  5. DB is CACHE, files are AUTHORITATIVE. Rule files (rules/packages/*.yaml, rules/generic.yaml,
+     compat/catalog.yaml, rules/methods/*.md) are the source of truth. When using add_rule,
+     provide file_path pointing to the authoritative rule file.
+
+  Tool quick reference:
+  - `search` (or `knowledge_query`): Search rules, knowledge, errors, compat
+  - `report_error`: Log error + auto-search for fixes (THE KILLER FEATURE)
+  - `check_compat`: Search compat/catalog.yaml for a symbol
+  - `add_knowledge` (or `report_finding`): Store findings/decisions/insights
+  - `add_negative`: Store anti-patterns and dead ends
+  - `add_rule`: Create/index rule after fixing a problem
+  - `session_start`: Initialize session with context
+  - `session_handoff`: Snapshot state for next session
diff --git a/src/hptimer.rs b/src/hptimer.rs
index cdbb0c1..e25ab92 100644
--- a/src/hptimer.rs
+++ b/src/hptimer.rs
@@ -366,12 +366,14 @@ fn timer_thread_loop(inner: Arc<Mutex<TimerManagerInner>>, new_timer_added: Arc<
 
                 let delay = target - sleep_now;
 
-                if delay > Duration::from_millis(2) {
+                if delay > Duration::from_micros(200) {
                     // Park with a safe threshold
-                    let park_duration = delay - Duration::from_millis(1);
+                    let park_duration = delay - Duration::from_micros(100);
                     thread::park_timeout(park_duration);
                 } else {
-                    std::hint::spin_loop();
+                    // Short sleep instead of spin — yields the core without
+                    // burning CPU while waiting for the timer to fire.
+                    thread::sleep(Duration::from_micros(50));
                 }
             }
         } else {
diff --git a/src/jit/cache.rs b/src/jit/cache.rs
new file mode 100644
index 0000000..9f7eb23
--- /dev/null
+++ b/src/jit/cache.rs
@@ -0,0 +1,131 @@
+//! JIT code cache: maps physical PCs to compiled native code blocks.
+
+use std::collections::HashMap;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+#[repr(u8)]
+pub enum BlockTier {
+    Alu   = 0,  // ALU + branches only, no memory helper calls
+    Loads = 1,  // ALU + loads + branches
+    Full  = 2,  // ALU + loads + stores + branches
+}
+
+impl BlockTier {
+    pub fn promote(self) -> Option<BlockTier> {
+        match self {
+            BlockTier::Alu   => Some(BlockTier::Loads),
+            BlockTier::Loads => Some(BlockTier::Full),
+            BlockTier::Full  => None,
+        }
+    }
+    pub fn demote(self) -> Option<BlockTier> {
+        match self {
+            BlockTier::Alu   => None,
+            BlockTier::Loads => Some(BlockTier::Alu),
+            BlockTier::Full  => Some(BlockTier::Loads),
+        }
+    }
+}
+
+// Defaults; overridden by IRIS_JIT_STABLE / IRIS_JIT_PROMOTE / IRIS_JIT_DEMOTE env vars.
+pub const TIER_STABLE_THRESHOLD:  u32 = 50;   // consecutive clean exits → trusted
+pub const TIER_PROMOTE_THRESHOLD: u32 = 200;  // trusted clean exits → try next tier
+pub const TIER_DEMOTE_THRESHOLD:  u32 = 3;    // exceptions in trial period → demote
+
+/// Runtime-configurable tier thresholds. Reads env vars once at init.
+pub struct TierConfig {
+    pub stable:  u32,
+    pub promote: u32,
+    pub demote:  u32,
+}
+
+impl TierConfig {
+    pub fn from_env() -> Self {
+        Self {
+            stable:  std::env::var("IRIS_JIT_STABLE").ok()
+                .and_then(|v| v.parse().ok()).unwrap_or(TIER_STABLE_THRESHOLD),
+            promote: std::env::var("IRIS_JIT_PROMOTE").ok()
+                .and_then(|v| v.parse().ok()).unwrap_or(TIER_PROMOTE_THRESHOLD),
+            demote:  std::env::var("IRIS_JIT_DEMOTE").ok()
+                .and_then(|v| v.parse().ok()).unwrap_or(TIER_DEMOTE_THRESHOLD),
+        }
+    }
+}
+
+/// A compiled native code block.
+pub struct CompiledBlock {
+    /// Function pointer to compiled native code.
+    pub entry: *const u8,
+    /// Physical address this block starts at.
+    pub phys_addr: u64,
+    /// Virtual address (for diagnostics).
+    pub virt_addr: u64,
+    /// Number of MIPS instructions in this block.
+    pub len_mips: u32,
+    /// Size of native code in bytes.
+    pub len_native: u32,
+    /// Compilation tier for this block.
+    pub tier:            BlockTier,
+    /// Total number of times this block has been entered.
+    pub hit_count:       u32,
+    /// Number of exceptions that occurred during this block's execution.
+    pub exception_count: u32,
+    /// Consecutive clean (non-exception) exits since last exception or tier change.
+    pub stable_hits:     u32,
+    /// True when this block is in a trial period (not yet fully trusted at current tier).
+    pub speculative:     bool,
+}
+
+// Safety: CompiledBlock is only accessed from the CPU thread.
+unsafe impl Send for CompiledBlock {}
+
+/// Code cache keyed by physical PC (aligned to 4 bytes).
+pub struct CodeCache {
+    blocks: HashMap<u64, CompiledBlock>,
+}
+
+impl CodeCache {
+    pub fn new() -> Self {
+        Self {
+            blocks: HashMap::new(),
+        }
+    }
+
+    pub fn lookup(&self, phys_pc: u64) -> Option<&CompiledBlock> {
+        self.blocks.get(&phys_pc)
+    }
+
+    pub fn lookup_mut(&mut self, phys_pc: u64) -> Option<&mut CompiledBlock> {
+        self.blocks.get_mut(&phys_pc)
+    }
+
+    pub fn insert(&mut self, phys_pc: u64, block: CompiledBlock) {
+        self.blocks.insert(phys_pc, block);
+    }
+
+    pub fn replace(&mut self, phys_pc: u64, block: CompiledBlock) {
+        self.blocks.insert(phys_pc, block);
+    }
+
+    /// Invalidate all blocks that overlap a physical address range.
+    /// Called when self-modifying code is detected or CACHE instruction executes.
+    pub fn invalidate_range(&mut self, phys_start: u64, phys_end: u64) {
+        self.blocks.retain(|&addr, block| {
+            let block_end = addr + (block.len_mips as u64 * 4);
+            addr >= phys_end || block_end <= phys_start
+        });
+    }
+
+    /// Invalidate everything (used on TLB flush or mode change).
+    pub fn invalidate_all(&mut self) {
+        self.blocks.clear();
+    }
+
+    pub fn len(&self) -> usize {
+        self.blocks.len()
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = (&u64, &CompiledBlock)> {
+        self.blocks.iter()
+    }
+}
diff --git a/src/jit/compiler.rs b/src/jit/compiler.rs
new file mode 100644
index 0000000..cb5461b
--- /dev/null
+++ b/src/jit/compiler.rs
@@ -0,0 +1,1028 @@
+//! Block compiler: translates MIPS basic blocks to native code via Cranelift.
+
+use cranelift_codegen::ir::{self, types, AbiParam, InstBuilder, MemFlags, Value, FuncRef};
+use cranelift_codegen::ir::condcodes::IntCC;
+use cranelift_codegen::settings::{self, Configurable};
+use cranelift_codegen::{self, Context};
+use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable};
+use cranelift_jit::{JITBuilder, JITModule};
+use cranelift_module::{Linkage, Module, FuncId};
+
+use crate::mips_exec::DecodedInstr;
+use crate::mips_isa::*;
+
+use super::cache::{BlockTier, CompiledBlock};
+use super::context::{JitContext, EXIT_NORMAL, EXIT_INTERPRET, EXIT_EXCEPTION};
+use super::helpers::HelperPtrs;
+
+pub struct BlockCompiler {
+    jit_module: JITModule,
+    ctx: Context,
+    builder_ctx: FunctionBuilderContext,
+    func_id_counter: u32,
+    // Declared function IDs for memory helpers (registered as imports)
+    fn_read_u8: FuncId,
+    fn_read_u16: FuncId,
+    fn_read_u32: FuncId,
+    fn_read_u64: FuncId,
+    fn_write_u8: FuncId,
+    fn_write_u16: FuncId,
+    fn_write_u32: FuncId,
+    fn_write_u64: FuncId,
+    fn_interp_step: FuncId,
+}
+
+impl BlockCompiler {
+    pub fn new(helpers: &HelperPtrs) -> Self {
+        let mut flag_builder = settings::builder();
+        flag_builder.set("opt_level", "speed").unwrap();
+        flag_builder.set("is_pic", "false").unwrap();
+
+        let isa_builder = cranelift_native::builder().expect("host ISA not supported");
+        let isa = isa_builder.finish(settings::Flags::new(flag_builder)).unwrap();
+
+        let mut jit_builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
+
+        // Register helper function symbols
+        jit_builder.symbol("jit_read_u8", helpers.read_u8);
+        jit_builder.symbol("jit_read_u16", helpers.read_u16);
+        jit_builder.symbol("jit_read_u32", helpers.read_u32);
+        jit_builder.symbol("jit_read_u64", helpers.read_u64);
+        jit_builder.symbol("jit_write_u8", helpers.write_u8);
+        jit_builder.symbol("jit_write_u16", helpers.write_u16);
+        jit_builder.symbol("jit_write_u32", helpers.write_u32);
+        jit_builder.symbol("jit_write_u64", helpers.write_u64);
+        jit_builder.symbol("jit_interp_step", helpers.interp_step);
+
+        let mut jit_module = JITModule::new(jit_builder);
+
+        // Declare helper function signatures: read(ctx_ptr, exec_ptr, virt_addr) -> u64
+        let ptr_type = jit_module.target_config().pointer_type();
+        let mut read_sig = jit_module.make_signature();
+        read_sig.params.push(AbiParam::new(ptr_type)); // ctx_ptr
+        read_sig.params.push(AbiParam::new(ptr_type)); // exec_ptr
+        read_sig.params.push(AbiParam::new(types::I64)); // virt_addr
+        read_sig.returns.push(AbiParam::new(types::I64)); // value
+        // Use the ISA's default calling convention (AppleAarch64 on macOS, SystemV on Linux)
+
+        // write(ctx_ptr, exec_ptr, virt_addr, value) -> u64
+        let mut write_sig = jit_module.make_signature();
+        write_sig.params.push(AbiParam::new(ptr_type));
+        write_sig.params.push(AbiParam::new(ptr_type));
+        write_sig.params.push(AbiParam::new(types::I64));
+        write_sig.params.push(AbiParam::new(types::I64)); // value
+        write_sig.returns.push(AbiParam::new(types::I64));
+        // Use default calling convention
+
+        let fn_read_u8  = jit_module.declare_function("jit_read_u8",  Linkage::Import, &read_sig).unwrap();
+        let fn_read_u16 = jit_module.declare_function("jit_read_u16", Linkage::Import, &read_sig).unwrap();
+        let fn_read_u32 = jit_module.declare_function("jit_read_u32", Linkage::Import, &read_sig).unwrap();
+        let fn_read_u64 = jit_module.declare_function("jit_read_u64", Linkage::Import, &read_sig).unwrap();
+        let fn_write_u8  = jit_module.declare_function("jit_write_u8",  Linkage::Import, &write_sig).unwrap();
+        let fn_write_u16 = jit_module.declare_function("jit_write_u16", Linkage::Import, &write_sig).unwrap();
+        let fn_write_u32 = jit_module.declare_function("jit_write_u32", Linkage::Import, &write_sig).unwrap();
+        let fn_write_u64 = jit_module.declare_function("jit_write_u64", Linkage::Import, &write_sig).unwrap();
+
+        // interp_step(ctx_ptr, exec_ptr) -> u64
+        let mut step_sig = jit_module.make_signature();
+        step_sig.params.push(AbiParam::new(ptr_type)); // ctx_ptr
+        step_sig.params.push(AbiParam::new(ptr_type)); // exec_ptr
+        step_sig.returns.push(AbiParam::new(types::I64));
+        let fn_interp_step = jit_module.declare_function("jit_interp_step", Linkage::Import, &step_sig).unwrap();
+
+        Self {
+            ctx: jit_module.make_context(),
+            jit_module,
+            builder_ctx: FunctionBuilderContext::new(),
+            func_id_counter: 0,
+            fn_read_u8, fn_read_u16, fn_read_u32, fn_read_u64,
+            fn_write_u8, fn_write_u16, fn_write_u32, fn_write_u64,
+            fn_interp_step,
+        }
+    }
+
+    /// Compile a block of MIPS instructions to native code.
+    /// `instrs` is a slice of (raw_word, DecodedInstr) for each instruction in the block.
+    /// `block_pc` is the virtual PC of the first instruction.
+    /// Returns None if the block is empty or compilation fails.
+    pub fn compile_block(
+        &mut self,
+        instrs: &[(u32, DecodedInstr)],
+        block_pc: u64,
+        tier: BlockTier,
+    ) -> Option<CompiledBlock> {
+        if instrs.is_empty() {
+            return None;
+        }
+
+        let num_instrs = instrs.len() as u32;
+
+        // Create a unique function name
+        let name = format!("jit_block_{:x}_{}", block_pc, self.func_id_counter);
+        self.func_id_counter += 1;
+
+        // Declare function signature: extern "C" fn(*mut JitContext)
+        let ptr_type = self.jit_module.target_config().pointer_type();
+        self.ctx.func.signature.params.push(AbiParam::new(ptr_type));
+        // Use default calling convention (matches extern "C" on host)
+
+        let func_id = self.jit_module
+            .declare_function(&name, Linkage::Local, &self.ctx.func.signature)
+            .unwrap();
+
+        let mut builder = FunctionBuilder::new(&mut self.ctx.func, &mut self.builder_ctx);
+
+        let entry_block = builder.create_block();
+        builder.append_block_params_for_function_params(entry_block);
+        builder.switch_to_block(entry_block);
+        builder.seal_block(entry_block);
+
+        let ctx_ptr = builder.block_params(entry_block)[0];
+        let mem = MemFlags::trusted();
+
+        // Load executor pointer from JitContext
+        let exec_ptr = builder.ins().load(
+            ptr_type, mem, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::executor_ptr_offset()),
+        );
+
+        // Declare helper function references for this function
+        let helpers = EmitHelpers {
+            read_u8:  self.jit_module.declare_func_in_func(self.fn_read_u8,  &mut builder.func),
+            read_u16: self.jit_module.declare_func_in_func(self.fn_read_u16, &mut builder.func),
+            read_u32: self.jit_module.declare_func_in_func(self.fn_read_u32, &mut builder.func),
+            read_u64: self.jit_module.declare_func_in_func(self.fn_read_u64, &mut builder.func),
+            write_u8:  self.jit_module.declare_func_in_func(self.fn_write_u8,  &mut builder.func),
+            write_u16: self.jit_module.declare_func_in_func(self.fn_write_u16, &mut builder.func),
+            write_u32: self.jit_module.declare_func_in_func(self.fn_write_u32, &mut builder.func),
+            write_u64: self.jit_module.declare_func_in_func(self.fn_write_u64, &mut builder.func),
+            interp_step: self.jit_module.declare_func_in_func(self.fn_interp_step, &mut builder.func),
+        };
+
+        // Load GPRs 1-31 from JitContext (gpr[0] is always 0)
+        let mut gpr = [builder.ins().iconst(types::I64, 0); 32];
+        for i in 1..32usize {
+            gpr[i] = builder.ins().load(
+                types::I64, mem, ctx_ptr,
+                ir::immediates::Offset32::new(JitContext::gpr_offset(i)),
+            );
+        }
+
+        // Load hi/lo
+        let mut hi = builder.ins().load(types::I64, mem, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::hi_offset()));
+        let mut lo = builder.ins().load(types::I64, mem, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::lo_offset()));
+
+        // Bitmask of GPRs modified so far (bits 1-31); used to flush before helper calls
+        let mut modified_gprs: u32 = 0;
+
+        // Emit IR for each instruction
+        let mut compiled_count = 0u32;
+        let mut branch_exit_pc: Option<Value> = None;
+
+        let mut idx = 0;
+        while idx < instrs.len() {
+            let (_, d) = &instrs[idx];
+            let instr_pc = block_pc.wrapping_add(idx as u64 * 4);
+            let result = emit_instruction(
+                &mut builder, ctx_ptr, exec_ptr, &helpers,
+                &mut gpr, &mut hi, &mut lo, &mut modified_gprs, d, instr_pc, tier,
+            );
+            match result {
+                EmitResult::Ok => { compiled_count += 1; idx += 1; }
+                EmitResult::Branch(target_val) => {
+                    compiled_count += 1;
+                    idx += 1;
+                    // Emit the delay slot instruction (next in the list, if present)
+                    if idx < instrs.len() {
+                        let (_, delay_d) = &instrs[idx];
+                        let delay_pc = block_pc.wrapping_add(idx as u64 * 4);
+                        let delay_result = emit_instruction(
+                            &mut builder, ctx_ptr, exec_ptr, &helpers,
+                            &mut gpr, &mut hi, &mut lo, &mut modified_gprs, delay_d, delay_pc, tier,
+                        );
+                        match delay_result {
+                            EmitResult::Ok => { compiled_count += 1; }
+                            EmitResult::Stop => {
+                                // Delay slot can't be compiled at this tier — interpreter fallback.
+                                // Flush all modified GPRs to ctx so interpreter sees current state.
+                                flush_modified_gprs(&mut builder, &gpr, ctx_ptr, &mut modified_gprs);
+                                builder.ins().store(mem, hi, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::hi_offset()));
+                                builder.ins().store(mem, lo, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::lo_offset()));
+                                // Store delay slot PC so interpreter executes the right instruction
+                                let delay_pc_val = builder.ins().iconst(types::I64, delay_pc as i64);
+                                builder.ins().store(mem, delay_pc_val, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::pc_offset()));
+                                // Call interpreter: syncs ctx→exec, step(), syncs exec→ctx
+                                builder.ins().call(helpers.interp_step, &[ctx_ptr, exec_ptr]);
+                                // Reload GPRs from ctx (interpreter may have modified any register)
+                                for i in 1..32usize {
+                                    gpr[i] = builder.ins().load(
+                                        types::I64, mem, ctx_ptr,
+                                        ir::immediates::Offset32::new(JitContext::gpr_offset(i)),
+                                    );
+                                }
+                                hi = builder.ins().load(types::I64, mem, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::hi_offset()));
+                                lo = builder.ins().load(types::I64, mem, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::lo_offset()));
+                                compiled_count += 1;
+                            }
+                            _ => {} // Branch in delay slot — shouldn't happen
+                        }
+                    }
+                    branch_exit_pc = Some(target_val);
+                    break;
+                }
+                EmitResult::Stop => break,
+            }
+        }
+
+        if compiled_count == 0 {
+            builder.ins().return_(&[]);
+            builder.finalize();
+            self.ctx.clear();
+            return None;
+        }
+
+        // Store all GPRs that may have changed. Use a full bitmask to ensure completeness.
+        let mut all_modified: u32 = 0xFFFFFFFE; // bits 1-31 set (skip r0)
+        flush_modified_gprs(&mut builder, &gpr, ctx_ptr, &mut all_modified);
+
+        // Store hi/lo back
+        builder.ins().store(mem, hi, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::hi_offset()));
+        builder.ins().store(mem, lo, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::lo_offset()));
+
+        // Set exit PC
+        let exit_pc_val = if let Some(target) = branch_exit_pc {
+            target
+        } else {
+            let fallthrough_pc = block_pc.wrapping_add(compiled_count as u64 * 4);
+            builder.ins().iconst(types::I64, fallthrough_pc as i64)
+        };
+        builder.ins().store(mem, exit_pc_val, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::pc_offset()));
+
+        // Set exit_reason = EXIT_NORMAL
+        let exit_val = builder.ins().iconst(types::I32, EXIT_NORMAL as i64);
+        builder.ins().store(mem, exit_val, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::exit_reason_offset()));
+
+        // Set block_instrs_executed
+        let count_val = builder.ins().iconst(types::I32, compiled_count as i64);
+        builder.ins().store(mem, count_val, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::block_instrs_offset()));
+
+        builder.ins().return_(&[]);
+        builder.finalize();
+
+        // Compile to native code
+        self.jit_module.define_function(func_id, &mut self.ctx).unwrap();
+        self.jit_module.clear_context(&mut self.ctx);
+        self.jit_module.finalize_definitions().unwrap();
+
+        let code_ptr = self.jit_module.get_finalized_function(func_id);
+        let code_size = 0u32; // JITModule doesn't expose size easily; not critical
+
+        Some(CompiledBlock {
+            entry: code_ptr,
+            phys_addr: 0, // filled in by caller
+            virt_addr: block_pc,
+            len_mips: compiled_count,
+            len_native: code_size,
+            tier,
+            // Full-tier blocks contain stores that modify memory. Speculative
+            // rollback restores CPU/TLB state but NOT memory, so read-modify-write
+            // sequences get double-applied on rollback. Non-speculative blocks skip
+            // snapshot/rollback — on exception, the store emitter's flushed GPRs and
+            // faulting PC (already in executor via sync_to) are used directly.
+            speculative: tier != BlockTier::Full,
+            hit_count: 0,
+            exception_count: 0,
+            stable_hits: 0,
+        })
+    }
+}
+
+/// Helper function references for memory operations within a compiled function.
+struct EmitHelpers {
+    read_u8: FuncRef, read_u16: FuncRef, read_u32: FuncRef, read_u64: FuncRef,
+    write_u8: FuncRef, write_u16: FuncRef, write_u32: FuncRef, write_u64: FuncRef,
+    interp_step: FuncRef,
+}
+
+/// Result of emitting a single instruction.
+enum EmitResult {
+    /// Instruction compiled normally.
+    Ok,
+    /// Instruction is a branch; the Value is the computed target PC.
+    Branch(Value),
+    /// Instruction is not compilable — terminate block before it.
+    Stop,
+}
+
+/// Emit Cranelift IR for a single MIPS instruction.
+fn emit_instruction(
+    builder: &mut FunctionBuilder,
+    ctx_ptr: Value,
+    exec_ptr: Value,
+    helpers: &EmitHelpers,
+    gpr: &mut [Value; 32],
+    hi: &mut Value,
+    lo: &mut Value,
+    modified_gprs: &mut u32,
+    d: &DecodedInstr,
+    instr_pc: u64,
+    tier: BlockTier,
+) -> EmitResult {
+    let op = d.op as u32;
+    let rs = d.rs as usize;
+    let rt = d.rt as usize;
+    let rd = d.rd as usize;
+    let sa = d.sa as u32;
+    let funct = d.funct as u32;
+
+    match op {
+        OP_SPECIAL => {
+            let result = emit_special(builder, gpr, hi, lo, d, rs, rt, rd, sa, funct);
+            // Conservative: mark rd modified for all SPECIAL ops that return Ok.
+            // Harmless for ops that don't write rd (JR, MTHI, MTLO) since flush
+            // will simply store the still-valid value that was loaded at block entry.
+            if matches!(result, EmitResult::Ok) {
+                *modified_gprs |= 1u32 << rd;
+            }
+            result
+        }
+        OP_ADDIU  => { emit_addiu(builder, gpr, rs, rt, d);  *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_DADDIU => { emit_daddiu(builder, gpr, rs, rt, d); *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_SLTI   => { emit_slti(builder, gpr, rs, rt, d);   *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_SLTIU  => { emit_sltiu(builder, gpr, rs, rt, d);  *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_ANDI   => { emit_andi(builder, gpr, rs, rt, d);   *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_ORI    => { emit_ori(builder, gpr, rs, rt, d);    *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_XORI   => { emit_xori(builder, gpr, rs, rt, d);   *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_LUI    => { emit_lui(builder, gpr, rt, d);         *modified_gprs |= 1 << rt; EmitResult::Ok }
+
+        // --- Loads (tier-gated) ---
+        OP_LB | OP_LBU | OP_LH | OP_LHU | OP_LW | OP_LWU | OP_LD => {
+            if tier == BlockTier::Alu { return EmitResult::Stop; }
+            match op {
+                OP_LB  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u8,  gpr, rs, rt, d, LoadWidth::Byte,   true,  instr_pc, modified_gprs),
+                OP_LBU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u8,  gpr, rs, rt, d, LoadWidth::Byte,   false, instr_pc, modified_gprs),
+                OP_LH  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u16, gpr, rs, rt, d, LoadWidth::Half,   true,  instr_pc, modified_gprs),
+                OP_LHU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u16, gpr, rs, rt, d, LoadWidth::Half,   false, instr_pc, modified_gprs),
+                OP_LW  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u32, gpr, rs, rt, d, LoadWidth::Word,   true,  instr_pc, modified_gprs),
+                OP_LWU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u32, gpr, rs, rt, d, LoadWidth::Word,   false, instr_pc, modified_gprs),
+                OP_LD  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u64, gpr, rs, rt, d, LoadWidth::Double, false, instr_pc, modified_gprs),
+                _ => unreachable!(),
+            }
+        }
+
+        // --- Stores (tier-gated) ---
+        OP_SB | OP_SH | OP_SW | OP_SD => {
+            if tier == BlockTier::Alu || tier == BlockTier::Loads { return EmitResult::Stop; }
+            match op {
+                OP_SB => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u8,  gpr, rs, rt, d, instr_pc, modified_gprs),
+                OP_SH => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u16, gpr, rs, rt, d, instr_pc, modified_gprs),
+                OP_SW => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u32, gpr, rs, rt, d, instr_pc, modified_gprs),
+                OP_SD => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u64, gpr, rs, rt, d, instr_pc, modified_gprs),
+                _ => unreachable!(),
+            }
+        }
+
+        // --- Branches ---
+        OP_BEQ   => emit_beq(builder, gpr, rs, rt, d, instr_pc, false),
+        OP_BNE   => emit_bne(builder, gpr, rs, rt, d, instr_pc, false),
+        OP_BLEZ  => emit_blez(builder, gpr, rs, d, instr_pc, false),
+        OP_BGTZ  => emit_bgtz(builder, gpr, rs, d, instr_pc, false),
+
+        // --- Jumps ---
+        OP_J   => emit_j(builder, gpr, d, instr_pc),
+        OP_JAL => { *modified_gprs |= 1 << 31; emit_jal(builder, gpr, d, instr_pc) }
+
+        _ => EmitResult::Stop,
+    }
+}
+
+fn emit_special(
+    builder: &mut FunctionBuilder,
+    gpr: &mut [Value; 32],
+    hi: &mut Value,
+    lo: &mut Value,
+    d: &DecodedInstr,
+    rs: usize, rt: usize, rd: usize, sa: u32, funct: u32,
+) -> EmitResult {
+    match funct {
+        // --- Shifts (immediate) ---
+        FUNCT_SLL  => { emit_sll(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_SRL  => { emit_srl(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_SRA  => { emit_sra(builder, gpr, rt, rd, sa); EmitResult::Ok }
+
+        // --- Shifts (variable) ---
+        FUNCT_SLLV => { emit_sllv(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SRLV => { emit_srlv(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SRAV => { emit_srav(builder, gpr, rs, rt, rd); EmitResult::Ok }
+
+        // --- 64-bit shifts (immediate) ---
+        FUNCT_DSLL   => { emit_dsll(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_DSRL   => { emit_dsrl(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_DSRA   => { emit_dsra(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_DSLL32 => { emit_dsll(builder, gpr, rt, rd, sa + 32); EmitResult::Ok }
+        FUNCT_DSRL32 => { emit_dsrl(builder, gpr, rt, rd, sa + 32); EmitResult::Ok }
+        FUNCT_DSRA32 => { emit_dsra(builder, gpr, rt, rd, sa + 32); EmitResult::Ok }
+
+        // --- 64-bit shifts (variable) ---
+        FUNCT_DSLLV => { emit_dsllv(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_DSRLV => { emit_dsrlv(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_DSRAV => { emit_dsrav(builder, gpr, rs, rt, rd); EmitResult::Ok }
+
+        // --- ALU register ops ---
+        FUNCT_ADDU => { emit_addu(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SUBU => { emit_subu(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_AND  => { emit_and(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_OR   => { emit_or(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_XOR  => { emit_xor(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_NOR  => { emit_nor(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SLT  => { emit_slt(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SLTU => { emit_sltu(builder, gpr, rs, rt, rd); EmitResult::Ok }
+
+        // --- 64-bit ALU ---
+        FUNCT_DADDU => { emit_daddu(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_DSUBU => { emit_dsubu(builder, gpr, rs, rt, rd); EmitResult::Ok }
+
+        // --- Multiply/Divide ---
+        FUNCT_MULT  => { emit_mult(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_MULTU => { emit_multu(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DIV   => { emit_div(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DIVU  => { emit_divu(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DMULT  => { emit_dmult(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DMULTU => { emit_dmultu(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DDIV   => { emit_ddiv(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DDIVU  => { emit_ddivu(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+
+        // --- HI/LO moves ---
+        FUNCT_MFHI => { gpr[rd] = *hi; EmitResult::Ok }
+        FUNCT_MTHI => { *hi = gpr[rs]; EmitResult::Ok }
+        FUNCT_MFLO => { gpr[rd] = *lo; EmitResult::Ok }
+        FUNCT_MTLO => { *lo = gpr[rs]; EmitResult::Ok }
+
+        // --- Conditional moves ---
+        FUNCT_MOVZ => { emit_movz(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_MOVN => { emit_movn(builder, gpr, rs, rt, rd); EmitResult::Ok }
+
+        // --- JR / JALR ---
+        FUNCT_JR   => { let target = gpr[rs]; EmitResult::Branch(target) }
+        FUNCT_JALR => {
+            let target = gpr[rs];
+            let instr_pc_plus_8 = d.imm; // we'll handle this in dispatch; for now use rd
+            // JALR stores return address in rd (default $ra=31)
+            // But we don't know the PC here... pass it via a different mechanism.
+            // Actually: JALR rd, rs — stores PC+8 in rd.
+            // We don't have the PC as a value here. Let's defer JALR to interpreter.
+            EmitResult::Stop
+        }
+
+        // --- SYNC (barrier, NOP for JIT) ---
+        FUNCT_SYNC => EmitResult::Ok,
+
+        // Everything else terminates the block
+        _ => EmitResult::Stop,
+    }
+}
+
+// ─── Helper: sign-extend i32 result to i64 ──────────────────────────────────
+
+/// Truncate a 64-bit value to 32-bit, then sign-extend back to 64-bit.
+/// Matches the interpreter pattern: `val as u32 as i32 as i64 as u64`.
+fn sext32(builder: &mut FunctionBuilder, val: Value) -> Value {
+    let narrow = builder.ins().ireduce(types::I32, val);
+    builder.ins().sextend(types::I64, narrow)
+}
+
+// ─── Immediate ALU ops ───────────────────────────────────────────────────────
+
+fn emit_addiu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    // (rs as u32).wrapping_add(imm as u32) → sign-extend to 64
+    let rs32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let imm = builder.ins().iconst(types::I32, d.imm as i32 as i64);
+    let sum = builder.ins().iadd(rs32, imm);
+    gpr[rt] = builder.ins().sextend(types::I64, sum);
+}
+
+fn emit_daddiu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    let imm = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    gpr[rt] = builder.ins().iadd(gpr[rs], imm);
+}
+
+fn emit_slti(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    let imm = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    let cmp = builder.ins().icmp(IntCC::SignedLessThan, gpr[rs], imm);
+    gpr[rt] = builder.ins().uextend(types::I64, cmp);
+}
+
+fn emit_sltiu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    // imm is sign-extended then compared as unsigned
+    let imm = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    let cmp = builder.ins().icmp(IntCC::UnsignedLessThan, gpr[rs], imm);
+    gpr[rt] = builder.ins().uextend(types::I64, cmp);
+}
+
+fn emit_andi(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    // zero-extended immediate
+    let imm = builder.ins().iconst(types::I64, (d.imm & 0xFFFF) as i64);
+    gpr[rt] = builder.ins().band(gpr[rs], imm);
+}
+
+fn emit_ori(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    let imm = builder.ins().iconst(types::I64, (d.imm & 0xFFFF) as i64);
+    gpr[rt] = builder.ins().bor(gpr[rs], imm);
+}
+
+fn emit_xori(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    let imm = builder.ins().iconst(types::I64, (d.imm & 0xFFFF) as i64);
+    gpr[rt] = builder.ins().bxor(gpr[rs], imm);
+}
+
+fn emit_lui(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, d: &DecodedInstr) {
+    // imm is already shifted left 16 by decode (set_imm_lui)
+    // sign-extend from 32 to 64
+    gpr[rt] = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+}
+
+// ─── Register ALU ops ────────────────────────────────────────────────────────
+
+fn emit_addu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let a = builder.ins().ireduce(types::I32, gpr[rs]);
+    let b = builder.ins().ireduce(types::I32, gpr[rt]);
+    let sum = builder.ins().iadd(a, b);
+    gpr[rd] = builder.ins().sextend(types::I64, sum);
+}
+
+fn emit_subu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let a = builder.ins().ireduce(types::I32, gpr[rs]);
+    let b = builder.ins().ireduce(types::I32, gpr[rt]);
+    let diff = builder.ins().isub(a, b);
+    gpr[rd] = builder.ins().sextend(types::I64, diff);
+}
+
+fn emit_and(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().band(gpr[rs], gpr[rt]);
+}
+
+fn emit_or(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().bor(gpr[rs], gpr[rt]);
+}
+
+fn emit_xor(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().bxor(gpr[rs], gpr[rt]);
+}
+
+fn emit_nor(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let or_val = builder.ins().bor(gpr[rs], gpr[rt]);
+    gpr[rd] = builder.ins().bnot(or_val);
+}
+
+fn emit_slt(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let cmp = builder.ins().icmp(IntCC::SignedLessThan, gpr[rs], gpr[rt]);
+    gpr[rd] = builder.ins().uextend(types::I64, cmp);
+}
+
+fn emit_sltu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let cmp = builder.ins().icmp(IntCC::UnsignedLessThan, gpr[rs], gpr[rt]);
+    gpr[rd] = builder.ins().uextend(types::I64, cmp);
+}
+
+// ─── 64-bit ALU ops ──────────────────────────────────────────────────────────
+
+fn emit_daddu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().iadd(gpr[rs], gpr[rt]);
+}
+
+fn emit_dsubu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().isub(gpr[rs], gpr[rt]);
+}
+
+// ─── 32-bit Shift ops ───────────────────────────────────────────────────────
+
+fn emit_sll(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let shift = builder.ins().iconst(types::I32, sa as i64);
+    let result = builder.ins().ishl(rt32, shift);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_srl(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let shift = builder.ins().iconst(types::I32, sa as i64);
+    let result = builder.ins().ushr(rt32, shift);
+    // SRL: logical shift, but result is still sign-extended to 64 (MIPS spec)
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_sra(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let shift = builder.ins().iconst(types::I32, sa as i64);
+    let result = builder.ins().sshr(rt32, shift);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_sllv(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let rs32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let mask = builder.ins().iconst(types::I32, 0x1F);
+    let sa = builder.ins().band(rs32, mask);
+    let result = builder.ins().ishl(rt32, sa);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_srlv(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let rs32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let mask = builder.ins().iconst(types::I32, 0x1F);
+    let sa = builder.ins().band(rs32, mask);
+    let result = builder.ins().ushr(rt32, sa);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_srav(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let rs32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let mask = builder.ins().iconst(types::I32, 0x1F);
+    let sa = builder.ins().band(rs32, mask);
+    let result = builder.ins().sshr(rt32, sa);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+// ─── 64-bit Shift ops ───────────────────────────────────────────────────────
+
+fn emit_dsll(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let shift = builder.ins().iconst(types::I64, sa as i64);
+    gpr[rd] = builder.ins().ishl(gpr[rt], shift);
+}
+
+fn emit_dsrl(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let shift = builder.ins().iconst(types::I64, sa as i64);
+    gpr[rd] = builder.ins().ushr(gpr[rt], shift);
+}
+
+fn emit_dsra(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let shift = builder.ins().iconst(types::I64, sa as i64);
+    gpr[rd] = builder.ins().sshr(gpr[rt], shift);
+}
+
+fn emit_dsllv(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let mask = builder.ins().iconst(types::I64, 0x3F);
+    let sa = builder.ins().band(gpr[rs], mask);
+    gpr[rd] = builder.ins().ishl(gpr[rt], sa);
+}
+
+fn emit_dsrlv(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let mask = builder.ins().iconst(types::I64, 0x3F);
+    let sa = builder.ins().band(gpr[rs], mask);
+    gpr[rd] = builder.ins().ushr(gpr[rt], sa);
+}
+
+fn emit_dsrav(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let mask = builder.ins().iconst(types::I64, 0x3F);
+    let sa = builder.ins().band(gpr[rs], mask);
+    gpr[rd] = builder.ins().sshr(gpr[rt], sa);
+}
+
+// ─── Multiply/Divide ─────────────────────────────────────────────────────────
+
+fn emit_mult(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    // Signed 32×32 → 64-bit result
+    let a32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let a = builder.ins().sextend(types::I64, a32);
+    let b32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let b = builder.ins().sextend(types::I64, b32);
+    let product = builder.ins().imul(a, b);
+    // lo = sign-extend low 32 bits; hi = sign-extend high 32 bits
+    *lo = sext32(builder, product);
+    let shifted = builder.ins().sshr_imm(product, 32);
+    *hi = sext32(builder, shifted);
+}
+
+fn emit_multu(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let a32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let a = builder.ins().uextend(types::I64, a32);
+    let b32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let b = builder.ins().uextend(types::I64, b32);
+    let product = builder.ins().imul(a, b);
+    *lo = sext32(builder, product);
+    let shifted = builder.ins().ushr_imm(product, 32);
+    *hi = sext32(builder, shifted);
+}
+
+fn emit_div(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let a = builder.ins().ireduce(types::I32, gpr[rs]);
+    let b = builder.ins().ireduce(types::I32, gpr[rt]);
+    let zero = builder.ins().iconst(types::I32, 0);
+    let one = builder.ins().iconst(types::I32, 1);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, b, zero);
+    let safe_b = builder.ins().select(is_nonzero, b, one);
+    let q = builder.ins().sdiv(a, safe_b);
+    let r = builder.ins().srem(a, safe_b);
+    *lo = builder.ins().sextend(types::I64, q);
+    *hi = builder.ins().sextend(types::I64, r);
+}
+
+fn emit_divu(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let a = builder.ins().ireduce(types::I32, gpr[rs]);
+    let b = builder.ins().ireduce(types::I32, gpr[rt]);
+    let zero = builder.ins().iconst(types::I32, 0);
+    let one = builder.ins().iconst(types::I32, 1);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, b, zero);
+    let safe_b = builder.ins().select(is_nonzero, b, one);
+    let q = builder.ins().udiv(a, safe_b);
+    let r = builder.ins().urem(a, safe_b);
+    *lo = builder.ins().sextend(types::I64, q);
+    *hi = builder.ins().sextend(types::I64, r);
+}
+
+fn emit_dmult(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    // Signed 64×64: lo = low 64, hi = high 64
+    *lo = builder.ins().imul(gpr[rs], gpr[rt]);
+    *hi = builder.ins().smulhi(gpr[rs], gpr[rt]);
+}
+
+fn emit_dmultu(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    *lo = builder.ins().imul(gpr[rs], gpr[rt]);
+    *hi = builder.ins().umulhi(gpr[rs], gpr[rt]);
+}
+
+fn emit_ddiv(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let zero = builder.ins().iconst(types::I64, 0);
+    let one = builder.ins().iconst(types::I64, 1);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, gpr[rt], zero);
+    let safe_b = builder.ins().select(is_nonzero, gpr[rt], one);
+    *lo = builder.ins().sdiv(gpr[rs], safe_b);
+    *hi = builder.ins().srem(gpr[rs], safe_b);
+}
+
+fn emit_ddivu(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let zero = builder.ins().iconst(types::I64, 0);
+    let one = builder.ins().iconst(types::I64, 1);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, gpr[rt], zero);
+    let safe_b = builder.ins().select(is_nonzero, gpr[rt], one);
+    *lo = builder.ins().udiv(gpr[rs], safe_b);
+    *hi = builder.ins().urem(gpr[rs], safe_b);
+}
+
+// ─── Conditional moves ───────────────────────────────────────────────────────
+
+fn emit_movz(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let zero = builder.ins().iconst(types::I64, 0);
+    let is_zero = builder.ins().icmp(IntCC::Equal, gpr[rt], zero);
+    gpr[rd] = builder.ins().select(is_zero, gpr[rs], gpr[rd]);
+}
+
+fn emit_movn(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let zero = builder.ins().iconst(types::I64, 0);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, gpr[rt], zero);
+    gpr[rd] = builder.ins().select(is_nonzero, gpr[rs], gpr[rd]);
+}
+
+// ─── GPR flush helper ────────────────────────────────────────────────────────
+
+/// Flush modified GPRs from SSA values to JitContext memory.
+/// Called immediately BEFORE each `builder.ins().call(helper, ...)`.
+/// After flushing, `*modified` is reset to 0.
+/// This eliminates cross-block SSA live value pressure on x86_64 (the "35+ live I64" spill bug).
+fn flush_modified_gprs(
+    builder: &mut FunctionBuilder,
+    gpr: &[Value; 32],
+    ctx_ptr: Value,
+    modified: &mut u32,
+) {
+    let mem = MemFlags::trusted();
+    for i in 1..32usize {
+        if (*modified >> i) & 1 != 0 {
+            builder.ins().store(
+                mem, gpr[i], ctx_ptr,
+                ir::immediates::Offset32::new(JitContext::gpr_offset(i)),
+            );
+        }
+    }
+    *modified = 0;
+}
+
+// ─── Load/Store emitters ─────────────────────────────────────────────────────
+
+/// Load width tag passed to emit_load so it applies the correct sign extension.
+#[derive(Clone, Copy)]
+enum LoadWidth { Byte, Half, Word, Double }
+
+/// Emit a load instruction. Calls the helper function, checks for exception,
+/// sign/zero-extends the result into the destination GPR.
+fn emit_load(
+    builder: &mut FunctionBuilder,
+    ctx_ptr: Value, exec_ptr: Value,
+    helper: FuncRef,
+    gpr: &mut [Value; 32],
+    rs: usize, rt: usize,
+    d: &DecodedInstr,
+    width: LoadWidth,
+    sign_extend: bool,
+    instr_pc: u64,
+    modified_gprs: &mut u32,
+) -> EmitResult {
+    let base = gpr[rs];
+    let offset = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    let virt_addr = builder.ins().iadd(base, offset);
+
+    // Flush all GPRs modified so far — prevents cross-block SSA live value pressure
+    flush_modified_gprs(builder, gpr, ctx_ptr, modified_gprs);
+
+    // Store faulting PC to ctx BEFORE the helper call, so the dispatch loop
+    // knows which instruction caused the exception if one occurs.
+    let instr_pc_val = builder.ins().iconst(types::I64, instr_pc as i64);
+    builder.ins().store(MemFlags::trusted(), instr_pc_val, ctx_ptr,
+        ir::immediates::Offset32::new(JitContext::pc_offset()));
+
+    // Call helper: result = helper(ctx_ptr, exec_ptr, virt_addr)
+    let call = builder.ins().call(helper, &[ctx_ptr, exec_ptr, virt_addr]);
+    let raw_val = builder.inst_results(call)[0];
+
+    // Check ctx.exit_reason for exception.
+    // MUST use MemFlags::new() — helper may have written exit_reason through ctx_ptr.
+    let exit_reason = builder.ins().load(types::I32, MemFlags::new(), ctx_ptr,
+        ir::immediates::Offset32::new(JitContext::exit_reason_offset()));
+    let zero_i32 = builder.ins().iconst(types::I32, 0);
+    let is_exception = builder.ins().icmp(IntCC::NotEqual, exit_reason, zero_i32);
+
+    let ok_block = builder.create_block();
+    builder.append_block_param(ok_block, types::I64);
+    let exc_block = builder.create_block();
+    builder.ins().brif(is_exception, exc_block, &[], ok_block, &[raw_val]);
+
+    // Exception path: GPRs already flushed before the helper call — just return
+    builder.switch_to_block(exc_block);
+    builder.seal_block(exc_block);
+    builder.ins().return_(&[]);
+
+    // Normal path — raw_val comes through as a block parameter
+    builder.switch_to_block(ok_block);
+    builder.seal_block(ok_block);
+    let val = builder.block_params(ok_block)[0];
+
+    // Apply correct sign/zero extension based on load width
+    gpr[rt] = match (width, sign_extend) {
+        (LoadWidth::Byte, true) => {
+            // i8 → i64: truncate to 8 bits, sign-extend
+            let narrow = builder.ins().ireduce(types::I8, val);
+            builder.ins().sextend(types::I64, narrow)
+        }
+        (LoadWidth::Half, true) => {
+            // i16 → i64: truncate to 16 bits, sign-extend
+            let narrow = builder.ins().ireduce(types::I16, val);
+            builder.ins().sextend(types::I64, narrow)
+        }
+        (LoadWidth::Word, true) => {
+            // i32 → i64: truncate to 32 bits, sign-extend
+            sext32(builder, val)
+        }
+        (_, false) | (LoadWidth::Double, _) => {
+            // Zero-extend or 64-bit: raw value is already correct
+            val
+        }
+    };
+    *modified_gprs |= 1u32 << rt;
+
+    EmitResult::Ok
+}
+
+/// Emit a store instruction. Calls the helper function, checks for exception.
+fn emit_store(
+    builder: &mut FunctionBuilder,
+    ctx_ptr: Value, exec_ptr: Value,
+    helper: FuncRef,
+    gpr: &[Value; 32],
+    rs: usize, rt: usize,
+    d: &DecodedInstr,
+    instr_pc: u64,
+    modified_gprs: &mut u32,
+) -> EmitResult {
+    let base = gpr[rs];
+    let offset = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    let virt_addr = builder.ins().iadd(base, offset);
+    let value = gpr[rt];
+
+    // Flush all GPRs modified so far — prevents cross-block SSA live value pressure
+    flush_modified_gprs(builder, gpr, ctx_ptr, modified_gprs);
+
+    // Store faulting PC before helper call
+    let instr_pc_val = builder.ins().iconst(types::I64, instr_pc as i64);
+    builder.ins().store(MemFlags::trusted(), instr_pc_val, ctx_ptr,
+        ir::immediates::Offset32::new(JitContext::pc_offset()));
+
+    let _call = builder.ins().call(helper, &[ctx_ptr, exec_ptr, virt_addr, value]);
+
+    // Check ctx.exit_reason — MUST use MemFlags::new()
+    let exit_reason = builder.ins().load(types::I32, MemFlags::new(), ctx_ptr,
+        ir::immediates::Offset32::new(JitContext::exit_reason_offset()));
+    let zero = builder.ins().iconst(types::I32, 0);
+    let is_exception = builder.ins().icmp(IntCC::NotEqual, exit_reason, zero);
+
+    let ok_block = builder.create_block();
+    let exc_block = builder.create_block();
+    builder.ins().brif(is_exception, exc_block, &[], ok_block, &[]);
+
+    // Exception path: GPRs already flushed before the helper call — just return
+    builder.switch_to_block(exc_block);
+    builder.seal_block(exc_block);
+    builder.ins().return_(&[]);
+
+    builder.switch_to_block(ok_block);
+    builder.seal_block(ok_block);
+
+    EmitResult::Ok
+}
+
+// ─── Branch emitters ─────────────────────────────────────────────────────────
+// Branches compute the target PC and return EmitResult::Branch(target_value).
+// The compiled block stores this PC and returns. Delay slots are handled by
+// the dispatch loop (the next instruction after the branch is interpreted).
+
+fn emit_beq(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8); // skip delay slot
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let cond = builder.ins().icmp(IntCC::Equal, gpr[rs], gpr[rt]);
+    let target = builder.ins().select(cond, taken, not_taken);
+    EmitResult::Branch(target)
+}
+
+fn emit_bne(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8);
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let cond = builder.ins().icmp(IntCC::NotEqual, gpr[rs], gpr[rt]);
+    let target = builder.ins().select(cond, taken, not_taken);
+    EmitResult::Branch(target)
+}
+
+fn emit_blez(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8);
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let zero = builder.ins().iconst(types::I64, 0);
+    let cond = builder.ins().icmp(IntCC::SignedLessThanOrEqual, gpr[rs], zero);
+    let target = builder.ins().select(cond, taken, not_taken);
+    EmitResult::Branch(target)
+}
+
+fn emit_bgtz(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8);
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let zero = builder.ins().iconst(types::I64, 0);
+    let cond = builder.ins().icmp(IntCC::SignedGreaterThan, gpr[rs], zero);
+    let target = builder.ins().select(cond, taken, not_taken);
+    EmitResult::Branch(target)
+}
+
+fn emit_j(
+    builder: &mut FunctionBuilder, _gpr: &[Value; 32],
+    d: &DecodedInstr, instr_pc: u64,
+) -> EmitResult {
+    // Target = (PC+4)[63:28] | (target26 << 2) — but imm already has target26<<2 from decode
+    let region = instr_pc.wrapping_add(4) & 0xFFFF_FFFF_F000_0000;
+    let target_pc = region | (d.imm as u64);
+    let target = builder.ins().iconst(types::I64, target_pc as i64);
+    EmitResult::Branch(target)
+}
+
+fn emit_jal(
+    builder: &mut FunctionBuilder, gpr: &mut [Value; 32],
+    d: &DecodedInstr, instr_pc: u64,
+) -> EmitResult {
+    // JAL: $ra = PC + 8 (return address past delay slot)
+    let return_addr = instr_pc.wrapping_add(8);
+    gpr[31] = builder.ins().iconst(types::I64, return_addr as i64);
+
+    let region = instr_pc.wrapping_add(4) & 0xFFFF_FFFF_F000_0000;
+    let target_pc = region | (d.imm as u64);
+    let target = builder.ins().iconst(types::I64, target_pc as i64);
+    EmitResult::Branch(target)
+}
diff --git a/src/jit/context.rs b/src/jit/context.rs
new file mode 100644
index 0000000..7458436
--- /dev/null
+++ b/src/jit/context.rs
@@ -0,0 +1,158 @@
+//! JitContext: `#[repr(C)]` bridge struct between JIT-compiled code and emulator state.
+//!
+//! Contains the hot subset of MipsCore and MipsExecutor state that compiled blocks
+//! read and write directly. Synced to/from the interpreter before and after JIT execution.
+
+use crate::mips_core::NanoTlbEntry;
+use crate::mips_exec::MipsExecutor;
+use crate::mips_tlb::Tlb;
+use crate::mips_cache_v2::MipsCache;
+
+// Exit reason constants set by JIT code before returning to dispatch.
+pub const EXIT_NORMAL: u32 = 0;
+pub const EXIT_INTERPRET: u32 = 1;
+pub const EXIT_EXCEPTION: u32 = 2;
+pub const EXIT_INTERRUPT_CHECK: u32 = 3;
+pub const EXIT_HALT: u32 = 4;
+
+#[repr(C)]
+pub struct JitContext {
+    // General purpose registers
+    pub gpr: [u64; 32],
+
+    // Special registers
+    pub pc: u64,
+    pub hi: u64,
+    pub lo: u64,
+
+    // FPU registers
+    pub fpr: [u64; 32],
+    pub fpu_fcsr: u32,
+
+    // CP0 state (hot subset for interrupt/exception checking)
+    pub cp0_status: u32,
+    pub cp0_cause: u32,
+    pub cp0_epc: u64,
+    pub cp0_count: u64,
+    pub cp0_compare: u64,
+    pub count_step: u64,
+    pub cp0_badvaddr: u64,
+
+    // Nano-TLB (3 entries: Fetch/Read/Write)
+    pub nanotlb: [NanoTlbEntry; 3],
+
+    // Delay slot state
+    pub in_delay_slot: bool,
+    pub delay_slot_target: u64,
+
+    // Interrupt handling (cached from executor)
+    pub cached_pending: u64,
+    pub local_cycles: u64,
+
+    // JIT dispatch state
+    pub exit_reason: u32,
+    pub block_instrs_executed: u32,
+
+    // Type-erased pointer to MipsExecutor — used by memory helper callouts
+    pub executor_ptr: u64,
+    // Exception status from failed memory access (set by helpers)
+    pub exception_status: u32,
+    _pad0: u32,
+}
+
+impl JitContext {
+    pub fn new() -> Self {
+        Self {
+            gpr: [0; 32],
+            pc: 0,
+            hi: 0,
+            lo: 0,
+            fpr: [0; 32],
+            fpu_fcsr: 0,
+            cp0_status: 0,
+            cp0_cause: 0,
+            cp0_epc: 0,
+            cp0_count: 0,
+            cp0_compare: 0,
+            count_step: 0,
+            cp0_badvaddr: 0,
+            nanotlb: [NanoTlbEntry::default(); 3],
+            in_delay_slot: false,
+            delay_slot_target: 0,
+            cached_pending: 0,
+            local_cycles: 0,
+            exit_reason: EXIT_NORMAL,
+            block_instrs_executed: 0,
+            executor_ptr: 0,
+            exception_status: 0,
+            _pad0: 0,
+        }
+    }
+
+    /// Byte offset of `gpr[i]` from the start of JitContext.
+    pub fn gpr_offset(i: usize) -> i32 {
+        (std::mem::offset_of!(JitContext, gpr) + i * 8) as i32
+    }
+
+    pub fn hi_offset() -> i32 { std::mem::offset_of!(JitContext, hi) as i32 }
+    pub fn lo_offset() -> i32 { std::mem::offset_of!(JitContext, lo) as i32 }
+    pub fn pc_offset() -> i32 { std::mem::offset_of!(JitContext, pc) as i32 }
+    pub fn exit_reason_offset() -> i32 { std::mem::offset_of!(JitContext, exit_reason) as i32 }
+    pub fn block_instrs_offset() -> i32 { std::mem::offset_of!(JitContext, block_instrs_executed) as i32 }
+    pub fn executor_ptr_offset() -> i32 { std::mem::offset_of!(JitContext, executor_ptr) as i32 }
+    pub fn exception_status_offset() -> i32 { std::mem::offset_of!(JitContext, exception_status) as i32 }
+
+    /// Copy emulator state into JitContext.
+    pub fn sync_from_executor<T: Tlb, C: MipsCache>(
+        &mut self,
+        exec: &MipsExecutor<T, C>,
+    ) {
+        self.gpr = exec.core.gpr;
+        self.pc = exec.core.pc;
+        self.hi = exec.core.hi;
+        self.lo = exec.core.lo;
+        self.fpr = exec.core.fpr;
+        self.fpu_fcsr = exec.core.fpu_fcsr;
+        self.cp0_status = exec.core.cp0_status;
+        self.cp0_cause = exec.core.cp0_cause;
+        self.cp0_epc = exec.core.cp0_epc;
+        self.cp0_count = exec.core.cp0_count;
+        self.cp0_compare = exec.core.cp0_compare;
+        self.count_step = exec.core.count_step;
+        self.cp0_badvaddr = exec.core.cp0_badvaddr;
+        self.nanotlb = exec.core.nanotlb;
+        self.in_delay_slot = exec.in_delay_slot;
+        self.delay_slot_target = exec.delay_slot_target;
+        self.cached_pending = exec.cached_pending;
+        self.local_cycles = exec.local_cycles;
+    }
+
+    /// Copy JitContext state back to the emulator.
+    ///
+    /// ONLY writes back fields that compiled blocks actually modify (GPRs, hi, lo, PC).
+    /// Fields managed by the interpreter or helpers (cp0_*, nanotlb, fpr) are NOT
+    /// written back — they're updated directly on the executor by helpers/interpreter.
+    pub fn sync_to_executor<T: Tlb, C: MipsCache>(
+        &self,
+        exec: &mut MipsExecutor<T, C>,
+    ) {
+        // These are modified by compiled code (stored in the block epilogue)
+        exec.core.gpr = self.gpr;
+        exec.core.pc = self.pc;
+        exec.core.hi = self.hi;
+        exec.core.lo = self.lo;
+
+        // Compiled blocks handle delay slots internally (the branch emitter
+        // computes the target, emits the delay slot, and sets the exit PC).
+        // Clear the interpreter's delay slot state so subsequent exec.step()
+        // calls don't jump to a stale target.
+        exec.in_delay_slot = false;
+        exec.delay_slot_target = 0;
+
+        // DO NOT write back: cp0_status, cp0_cause, cp0_epc, cp0_badvaddr,
+        // cp0_count, cp0_compare, count_step, nanotlb, fpr, fpu_fcsr —
+        // these are managed by the interpreter and memory helpers directly
+        // on the executor. Writing them back would clobber changes made by
+        // exception handlers and TLB fill operations.
+    }
+}
diff --git a/src/jit/dispatch.rs b/src/jit/dispatch.rs
new file mode 100644
index 0000000..65fd205
--- /dev/null
+++ b/src/jit/dispatch.rs
@@ -0,0 +1,689 @@
+//! Adaptive JIT dispatch loop with tiered compilation and speculative execution.
+//!
+//! Interpreter-first architecture: the interpreter runs in short bursts, with
+//! cache probes after each burst. One JIT block per probe, then back to interpreter.
+//! Blocks start at Tier 0 (ALU only) and earn promotion through stable execution.
+//!
+//! The probe interval adapts dynamically: frequent cache hits → shorter interval
+//! (probe more often), frequent misses → longer interval (less overhead).
+
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::mips_exec::{MipsExecutor, DecodedInstr, EXEC_BREAKPOINT, decode_into};
+use crate::mips_tlb::{Tlb, AccessType};
+use crate::mips_cache_v2::MipsCache;
+
+use super::cache::{BlockTier, CodeCache, TierConfig};
+use super::compiler::BlockCompiler;
+use super::context::{JitContext, EXIT_NORMAL, EXIT_EXCEPTION};
+use super::helpers::HelperPtrs;
+use super::profile::{self, ProfileEntry};
+use super::snapshot::CpuRollbackSnapshot;
+
+const MAX_BLOCK_LEN: usize = 64;
+
+/// How many interpreter steps in one outer batch (controls flush_cycles frequency).
+const BATCH_SIZE: u32 = 10000;
+
+/// Adaptive probe interval controller.
+///
+/// Asymmetric adjustment: hits pull the interval down aggressively (we want to
+/// exploit hot code), misses push it up gently (don't overreact to cold regions).
+/// Cache size provides a floor — more compiled blocks means shorter intervals
+/// even when the instantaneous hit rate is low.
+struct ProbeController {
+    /// Current probe interval (interpreter steps between cache probes).
+    interval: u32,
+    /// Minimum allowed interval.
+    min_interval: u32,
+    /// Maximum allowed interval.
+    max_interval: u32,
+    /// Exponentially weighted hit rate (0..256 fixed-point, 256 = 100%).
+    ewma_hit_rate: u32,
+    /// Number of compiled blocks (updated externally).
+    cache_size: u32,
+    /// Simple LFSR for jitter (avoids lock-step with OS timers).
+    lfsr: u32,
+}
+
+impl ProbeController {
+    fn new() -> Self {
+        let base = std::env::var("IRIS_JIT_PROBE").ok()
+            .and_then(|v| v.parse().ok()).unwrap_or(200u32);
+        let min = std::env::var("IRIS_JIT_PROBE_MIN").ok()
+            .and_then(|v| v.parse().ok()).unwrap_or(100u32);
+        let max = std::env::var("IRIS_JIT_PROBE_MAX").ok()
+            .and_then(|v| v.parse().ok()).unwrap_or(2000u32);
+        Self {
+            interval: base.clamp(min, max),
+            min_interval: min,
+            max_interval: max,
+            ewma_hit_rate: 0,
+            cache_size: 0,
+            lfsr: 0xACE1u32,
+        }
+    }
+
+    /// Record a cache hit — aggressively pull interval down.
+    fn record_hit(&mut self) {
+        // EWMA with alpha ~1/8 for hits (fast response to hot code)
+        self.ewma_hit_rate = self.ewma_hit_rate - (self.ewma_hit_rate / 8) + 32; // +32 = 1/8 of 256
+
+        // Each hit immediately nudges interval down by ~3%
+        self.interval = (self.interval * 31 / 32).max(self.min_interval);
+    }
+
+    /// Record a cache miss — gently push interval up.
+    fn record_miss(&mut self) {
+        // EWMA with alpha ~1/32 for misses (slow response, don't overreact)
+        self.ewma_hit_rate = self.ewma_hit_rate.saturating_sub(self.ewma_hit_rate / 32);
+
+        // Misses push interval up by ~1% (3x slower than hit pull-down)
+        self.interval = (self.interval * 33 / 32).min(self.max_interval);
+    }
+
+    /// Update cache size — provides an interval floor.
+    fn set_cache_size(&mut self, size: u32) {
+        self.cache_size = size;
+    }
+
+    /// Get current interval with jitter, incorporating cache size pressure.
+    fn next_interval(&mut self) -> u32 {
+        // Cache size pressure: more blocks compiled → gently push interval down.
+        // Uses sqrt so 100 blocks barely changes anything, 10000 blocks halves it,
+        // but never goes below min_interval.
+        // 100 blocks → factor 1.0 (no change), 1000 → 0.68, 10000 → 0.46, 50000 → 0.31
+        let cache_factor = if self.cache_size > 100 {
+            1.0f32 / (self.cache_size as f32 / 100.0).sqrt().max(1.0)
+        } else {
+            1.0
+        };
+        let cache_adjusted = (self.interval as f32 * cache_factor) as u32;
+        let effective = cache_adjusted.clamp(self.min_interval, self.max_interval);
+
+        // Galois LFSR for cheap pseudo-randomness
+        let bit = self.lfsr & 1;
+        self.lfsr >>= 1;
+        if bit != 0 { self.lfsr ^= 0xB400; }
+
+        // Jitter: ~0.85x to ~1.15x using 3 bits of LFSR
+        let jitter_bits = (self.lfsr & 0x7) as u32; // 0-7
+        let jittered = effective * (17 + jitter_bits) / 21; // range ~0.81x to ~1.14x
+        jittered.clamp(self.min_interval, self.max_interval)
+    }
+}
+
+pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
+    exec: &mut MipsExecutor<T, C>,
+    running: &AtomicBool,
+) {
+    let jit_enabled = std::env::var("IRIS_JIT").map(|v| v == "1").unwrap_or(false);
+
+    if !jit_enabled {
+        eprintln!("JIT: interpreter-only mode (set IRIS_JIT=1 to enable compilation)");
+        interpreter_loop(exec, running);
+        return;
+    }
+
+    let exec_ptr: *mut MipsExecutor<T, C> = exec as *mut _;
+
+    // IRIS_JIT_MAX_TIER: cap the highest tier blocks can reach (0=Alu, 1=Loads, 2=Full)
+    let max_tier = match std::env::var("IRIS_JIT_MAX_TIER").ok().and_then(|v| v.parse::<u8>().ok()) {
+        Some(0) => BlockTier::Alu,
+        Some(1) => BlockTier::Loads,
+        _ => BlockTier::Full,
+    };
+    // IRIS_JIT_VERIFY=1: after each JIT block, re-run via interpreter and compare
+    let verify_mode = std::env::var("IRIS_JIT_VERIFY").map(|v| v == "1").unwrap_or(false);
+    let tier_cfg = TierConfig::from_env();
+    let mut probe = ProbeController::new();
+    eprintln!("JIT: adaptive mode (max_tier={:?}, verify={}, probe={} [{}-{}], stable={}, promote={}, demote={})",
+        max_tier, verify_mode, probe.interval, probe.min_interval, probe.max_interval,
+        tier_cfg.stable, tier_cfg.promote, tier_cfg.demote);
+    let helpers = HelperPtrs::new::<T, C>();
+    let mut compiler = BlockCompiler::new(&helpers);
+    let mut cache = CodeCache::new();
+    let mut ctx = JitContext::new();
+    ctx.executor_ptr = exec_ptr as u64;
+
+    let mut total_jit_instrs: u64 = 0;
+    let mut total_interp_steps: u64 = 0;
+    let mut blocks_compiled: u64 = 0;
+    let mut promotions: u64 = 0;
+    let mut demotions: u64 = 0;
+    let mut rollbacks: u64 = 0;
+
+    // Load saved profile and eagerly compile hot blocks
+    {
+        let exec = unsafe { &mut *exec_ptr };
+        let profile_entries = profile::load_profile();
+        let mut profile_compiled = 0u64;
+        for entry in &profile_entries {
+            let tier = if entry.tier > max_tier { max_tier } else { entry.tier };
+            if tier == BlockTier::Alu {
+                continue;
+            }
+            let instrs = trace_block(exec, entry.virt_pc, tier);
+            if !instrs.is_empty() {
+                if let Some(mut block) = compiler.compile_block(&instrs, entry.virt_pc, tier) {
+                    block.phys_addr = entry.phys_pc;
+                    cache.insert(entry.phys_pc, block);
+                    blocks_compiled += 1;
+                    profile_compiled += 1;
+                }
+            }
+        }
+        if profile_compiled > 0 {
+            eprintln!("JIT profile: pre-compiled {} blocks from profile", profile_compiled);
+        }
+    }
+
+    while running.load(Ordering::Relaxed) {
+        let mut steps_in_batch: u32 = 0;
+
+        while steps_in_batch < BATCH_SIZE {
+            let burst = probe.next_interval();
+
+            // Interpreter burst
+            {
+                let exec = unsafe { &mut *exec_ptr };
+                #[cfg(feature = "lightning")]
+                for _ in 0..burst {
+                    exec.step();
+                }
+                #[cfg(not(feature = "lightning"))]
+                for _ in 0..burst {
+                    let status = exec.step();
+                    if status == EXEC_BREAKPOINT {
+                        running.store(false, Ordering::SeqCst);
+                        break;
+                    }
+                }
+            }
+            steps_in_batch += burst;
+            total_interp_steps += burst as u64;
+
+            if !running.load(Ordering::Relaxed) { break; }
+
+            // Probe the JIT code cache
+            let (pc, in_delay_slot) = {
+                let exec = unsafe { &*exec_ptr };
+                (exec.core.pc, exec.in_delay_slot)
+            };
+            let pc32 = pc as u32;
+
+            let in_prom = (pc32 >= 0x9FC00000 && pc32 < 0xA0000000) || (pc32 >= 0xBFC00000);
+            let in_exc = pc32 >= 0x80000000 && pc32 < 0x80000400;
+            if in_prom || in_exc || in_delay_slot {
+                probe.record_miss();
+                continue;
+            }
+
+            let phys_pc = {
+                let exec = unsafe { &mut *exec_ptr };
+                match translate_pc(exec, pc) {
+                    Some(p) => p,
+                    None => { probe.record_miss(); continue; }
+                }
+            };
+
+            if let Some(block) = cache.lookup(phys_pc) {
+                probe.record_hit();
+                let block_len = block.len_mips;
+                let block_tier = block.tier;
+                let is_speculative = block.speculative;
+
+                // Snapshot CPU if speculative OR verify mode
+                let snapshot = if is_speculative || verify_mode {
+                    let exec = unsafe { &*exec_ptr };
+                    exec.tlb.clone_as_mips_tlb().map(|tlb| {
+                        CpuRollbackSnapshot::capture(exec, tlb)
+                    })
+                } else {
+                    None
+                };
+
+                // Sync and run
+                {
+                    let exec = unsafe { &mut *exec_ptr };
+                    ctx.sync_from_executor(exec);
+                }
+
+                ctx.exit_reason = 0;
+                let entry: extern "C" fn(*mut JitContext) = unsafe {
+                    std::mem::transmute(block.entry)
+                };
+                entry(&mut ctx);
+
+                {
+                    let exec = unsafe { &mut *exec_ptr };
+                    ctx.sync_to_executor(exec);
+
+                    if ctx.exit_reason == EXIT_EXCEPTION {
+                        if let Some(snap) = &snapshot {
+                            if is_speculative {
+                                snap.restore(exec);
+                                rollbacks += 1;
+
+                                if let Some(block) = cache.lookup_mut(phys_pc) {
+                                    block.hit_count += 1;
+                                    block.exception_count += 1;
+                                    block.stable_hits = 0;
+
+                                    if block.exception_count >= tier_cfg.demote {
+                                        if let Some(lower) = block.tier.demote() {
+                                            demotions += 1;
+                                            eprintln!("JIT: demote {:016x} {:?}→{:?} ({}exc)",
+                                                pc, block.tier, lower, block.exception_count);
+                                            recompile_block_at_tier(
+                                                &mut compiler, &mut cache, exec,
+                                                phys_pc, pc, lower,
+                                                &mut blocks_compiled,
+                                            );
+                                        } else {
+                                            block.speculative = false;
+                                        }
+                                    }
+                                }
+                            } else if verify_mode {
+                                snap.restore(exec);
+                            }
+                        }
+                        // Advance cp0_count for instructions that executed before the fault.
+                        // ctx.pc was set to the faulting instruction by the load/store emitter.
+                        let instrs_before_fault = ctx.pc.wrapping_sub(pc) / 4;
+                        if instrs_before_fault > 0 {
+                            let advance = exec.core.count_step.wrapping_mul(instrs_before_fault);
+                            let prev = exec.core.cp0_count;
+                            exec.core.cp0_count = prev.wrapping_add(advance) & 0x0000_FFFF_FFFF_FFFF;
+                            if exec.core.cp0_compare != 0
+                                && prev < exec.core.cp0_compare
+                                && exec.core.cp0_count >= exec.core.cp0_compare
+                            {
+                                exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                            }
+                            exec.local_cycles += instrs_before_fault;
+                        }
+                        exec.step();
+                        total_interp_steps += 1;
+                        steps_in_batch += 1;
+                        continue;
+                    }
+
+                    // Normal exit
+                    if verify_mode {
+                        if let Some(snap) = &snapshot {
+                            let jit_gpr = exec.core.gpr;
+                            let jit_pc = exec.core.pc;
+                            let jit_hi = exec.core.hi;
+                            let jit_lo = exec.core.lo;
+
+                            snap.restore(exec);
+                            for _ in 0..block_len {
+                                exec.step();
+                            }
+
+                            let interp_gpr = exec.core.gpr;
+                            let interp_pc = exec.core.pc;
+                            let interp_hi = exec.core.hi;
+                            let interp_lo = exec.core.lo;
+
+                            let mut mismatch = false;
+                            for i in 0..32 {
+                                if jit_gpr[i] != interp_gpr[i] {
+                                    eprintln!("JIT VERIFY FAIL at {:016x} (tier={:?}, len={}): gpr[{}] jit={:016x} interp={:016x}",
+                                        pc, block_tier, block_len, i, jit_gpr[i], interp_gpr[i]);
+                                    mismatch = true;
+                                }
+                            }
+                            if jit_pc != interp_pc {
+                                eprintln!("JIT VERIFY FAIL at {:016x}: pc jit={:016x} interp={:016x}",
+                                    pc, jit_pc, interp_pc);
+                                mismatch = true;
+                            }
+                            if jit_hi != interp_hi {
+                                eprintln!("JIT VERIFY FAIL at {:016x}: hi jit={:016x} interp={:016x}",
+                                    pc, jit_hi, interp_hi);
+                                mismatch = true;
+                            }
+                            if jit_lo != interp_lo {
+                                eprintln!("JIT VERIFY FAIL at {:016x}: lo jit={:016x} interp={:016x}",
+                                    pc, jit_lo, interp_lo);
+                                mismatch = true;
+                            }
+
+                            if mismatch {
+                                // Check if this is a timing false positive:
+                                // interpreter took an exception (PC in exception vectors)
+                                // while JIT didn't. This happens because the interpreter
+                                // re-run occurs at a different wall-clock time and sees
+                                // different external interrupt state via the atomic.
+                                let interp_pc32 = interp_pc as u32;
+                                let interp_in_exc = (interp_pc32 >= 0x80000000 && interp_pc32 < 0x80000400)
+                                    || interp_pc32 == 0x80000180; // general exception vector
+                                let jit_pc32 = jit_pc as u32;
+                                let jit_not_exc = jit_pc32 < 0x80000000 || jit_pc32 >= 0x80000400;
+
+                                if interp_in_exc && jit_not_exc {
+                                    // Timing false positive — interpreter took an interrupt
+                                    // the JIT didn't see. Don't invalidate the block.
+                                    // Use the interpreter's result (it's authoritative).
+                                    eprintln!("JIT VERIFY: timing false positive at {:016x} (interp took exception to {:016x}), keeping block",
+                                        pc, interp_pc);
+                                } else {
+                                    // Real codegen mismatch — dump and invalidate
+                                    let instrs = trace_block(exec, pc, block_tier);
+                                    eprintln!("JIT VERIFY: block at {:016x} ({} instrs):", pc, instrs.len());
+                                    for (idx, (raw, d)) in instrs.iter().enumerate() {
+                                        let ipc = pc.wrapping_add(idx as u64 * 4);
+                                        eprintln!("  {:016x}: {:08x} op={} rs={} rt={} rd={} funct={} imm={:04x}",
+                                            ipc, raw, d.op, d.rs, d.rt, d.rd, d.funct, d.imm as u16);
+                                    }
+                                    cache.invalidate_range(phys_pc, phys_pc + 4);
+                                }
+                                total_jit_instrs += block_len as u64;
+                                continue;
+                            }
+                        }
+                    }
+
+                    // Advance cp0_count and check interrupts for the N instructions
+                    // the JIT block executed. The interpreter's step() does this per-
+                    // instruction; we must do it in bulk here or timing drifts and
+                    // the kernel panics from missed timer interrupts.
+                    {
+                        let n = block_len as u64;
+                        // Advance cp0_count by block_len * count_step
+                        let count_advance = exec.core.count_step.wrapping_mul(n);
+                        let prev = exec.core.cp0_count;
+                        exec.core.cp0_count = prev.wrapping_add(count_advance) & 0x0000_FFFF_FFFF_FFFF;
+                        if exec.core.cp0_compare != 0
+                            && prev < exec.core.cp0_compare
+                            && exec.core.cp0_count >= exec.core.cp0_compare
+                        {
+                            exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                        }
+                        // Credit local_cycles so the stats display shows correct MHz
+                        exec.local_cycles += n;
+
+                        // Check for pending interrupts — JIT blocks don't check per-
+                        // instruction like the interpreter does. If an external interrupt
+                        // arrived during the block, service it now via one interpreter step.
+                        let pending = exec.core.interrupts.load(Ordering::Relaxed);
+                        if (pending | exec.core.cp0_cause as u64) != 0 {
+                            // Merge external IP bits (IP2-IP6) into Cause (same as step() does)
+                            use crate::mips_core::{CAUSE_IP2, CAUSE_IP3, CAUSE_IP4, CAUSE_IP5, CAUSE_IP6};
+                            let ext_mask = CAUSE_IP2 | CAUSE_IP3 | CAUSE_IP4 | CAUSE_IP5 | CAUSE_IP6;
+                            exec.core.cp0_cause = (exec.core.cp0_cause & !ext_mask)
+                                | (pending as u32 & ext_mask);
+                            if exec.core.interrupts_enabled() {
+                                let ip = exec.core.cp0_cause & crate::mips_core::CAUSE_IP_MASK;
+                                let im = exec.core.cp0_status & crate::mips_core::STATUS_IM_MASK;
+                                if (ip & im) != 0 {
+                                    // Pending unmasked interrupt — let the interpreter handle it
+                                    exec.step();
+                                    total_interp_steps += 1;
+                                    steps_in_batch += 1;
+                                }
+                            }
+                        }
+                    }
+
+                    // Update stats and check for promotion
+                    if let Some(block) = cache.lookup_mut(phys_pc) {
+                        block.hit_count += 1;
+                        block.stable_hits += 1;
+                        block.exception_count = 0;
+
+                        if block.speculative && block.stable_hits >= tier_cfg.stable {
+                            block.speculative = false;
+                        }
+
+                        if !block.speculative && block.stable_hits >= tier_cfg.promote {
+                            if let Some(next) = block.tier.promote().filter(|t| *t <= max_tier) {
+                                promotions += 1;
+                                eprintln!("JIT: promote {:016x} {:?}→{:?} ({}hits)",
+                                    pc, block.tier, next, block.hit_count);
+                                recompile_block_at_tier(
+                                    &mut compiler, &mut cache, exec,
+                                    phys_pc, pc, next,
+                                    &mut blocks_compiled,
+                                );
+                            }
+                        }
+                    }
+
+                    total_jit_instrs += block_len as u64;
+                    steps_in_batch += block_len;
+                }
+            } else {
+                probe.record_miss();
+                // Cache miss — compile at Alu tier
+                let exec = unsafe { &mut *exec_ptr };
+                let instrs = trace_block(exec, pc, BlockTier::Alu);
+                if !instrs.is_empty() {
+                    if let Some(mut block) = compiler.compile_block(&instrs, pc, BlockTier::Alu) {
+                        block.phys_addr = phys_pc;
+                        cache.insert(phys_pc, block);
+                        blocks_compiled += 1;
+                        probe.set_cache_size(cache.len() as u32);
+                        if blocks_compiled <= 10 || blocks_compiled % 500 == 0 {
+                            eprintln!("JIT: compiled #{} at {:016x} ({} instrs, tier=Alu, cache={})",
+                                blocks_compiled, pc, instrs.len(), cache.len());
+                        }
+                    }
+                }
+            }
+        }
+
+        {
+            let exec = unsafe { &mut *exec_ptr };
+            exec.flush_cycles();
+        }
+
+        let total = total_interp_steps + total_jit_instrs;
+        if total % 10000000 < BATCH_SIZE as u64 {
+            let exec = unsafe { &*exec_ptr };
+            let jit_pct = if total > 0 { total_jit_instrs as f64 / total as f64 * 100.0 } else { 0.0 };
+            let effective_probe = {
+                let cf = if probe.cache_size > 100 {
+                    1.0f32 / (probe.cache_size as f32 / 100.0).sqrt().max(1.0)
+                } else { 1.0 };
+                ((probe.interval as f32 * cf) as u32).clamp(probe.min_interval, probe.max_interval)
+            };
+            eprintln!("JIT: {} total ({:.1}% jit), {} blocks, {}↑ {}↓ {}⟲, probe={}(eff {}), pc={:016x}",
+                total, jit_pct, blocks_compiled, promotions, demotions, rollbacks,
+                probe.interval, effective_probe, exec.core.pc);
+        }
+    }
+
+    {
+        let exec = unsafe { &mut *exec_ptr };
+        exec.flush_cycles();
+    }
+    let total = total_interp_steps + total_jit_instrs;
+    let jit_pct = if total > 0 { total_jit_instrs as f64 / total as f64 * 100.0 } else { 0.0 };
+    eprintln!("JIT: shutdown. {} blocks, {} jit / {} interp / {} total ({:.1}% jit), {}↑ {}↓ {}⟲, final_probe={}",
+        blocks_compiled, total_jit_instrs, total_interp_steps, total,
+        jit_pct, promotions, demotions, rollbacks, probe.interval);
+
+    // Save profile: all blocks above Alu tier
+    let profile_entries: Vec<ProfileEntry> = cache.iter()
+        .filter(|(_, block)| block.tier > BlockTier::Alu)
+        .map(|(&phys_pc, block)| ProfileEntry {
+            phys_pc,
+            virt_pc: block.virt_addr,
+            tier: block.tier,
+        })
+        .collect();
+    if !profile_entries.is_empty() {
+        if let Err(e) = profile::save_profile(&profile_entries) {
+            eprintln!("JIT profile: save failed: {}", e);
+        }
+    }
+}
+
+/// Recompile a block at a different tier, replacing the existing cache entry.
+fn recompile_block_at_tier<T: Tlb, C: MipsCache>(
+    compiler: &mut BlockCompiler,
+    cache: &mut CodeCache,
+    exec: &mut MipsExecutor<T, C>,
+    phys_pc: u64,
+    virt_pc: u64,
+    tier: BlockTier,
+    blocks_compiled: &mut u64,
+) {
+    let instrs = trace_block(exec, virt_pc, tier);
+    if !instrs.is_empty() {
+        if let Some(mut block) = compiler.compile_block(&instrs, virt_pc, tier) {
+            block.phys_addr = phys_pc;
+            cache.replace(phys_pc, block);
+            *blocks_compiled += 1;
+        }
+    }
+}
+
+fn interpreter_loop<T: Tlb, C: MipsCache>(
+    exec: &mut MipsExecutor<T, C>,
+    running: &AtomicBool,
+) {
+    while running.load(Ordering::Relaxed) {
+        #[cfg(feature = "lightning")]
+        for _ in 0..1000 {
+            exec.step(); exec.step(); exec.step(); exec.step(); exec.step();
+            exec.step(); exec.step(); exec.step(); exec.step(); exec.step();
+        }
+        #[cfg(not(feature = "lightning"))]
+        for _ in 0..1000 {
+            let status = exec.step();
+            if status == EXEC_BREAKPOINT {
+                running.store(false, Ordering::SeqCst);
+                break;
+            }
+        }
+        exec.flush_cycles();
+    }
+}
+
+fn translate_pc<T: Tlb, C: MipsCache>(
+    exec: &mut MipsExecutor<T, C>,
+    virt_pc: u64,
+) -> Option<u64> {
+    let result = (exec.translate_fn)(exec, virt_pc, AccessType::Fetch);
+    if result.is_exception() { None } else { Some(result.phys as u64) }
+}
+
+fn trace_block<T: Tlb, C: MipsCache>(
+    exec: &mut MipsExecutor<T, C>,
+    start_pc: u64,
+    tier: BlockTier,
+) -> Vec<(u32, DecodedInstr)> {
+    let mut instrs = Vec::with_capacity(MAX_BLOCK_LEN);
+    let mut pc = start_pc;
+
+    for _ in 0..MAX_BLOCK_LEN {
+        let raw = match exec.debug_fetch_instr(pc) {
+            Ok(w) => w,
+            Err(_) => break,
+        };
+
+        let mut d = DecodedInstr::default();
+        d.raw = raw;
+        decode_into::<T, C>(&mut d);
+
+        if !is_compilable_for_tier(&d, tier) { break; }
+
+        let is_branch = is_branch_or_jump(&d);
+        // Terminate Full-tier blocks after each store to keep blocks short.
+        // Long blocks with multiple load/store helper calls create complex CFG
+        // (ok_block/exc_block diamonds) that triggers Cranelift regalloc2 issues
+        // on x86_64, causing rare but fatal codegen corruption.
+        let is_store = tier == BlockTier::Full && is_compilable_store(&d);
+        instrs.push((raw, d));
+
+        if is_store {
+            break;
+        }
+
+        if is_branch {
+            pc = pc.wrapping_add(4);
+            let mut delay_ok = false;
+            if let Ok(delay_raw) = exec.debug_fetch_instr(pc) {
+                let mut delay_d = DecodedInstr::default();
+                delay_d.raw = delay_raw;
+                decode_into::<T, C>(&mut delay_d);
+                // Exclude stores from delay slots: if the delay slot faults,
+                // the JIT exception path loses delay-slot context (sync_to clears
+                // in_delay_slot), so handle_exception sets wrong cp0_epc/BD bit,
+                // and on ERET the branch is permanently skipped → crash.
+                if is_compilable_for_tier(&delay_d, tier) && !is_compilable_store(&delay_d) {
+                    instrs.push((delay_raw, delay_d));
+                    delay_ok = true;
+                }
+            }
+            if !delay_ok { instrs.pop(); }
+            break;
+        }
+
+        pc = pc.wrapping_add(4);
+    }
+
+    instrs
+}
+
+fn is_compilable_for_tier(d: &DecodedInstr, tier: BlockTier) -> bool {
+    if is_compilable_alu(d) || is_branch_or_jump(d) { return true; }
+    match tier {
+        BlockTier::Alu => false,
+        BlockTier::Loads => is_compilable_load(d),
+        BlockTier::Full => is_compilable_load(d) || is_compilable_store(d),
+    }
+}
+
+fn is_compilable_alu(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    match d.op as u32 {
+        OP_SPECIAL => matches!(d.funct as u32,
+            FUNCT_SLL | FUNCT_SRL | FUNCT_SRA |
+            FUNCT_SLLV | FUNCT_SRLV | FUNCT_SRAV |
+            FUNCT_MOVZ | FUNCT_MOVN |
+            FUNCT_MFHI | FUNCT_MTHI | FUNCT_MFLO | FUNCT_MTLO |
+            FUNCT_MULT | FUNCT_MULTU | FUNCT_DIV | FUNCT_DIVU |
+            FUNCT_DMULT | FUNCT_DMULTU | FUNCT_DDIV | FUNCT_DDIVU |
+            FUNCT_ADDU | FUNCT_SUBU | FUNCT_AND | FUNCT_OR |
+            FUNCT_XOR | FUNCT_NOR | FUNCT_SLT | FUNCT_SLTU |
+            FUNCT_DADDU | FUNCT_DSUBU |
+            FUNCT_DSLL | FUNCT_DSRL | FUNCT_DSRA |
+            FUNCT_DSLL32 | FUNCT_DSRL32 | FUNCT_DSRA32 |
+            FUNCT_DSLLV | FUNCT_DSRLV | FUNCT_DSRAV |
+            FUNCT_SYNC
+        ),
+        OP_ADDIU | OP_DADDIU | OP_SLTI | OP_SLTIU |
+        OP_ANDI | OP_ORI | OP_XORI | OP_LUI => true,
+        _ => false,
+    }
+}
+
+fn is_compilable_load(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    matches!(d.op as u32,
+        OP_LB | OP_LBU | OP_LH | OP_LHU | OP_LW | OP_LWU | OP_LD
+    )
+}
+
+fn is_compilable_store(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    matches!(d.op as u32,
+        OP_SB | OP_SH | OP_SW | OP_SD
+    )
+}
+
+fn is_branch_or_jump(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    match d.op as u32 {
+        OP_BEQ | OP_BNE | OP_BLEZ | OP_BGTZ => true,
+        OP_J | OP_JAL => true,
+        OP_SPECIAL => matches!(d.funct as u32, FUNCT_JR),
+        _ => false,
+    }
+}
diff --git a/src/jit/helpers.rs b/src/jit/helpers.rs
new file mode 100644
index 0000000..2c4f9f2
--- /dev/null
+++ b/src/jit/helpers.rs
@@ -0,0 +1,159 @@
+//! `extern "C"` bridge functions called by JIT-compiled code for memory access.
+//!
+//! CRITICAL: All pointer casts use `std::hint::black_box` to prevent LLVM from
+//! tracking pointer provenance through LTO. Without this, LLVM can prove the
+//! exec_ptr derives from a &mut in the dispatch loop and apply noalias
+//! optimizations that cause stale reads.
+
+use super::context::{JitContext, EXIT_EXCEPTION};
+use crate::mips_exec::{MipsExecutor, EXEC_COMPLETE};
+use crate::mips_tlb::Tlb;
+use crate::mips_cache_v2::MipsCache;
+
+/// Opaque cast that defeats LLVM's alias analysis and pointer provenance tracking.
+/// `#[inline(never)]` ensures LLVM can't see through this to recover provenance.
+#[inline(never)]
+fn opaque_exec<T: Tlb, C: MipsCache>(ptr: *mut u8) -> *mut MipsExecutor<T, C> {
+    std::hint::black_box(ptr as *mut MipsExecutor<T, C>)
+}
+
+#[inline(never)]
+fn opaque_ctx(ptr: *mut JitContext) -> *mut JitContext {
+    std::hint::black_box(ptr)
+}
+
+// ─── Read helpers ────────────────────────────────────────────────────────────
+
+pub extern "C" fn jit_read_u8<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    match exec.read_data::<1>(virt_addr) {
+        Ok(value) => value,
+        Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
+    }
+}
+
+pub extern "C" fn jit_read_u16<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    match exec.read_data::<2>(virt_addr) {
+        Ok(value) => value,
+        Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
+    }
+}
+
+pub extern "C" fn jit_read_u32<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    match exec.read_data::<4>(virt_addr) {
+        Ok(value) => value,
+        Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
+    }
+}
+
+pub extern "C" fn jit_read_u64<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    match exec.read_data::<8>(virt_addr) {
+        Ok(value) => value,
+        Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
+    }
+}
+
+// ─── Write helpers ───────────────────────────────────────────────────────────
+
+pub extern "C" fn jit_write_u8<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    let status = exec.write_data::<1>(virt_addr, value);
+    if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
+    0
+}
+
+pub extern "C" fn jit_write_u16<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    let status = exec.write_data::<2>(virt_addr, value);
+    if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
+    0
+}
+
+pub extern "C" fn jit_write_u32<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    let status = exec.write_data::<4>(virt_addr, value);
+    if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
+    0
+}
+
+pub extern "C" fn jit_write_u64<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    let status = exec.write_data::<8>(virt_addr, value);
+    if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
+    0
+}
+
+// ─── Interpreter fallback ────────────────────────────────────────────────────
+
+/// Execute one interpreter step for a delay slot that can't be compiled at
+/// the current JIT tier. The caller (JIT block) has already flushed modified
+/// GPRs and set ctx.pc to the delay slot PC. This function:
+/// 1. Syncs JitContext → executor (so interpreter sees JIT's register state)
+/// 2. Calls exec.step() (executes the instruction + full bookkeeping)
+/// 3. Syncs executor → JitContext (so JIT sees the result, e.g. loaded value)
+pub extern "C" fn jit_interp_one_step<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    ctx.sync_to_executor(exec);
+    exec.step();
+    ctx.sync_from_executor(exec);
+    0
+}
+
+/// Collection of monomorphized helper function pointers.
+pub struct HelperPtrs {
+    pub read_u8:  *const u8,
+    pub read_u16: *const u8,
+    pub read_u32: *const u8,
+    pub read_u64: *const u8,
+    pub write_u8:  *const u8,
+    pub write_u16: *const u8,
+    pub write_u32: *const u8,
+    pub write_u64: *const u8,
+    pub interp_step: *const u8,
+}
+
+impl HelperPtrs {
+    pub fn new<T: Tlb, C: MipsCache>() -> Self {
+        Self {
+            read_u8:  jit_read_u8::<T, C>  as *const u8,
+            read_u16: jit_read_u16::<T, C> as *const u8,
+            read_u32: jit_read_u32::<T, C> as *const u8,
+            read_u64: jit_read_u64::<T, C> as *const u8,
+            write_u8:  jit_write_u8::<T, C>  as *const u8,
+            write_u16: jit_write_u16::<T, C> as *const u8,
+            write_u32: jit_write_u32::<T, C> as *const u8,
+            write_u64: jit_write_u64::<T, C> as *const u8,
+            interp_step: jit_interp_one_step::<T, C> as *const u8,
+        }
+    }
+}
diff --git a/src/jit/mod.rs b/src/jit/mod.rs
new file mode 100644
index 0000000..c98854d
--- /dev/null
+++ b/src/jit/mod.rs
@@ -0,0 +1,17 @@
+//! Cranelift-based JIT compiler for MIPS R4400.
+//!
+//! Feature-gated under `#[cfg(feature = "jit")]`.
+//! Phase 1: dispatch infrastructure with full interpreter fallback.
+
+pub mod context;
+pub mod cache;
+pub mod compiler;
+pub mod dispatch;
+pub mod helpers;
+pub mod profile;
+pub mod snapshot;
+
+pub use context::JitContext;
+pub use cache::{CodeCache, CompiledBlock};
+pub use snapshot::CpuRollbackSnapshot;
+pub use compiler::BlockCompiler;
diff --git a/src/jit/profile.rs b/src/jit/profile.rs
new file mode 100644
index 0000000..12a2650
--- /dev/null
+++ b/src/jit/profile.rs
@@ -0,0 +1,119 @@
+//! JIT profile cache: persists hot block metadata across emulator runs.
+//!
+//! On shutdown, saves (phys_pc, virt_pc, tier) tuples for all blocks above Alu tier.
+//! On startup, loads the profile and eagerly compiles those blocks at their saved tier
+//! (still speculative until they prove stable again). Eliminates warmup time.
+
+use std::fs;
+use std::io::{self, Read, Write, BufReader, BufWriter};
+use std::path::PathBuf;
+
+use super::cache::BlockTier;
+
+/// One entry in the profile: a block that reached a tier worth persisting.
+#[derive(Debug, Clone)]
+pub struct ProfileEntry {
+    pub phys_pc: u64,
+    pub virt_pc: u64,
+    pub tier: BlockTier,
+}
+
+const PROFILE_MAGIC: &[u8; 4] = b"IRJP"; // IRIS JIT Profile
+const PROFILE_VERSION: u8 = 1;
+
+/// Default profile path: ~/.iris/jit-profile.bin
+fn default_profile_path() -> PathBuf {
+    if let Some(home) = std::env::var_os("HOME") {
+        PathBuf::from(home).join(".iris").join("jit-profile.bin")
+    } else {
+        PathBuf::from("jit-profile.bin")
+    }
+}
+
+/// Get the profile path, respecting IRIS_JIT_PROFILE env var override.
+pub fn profile_path() -> PathBuf {
+    match std::env::var_os("IRIS_JIT_PROFILE") {
+        Some(p) => PathBuf::from(p),
+        None => default_profile_path(),
+    }
+}
+
+/// Load profile entries from disk. Returns empty vec on any error.
+pub fn load_profile() -> Vec<ProfileEntry> {
+    let path = profile_path();
+    let file = match fs::File::open(&path) {
+        Ok(f) => f,
+        Err(_) => return Vec::new(),
+    };
+    let mut reader = BufReader::new(file);
+
+    let mut magic = [0u8; 4];
+    if reader.read_exact(&mut magic).is_err() || &magic != PROFILE_MAGIC {
+        eprintln!("JIT profile: invalid magic in {:?}, ignoring", path);
+        return Vec::new();
+    }
+
+    let mut version = [0u8; 1];
+    if reader.read_exact(&mut version).is_err() || version[0] != PROFILE_VERSION {
+        eprintln!("JIT profile: version mismatch in {:?}, ignoring", path);
+        return Vec::new();
+    }
+
+    let mut count_buf = [0u8; 4];
+    if reader.read_exact(&mut count_buf).is_err() {
+        return Vec::new();
+    }
+    let count = u32::from_le_bytes(count_buf) as usize;
+
+    let mut entries = Vec::with_capacity(count);
+    for _ in 0..count {
+        let mut buf = [0u8; 17]; // 8 + 8 + 1
+        if reader.read_exact(&mut buf).is_err() {
+            break;
+        }
+        let phys_pc = u64::from_le_bytes(buf[0..8].try_into().unwrap());
+        let virt_pc = u64::from_le_bytes(buf[8..16].try_into().unwrap());
+        let tier = match buf[16] {
+            0 => BlockTier::Alu,
+            1 => BlockTier::Loads,
+            2 => BlockTier::Full,
+            _ => continue,
+        };
+        entries.push(ProfileEntry { phys_pc, virt_pc, tier });
+    }
+
+    eprintln!("JIT profile: loaded {} entries from {:?}", entries.len(), path);
+    entries
+}
+
+/// Save profile entries to disk.
+pub fn save_profile(entries: &[ProfileEntry]) -> io::Result<()> {
+    let path = profile_path();
+
+    // Ensure parent directory exists
+    if let Some(parent) = path.parent() {
+        fs::create_dir_all(parent)?;
+    }
+
+    let file = fs::File::create(&path)?;
+    let mut writer = BufWriter::new(file);
+
+    writer.write_all(PROFILE_MAGIC)?;
+    writer.write_all(&[PROFILE_VERSION])?;
+    writer.write_all(&(entries.len() as u32).to_le_bytes())?;
+
+    for entry in entries {
+        writer.write_all(&entry.phys_pc.to_le_bytes())?;
+        writer.write_all(&entry.virt_pc.to_le_bytes())?;
+        let tier_byte = match entry.tier {
+            BlockTier::Alu => 0u8,
+            BlockTier::Loads => 1u8,
+            BlockTier::Full => 2u8,
+        };
+        writer.write_all(&[tier_byte])?;
+    }
+
+    writer.flush()?;
+    eprintln!("JIT profile: saved {} entries to {:?}", entries.len(), path);
+    Ok(())
+}
diff --git a/src/jit/snapshot.rs b/src/jit/snapshot.rs
new file mode 100644
index 0000000..03af56c
--- /dev/null
+++ b/src/jit/snapshot.rs
@@ -0,0 +1,101 @@
+//! Fast CPU state snapshot for JIT speculative-execution rollback.
+
+use crate::mips_core::NanoTlbEntry;
+use crate::mips_tlb::{MipsTlb, Tlb};
+use crate::mips_exec::MipsExecutor;
+use crate::mips_cache_v2::MipsCache;
+
+/// Complete CPU snapshot for JIT speculative-execution rollback.
+/// ~2.3 KB. Only allocated for speculative blocks; zero overhead for trusted blocks.
+#[derive(Clone)]
+pub struct CpuRollbackSnapshot {
+    pub gpr: [u64; 32],
+    pub pc: u64,
+    pub hi: u64,
+    pub lo: u64,
+    // CP0 subset that JIT blocks can observe or dirty:
+    pub cp0_status:   u32,
+    pub cp0_cause:    u32,
+    pub cp0_epc:      u64,
+    pub cp0_count:    u64,
+    pub cp0_compare:  u64,
+    pub cp0_badvaddr: u64,
+    pub cp0_entryhi:  u64,
+    pub cp0_context:  u64,
+    pub cp0_wired:    u32,
+    pub cp0_entrylo0: u64,
+    pub cp0_entrylo1: u64,
+    pub cp0_pagemask: u64,
+    pub nanotlb: [NanoTlbEntry; 3],
+    pub in_delay_slot: bool,
+    pub delay_slot_target: u64,
+    pub cached_pending: u64,
+    pub tlb: MipsTlb,
+}
+
+impl CpuRollbackSnapshot {
+    /// Capture current CPU state. Call immediately before running a speculative block.
+    /// `tlb` should be obtained via `exec.tlb.clone_as_mips_tlb().unwrap()`.
+    pub fn capture<T: Tlb, C: MipsCache>(exec: &MipsExecutor<T, C>, tlb: MipsTlb) -> Self {
+        Self {
+            gpr:               exec.core.gpr,
+            pc:                exec.core.pc,
+            hi:                exec.core.hi,
+            lo:                exec.core.lo,
+            cp0_status:        exec.core.cp0_status,
+            cp0_cause:         exec.core.cp0_cause,
+            cp0_epc:           exec.core.cp0_epc,
+            cp0_count:         exec.core.cp0_count,
+            cp0_compare:       exec.core.cp0_compare,
+            cp0_badvaddr:      exec.core.cp0_badvaddr,
+            cp0_entryhi:       exec.core.cp0_entryhi,
+            cp0_context:       exec.core.cp0_context,
+            cp0_wired:         exec.core.cp0_wired,
+            cp0_entrylo0:      exec.core.cp0_entrylo0,
+            cp0_entrylo1:      exec.core.cp0_entrylo1,
+            cp0_pagemask:      exec.core.cp0_pagemask,
+            nanotlb:           exec.core.nanotlb,
+            in_delay_slot:     exec.in_delay_slot,
+            delay_slot_target: exec.delay_slot_target,
+            cached_pending:    exec.cached_pending,
+            tlb,
+        }
+    }
+
+    /// Restore CPU state from snapshot. Call on rollback after a speculative block misbehaves.
+    pub fn restore<T: Tlb, C: MipsCache>(&self, exec: &mut MipsExecutor<T, C>) {
+        exec.core.gpr          = self.gpr;
+        exec.core.pc           = self.pc;
+        exec.core.hi           = self.hi;
+        exec.core.lo           = self.lo;
+        exec.core.cp0_status   = self.cp0_status;
+        exec.core.cp0_cause    = self.cp0_cause;
+        exec.core.cp0_epc      = self.cp0_epc;
+        exec.core.cp0_count    = self.cp0_count;
+        exec.core.cp0_compare  = self.cp0_compare;
+        exec.core.cp0_badvaddr = self.cp0_badvaddr;
+        exec.core.cp0_entryhi  = self.cp0_entryhi;
+        exec.core.cp0_context  = self.cp0_context;
+        exec.core.cp0_wired    = self.cp0_wired;
+        exec.core.cp0_entrylo0 = self.cp0_entrylo0;
+        exec.core.cp0_entrylo1 = self.cp0_entrylo1;
+        exec.core.cp0_pagemask = self.cp0_pagemask;
+        exec.core.nanotlb      = self.nanotlb;
+        exec.in_delay_slot     = self.in_delay_slot;
+        exec.delay_slot_target = self.delay_slot_target;
+        exec.cached_pending    = self.cached_pending;
+        exec.tlb.restore_from_mips_tlb(&self.tlb);
+    }
+
+    /// Compare GPRs between snapshot and current state.
+    /// Returns bitmask of register indices that differ (bit i set = gpr[i] changed).
+    pub fn compare_gprs<T: Tlb, C: MipsCache>(&self, exec: &MipsExecutor<T, C>) -> u32 {
+        let mut mask = 0u32;
+        for i in 0..32 {
+            if self.gpr[i] != exec.core.gpr[i] {
+                mask |= 1u32 << i;
+            }
+        }
+        mask
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 7b00c3b..712efa4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,4 +45,6 @@ pub mod hptimer;
 pub mod hptimer_tests;
 pub mod vga_font;
 pub mod saa7191;
-pub mod vino;
\ No newline at end of file
+pub mod vino;
+#[cfg(feature = "jit")]
+pub mod jit;
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
index 048be6f..7d437f8 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -6,7 +6,7 @@ fn main() {
     let headless = cfg.headless;
 
     // Start unfsd before the machine so NFS is ready when IRIX boots.
-    let nfs_proc = cfg.nfs.as_ref().map(|nfs| start_unfsd(nfs));
+    let nfs_proc = cfg.nfs.as_ref().and_then(|nfs| start_unfsd(nfs));
 
     // Machine::new() allocates >1MB on the stack (Physical device_map), which overflows
     // the default stack on Windows (1MB). We spawn a thread with a larger stack to create it.
@@ -80,7 +80,7 @@ impl UnfsdProc {
     }
 }
 
-fn start_unfsd(nfs: &NfsConfig) -> UnfsdProc {
+fn start_unfsd(nfs: &NfsConfig) -> Option<UnfsdProc> {
     use std::io::Write as _;
 
     // NFS requires an absolute path in the exports file.
@@ -98,7 +98,7 @@ fn start_unfsd(nfs: &NfsConfig) -> UnfsdProc {
 
     let pid_path = std::env::temp_dir().join("iris_nfs.pid");
 
-    let child = std::process::Command::new(&nfs.unfsd)
+    let child = match std::process::Command::new(&nfs.unfsd)
         .arg("-u")                                       // don't require root
         .arg("-p")                                       // don't register with host portmap
         .arg("-3")                                       // truncate fileid/cookie to 32 bits (IRIX compat)
@@ -108,7 +108,13 @@ fn start_unfsd(nfs: &NfsConfig) -> UnfsdProc {
         .arg("-e").arg(&exports_path)
         .arg("-i").arg(&pid_path)
         .spawn()
-        .unwrap_or_else(|e| panic!("failed to start unfsd '{}': {}", nfs.unfsd, e));
+    {
+        Ok(child) => child,
+        Err(e) => {
+            eprintln!("iris: warning: failed to start unfsd '{}': {} (NFS sharing disabled)", nfs.unfsd, e);
+            return None;
+        }
+    };
 
     eprintln!("iris: unfsd started (pid {}) nfs=127.0.0.1:{} mountd=127.0.0.1:{} dir={}",
               child.id(), nfs.nfs_host_port, nfs.mountd_host_port, abs_dir.display());
@@ -120,8 +126,8 @@ fn start_unfsd(nfs: &NfsConfig) -> UnfsdProc {
     { let mut c = child; let _ = c.wait(); }
 
     #[cfg(windows)]
-    return UnfsdProc { child };
+    return Some(UnfsdProc { child });
 
     #[cfg(not(windows))]
-    return UnfsdProc { pid_path };
+    return Some(UnfsdProc { pid_path });
 }
diff --git a/src/mips_exec.rs b/src/mips_exec.rs
index 3ff9ce1..58f584f 100644
--- a/src/mips_exec.rs
+++ b/src/mips_exec.rs
@@ -543,7 +543,7 @@ pub struct MipsExecutor<T: Tlb, C: MipsCache> {
     pub sysad: Arc<dyn BusDevice>,
     pub tlb: T,
     pub cache: C,
-    in_delay_slot: bool,
+    pub(crate) in_delay_slot: bool,
     pub delay_slot_target: u64,
     #[cfg(feature = "developer")]
     undo_buffer: UndoBuffer,
@@ -584,9 +584,9 @@ pub struct MipsExecutor<T: Tlb, C: MipsCache> {
     pub fpr_write_w: fn(&mut MipsCore, u32, u32),
     /// Local cycle counter — flushed to the shared atomic periodically to avoid
     /// a locked bus op on every instruction.
-    local_cycles: u64,
+    pub(crate) local_cycles: u64,
     /// Cached external interrupt word — reloaded every 16 instructions.
-    cached_pending: u64,
+    pub(crate) cached_pending: u64,
 }
 
 // ---- translate_fn slow-path wrappers (one per privilege × addressing-mode combination) ------
@@ -1523,7 +1523,7 @@ For R4000SC/MC CPUs:
 
     /// Production data read (with breakpoints, updates CP0 state on exceptions).
     #[inline]
-    fn read_data<const SIZE: usize>(&mut self, virt_addr: u64) -> Result<u64, ExecStatus> {
+    pub(crate) fn read_data<const SIZE: usize>(&mut self, virt_addr: u64) -> Result<u64, ExecStatus> {
         self.read_data_impl::<false, SIZE>(virt_addr)
     }
 
@@ -1624,15 +1624,13 @@ For R4000SC/MC CPUs:
 
     /// Production data write (with breakpoints, undo tracking, updates CP0 state on exceptions).
     #[inline]
-    fn write_data<const SIZE: usize>(&mut self, virt_addr: u64, val: u64) -> ExecStatus {
+    pub(crate) fn write_data<const SIZE: usize>(&mut self, virt_addr: u64, val: u64) -> ExecStatus {
         self.write_data_impl::<false, SIZE>(virt_addr, val)
     }
 
     /// Partial masked doubleword write for SDL/SDR/SWL/SWR.
-    /// Only bytes where the corresponding mask byte is non-zero are written.
-    /// `virt_addr` must be 8-byte aligned; val/mask are in MIPS big-endian doubleword space.
     #[inline]
-    fn write_data64_masked(&mut self, virt_addr: u64, val: u64, mask: u64) -> ExecStatus {
+    pub(crate) fn write_data64_masked(&mut self, virt_addr: u64, val: u64, mask: u64) -> ExecStatus {
         self.write_data64_masked_impl::<false>(virt_addr, val, mask)
     }
 
@@ -4865,10 +4863,18 @@ impl<T: Tlb + Send + 'static, C: MipsCache + Send + 'static> Device for MipsCpu<
 
         *self.thread.lock() = Some(thread::Builder::new().name("MIPS-CPU".to_string()).spawn(move || {
             let mut guard = executor.lock();
+
+            #[cfg(feature = "jit")]
+            {
+                crate::jit::dispatch::run_jit_dispatch(&mut *guard, &running);
+                return;
+            }
+
             // --- perf sampling (comment out to disable) ---
             //let mut last_cycles: u64 = guard.core.cycles.load(Ordering::Relaxed);
             //let mut last_time = std::time::Instant::now();
             // --- end perf sampling ---
+            #[allow(unreachable_code)]
             while running.load(Ordering::Relaxed) {
                 #[cfg(feature = "lightning")]
                 for _ in 0..1000 {
diff --git a/src/mips_tlb.rs b/src/mips_tlb.rs
index a7f2653..e769ffa 100644
--- a/src/mips_tlb.rs
+++ b/src/mips_tlb.rs
@@ -191,6 +191,14 @@ pub trait Tlb {
     fn power_on(&mut self) {}
     fn save_state(&self) -> toml::Value { toml::Value::Table(Default::default()) }
     fn load_state(&mut self, _v: &toml::Value) -> Result<(), String> { Ok(()) }
+
+    /// Attempt to clone this TLB as a concrete `MipsTlb`.
+    /// Returns `None` for implementations that are not `MipsTlb` (e.g. `PassthroughTlb`).
+    fn clone_as_mips_tlb(&self) -> Option<MipsTlb> { None }
+
+    /// Restore TLB state from a `MipsTlb` snapshot (used by JIT rollback).
+    /// Default no-op for implementations that don't support rollback.
+    fn restore_from_mips_tlb(&mut self, _src: &MipsTlb) {}
 }
 
 /// Sentinel: end of MRU list.
@@ -499,6 +507,10 @@ impl Tlb for MipsTlb {
         }
         Ok(())
     }
+
+    fn clone_as_mips_tlb(&self) -> Option<MipsTlb> { Some(self.clone()) }
+
+    fn restore_from_mips_tlb(&mut self, src: &MipsTlb) { *self = src.clone(); }
 }
 
 /// Passthrough TLB implementation for testing
diff --git a/src/rex3.rs b/src/rex3.rs
index f3ea517..a322150 100644
--- a/src/rex3.rs
+++ b/src/rex3.rs
@@ -1,5 +1,5 @@
 use std::sync::Arc;
-use parking_lot::Mutex;
+use parking_lot::{Mutex, Condvar};
 use spin::Mutex as SpinMutex;
 use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};
 use std::thread;
@@ -878,6 +878,9 @@ pub struct Rex3 {
     pub host_count: UnsafeCell<u32>,
     pub gfifo_producer: SpinMutex<Producer<GFIFOEntry>>,
     pub gfifo_consumer: SpinMutex<Option<Consumer<GFIFOEntry>>>,
+    /// Condvar used to wake the processor thread when work is pushed to the GFIFO.
+    gfifo_ready: Mutex<bool>,
+    gfifo_condvar: Condvar,
 
     pub vc2: Mutex<Vc2>,
     pub xmap0: Mutex<Xmap9>,
@@ -1007,6 +1010,8 @@ impl Rex3 {
             host_count: UnsafeCell::new(0),
             gfifo_producer: SpinMutex::new(producer),
             gfifo_consumer: SpinMutex::new(Some(consumer)),
+            gfifo_ready: Mutex::new(false),
+            gfifo_condvar: Condvar::new(),
             vc2: Mutex::new(Vc2::new()),
             xmap0: Mutex::new(Xmap9::new()),
             xmap1: Mutex::new(Xmap9::new()),
@@ -2701,8 +2706,15 @@ impl Rex3 {
         // push and break write ordering for multi-entry sequences.
         let mut producer = self.gfifo_producer.lock();
         loop {
-            if producer.push(entry).is_ok() { return; }
-            std::hint::spin_loop(); // be little bit nice at least
+            if producer.push(entry).is_ok() {
+                *self.gfifo_ready.lock() = true;
+                self.gfifo_condvar.notify_one();
+                return;
+            }
+            // Buffer full — wake consumer to drain it, then retry.
+            *self.gfifo_ready.lock() = true;
+            self.gfifo_condvar.notify_one();
+            std::hint::spin_loop();
         }
     }
 
@@ -2849,9 +2861,13 @@ impl Rex3 {
 
                 self.gfifo_pending.fetch_sub(1, Ordering::SeqCst);
             } else {
-                // Spin-wait for more entries.
-                // A condvar would be better, but this matches the user request.
-                std::hint::spin_loop();
+                // Wait for the producer to push work. The condvar is notified by
+                // gfifo_push on every successful push and on buffer-full retries.
+                let mut ready = self.gfifo_ready.lock();
+                while !*ready {
+                    self.gfifo_condvar.wait(&mut ready);
+                }
+                *ready = false;
             }
         }
         consumer