From c0c248fcc819729c698421e2b0546f7ccfe959db Mon Sep 17 00:00:00 2001
From: Eric Dodd <eric.e.dodd@gmail.com>
Date: Thu, 2 Apr 2026 14:01:39 -0400
Subject: [PATCH 1/5] initial jit

---
 Cargo.toml          |   8 +
 src/jit/cache.rs    |  59 +++++
 src/jit/compiler.rs | 559 ++++++++++++++++++++++++++++++++++++++++++++
 src/jit/context.rs  | 143 ++++++++++++
 src/jit/dispatch.rs | 188 +++++++++++++++
 src/jit/helpers.rs  |  26 +++
 src/jit/mod.rs      |  14 ++
 src/lib.rs          |   4 +-
 src/mips_exec.rs    |  14 +-
 9 files changed, 1011 insertions(+), 4 deletions(-)
 create mode 100644 src/jit/cache.rs
 create mode 100644 src/jit/compiler.rs
 create mode 100644 src/jit/context.rs
 create mode 100644 src/jit/dispatch.rs
 create mode 100644 src/jit/helpers.rs
 create mode 100644 src/jit/mod.rs

diff --git a/Cargo.toml b/Cargo.toml
index acf1270..5d02def 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,8 @@ developer_ip7 = []  # CP0 Compare/timer calibration stats and debug prints
 # Lightning: pedal-to-the-metal build — disables breakpoint checks and traceback buffer updates.
 # Incompatible with interactive debugging. For end-user / benchmarking builds only.
 lightning = []
+# Cranelift-based JIT compiler for MIPS → native translation.
+jit = ["cranelift-codegen", "cranelift-frontend", "cranelift-jit", "cranelift-module", "cranelift-native", "target-lexicon"]
 
 [dependencies]
 clap = { version = "4", features = ["derive"] }
@@ -31,6 +33,12 @@ serde = { version = "1.0.228", features = ["derive"] }
 toml = "1.0.3"
 parking_lot = "0.12"
 spin = "0.10.0"
+cranelift-codegen    = { version = "0.116", optional = true }
+cranelift-frontend   = { version = "0.116", optional = true }
+cranelift-jit        = { version = "0.116", optional = true }
+cranelift-module     = { version = "0.116", optional = true }
+cranelift-native     = { version = "0.116", optional = true }
+target-lexicon       = { version = "0.13",  optional = true }
 
 [target.'cfg(not(windows))'.dependencies]
 libc = "0.2"
diff --git a/src/jit/cache.rs b/src/jit/cache.rs
new file mode 100644
index 0000000..3dd44e8
--- /dev/null
+++ b/src/jit/cache.rs
@@ -0,0 +1,59 @@
+//! JIT code cache: maps physical PCs to compiled native code blocks.
+
+use std::collections::HashMap;
+
+/// A compiled native code block.
+pub struct CompiledBlock {
+    /// Function pointer to compiled native code.
+    pub entry: *const u8,
+    /// Physical address this block starts at.
+    pub phys_addr: u64,
+    /// Virtual address (for diagnostics).
+    pub virt_addr: u64,
+    /// Number of MIPS instructions in this block.
+    pub len_mips: u32,
+    /// Size of native code in bytes.
+    pub len_native: u32,
+}
+
+// Safety: CompiledBlock is only accessed from the CPU thread.
+unsafe impl Send for CompiledBlock {}
+
+/// Code cache keyed by physical PC (aligned to 4 bytes).
+pub struct CodeCache {
+    blocks: HashMap<u64, CompiledBlock>,
+}
+
+impl CodeCache {
+    pub fn new() -> Self {
+        Self {
+            blocks: HashMap::new(),
+        }
+    }
+
+    pub fn lookup(&self, phys_pc: u64) -> Option<&CompiledBlock> {
+        self.blocks.get(&phys_pc)
+    }
+
+    pub fn insert(&mut self, phys_pc: u64, block: CompiledBlock) {
+        self.blocks.insert(phys_pc, block);
+    }
+
+    /// Invalidate all blocks that overlap a physical address range.
+    /// Called when self-modifying code is detected or CACHE instruction executes.
+    pub fn invalidate_range(&mut self, phys_start: u64, phys_end: u64) {
+        self.blocks.retain(|&addr, block| {
+            let block_end = addr + (block.len_mips as u64 * 4);
+            addr >= phys_end || block_end <= phys_start
+        });
+    }
+
+    /// Invalidate everything (used on TLB flush or mode change).
+    pub fn invalidate_all(&mut self) {
+        self.blocks.clear();
+    }
+
+    pub fn len(&self) -> usize {
+        self.blocks.len()
+    }
+}
diff --git a/src/jit/compiler.rs b/src/jit/compiler.rs
new file mode 100644
index 0000000..755025c
--- /dev/null
+++ b/src/jit/compiler.rs
@@ -0,0 +1,559 @@
+//! Block compiler: translates MIPS basic blocks to native code via Cranelift.
+
+use cranelift_codegen::ir::{self, types, AbiParam, InstBuilder, MemFlags, Value};
+use cranelift_codegen::ir::condcodes::IntCC;
+use cranelift_codegen::settings::{self, Configurable};
+use cranelift_codegen::{self, Context};
+use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable};
+use cranelift_jit::{JITBuilder, JITModule};
+use cranelift_module::{Linkage, Module};
+
+use crate::mips_exec::DecodedInstr;
+use crate::mips_isa::*;
+
+use super::cache::CompiledBlock;
+use super::context::{JitContext, EXIT_NORMAL, EXIT_INTERPRET};
+
+pub struct BlockCompiler {
+    jit_module: JITModule,
+    ctx: Context,
+    builder_ctx: FunctionBuilderContext,
+    func_id_counter: u32,
+}
+
+impl BlockCompiler {
+    pub fn new() -> Self {
+        let mut flag_builder = settings::builder();
+        flag_builder.set("opt_level", "speed").unwrap();
+        flag_builder.set("is_pic", "false").unwrap();
+
+        let isa_builder = cranelift_native::builder().expect("host ISA not supported");
+        let isa = isa_builder.finish(settings::Flags::new(flag_builder)).unwrap();
+
+        let jit_builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
+        let jit_module = JITModule::new(jit_builder);
+
+        Self {
+            ctx: jit_module.make_context(),
+            jit_module,
+            builder_ctx: FunctionBuilderContext::new(),
+            func_id_counter: 0,
+        }
+    }
+
+    /// Compile a block of MIPS instructions to native code.
+    /// `instrs` is a slice of (raw_word, DecodedInstr) for each instruction in the block.
+    /// `block_pc` is the virtual PC of the first instruction.
+    /// Returns None if the block is empty or compilation fails.
+    pub fn compile_block(
+        &mut self,
+        instrs: &[(u32, DecodedInstr)],
+        block_pc: u64,
+    ) -> Option<CompiledBlock> {
+        if instrs.is_empty() {
+            return None;
+        }
+
+        let num_instrs = instrs.len() as u32;
+
+        // Create a unique function name
+        let name = format!("jit_block_{:x}_{}", block_pc, self.func_id_counter);
+        self.func_id_counter += 1;
+
+        // Declare function signature: extern "C" fn(*mut JitContext)
+        let ptr_type = self.jit_module.target_config().pointer_type();
+        self.ctx.func.signature.params.push(AbiParam::new(ptr_type));
+        self.ctx.func.signature.call_conv = cranelift_codegen::isa::CallConv::SystemV;
+
+        let func_id = self.jit_module
+            .declare_function(&name, Linkage::Local, &self.ctx.func.signature)
+            .unwrap();
+
+        let mut builder = FunctionBuilder::new(&mut self.ctx.func, &mut self.builder_ctx);
+
+        let entry_block = builder.create_block();
+        builder.append_block_params_for_function_params(entry_block);
+        builder.switch_to_block(entry_block);
+        builder.seal_block(entry_block);
+
+        let ctx_ptr = builder.block_params(entry_block)[0];
+        let mem = MemFlags::trusted();
+
+        // Load GPRs 1-31 from JitContext (gpr[0] is always 0)
+        let mut gpr = [builder.ins().iconst(types::I64, 0); 32];
+        for i in 1..32usize {
+            gpr[i] = builder.ins().load(
+                types::I64, mem, ctx_ptr,
+                ir::immediates::Offset32::new(JitContext::gpr_offset(i)),
+            );
+        }
+
+        // Load hi/lo
+        let mut hi = builder.ins().load(types::I64, mem, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::hi_offset()));
+        let mut lo = builder.ins().load(types::I64, mem, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::lo_offset()));
+
+        // Emit IR for each instruction
+        let mut compiled_count = 0u32;
+        for (_, d) in instrs {
+            if !emit_instruction(&mut builder, ctx_ptr, &mut gpr, &mut hi, &mut lo, d) {
+                break;
+            }
+            compiled_count += 1;
+        }
+
+        if compiled_count == 0 {
+            // Nothing was compilable — clean up and return None
+            builder.ins().return_(&[]);
+            builder.finalize();
+            self.ctx.clear();
+            return None;
+        }
+
+        // Store GPRs back (skip r0)
+        for i in 1..32usize {
+            builder.ins().store(mem, gpr[i], ctx_ptr,
+                ir::immediates::Offset32::new(JitContext::gpr_offset(i)));
+        }
+
+        // Store hi/lo back
+        builder.ins().store(mem, hi, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::hi_offset()));
+        builder.ins().store(mem, lo, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::lo_offset()));
+
+        // Set exit PC = block_pc + 4 * compiled_count
+        let exit_pc = block_pc.wrapping_add(compiled_count as u64 * 4);
+        let exit_pc_val = builder.ins().iconst(types::I64, exit_pc as i64);
+        builder.ins().store(mem, exit_pc_val, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::pc_offset()));
+
+        // Set exit_reason = EXIT_NORMAL
+        let exit_val = builder.ins().iconst(types::I32, EXIT_NORMAL as i64);
+        builder.ins().store(mem, exit_val, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::exit_reason_offset()));
+
+        // Set block_instrs_executed
+        let count_val = builder.ins().iconst(types::I32, compiled_count as i64);
+        builder.ins().store(mem, count_val, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::block_instrs_offset()));
+
+        builder.ins().return_(&[]);
+        builder.finalize();
+
+        // Compile to native code
+        self.jit_module.define_function(func_id, &mut self.ctx).unwrap();
+        self.jit_module.clear_context(&mut self.ctx);
+        self.jit_module.finalize_definitions().unwrap();
+
+        let code_ptr = self.jit_module.get_finalized_function(func_id);
+        let code_size = 0u32; // JITModule doesn't expose size easily; not critical
+
+        Some(CompiledBlock {
+            entry: code_ptr,
+            phys_addr: 0, // filled in by caller
+            virt_addr: block_pc,
+            len_mips: compiled_count,
+            len_native: code_size,
+        })
+    }
+}
+
+/// Emit Cranelift IR for a single MIPS instruction.
+/// Returns true if the instruction was compiled, false if it should terminate the block.
+fn emit_instruction(
+    builder: &mut FunctionBuilder,
+    ctx_ptr: Value,
+    gpr: &mut [Value; 32],
+    hi: &mut Value,
+    lo: &mut Value,
+    d: &DecodedInstr,
+) -> bool {
+    let op = d.op as u32;
+    let rs = d.rs as usize;
+    let rt = d.rt as usize;
+    let rd = d.rd as usize;
+    let sa = d.sa as u32;
+    let funct = d.funct as u32;
+
+    match op {
+        OP_SPECIAL => emit_special(builder, gpr, hi, lo, d, rs, rt, rd, sa, funct),
+        OP_ADDIU  => { emit_addiu(builder, gpr, rs, rt, d); true }
+        OP_DADDIU => { emit_daddiu(builder, gpr, rs, rt, d); true }
+        OP_SLTI   => { emit_slti(builder, gpr, rs, rt, d); true }
+        OP_SLTIU  => { emit_sltiu(builder, gpr, rs, rt, d); true }
+        OP_ANDI   => { emit_andi(builder, gpr, rs, rt, d); true }
+        OP_ORI    => { emit_ori(builder, gpr, rs, rt, d); true }
+        OP_XORI   => { emit_xori(builder, gpr, rs, rt, d); true }
+        OP_LUI    => { emit_lui(builder, gpr, rt, d); true }
+        _ => false, // Non-ALU instruction — terminate block
+    }
+}
+
+fn emit_special(
+    builder: &mut FunctionBuilder,
+    gpr: &mut [Value; 32],
+    hi: &mut Value,
+    lo: &mut Value,
+    d: &DecodedInstr,
+    rs: usize, rt: usize, rd: usize, sa: u32, funct: u32,
+) -> bool {
+    match funct {
+        // --- Shifts (immediate) ---
+        FUNCT_SLL  => { emit_sll(builder, gpr, rt, rd, sa); true }
+        FUNCT_SRL  => { emit_srl(builder, gpr, rt, rd, sa); true }
+        FUNCT_SRA  => { emit_sra(builder, gpr, rt, rd, sa); true }
+
+        // --- Shifts (variable) ---
+        FUNCT_SLLV => { emit_sllv(builder, gpr, rs, rt, rd); true }
+        FUNCT_SRLV => { emit_srlv(builder, gpr, rs, rt, rd); true }
+        FUNCT_SRAV => { emit_srav(builder, gpr, rs, rt, rd); true }
+
+        // --- 64-bit shifts (immediate) ---
+        FUNCT_DSLL   => { emit_dsll(builder, gpr, rt, rd, sa); true }
+        FUNCT_DSRL   => { emit_dsrl(builder, gpr, rt, rd, sa); true }
+        FUNCT_DSRA   => { emit_dsra(builder, gpr, rt, rd, sa); true }
+        FUNCT_DSLL32 => { emit_dsll(builder, gpr, rt, rd, sa + 32); true }
+        FUNCT_DSRL32 => { emit_dsrl(builder, gpr, rt, rd, sa + 32); true }
+        FUNCT_DSRA32 => { emit_dsra(builder, gpr, rt, rd, sa + 32); true }
+
+        // --- 64-bit shifts (variable) ---
+        FUNCT_DSLLV => { emit_dsllv(builder, gpr, rs, rt, rd); true }
+        FUNCT_DSRLV => { emit_dsrlv(builder, gpr, rs, rt, rd); true }
+        FUNCT_DSRAV => { emit_dsrav(builder, gpr, rs, rt, rd); true }
+
+        // --- ALU register ops ---
+        FUNCT_ADDU => { emit_addu(builder, gpr, rs, rt, rd); true }
+        FUNCT_SUBU => { emit_subu(builder, gpr, rs, rt, rd); true }
+        FUNCT_AND  => { emit_and(builder, gpr, rs, rt, rd); true }
+        FUNCT_OR   => { emit_or(builder, gpr, rs, rt, rd); true }
+        FUNCT_XOR  => { emit_xor(builder, gpr, rs, rt, rd); true }
+        FUNCT_NOR  => { emit_nor(builder, gpr, rs, rt, rd); true }
+        FUNCT_SLT  => { emit_slt(builder, gpr, rs, rt, rd); true }
+        FUNCT_SLTU => { emit_sltu(builder, gpr, rs, rt, rd); true }
+
+        // --- 64-bit ALU ---
+        FUNCT_DADDU => { emit_daddu(builder, gpr, rs, rt, rd); true }
+        FUNCT_DSUBU => { emit_dsubu(builder, gpr, rs, rt, rd); true }
+
+        // --- Multiply/Divide ---
+        FUNCT_MULT  => { emit_mult(builder, gpr, hi, lo, rs, rt); true }
+        FUNCT_MULTU => { emit_multu(builder, gpr, hi, lo, rs, rt); true }
+        FUNCT_DIV   => { emit_div(builder, gpr, hi, lo, rs, rt); true }
+        FUNCT_DIVU  => { emit_divu(builder, gpr, hi, lo, rs, rt); true }
+        FUNCT_DMULT  => { emit_dmult(builder, gpr, hi, lo, rs, rt); true }
+        FUNCT_DMULTU => { emit_dmultu(builder, gpr, hi, lo, rs, rt); true }
+        FUNCT_DDIV   => { emit_ddiv(builder, gpr, hi, lo, rs, rt); true }
+        FUNCT_DDIVU  => { emit_ddivu(builder, gpr, hi, lo, rs, rt); true }
+
+        // --- HI/LO moves ---
+        FUNCT_MFHI => { gpr[rd] = *hi; true }
+        FUNCT_MTHI => { *hi = gpr[rs]; true }
+        FUNCT_MFLO => { gpr[rd] = *lo; true }
+        FUNCT_MTLO => { *lo = gpr[rs]; true }
+
+        // --- Conditional moves ---
+        FUNCT_MOVZ => { emit_movz(builder, gpr, rs, rt, rd); true }
+        FUNCT_MOVN => { emit_movn(builder, gpr, rs, rt, rd); true }
+
+        // --- SYNC (barrier, NOP for JIT) ---
+        FUNCT_SYNC => true,
+
+        // Everything else terminates the block
+        _ => false,
+    }
+}
+
+// ─── Helper: sign-extend i32 result to i64 ──────────────────────────────────
+
+/// Truncate a 64-bit value to 32-bit, then sign-extend back to 64-bit.
+/// Matches the interpreter pattern: `val as u32 as i32 as i64 as u64`.
+fn sext32(builder: &mut FunctionBuilder, val: Value) -> Value {
+    let narrow = builder.ins().ireduce(types::I32, val);
+    builder.ins().sextend(types::I64, narrow)
+}
+
+// ─── Immediate ALU ops ───────────────────────────────────────────────────────
+
+fn emit_addiu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    // (rs as u32).wrapping_add(imm as u32) → sign-extend to 64
+    let rs32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let imm = builder.ins().iconst(types::I32, d.imm as i32 as i64);
+    let sum = builder.ins().iadd(rs32, imm);
+    gpr[rt] = builder.ins().sextend(types::I64, sum);
+}
+
+fn emit_daddiu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    let imm = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    gpr[rt] = builder.ins().iadd(gpr[rs], imm);
+}
+
+fn emit_slti(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    let imm = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    let cmp = builder.ins().icmp(IntCC::SignedLessThan, gpr[rs], imm);
+    gpr[rt] = builder.ins().uextend(types::I64, cmp);
+}
+
+fn emit_sltiu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    // imm is sign-extended then compared as unsigned
+    let imm = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    let cmp = builder.ins().icmp(IntCC::UnsignedLessThan, gpr[rs], imm);
+    gpr[rt] = builder.ins().uextend(types::I64, cmp);
+}
+
+fn emit_andi(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    // zero-extended immediate
+    let imm = builder.ins().iconst(types::I64, (d.imm & 0xFFFF) as i64);
+    gpr[rt] = builder.ins().band(gpr[rs], imm);
+}
+
+fn emit_ori(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    let imm = builder.ins().iconst(types::I64, (d.imm & 0xFFFF) as i64);
+    gpr[rt] = builder.ins().bor(gpr[rs], imm);
+}
+
+fn emit_xori(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr) {
+    let imm = builder.ins().iconst(types::I64, (d.imm & 0xFFFF) as i64);
+    gpr[rt] = builder.ins().bxor(gpr[rs], imm);
+}
+
+fn emit_lui(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, d: &DecodedInstr) {
+    // imm is already shifted left 16 by decode (set_imm_lui)
+    // sign-extend from 32 to 64
+    gpr[rt] = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+}
+
+// ─── Register ALU ops ────────────────────────────────────────────────────────
+
+fn emit_addu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let a = builder.ins().ireduce(types::I32, gpr[rs]);
+    let b = builder.ins().ireduce(types::I32, gpr[rt]);
+    let sum = builder.ins().iadd(a, b);
+    gpr[rd] = builder.ins().sextend(types::I64, sum);
+}
+
+fn emit_subu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let a = builder.ins().ireduce(types::I32, gpr[rs]);
+    let b = builder.ins().ireduce(types::I32, gpr[rt]);
+    let diff = builder.ins().isub(a, b);
+    gpr[rd] = builder.ins().sextend(types::I64, diff);
+}
+
+fn emit_and(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().band(gpr[rs], gpr[rt]);
+}
+
+fn emit_or(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().bor(gpr[rs], gpr[rt]);
+}
+
+fn emit_xor(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().bxor(gpr[rs], gpr[rt]);
+}
+
+fn emit_nor(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let or_val = builder.ins().bor(gpr[rs], gpr[rt]);
+    gpr[rd] = builder.ins().bnot(or_val);
+}
+
+fn emit_slt(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let cmp = builder.ins().icmp(IntCC::SignedLessThan, gpr[rs], gpr[rt]);
+    gpr[rd] = builder.ins().uextend(types::I64, cmp);
+}
+
+fn emit_sltu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let cmp = builder.ins().icmp(IntCC::UnsignedLessThan, gpr[rs], gpr[rt]);
+    gpr[rd] = builder.ins().uextend(types::I64, cmp);
+}
+
+// ─── 64-bit ALU ops ──────────────────────────────────────────────────────────
+
+fn emit_daddu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().iadd(gpr[rs], gpr[rt]);
+}
+
+fn emit_dsubu(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    gpr[rd] = builder.ins().isub(gpr[rs], gpr[rt]);
+}
+
+// ─── 32-bit Shift ops ───────────────────────────────────────────────────────
+
+fn emit_sll(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let shift = builder.ins().iconst(types::I32, sa as i64);
+    let result = builder.ins().ishl(rt32, shift);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_srl(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let shift = builder.ins().iconst(types::I32, sa as i64);
+    let result = builder.ins().ushr(rt32, shift);
+    // SRL: logical shift, but result is still sign-extended to 64 (MIPS spec)
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_sra(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let shift = builder.ins().iconst(types::I32, sa as i64);
+    let result = builder.ins().sshr(rt32, shift);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_sllv(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let rs32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let mask = builder.ins().iconst(types::I32, 0x1F);
+    let sa = builder.ins().band(rs32, mask);
+    let result = builder.ins().ishl(rt32, sa);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_srlv(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let rs32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let mask = builder.ins().iconst(types::I32, 0x1F);
+    let sa = builder.ins().band(rs32, mask);
+    let result = builder.ins().ushr(rt32, sa);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+fn emit_srav(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let rt32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let rs32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let mask = builder.ins().iconst(types::I32, 0x1F);
+    let sa = builder.ins().band(rs32, mask);
+    let result = builder.ins().sshr(rt32, sa);
+    gpr[rd] = builder.ins().sextend(types::I64, result);
+}
+
+// ─── 64-bit Shift ops ───────────────────────────────────────────────────────
+
+fn emit_dsll(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let shift = builder.ins().iconst(types::I64, sa as i64);
+    gpr[rd] = builder.ins().ishl(gpr[rt], shift);
+}
+
+fn emit_dsrl(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let shift = builder.ins().iconst(types::I64, sa as i64);
+    gpr[rd] = builder.ins().ushr(gpr[rt], shift);
+}
+
+fn emit_dsra(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rt: usize, rd: usize, sa: u32) {
+    let shift = builder.ins().iconst(types::I64, sa as i64);
+    gpr[rd] = builder.ins().sshr(gpr[rt], shift);
+}
+
+fn emit_dsllv(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let mask = builder.ins().iconst(types::I64, 0x3F);
+    let sa = builder.ins().band(gpr[rs], mask);
+    gpr[rd] = builder.ins().ishl(gpr[rt], sa);
+}
+
+fn emit_dsrlv(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let mask = builder.ins().iconst(types::I64, 0x3F);
+    let sa = builder.ins().band(gpr[rs], mask);
+    gpr[rd] = builder.ins().ushr(gpr[rt], sa);
+}
+
+fn emit_dsrav(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let mask = builder.ins().iconst(types::I64, 0x3F);
+    let sa = builder.ins().band(gpr[rs], mask);
+    gpr[rd] = builder.ins().sshr(gpr[rt], sa);
+}
+
+// ─── Multiply/Divide ─────────────────────────────────────────────────────────
+
+fn emit_mult(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    // Signed 32×32 → 64-bit result
+    let a32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let a = builder.ins().sextend(types::I64, a32);
+    let b32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let b = builder.ins().sextend(types::I64, b32);
+    let product = builder.ins().imul(a, b);
+    // lo = sign-extend low 32 bits; hi = sign-extend high 32 bits
+    *lo = sext32(builder, product);
+    let shifted = builder.ins().sshr_imm(product, 32);
+    *hi = sext32(builder, shifted);
+}
+
+fn emit_multu(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let a32 = builder.ins().ireduce(types::I32, gpr[rs]);
+    let a = builder.ins().uextend(types::I64, a32);
+    let b32 = builder.ins().ireduce(types::I32, gpr[rt]);
+    let b = builder.ins().uextend(types::I64, b32);
+    let product = builder.ins().imul(a, b);
+    *lo = sext32(builder, product);
+    let shifted = builder.ins().ushr_imm(product, 32);
+    *hi = sext32(builder, shifted);
+}
+
+fn emit_div(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let a = builder.ins().ireduce(types::I32, gpr[rs]);
+    let b = builder.ins().ireduce(types::I32, gpr[rt]);
+    let zero = builder.ins().iconst(types::I32, 0);
+    let one = builder.ins().iconst(types::I32, 1);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, b, zero);
+    let safe_b = builder.ins().select(is_nonzero, b, one);
+    let q = builder.ins().sdiv(a, safe_b);
+    let r = builder.ins().srem(a, safe_b);
+    *lo = builder.ins().sextend(types::I64, q);
+    *hi = builder.ins().sextend(types::I64, r);
+}
+
+fn emit_divu(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let a = builder.ins().ireduce(types::I32, gpr[rs]);
+    let b = builder.ins().ireduce(types::I32, gpr[rt]);
+    let zero = builder.ins().iconst(types::I32, 0);
+    let one = builder.ins().iconst(types::I32, 1);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, b, zero);
+    let safe_b = builder.ins().select(is_nonzero, b, one);
+    let q = builder.ins().udiv(a, safe_b);
+    let r = builder.ins().urem(a, safe_b);
+    *lo = builder.ins().sextend(types::I64, q);
+    *hi = builder.ins().sextend(types::I64, r);
+}
+
+fn emit_dmult(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    // Signed 64×64: lo = low 64, hi = high 64
+    *lo = builder.ins().imul(gpr[rs], gpr[rt]);
+    *hi = builder.ins().smulhi(gpr[rs], gpr[rt]);
+}
+
+fn emit_dmultu(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    *lo = builder.ins().imul(gpr[rs], gpr[rt]);
+    *hi = builder.ins().umulhi(gpr[rs], gpr[rt]);
+}
+
+fn emit_ddiv(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let zero = builder.ins().iconst(types::I64, 0);
+    let one = builder.ins().iconst(types::I64, 1);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, gpr[rt], zero);
+    let safe_b = builder.ins().select(is_nonzero, gpr[rt], one);
+    *lo = builder.ins().sdiv(gpr[rs], safe_b);
+    *hi = builder.ins().srem(gpr[rs], safe_b);
+}
+
+fn emit_ddivu(builder: &mut FunctionBuilder, gpr: &[Value; 32], hi: &mut Value, lo: &mut Value, rs: usize, rt: usize) {
+    let zero = builder.ins().iconst(types::I64, 0);
+    let one = builder.ins().iconst(types::I64, 1);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, gpr[rt], zero);
+    let safe_b = builder.ins().select(is_nonzero, gpr[rt], one);
+    *lo = builder.ins().udiv(gpr[rs], safe_b);
+    *hi = builder.ins().urem(gpr[rs], safe_b);
+}
+
+// ─── Conditional moves ───────────────────────────────────────────────────────
+
+fn emit_movz(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let zero = builder.ins().iconst(types::I64, 0);
+    let is_zero = builder.ins().icmp(IntCC::Equal, gpr[rt], zero);
+    gpr[rd] = builder.ins().select(is_zero, gpr[rs], gpr[rd]);
+}
+
+fn emit_movn(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt: usize, rd: usize) {
+    let zero = builder.ins().iconst(types::I64, 0);
+    let is_nonzero = builder.ins().icmp(IntCC::NotEqual, gpr[rt], zero);
+    gpr[rd] = builder.ins().select(is_nonzero, gpr[rs], gpr[rd]);
+}
diff --git a/src/jit/context.rs b/src/jit/context.rs
new file mode 100644
index 0000000..8d70181
--- /dev/null
+++ b/src/jit/context.rs
@@ -0,0 +1,143 @@
+//! JitContext: `#[repr(C)]` bridge struct between JIT-compiled code and emulator state.
+//!
+//! Contains the hot subset of MipsCore and MipsExecutor state that compiled blocks
+//! read and write directly. Synced to/from the interpreter before and after JIT execution.
+
+use crate::mips_core::NanoTlbEntry;
+use crate::mips_exec::MipsExecutor;
+use crate::mips_tlb::Tlb;
+use crate::mips_cache_v2::MipsCache;
+
+// Exit reason constants set by JIT code before returning to dispatch.
+pub const EXIT_NORMAL: u32 = 0;
+pub const EXIT_INTERPRET: u32 = 1;
+pub const EXIT_EXCEPTION: u32 = 2;
+pub const EXIT_INTERRUPT_CHECK: u32 = 3;
+pub const EXIT_HALT: u32 = 4;
+
+#[repr(C)]
+pub struct JitContext {
+    // General purpose registers
+    pub gpr: [u64; 32],
+
+    // Special registers
+    pub pc: u64,
+    pub hi: u64,
+    pub lo: u64,
+
+    // FPU registers
+    pub fpr: [u64; 32],
+    pub fpu_fcsr: u32,
+
+    // CP0 state (hot subset for interrupt/exception checking)
+    pub cp0_status: u32,
+    pub cp0_cause: u32,
+    pub cp0_epc: u64,
+    pub cp0_count: u64,
+    pub cp0_compare: u64,
+    pub count_step: u64,
+    pub cp0_badvaddr: u64,
+
+    // Nano-TLB (3 entries: Fetch/Read/Write)
+    pub nanotlb: [NanoTlbEntry; 3],
+
+    // Delay slot state
+    pub in_delay_slot: bool,
+    pub delay_slot_target: u64,
+
+    // Interrupt handling (cached from executor)
+    pub cached_pending: u64,
+    pub local_cycles: u64,
+
+    // JIT dispatch state
+    pub exit_reason: u32,
+    pub block_instrs_executed: u32,
+}
+
+impl JitContext {
+    pub fn new() -> Self {
+        Self {
+            gpr: [0; 32],
+            pc: 0,
+            hi: 0,
+            lo: 0,
+            fpr: [0; 32],
+            fpu_fcsr: 0,
+            cp0_status: 0,
+            cp0_cause: 0,
+            cp0_epc: 0,
+            cp0_count: 0,
+            cp0_compare: 0,
+            count_step: 0,
+            cp0_badvaddr: 0,
+            nanotlb: [NanoTlbEntry::default(); 3],
+            in_delay_slot: false,
+            delay_slot_target: 0,
+            cached_pending: 0,
+            local_cycles: 0,
+            exit_reason: EXIT_NORMAL,
+            block_instrs_executed: 0,
+        }
+    }
+
+    /// Byte offset of `gpr[i]` from the start of JitContext.
+    pub fn gpr_offset(i: usize) -> i32 {
+        (std::mem::offset_of!(JitContext, gpr) + i * 8) as i32
+    }
+
+    pub fn hi_offset() -> i32 { std::mem::offset_of!(JitContext, hi) as i32 }
+    pub fn lo_offset() -> i32 { std::mem::offset_of!(JitContext, lo) as i32 }
+    pub fn pc_offset() -> i32 { std::mem::offset_of!(JitContext, pc) as i32 }
+    pub fn exit_reason_offset() -> i32 { std::mem::offset_of!(JitContext, exit_reason) as i32 }
+    pub fn block_instrs_offset() -> i32 { std::mem::offset_of!(JitContext, block_instrs_executed) as i32 }
+
+    /// Copy emulator state into JitContext.
+    pub fn sync_from_executor<T: Tlb, C: MipsCache>(
+        &mut self,
+        exec: &MipsExecutor<T, C>,
+    ) {
+        self.gpr = exec.core.gpr;
+        self.pc = exec.core.pc;
+        self.hi = exec.core.hi;
+        self.lo = exec.core.lo;
+        self.fpr = exec.core.fpr;
+        self.fpu_fcsr = exec.core.fpu_fcsr;
+        self.cp0_status = exec.core.cp0_status;
+        self.cp0_cause = exec.core.cp0_cause;
+        self.cp0_epc = exec.core.cp0_epc;
+        self.cp0_count = exec.core.cp0_count;
+        self.cp0_compare = exec.core.cp0_compare;
+        self.count_step = exec.core.count_step;
+        self.cp0_badvaddr = exec.core.cp0_badvaddr;
+        self.nanotlb = exec.core.nanotlb;
+        self.in_delay_slot = exec.in_delay_slot;
+        self.delay_slot_target = exec.delay_slot_target;
+        self.cached_pending = exec.cached_pending;
+        self.local_cycles = exec.local_cycles;
+    }
+
+    /// Copy JitContext state back to the emulator.
+    pub fn sync_to_executor<T: Tlb, C: MipsCache>(
+        &self,
+        exec: &mut MipsExecutor<T, C>,
+    ) {
+        exec.core.gpr = self.gpr;
+        exec.core.pc = self.pc;
+        exec.core.hi = self.hi;
+        exec.core.lo = self.lo;
+        exec.core.fpr = self.fpr;
+        exec.core.fpu_fcsr = self.fpu_fcsr;
+        exec.core.cp0_status = self.cp0_status;
+        exec.core.cp0_cause = self.cp0_cause;
+        exec.core.cp0_epc = self.cp0_epc;
+        exec.core.cp0_count = self.cp0_count;
+        exec.core.cp0_compare = self.cp0_compare;
+        exec.core.count_step = self.count_step;
+        exec.core.cp0_badvaddr = self.cp0_badvaddr;
+        exec.core.nanotlb = self.nanotlb;
+        exec.in_delay_slot = self.in_delay_slot;
+        exec.delay_slot_target = self.delay_slot_target;
+        exec.cached_pending = self.cached_pending;
+        exec.local_cycles = self.local_cycles;
+    }
+}
diff --git a/src/jit/dispatch.rs b/src/jit/dispatch.rs
new file mode 100644
index 0000000..5067eeb
--- /dev/null
+++ b/src/jit/dispatch.rs
@@ -0,0 +1,188 @@
+//! JIT dispatch loop: traces, compiles, and executes MIPS basic blocks.
+
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::mips_exec::{MipsExecutor, DecodedInstr, ExecStatus, EXEC_BREAKPOINT, EXEC_IS_EXCEPTION, decode_into};
+use crate::mips_tlb::{Tlb, AccessType};
+use crate::mips_cache_v2::MipsCache;
+
+use super::cache::CodeCache;
+use super::compiler::BlockCompiler;
+use super::context::{JitContext, EXIT_NORMAL, EXIT_INTERPRET};
+
+/// Maximum number of instructions per compiled block.
+const MAX_BLOCK_LEN: usize = 64;
+
+/// Run the JIT dispatch loop. Replaces the inner `while running` loop in MipsCpu::start().
+pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
+    exec: &mut MipsExecutor<T, C>,
+    running: &AtomicBool,
+) {
+    let mut compiler = BlockCompiler::new();
+    let mut cache = CodeCache::new();
+    let mut ctx = JitContext::new();
+    let mut steps_since_flush: u32 = 0;
+
+    while running.load(Ordering::Relaxed) {
+        let pc = exec.core.pc;
+
+        // Translate PC to physical address for cache lookup
+        let phys_pc = match translate_pc(exec, pc) {
+            Some(p) => p,
+            None => {
+                // Translation failed — let interpreter handle the exception
+                exec.step();
+                steps_since_flush += 1;
+                if steps_since_flush >= 1000 {
+                    exec.flush_cycles();
+                    steps_since_flush = 0;
+                }
+                continue;
+            }
+        };
+
+        if let Some(block) = cache.lookup(phys_pc) {
+            // Cache hit — execute compiled block
+            let entry: extern "C" fn(*mut JitContext) = unsafe {
+                std::mem::transmute(block.entry)
+            };
+            let block_len = block.len_mips;
+
+            ctx.sync_from_executor(exec);
+            entry(&mut ctx);
+            ctx.sync_to_executor(exec);
+
+            // Advance cp0_count by block length
+            let count_advance = exec.core.count_step.wrapping_mul(block_len as u64);
+            let prev = exec.core.cp0_count;
+            exec.core.cp0_count = prev.wrapping_add(count_advance) & 0x0000_FFFF_FFFF_FFFF;
+            if exec.core.cp0_compare != 0 && prev < exec.core.cp0_compare && exec.core.cp0_count >= exec.core.cp0_compare {
+                exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+            }
+
+            exec.local_cycles += block_len as u64;
+            steps_since_flush += block_len;
+
+            match ctx.exit_reason {
+                EXIT_NORMAL => {}
+                EXIT_INTERPRET => {
+                    // The block ended before an uncompilable instruction.
+                    // PC is set to the uncompilable instruction — interpret it.
+                    exec.step();
+                    steps_since_flush += 1;
+                }
+                _ => {}
+            }
+
+            // Check interrupts between blocks
+            exec.cached_pending = unsafe {
+                let ptr = std::ptr::addr_of!(exec.core.interrupts) as *const std::sync::atomic::AtomicU64;
+                (*ptr).load(Ordering::Relaxed)
+            };
+            if (exec.cached_pending | exec.core.cp0_cause as u64) != 0 {
+                // Let the interpreter handle the interrupt
+                exec.step();
+                steps_since_flush += 1;
+            }
+        } else {
+            // Cache miss — try to trace and compile a block
+            let instrs = trace_block(exec, pc);
+            if instrs.is_empty() {
+                // First instruction isn't compilable — interpret it
+                exec.step();
+                steps_since_flush += 1;
+            } else {
+                if let Some(mut block) = compiler.compile_block(&instrs, pc) {
+                    block.phys_addr = phys_pc;
+                    cache.insert(phys_pc, block);
+                    // Next iteration will hit the cache
+                } else {
+                    // Compilation failed — interpret one instruction
+                    exec.step();
+                    steps_since_flush += 1;
+                }
+            }
+        }
+
+        if steps_since_flush >= 1000 {
+            exec.flush_cycles();
+            steps_since_flush = 0;
+        }
+    }
+
+    exec.flush_cycles();
+}
+
+/// Translate a virtual PC to a physical address for code cache lookup.
+fn translate_pc<T: Tlb, C: MipsCache>(
+    exec: &mut MipsExecutor<T, C>,
+    virt_pc: u64,
+) -> Option<u64> {
+    let result = (exec.translate_fn)(exec, virt_pc, AccessType::Fetch);
+    if result.is_exception() {
+        None
+    } else {
+        Some(result.phys as u64)
+    }
+}
+
+/// Trace a basic block: walk instructions from `virt_pc`, collecting compilable
+/// instructions until we hit a non-compilable op, a branch, or the max block size.
+fn trace_block<T: Tlb, C: MipsCache>(
+    exec: &mut MipsExecutor<T, C>,
+    start_pc: u64,
+) -> Vec<(u32, DecodedInstr)> {
+    let mut instrs = Vec::with_capacity(MAX_BLOCK_LEN);
+    let mut pc = start_pc;
+
+    for _ in 0..MAX_BLOCK_LEN {
+        // Fetch instruction word without side effects
+        let raw = match exec.debug_fetch_instr(pc) {
+            Ok(w) => w,
+            Err(_) => break, // fetch failed (unmapped, etc.)
+        };
+
+        // Decode into a DecodedInstr
+        let mut d = DecodedInstr::default();
+        d.raw = raw;
+        decode_into::<T, C>(&mut d);
+
+        // Check if this instruction is compilable (ALU only in Phase 2)
+        if !is_compilable(&d) {
+            break;
+        }
+
+        instrs.push((raw, d));
+        pc = pc.wrapping_add(4);
+    }
+
+    instrs
+}
+
+/// Returns true if the instruction can be compiled by the JIT (Phase 2: integer ALU only).
+fn is_compilable(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    let op = d.op as u32;
+    let funct = d.funct as u32;
+
+    match op {
+        OP_SPECIAL => matches!(funct,
+            FUNCT_SLL | FUNCT_SRL | FUNCT_SRA |
+            FUNCT_SLLV | FUNCT_SRLV | FUNCT_SRAV |
+            FUNCT_MOVZ | FUNCT_MOVN |
+            FUNCT_MFHI | FUNCT_MTHI | FUNCT_MFLO | FUNCT_MTLO |
+            FUNCT_MULT | FUNCT_MULTU | FUNCT_DIV | FUNCT_DIVU |
+            FUNCT_DMULT | FUNCT_DMULTU | FUNCT_DDIV | FUNCT_DDIVU |
+            FUNCT_ADDU | FUNCT_SUBU | FUNCT_AND | FUNCT_OR |
+            FUNCT_XOR | FUNCT_NOR | FUNCT_SLT | FUNCT_SLTU |
+            FUNCT_DADDU | FUNCT_DSUBU |
+            FUNCT_DSLL | FUNCT_DSRL | FUNCT_DSRA |
+            FUNCT_DSLL32 | FUNCT_DSRL32 | FUNCT_DSRA32 |
+            FUNCT_DSLLV | FUNCT_DSRLV | FUNCT_DSRAV |
+            FUNCT_SYNC
+        ),
+        OP_ADDIU | OP_DADDIU | OP_SLTI | OP_SLTIU |
+        OP_ANDI | OP_ORI | OP_XORI | OP_LUI => true,
+        _ => false,
+    }
+}
diff --git a/src/jit/helpers.rs b/src/jit/helpers.rs
new file mode 100644
index 0000000..ed79649
--- /dev/null
+++ b/src/jit/helpers.rs
@@ -0,0 +1,26 @@
+//! `extern "C"` bridge functions called by JIT-compiled code.
+//!
+//! Phase 1: stubs only. These are populated in Phase 3+ when compiled blocks
+//! need to call back into the interpreter for memory access, exceptions, etc.
+
+use super::context::JitContext;
+
+/// Read memory via the interpreter's full memory subsystem.
+/// Called by JIT-compiled code when a memory load hits the slow path.
+pub extern "C" fn jit_helper_read_data(
+    _ctx: *mut JitContext,
+    _virt_addr: u64,
+    _size: u32,
+) -> u64 {
+    unimplemented!("JIT memory read helper not yet implemented")
+}
+
+/// Write memory via the interpreter's full memory subsystem.
+pub extern "C" fn jit_helper_write_data(
+    _ctx: *mut JitContext,
+    _virt_addr: u64,
+    _value: u64,
+    _size: u32,
+) -> u32 {
+    unimplemented!("JIT memory write helper not yet implemented")
+}
diff --git a/src/jit/mod.rs b/src/jit/mod.rs
new file mode 100644
index 0000000..3e8efc8
--- /dev/null
+++ b/src/jit/mod.rs
@@ -0,0 +1,14 @@
+//! Cranelift-based JIT compiler for MIPS R4400.
+//!
+//! Feature-gated under `#[cfg(feature = "jit")]`.
+//! Phase 1: dispatch infrastructure with full interpreter fallback.
+
+pub mod context;
+pub mod cache;
+pub mod compiler;
+pub mod dispatch;
+pub mod helpers;
+
+pub use context::JitContext;
+pub use cache::{CodeCache, CompiledBlock};
+pub use compiler::BlockCompiler;
diff --git a/src/lib.rs b/src/lib.rs
index 7b00c3b..712efa4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,4 +45,6 @@ pub mod hptimer;
 pub mod hptimer_tests;
 pub mod vga_font;
 pub mod saa7191;
-pub mod vino;
\ No newline at end of file
+pub mod vino;
+#[cfg(feature = "jit")]
+pub mod jit;
\ No newline at end of file
diff --git a/src/mips_exec.rs b/src/mips_exec.rs
index 3ff9ce1..ced3311 100644
--- a/src/mips_exec.rs
+++ b/src/mips_exec.rs
@@ -543,7 +543,7 @@ pub struct MipsExecutor<T: Tlb, C: MipsCache> {
     pub sysad: Arc<dyn BusDevice>,
     pub tlb: T,
     pub cache: C,
-    in_delay_slot: bool,
+    pub(crate) in_delay_slot: bool,
     pub delay_slot_target: u64,
     #[cfg(feature = "developer")]
     undo_buffer: UndoBuffer,
@@ -584,9 +584,9 @@ pub struct MipsExecutor<T: Tlb, C: MipsCache> {
     pub fpr_write_w: fn(&mut MipsCore, u32, u32),
     /// Local cycle counter — flushed to the shared atomic periodically to avoid
     /// a locked bus op on every instruction.
-    local_cycles: u64,
+    pub(crate) local_cycles: u64,
     /// Cached external interrupt word — reloaded every 16 instructions.
-    cached_pending: u64,
+    pub(crate) cached_pending: u64,
 }
 
 // ---- translate_fn slow-path wrappers (one per privilege × addressing-mode combination) ------
@@ -4865,10 +4865,18 @@ impl<T: Tlb + Send + 'static, C: MipsCache + Send + 'static> Device for MipsCpu<
 
         *self.thread.lock() = Some(thread::Builder::new().name("MIPS-CPU".to_string()).spawn(move || {
             let mut guard = executor.lock();
+
+            #[cfg(feature = "jit")]
+            {
+                crate::jit::dispatch::run_jit_dispatch(&mut *guard, &running);
+                return;
+            }
+
             // --- perf sampling (comment out to disable) ---
             //let mut last_cycles: u64 = guard.core.cycles.load(Ordering::Relaxed);
             //let mut last_time = std::time::Instant::now();
             // --- end perf sampling ---
+            #[allow(unreachable_code)]
             while running.load(Ordering::Relaxed) {
                 #[cfg(feature = "lightning")]
                 for _ in 0..1000 {

From ae2781a06f552530741fced0a98237bada6ba4ca Mon Sep 17 00:00:00 2001
From: Eric Dodd <eric.e.dodd@gmail.com>
Date: Fri, 3 Apr 2026 07:52:21 -0400
Subject: [PATCH 2/5] working on jit

---
 jit-diag.sh         |  45 ++++
 src/jit/compiler.rs | 507 ++++++++++++++++++++++++++++++++++++++------
 src/jit/context.rs  |  43 ++--
 src/jit/dispatch.rs | 313 ++++++++++++++++++---------
 src/jit/helpers.rs  | 150 +++++++++++--
 src/mips_exec.rs    |   8 +-
 6 files changed, 863 insertions(+), 203 deletions(-)
 create mode 100755 jit-diag.sh

diff --git a/jit-diag.sh b/jit-diag.sh
new file mode 100755
index 0000000..79a1950
--- /dev/null
+++ b/jit-diag.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# JIT diagnostic launcher — runs emulator and captures output for analysis
+# Usage: ./jit-diag.sh [mode]
+#   mode: "jit"     — JIT enabled (default)
+#         "verify"  — JIT with verification
+#         "nojit"   — interpreter only through JIT dispatch
+#         "interp"  — pure interpreter (no JIT feature, baseline)
+
+MODE="${1:-jit}"
+OUTFILE="jit-diag-$(date +%Y%m%d-%H%M%S)-${MODE}.log"
+
+echo "=== IRIS JIT Diagnostic ===" | tee "$OUTFILE"
+echo "Mode: $MODE" | tee -a "$OUTFILE"
+echo "Date: $(date)" | tee -a "$OUTFILE"
+echo "Host: $(uname -m) $(uname -s) $(uname -r)" | tee -a "$OUTFILE"
+echo "Rust: $(rustc --version)" | tee -a "$OUTFILE"
+echo "" | tee -a "$OUTFILE"
+
+case "$MODE" in
+  jit)
+    echo "Running: IRIS_JIT=1 cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
+    IRIS_JIT=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    ;;
+  verify)
+    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
+    IRIS_JIT=1 IRIS_JIT_VERIFY=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    ;;
+  nojit)
+    echo "Running: cargo run --release --features jit,lightning (no IRIS_JIT)" | tee -a "$OUTFILE"
+    cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    ;;
+  interp)
+    echo "Running: cargo run --release --features lightning (no jit feature)" | tee -a "$OUTFILE"
+    cargo run --release --features lightning 2>&1 | tee -a "$OUTFILE"
+    ;;
+  *)
+    echo "Unknown mode: $MODE"
+    echo "Usage: $0 [jit|verify|nojit|interp]"
+    exit 1
+    ;;
+esac
+
+echo "" >> "$OUTFILE"
+echo "=== Exit code: $? ===" >> "$OUTFILE"
+echo "Output saved to: $OUTFILE"
diff --git a/src/jit/compiler.rs b/src/jit/compiler.rs
index 755025c..0b6f9ca 100644
--- a/src/jit/compiler.rs
+++ b/src/jit/compiler.rs
@@ -1,28 +1,38 @@
 //! Block compiler: translates MIPS basic blocks to native code via Cranelift.
 
-use cranelift_codegen::ir::{self, types, AbiParam, InstBuilder, MemFlags, Value};
+use cranelift_codegen::ir::{self, types, AbiParam, InstBuilder, MemFlags, Value, FuncRef};
 use cranelift_codegen::ir::condcodes::IntCC;
 use cranelift_codegen::settings::{self, Configurable};
 use cranelift_codegen::{self, Context};
 use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable};
 use cranelift_jit::{JITBuilder, JITModule};
-use cranelift_module::{Linkage, Module};
+use cranelift_module::{Linkage, Module, FuncId};
 
 use crate::mips_exec::DecodedInstr;
 use crate::mips_isa::*;
 
 use super::cache::CompiledBlock;
-use super::context::{JitContext, EXIT_NORMAL, EXIT_INTERPRET};
+use super::context::{JitContext, EXIT_NORMAL, EXIT_INTERPRET, EXIT_EXCEPTION};
+use super::helpers::HelperPtrs;
 
 pub struct BlockCompiler {
     jit_module: JITModule,
     ctx: Context,
     builder_ctx: FunctionBuilderContext,
     func_id_counter: u32,
+    // Declared function IDs for memory helpers (registered as imports)
+    fn_read_u8: FuncId,
+    fn_read_u16: FuncId,
+    fn_read_u32: FuncId,
+    fn_read_u64: FuncId,
+    fn_write_u8: FuncId,
+    fn_write_u16: FuncId,
+    fn_write_u32: FuncId,
+    fn_write_u64: FuncId,
 }
 
 impl BlockCompiler {
-    pub fn new() -> Self {
+    pub fn new(helpers: &HelperPtrs) -> Self {
         let mut flag_builder = settings::builder();
         flag_builder.set("opt_level", "speed").unwrap();
         flag_builder.set("is_pic", "false").unwrap();
@@ -30,14 +40,54 @@ impl BlockCompiler {
         let isa_builder = cranelift_native::builder().expect("host ISA not supported");
         let isa = isa_builder.finish(settings::Flags::new(flag_builder)).unwrap();
 
-        let jit_builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
-        let jit_module = JITModule::new(jit_builder);
+        let mut jit_builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
+
+        // Register helper function symbols
+        jit_builder.symbol("jit_read_u8", helpers.read_u8);
+        jit_builder.symbol("jit_read_u16", helpers.read_u16);
+        jit_builder.symbol("jit_read_u32", helpers.read_u32);
+        jit_builder.symbol("jit_read_u64", helpers.read_u64);
+        jit_builder.symbol("jit_write_u8", helpers.write_u8);
+        jit_builder.symbol("jit_write_u16", helpers.write_u16);
+        jit_builder.symbol("jit_write_u32", helpers.write_u32);
+        jit_builder.symbol("jit_write_u64", helpers.write_u64);
+
+        let mut jit_module = JITModule::new(jit_builder);
+
+        // Declare helper function signatures: read(ctx_ptr, exec_ptr, virt_addr) -> u64
+        let ptr_type = jit_module.target_config().pointer_type();
+        let mut read_sig = jit_module.make_signature();
+        read_sig.params.push(AbiParam::new(ptr_type)); // ctx_ptr
+        read_sig.params.push(AbiParam::new(ptr_type)); // exec_ptr
+        read_sig.params.push(AbiParam::new(types::I64)); // virt_addr
+        read_sig.returns.push(AbiParam::new(types::I64)); // value
+        // Use the ISA's default calling convention (AppleAarch64 on macOS, SystemV on Linux)
+
+        // write(ctx_ptr, exec_ptr, virt_addr, value) -> u64
+        let mut write_sig = jit_module.make_signature();
+        write_sig.params.push(AbiParam::new(ptr_type));
+        write_sig.params.push(AbiParam::new(ptr_type));
+        write_sig.params.push(AbiParam::new(types::I64));
+        write_sig.params.push(AbiParam::new(types::I64)); // value
+        write_sig.returns.push(AbiParam::new(types::I64));
+        // Use default calling convention
+
+        let fn_read_u8  = jit_module.declare_function("jit_read_u8",  Linkage::Import, &read_sig).unwrap();
+        let fn_read_u16 = jit_module.declare_function("jit_read_u16", Linkage::Import, &read_sig).unwrap();
+        let fn_read_u32 = jit_module.declare_function("jit_read_u32", Linkage::Import, &read_sig).unwrap();
+        let fn_read_u64 = jit_module.declare_function("jit_read_u64", Linkage::Import, &read_sig).unwrap();
+        let fn_write_u8  = jit_module.declare_function("jit_write_u8",  Linkage::Import, &write_sig).unwrap();
+        let fn_write_u16 = jit_module.declare_function("jit_write_u16", Linkage::Import, &write_sig).unwrap();
+        let fn_write_u32 = jit_module.declare_function("jit_write_u32", Linkage::Import, &write_sig).unwrap();
+        let fn_write_u64 = jit_module.declare_function("jit_write_u64", Linkage::Import, &write_sig).unwrap();
 
         Self {
             ctx: jit_module.make_context(),
             jit_module,
             builder_ctx: FunctionBuilderContext::new(),
             func_id_counter: 0,
+            fn_read_u8, fn_read_u16, fn_read_u32, fn_read_u64,
+            fn_write_u8, fn_write_u16, fn_write_u32, fn_write_u64,
         }
     }
 
@@ -63,7 +113,7 @@ impl BlockCompiler {
         // Declare function signature: extern "C" fn(*mut JitContext)
         let ptr_type = self.jit_module.target_config().pointer_type();
         self.ctx.func.signature.params.push(AbiParam::new(ptr_type));
-        self.ctx.func.signature.call_conv = cranelift_codegen::isa::CallConv::SystemV;
+        // Use default calling convention (matches extern "C" on host)
 
         let func_id = self.jit_module
             .declare_function(&name, Linkage::Local, &self.ctx.func.signature)
@@ -79,6 +129,24 @@ impl BlockCompiler {
         let ctx_ptr = builder.block_params(entry_block)[0];
         let mem = MemFlags::trusted();
 
+        // Load executor pointer from JitContext
+        let exec_ptr = builder.ins().load(
+            ptr_type, mem, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::executor_ptr_offset()),
+        );
+
+        // Declare helper function references for this function
+        let helpers = EmitHelpers {
+            read_u8:  self.jit_module.declare_func_in_func(self.fn_read_u8,  &mut builder.func),
+            read_u16: self.jit_module.declare_func_in_func(self.fn_read_u16, &mut builder.func),
+            read_u32: self.jit_module.declare_func_in_func(self.fn_read_u32, &mut builder.func),
+            read_u64: self.jit_module.declare_func_in_func(self.fn_read_u64, &mut builder.func),
+            write_u8:  self.jit_module.declare_func_in_func(self.fn_write_u8,  &mut builder.func),
+            write_u16: self.jit_module.declare_func_in_func(self.fn_write_u16, &mut builder.func),
+            write_u32: self.jit_module.declare_func_in_func(self.fn_write_u32, &mut builder.func),
+            write_u64: self.jit_module.declare_func_in_func(self.fn_write_u64, &mut builder.func),
+        };
+
         // Load GPRs 1-31 from JitContext (gpr[0] is always 0)
         let mut gpr = [builder.ins().iconst(types::I64, 0); 32];
         for i in 1..32usize {
@@ -96,15 +164,41 @@ impl BlockCompiler {
 
         // Emit IR for each instruction
         let mut compiled_count = 0u32;
-        for (_, d) in instrs {
-            if !emit_instruction(&mut builder, ctx_ptr, &mut gpr, &mut hi, &mut lo, d) {
-                break;
+        let mut branch_exit_pc: Option<Value> = None;
+
+        let mut idx = 0;
+        while idx < instrs.len() {
+            let (_, d) = &instrs[idx];
+            let instr_pc = block_pc.wrapping_add(idx as u64 * 4);
+            let result = emit_instruction(
+                &mut builder, ctx_ptr, exec_ptr, &helpers,
+                &mut gpr, &mut hi, &mut lo, d, instr_pc,
+            );
+            match result {
+                EmitResult::Ok => { compiled_count += 1; idx += 1; }
+                EmitResult::Branch(target_val) => {
+                    compiled_count += 1;
+                    idx += 1;
+                    // Emit the delay slot instruction (next in the list, if present)
+                    if idx < instrs.len() {
+                        let (_, delay_d) = &instrs[idx];
+                        let delay_pc = block_pc.wrapping_add(idx as u64 * 4);
+                        let delay_result = emit_instruction(
+                            &mut builder, ctx_ptr, exec_ptr, &helpers,
+                            &mut gpr, &mut hi, &mut lo, delay_d, delay_pc,
+                        );
+                        if matches!(delay_result, EmitResult::Ok) {
+                            compiled_count += 1;
+                        }
+                    }
+                    branch_exit_pc = Some(target_val);
+                    break;
+                }
+                EmitResult::Stop => break,
             }
-            compiled_count += 1;
         }
 
         if compiled_count == 0 {
-            // Nothing was compilable — clean up and return None
             builder.ins().return_(&[]);
             builder.finalize();
             self.ctx.clear();
@@ -123,9 +217,13 @@ impl BlockCompiler {
         builder.ins().store(mem, lo, ctx_ptr,
             ir::immediates::Offset32::new(JitContext::lo_offset()));
 
-        // Set exit PC = block_pc + 4 * compiled_count
-        let exit_pc = block_pc.wrapping_add(compiled_count as u64 * 4);
-        let exit_pc_val = builder.ins().iconst(types::I64, exit_pc as i64);
+        // Set exit PC
+        let exit_pc_val = if let Some(target) = branch_exit_pc {
+            target
+        } else {
+            let fallthrough_pc = block_pc.wrapping_add(compiled_count as u64 * 4);
+            builder.ins().iconst(types::I64, fallthrough_pc as i64)
+        };
         builder.ins().store(mem, exit_pc_val, ctx_ptr,
             ir::immediates::Offset32::new(JitContext::pc_offset()));
 
@@ -160,16 +258,34 @@ impl BlockCompiler {
     }
 }
 
+/// Helper function references for memory operations within a compiled function.
+struct EmitHelpers {
+    read_u8: FuncRef, read_u16: FuncRef, read_u32: FuncRef, read_u64: FuncRef,
+    write_u8: FuncRef, write_u16: FuncRef, write_u32: FuncRef, write_u64: FuncRef,
+}
+
+/// Result of emitting a single instruction.
+enum EmitResult {
+    /// Instruction compiled normally.
+    Ok,
+    /// Instruction is a branch; the Value is the computed target PC.
+    Branch(Value),
+    /// Instruction is not compilable — terminate block before it.
+    Stop,
+}
+
 /// Emit Cranelift IR for a single MIPS instruction.
-/// Returns true if the instruction was compiled, false if it should terminate the block.
 fn emit_instruction(
     builder: &mut FunctionBuilder,
     ctx_ptr: Value,
+    exec_ptr: Value,
+    helpers: &EmitHelpers,
     gpr: &mut [Value; 32],
     hi: &mut Value,
     lo: &mut Value,
     d: &DecodedInstr,
-) -> bool {
+    instr_pc: u64,
+) -> EmitResult {
     let op = d.op as u32;
     let rs = d.rs as usize;
     let rt = d.rt as usize;
@@ -179,15 +295,41 @@ fn emit_instruction(
 
     match op {
         OP_SPECIAL => emit_special(builder, gpr, hi, lo, d, rs, rt, rd, sa, funct),
-        OP_ADDIU  => { emit_addiu(builder, gpr, rs, rt, d); true }
-        OP_DADDIU => { emit_daddiu(builder, gpr, rs, rt, d); true }
-        OP_SLTI   => { emit_slti(builder, gpr, rs, rt, d); true }
-        OP_SLTIU  => { emit_sltiu(builder, gpr, rs, rt, d); true }
-        OP_ANDI   => { emit_andi(builder, gpr, rs, rt, d); true }
-        OP_ORI    => { emit_ori(builder, gpr, rs, rt, d); true }
-        OP_XORI   => { emit_xori(builder, gpr, rs, rt, d); true }
-        OP_LUI    => { emit_lui(builder, gpr, rt, d); true }
-        _ => false, // Non-ALU instruction — terminate block
+        OP_ADDIU  => { emit_addiu(builder, gpr, rs, rt, d); EmitResult::Ok }
+        OP_DADDIU => { emit_daddiu(builder, gpr, rs, rt, d); EmitResult::Ok }
+        OP_SLTI   => { emit_slti(builder, gpr, rs, rt, d); EmitResult::Ok }
+        OP_SLTIU  => { emit_sltiu(builder, gpr, rs, rt, d); EmitResult::Ok }
+        OP_ANDI   => { emit_andi(builder, gpr, rs, rt, d); EmitResult::Ok }
+        OP_ORI    => { emit_ori(builder, gpr, rs, rt, d); EmitResult::Ok }
+        OP_XORI   => { emit_xori(builder, gpr, rs, rt, d); EmitResult::Ok }
+        OP_LUI    => { emit_lui(builder, gpr, rt, d); EmitResult::Ok }
+
+        // --- Loads ---
+        OP_LB  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u8,  gpr, rs, rt, d, LoadWidth::Byte, true, instr_pc),
+        OP_LBU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u8,  gpr, rs, rt, d, LoadWidth::Byte, false, instr_pc),
+        OP_LH  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u16, gpr, rs, rt, d, LoadWidth::Half, true, instr_pc),
+        OP_LHU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u16, gpr, rs, rt, d, LoadWidth::Half, false, instr_pc),
+        OP_LW  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u32, gpr, rs, rt, d, LoadWidth::Word, true, instr_pc),
+        OP_LWU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u32, gpr, rs, rt, d, LoadWidth::Word, false, instr_pc),
+        OP_LD  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u64, gpr, rs, rt, d, LoadWidth::Double, false, instr_pc),
+
+        // --- Stores ---
+        OP_SB => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u8,  gpr, rs, rt, d, instr_pc),
+        OP_SH => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u16, gpr, rs, rt, d, instr_pc),
+        OP_SW => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u32, gpr, rs, rt, d, instr_pc),
+        OP_SD => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u64, gpr, rs, rt, d, instr_pc),
+
+        // --- Branches ---
+        OP_BEQ   => emit_beq(builder, gpr, rs, rt, d, instr_pc, false),
+        OP_BNE   => emit_bne(builder, gpr, rs, rt, d, instr_pc, false),
+        OP_BLEZ  => emit_blez(builder, gpr, rs, d, instr_pc, false),
+        OP_BGTZ  => emit_bgtz(builder, gpr, rs, d, instr_pc, false),
+
+        // --- Jumps ---
+        OP_J   => emit_j(builder, gpr, d, instr_pc),
+        OP_JAL => emit_jal(builder, gpr, d, instr_pc),
+
+        _ => EmitResult::Stop,
     }
 }
 
@@ -198,70 +340,82 @@ fn emit_special(
     lo: &mut Value,
     d: &DecodedInstr,
     rs: usize, rt: usize, rd: usize, sa: u32, funct: u32,
-) -> bool {
+) -> EmitResult {
     match funct {
         // --- Shifts (immediate) ---
-        FUNCT_SLL  => { emit_sll(builder, gpr, rt, rd, sa); true }
-        FUNCT_SRL  => { emit_srl(builder, gpr, rt, rd, sa); true }
-        FUNCT_SRA  => { emit_sra(builder, gpr, rt, rd, sa); true }
+        FUNCT_SLL  => { emit_sll(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_SRL  => { emit_srl(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_SRA  => { emit_sra(builder, gpr, rt, rd, sa); EmitResult::Ok }
 
         // --- Shifts (variable) ---
-        FUNCT_SLLV => { emit_sllv(builder, gpr, rs, rt, rd); true }
-        FUNCT_SRLV => { emit_srlv(builder, gpr, rs, rt, rd); true }
-        FUNCT_SRAV => { emit_srav(builder, gpr, rs, rt, rd); true }
+        FUNCT_SLLV => { emit_sllv(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SRLV => { emit_srlv(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SRAV => { emit_srav(builder, gpr, rs, rt, rd); EmitResult::Ok }
 
         // --- 64-bit shifts (immediate) ---
-        FUNCT_DSLL   => { emit_dsll(builder, gpr, rt, rd, sa); true }
-        FUNCT_DSRL   => { emit_dsrl(builder, gpr, rt, rd, sa); true }
-        FUNCT_DSRA   => { emit_dsra(builder, gpr, rt, rd, sa); true }
-        FUNCT_DSLL32 => { emit_dsll(builder, gpr, rt, rd, sa + 32); true }
-        FUNCT_DSRL32 => { emit_dsrl(builder, gpr, rt, rd, sa + 32); true }
-        FUNCT_DSRA32 => { emit_dsra(builder, gpr, rt, rd, sa + 32); true }
+        FUNCT_DSLL   => { emit_dsll(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_DSRL   => { emit_dsrl(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_DSRA   => { emit_dsra(builder, gpr, rt, rd, sa); EmitResult::Ok }
+        FUNCT_DSLL32 => { emit_dsll(builder, gpr, rt, rd, sa + 32); EmitResult::Ok }
+        FUNCT_DSRL32 => { emit_dsrl(builder, gpr, rt, rd, sa + 32); EmitResult::Ok }
+        FUNCT_DSRA32 => { emit_dsra(builder, gpr, rt, rd, sa + 32); EmitResult::Ok }
 
         // --- 64-bit shifts (variable) ---
-        FUNCT_DSLLV => { emit_dsllv(builder, gpr, rs, rt, rd); true }
-        FUNCT_DSRLV => { emit_dsrlv(builder, gpr, rs, rt, rd); true }
-        FUNCT_DSRAV => { emit_dsrav(builder, gpr, rs, rt, rd); true }
+        FUNCT_DSLLV => { emit_dsllv(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_DSRLV => { emit_dsrlv(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_DSRAV => { emit_dsrav(builder, gpr, rs, rt, rd); EmitResult::Ok }
 
         // --- ALU register ops ---
-        FUNCT_ADDU => { emit_addu(builder, gpr, rs, rt, rd); true }
-        FUNCT_SUBU => { emit_subu(builder, gpr, rs, rt, rd); true }
-        FUNCT_AND  => { emit_and(builder, gpr, rs, rt, rd); true }
-        FUNCT_OR   => { emit_or(builder, gpr, rs, rt, rd); true }
-        FUNCT_XOR  => { emit_xor(builder, gpr, rs, rt, rd); true }
-        FUNCT_NOR  => { emit_nor(builder, gpr, rs, rt, rd); true }
-        FUNCT_SLT  => { emit_slt(builder, gpr, rs, rt, rd); true }
-        FUNCT_SLTU => { emit_sltu(builder, gpr, rs, rt, rd); true }
+        FUNCT_ADDU => { emit_addu(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SUBU => { emit_subu(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_AND  => { emit_and(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_OR   => { emit_or(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_XOR  => { emit_xor(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_NOR  => { emit_nor(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SLT  => { emit_slt(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_SLTU => { emit_sltu(builder, gpr, rs, rt, rd); EmitResult::Ok }
 
         // --- 64-bit ALU ---
-        FUNCT_DADDU => { emit_daddu(builder, gpr, rs, rt, rd); true }
-        FUNCT_DSUBU => { emit_dsubu(builder, gpr, rs, rt, rd); true }
+        FUNCT_DADDU => { emit_daddu(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_DSUBU => { emit_dsubu(builder, gpr, rs, rt, rd); EmitResult::Ok }
 
         // --- Multiply/Divide ---
-        FUNCT_MULT  => { emit_mult(builder, gpr, hi, lo, rs, rt); true }
-        FUNCT_MULTU => { emit_multu(builder, gpr, hi, lo, rs, rt); true }
-        FUNCT_DIV   => { emit_div(builder, gpr, hi, lo, rs, rt); true }
-        FUNCT_DIVU  => { emit_divu(builder, gpr, hi, lo, rs, rt); true }
-        FUNCT_DMULT  => { emit_dmult(builder, gpr, hi, lo, rs, rt); true }
-        FUNCT_DMULTU => { emit_dmultu(builder, gpr, hi, lo, rs, rt); true }
-        FUNCT_DDIV   => { emit_ddiv(builder, gpr, hi, lo, rs, rt); true }
-        FUNCT_DDIVU  => { emit_ddivu(builder, gpr, hi, lo, rs, rt); true }
+        FUNCT_MULT  => { emit_mult(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_MULTU => { emit_multu(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DIV   => { emit_div(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DIVU  => { emit_divu(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DMULT  => { emit_dmult(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DMULTU => { emit_dmultu(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DDIV   => { emit_ddiv(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
+        FUNCT_DDIVU  => { emit_ddivu(builder, gpr, hi, lo, rs, rt); EmitResult::Ok }
 
         // --- HI/LO moves ---
-        FUNCT_MFHI => { gpr[rd] = *hi; true }
-        FUNCT_MTHI => { *hi = gpr[rs]; true }
-        FUNCT_MFLO => { gpr[rd] = *lo; true }
-        FUNCT_MTLO => { *lo = gpr[rs]; true }
+        FUNCT_MFHI => { gpr[rd] = *hi; EmitResult::Ok }
+        FUNCT_MTHI => { *hi = gpr[rs]; EmitResult::Ok }
+        FUNCT_MFLO => { gpr[rd] = *lo; EmitResult::Ok }
+        FUNCT_MTLO => { *lo = gpr[rs]; EmitResult::Ok }
 
         // --- Conditional moves ---
-        FUNCT_MOVZ => { emit_movz(builder, gpr, rs, rt, rd); true }
-        FUNCT_MOVN => { emit_movn(builder, gpr, rs, rt, rd); true }
+        FUNCT_MOVZ => { emit_movz(builder, gpr, rs, rt, rd); EmitResult::Ok }
+        FUNCT_MOVN => { emit_movn(builder, gpr, rs, rt, rd); EmitResult::Ok }
+
+        // --- JR / JALR ---
+        FUNCT_JR   => { let target = gpr[rs]; EmitResult::Branch(target) }
+        FUNCT_JALR => {
+            let target = gpr[rs];
+            let instr_pc_plus_8 = d.imm; // we'll handle this in dispatch; for now use rd
+            // JALR stores return address in rd (default $ra=31)
+            // But we don't know the PC here... pass it via a different mechanism.
+            // Actually: JALR rd, rs — stores PC+8 in rd.
+            // We don't have the PC as a value here. Let's defer JALR to interpreter.
+            EmitResult::Stop
+        }
 
         // --- SYNC (barrier, NOP for JIT) ---
-        FUNCT_SYNC => true,
+        FUNCT_SYNC => EmitResult::Ok,
 
         // Everything else terminates the block
-        _ => false,
+        _ => EmitResult::Stop,
     }
 }
 
@@ -557,3 +711,220 @@ fn emit_movn(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt
     let is_nonzero = builder.ins().icmp(IntCC::NotEqual, gpr[rt], zero);
     gpr[rd] = builder.ins().select(is_nonzero, gpr[rs], gpr[rd]);
 }
+
+// ─── Load/Store emitters ─────────────────────────────────────────────────────
+
+/// Load width tag passed to emit_load so it applies the correct sign extension.
+#[derive(Clone, Copy)]
+enum LoadWidth { Byte, Half, Word, Double }
+
+/// Emit a load instruction. Calls the helper function, checks for exception,
+/// sign/zero-extends the result into the destination GPR.
+fn emit_load(
+    builder: &mut FunctionBuilder,
+    ctx_ptr: Value, exec_ptr: Value,
+    helper: FuncRef,
+    gpr: &mut [Value; 32],
+    rs: usize, rt: usize,
+    d: &DecodedInstr,
+    width: LoadWidth,
+    sign_extend: bool,
+    instr_pc: u64,
+) -> EmitResult {
+    let base = gpr[rs];
+    let offset = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    let virt_addr = builder.ins().iadd(base, offset);
+
+    // Store faulting PC to ctx BEFORE the helper call, so the dispatch loop
+    // knows which instruction caused the exception if one occurs.
+    let instr_pc_val = builder.ins().iconst(types::I64, instr_pc as i64);
+    builder.ins().store(MemFlags::trusted(), instr_pc_val, ctx_ptr,
+        ir::immediates::Offset32::new(JitContext::pc_offset()));
+
+    // Call helper: result = helper(ctx_ptr, exec_ptr, virt_addr)
+    let call = builder.ins().call(helper, &[ctx_ptr, exec_ptr, virt_addr]);
+    let raw_val = builder.inst_results(call)[0];
+
+    // Check ctx.exit_reason for exception.
+    // MUST use MemFlags::new() — helper may have written exit_reason through ctx_ptr.
+    let exit_reason = builder.ins().load(types::I32, MemFlags::new(), ctx_ptr,
+        ir::immediates::Offset32::new(JitContext::exit_reason_offset()));
+    let zero_i32 = builder.ins().iconst(types::I32, 0);
+    let is_exception = builder.ins().icmp(IntCC::NotEqual, exit_reason, zero_i32);
+
+    let ok_block = builder.create_block();
+    builder.append_block_param(ok_block, types::I64);
+    let exc_block = builder.create_block();
+    builder.ins().brif(is_exception, exc_block, &[], ok_block, &[raw_val]);
+
+    // Exception path: store all GPRs back to ctx so sync_to has current state
+    builder.switch_to_block(exc_block);
+    builder.seal_block(exc_block);
+    let mem = MemFlags::trusted();
+    for i in 1..32usize {
+        builder.ins().store(mem, gpr[i], ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::gpr_offset(i)));
+    }
+    builder.ins().return_(&[]);
+
+    // Normal path — raw_val comes through as a block parameter
+    builder.switch_to_block(ok_block);
+    builder.seal_block(ok_block);
+    let val = builder.block_params(ok_block)[0];
+
+    // Apply correct sign/zero extension based on load width
+    gpr[rt] = match (width, sign_extend) {
+        (LoadWidth::Byte, true) => {
+            // i8 → i64: truncate to 8 bits, sign-extend
+            let narrow = builder.ins().ireduce(types::I8, val);
+            builder.ins().sextend(types::I64, narrow)
+        }
+        (LoadWidth::Half, true) => {
+            // i16 → i64: truncate to 16 bits, sign-extend
+            let narrow = builder.ins().ireduce(types::I16, val);
+            builder.ins().sextend(types::I64, narrow)
+        }
+        (LoadWidth::Word, true) => {
+            // i32 → i64: truncate to 32 bits, sign-extend
+            sext32(builder, val)
+        }
+        (_, false) | (LoadWidth::Double, _) => {
+            // Zero-extend or 64-bit: raw value is already correct
+            val
+        }
+    };
+
+    EmitResult::Ok
+}
+
+/// Emit a store instruction. Calls the helper function, checks for exception.
+fn emit_store(
+    builder: &mut FunctionBuilder,
+    ctx_ptr: Value, exec_ptr: Value,
+    helper: FuncRef,
+    gpr: &[Value; 32],
+    rs: usize, rt: usize,
+    d: &DecodedInstr,
+    instr_pc: u64,
+) -> EmitResult {
+    let base = gpr[rs];
+    let offset = builder.ins().iconst(types::I64, d.imm as i32 as i64);
+    let virt_addr = builder.ins().iadd(base, offset);
+    let value = gpr[rt];
+
+    // Store faulting PC before helper call
+    let instr_pc_val = builder.ins().iconst(types::I64, instr_pc as i64);
+    builder.ins().store(MemFlags::trusted(), instr_pc_val, ctx_ptr,
+        ir::immediates::Offset32::new(JitContext::pc_offset()));
+
+    let _call = builder.ins().call(helper, &[ctx_ptr, exec_ptr, virt_addr, value]);
+
+    // Check ctx.exit_reason — MUST use MemFlags::new()
+    let exit_reason = builder.ins().load(types::I32, MemFlags::new(), ctx_ptr,
+        ir::immediates::Offset32::new(JitContext::exit_reason_offset()));
+    let zero = builder.ins().iconst(types::I32, 0);
+    let is_exception = builder.ins().icmp(IntCC::NotEqual, exit_reason, zero);
+
+    let ok_block = builder.create_block();
+    let exc_block = builder.create_block();
+    builder.ins().brif(is_exception, exc_block, &[], ok_block, &[]);
+
+    // Exception path: store all GPRs back to ctx
+    builder.switch_to_block(exc_block);
+    builder.seal_block(exc_block);
+    let mem = MemFlags::trusted();
+    for i in 1..32usize {
+        builder.ins().store(mem, gpr[i], ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::gpr_offset(i)));
+    }
+    builder.ins().return_(&[]);
+
+    builder.switch_to_block(ok_block);
+    builder.seal_block(ok_block);
+
+    EmitResult::Ok
+}
+
+// ─── Branch emitters ─────────────────────────────────────────────────────────
+// Branches compute the target PC and return EmitResult::Branch(target_value).
+// The compiled block stores this PC and returns. Delay slots are handled by
+// the dispatch loop (the next instruction after the branch is interpreted).
+
+fn emit_beq(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8); // skip delay slot
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let cond = builder.ins().icmp(IntCC::Equal, gpr[rs], gpr[rt]);
+    let target = builder.ins().select(cond, taken, not_taken);
+    EmitResult::Branch(target)
+}
+
+fn emit_bne(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8);
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let cond = builder.ins().icmp(IntCC::NotEqual, gpr[rs], gpr[rt]);
+    let target = builder.ins().select(cond, taken, not_taken);
+    EmitResult::Branch(target)
+}
+
+fn emit_blez(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8);
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let zero = builder.ins().iconst(types::I64, 0);
+    let cond = builder.ins().icmp(IntCC::SignedLessThanOrEqual, gpr[rs], zero);
+    let target = builder.ins().select(cond, taken, not_taken);
+    EmitResult::Branch(target)
+}
+
+fn emit_bgtz(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8);
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let zero = builder.ins().iconst(types::I64, 0);
+    let cond = builder.ins().icmp(IntCC::SignedGreaterThan, gpr[rs], zero);
+    let target = builder.ins().select(cond, taken, not_taken);
+    EmitResult::Branch(target)
+}
+
+fn emit_j(
+    builder: &mut FunctionBuilder, _gpr: &[Value; 32],
+    d: &DecodedInstr, instr_pc: u64,
+) -> EmitResult {
+    // Target = (PC+4)[63:28] | (target26 << 2) — but imm already has target26<<2 from decode
+    let region = instr_pc.wrapping_add(4) & 0xFFFF_FFFF_F000_0000;
+    let target_pc = region | (d.imm as u64);
+    let target = builder.ins().iconst(types::I64, target_pc as i64);
+    EmitResult::Branch(target)
+}
+
+fn emit_jal(
+    builder: &mut FunctionBuilder, gpr: &mut [Value; 32],
+    d: &DecodedInstr, instr_pc: u64,
+) -> EmitResult {
+    // JAL: $ra = PC + 8 (return address past delay slot)
+    let return_addr = instr_pc.wrapping_add(8);
+    gpr[31] = builder.ins().iconst(types::I64, return_addr as i64);
+
+    let region = instr_pc.wrapping_add(4) & 0xFFFF_FFFF_F000_0000;
+    let target_pc = region | (d.imm as u64);
+    let target = builder.ins().iconst(types::I64, target_pc as i64);
+    EmitResult::Branch(target)
+}
diff --git a/src/jit/context.rs b/src/jit/context.rs
index 8d70181..7458436 100644
--- a/src/jit/context.rs
+++ b/src/jit/context.rs
@@ -52,6 +52,12 @@ pub struct JitContext {
     // JIT dispatch state
     pub exit_reason: u32,
     pub block_instrs_executed: u32,
+
+    // Type-erased pointer to MipsExecutor — used by memory helper callouts
+    pub executor_ptr: u64,
+    // Exception status from failed memory access (set by helpers)
+    pub exception_status: u32,
+    _pad0: u32,
 }
 
 impl JitContext {
@@ -77,6 +83,9 @@ impl JitContext {
             local_cycles: 0,
             exit_reason: EXIT_NORMAL,
             block_instrs_executed: 0,
+            executor_ptr: 0,
+            exception_status: 0,
+            _pad0: 0,
         }
     }
 
@@ -90,6 +99,8 @@ impl JitContext {
     pub fn pc_offset() -> i32 { std::mem::offset_of!(JitContext, pc) as i32 }
     pub fn exit_reason_offset() -> i32 { std::mem::offset_of!(JitContext, exit_reason) as i32 }
     pub fn block_instrs_offset() -> i32 { std::mem::offset_of!(JitContext, block_instrs_executed) as i32 }
+    pub fn executor_ptr_offset() -> i32 { std::mem::offset_of!(JitContext, executor_ptr) as i32 }
+    pub fn exception_status_offset() -> i32 { std::mem::offset_of!(JitContext, exception_status) as i32 }
 
     /// Copy emulator state into JitContext.
     pub fn sync_from_executor<T: Tlb, C: MipsCache>(
@@ -117,27 +128,31 @@ impl JitContext {
     }
 
     /// Copy JitContext state back to the emulator.
+    ///
+    /// ONLY writes back fields that compiled blocks actually modify (GPRs, hi, lo, PC).
+    /// Fields managed by the interpreter or helpers (cp0_*, nanotlb, fpr) are NOT
+    /// written back — they're updated directly on the executor by helpers/interpreter.
     pub fn sync_to_executor<T: Tlb, C: MipsCache>(
         &self,
         exec: &mut MipsExecutor<T, C>,
     ) {
+        // These are modified by compiled code (stored in the block epilogue)
         exec.core.gpr = self.gpr;
         exec.core.pc = self.pc;
         exec.core.hi = self.hi;
         exec.core.lo = self.lo;
-        exec.core.fpr = self.fpr;
-        exec.core.fpu_fcsr = self.fpu_fcsr;
-        exec.core.cp0_status = self.cp0_status;
-        exec.core.cp0_cause = self.cp0_cause;
-        exec.core.cp0_epc = self.cp0_epc;
-        exec.core.cp0_count = self.cp0_count;
-        exec.core.cp0_compare = self.cp0_compare;
-        exec.core.count_step = self.count_step;
-        exec.core.cp0_badvaddr = self.cp0_badvaddr;
-        exec.core.nanotlb = self.nanotlb;
-        exec.in_delay_slot = self.in_delay_slot;
-        exec.delay_slot_target = self.delay_slot_target;
-        exec.cached_pending = self.cached_pending;
-        exec.local_cycles = self.local_cycles;
+
+        // Compiled blocks handle delay slots internally (the branch emitter
+        // computes the target, emits the delay slot, and sets the exit PC).
+        // Clear the interpreter's delay slot state so subsequent exec.step()
+        // calls don't jump to a stale target.
+        exec.in_delay_slot = false;
+        exec.delay_slot_target = 0;
+
+        // DO NOT write back: cp0_status, cp0_cause, cp0_epc, cp0_badvaddr,
+        // cp0_count, cp0_compare, count_step, nanotlb, fpr, fpu_fcsr —
+        // these are managed by the interpreter and memory helpers directly
+        // on the executor. Writing them back would clobber changes made by
+        // exception handlers and TLB fill operations.
     }
 }
diff --git a/src/jit/dispatch.rs b/src/jit/dispatch.rs
index 5067eeb..1d7c247 100644
--- a/src/jit/dispatch.rs
+++ b/src/jit/dispatch.rs
@@ -1,133 +1,221 @@
-//! JIT dispatch loop: traces, compiles, and executes MIPS basic blocks.
+//! JIT dispatch loop: interpreter-first architecture with inline cache probes.
+//!
+//! The interpreter runs in tight batches. Every PROBE_INTERVAL steps within
+//! the batch, we check if the current PC has a compiled block. If so, we
+//! execute it and return to the interpreter. This gives high JIT hit rates
+//! while keeping zero overhead on most interpreter steps.
 
 use std::sync::atomic::{AtomicBool, Ordering};
 
-use crate::mips_exec::{MipsExecutor, DecodedInstr, ExecStatus, EXEC_BREAKPOINT, EXEC_IS_EXCEPTION, decode_into};
+use crate::mips_exec::{MipsExecutor, DecodedInstr, EXEC_BREAKPOINT, decode_into};
 use crate::mips_tlb::{Tlb, AccessType};
 use crate::mips_cache_v2::MipsCache;
 
 use super::cache::CodeCache;
 use super::compiler::BlockCompiler;
-use super::context::{JitContext, EXIT_NORMAL, EXIT_INTERPRET};
+use super::context::{JitContext, EXIT_NORMAL, EXIT_EXCEPTION};
+use super::helpers::HelperPtrs;
 
-/// Maximum number of instructions per compiled block.
 const MAX_BLOCK_LEN: usize = 64;
 
-/// Run the JIT dispatch loop. Replaces the inner `while running` loop in MipsCpu::start().
+/// How many interpreter steps between cache probes within a batch.
+const PROBE_INTERVAL: u32 = 1000;
+
+/// How many interpreter steps in one outer batch.
+const BATCH_SIZE: u32 = 10000;
+
 pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     exec: &mut MipsExecutor<T, C>,
     running: &AtomicBool,
 ) {
-    let mut compiler = BlockCompiler::new();
+    let jit_enabled = std::env::var("IRIS_JIT").map(|v| v == "1").unwrap_or(false);
+
+    if !jit_enabled {
+        eprintln!("JIT: interpreter-only mode (set IRIS_JIT=1 to enable compilation)");
+        interpreter_loop(exec, running);
+        return;
+    }
+
+    // CRITICAL: Convert &mut to raw pointer. We must never hold &mut MipsExecutor
+    // across a JIT block call, because the JIT's memory helpers create their own
+    // &mut from the raw pointer. Two simultaneous &mut is UB, and with lto="fat"
+    // LLVM exploits the noalias guarantee to cache/hoist loads across the call,
+    // causing stale TLB/cache/CP0 state and kernel panics.
+    let exec_ptr: *mut MipsExecutor<T, C> = exec as *mut _;
+
+    eprintln!("JIT: enabled (interpreter-first, probe every {} steps)", PROBE_INTERVAL);
+    let helpers = HelperPtrs::new::<T, C>();
+    let mut compiler = BlockCompiler::new(&helpers);
     let mut cache = CodeCache::new();
     let mut ctx = JitContext::new();
-    let mut steps_since_flush: u32 = 0;
+    ctx.executor_ptr = exec_ptr as u64;
+
+    let mut total_jit_instrs: u64 = 0;
+    let mut total_interp_steps: u64 = 0;
+    let mut blocks_compiled: u64 = 0;
 
     while running.load(Ordering::Relaxed) {
-        let pc = exec.core.pc;
-
-        // Translate PC to physical address for cache lookup
-        let phys_pc = match translate_pc(exec, pc) {
-            Some(p) => p,
-            None => {
-                // Translation failed — let interpreter handle the exception
-                exec.step();
-                steps_since_flush += 1;
-                if steps_since_flush >= 1000 {
-                    exec.flush_cycles();
-                    steps_since_flush = 0;
+        let mut steps_in_batch: u32 = 0;
+
+        while steps_in_batch < BATCH_SIZE {
+            // Borrow exec for interpreter batch — no JIT call happens here
+            {
+                let exec = unsafe { &mut *exec_ptr };
+                #[cfg(feature = "lightning")]
+                for _ in 0..PROBE_INTERVAL {
+                    exec.step();
                 }
+                #[cfg(not(feature = "lightning"))]
+                for _ in 0..PROBE_INTERVAL {
+                    let status = exec.step();
+                    if status == EXEC_BREAKPOINT {
+                        running.store(false, Ordering::SeqCst);
+                        break;
+                    }
+                }
+            } // &mut exec dropped here
+            steps_in_batch += PROBE_INTERVAL;
+
+            if !running.load(Ordering::Relaxed) { break; }
+
+            // Probe the JIT code cache — borrow briefly for reads
+            let (pc, in_delay_slot) = {
+                let exec = unsafe { &*exec_ptr };
+                (exec.core.pc, exec.in_delay_slot)
+            };
+            let pc32 = pc as u32;
+
+            let in_prom = (pc32 >= 0x9FC00000 && pc32 < 0xA0000000) || (pc32 >= 0xBFC00000);
+            let in_exc = pc32 >= 0x80000000 && pc32 < 0x80000400;
+            if in_prom || in_exc || in_delay_slot {
                 continue;
             }
-        };
 
-        if let Some(block) = cache.lookup(phys_pc) {
-            // Cache hit — execute compiled block
-            let entry: extern "C" fn(*mut JitContext) = unsafe {
-                std::mem::transmute(block.entry)
+            let phys_pc = {
+                let exec = unsafe { &mut *exec_ptr };
+                match translate_pc(exec, pc) {
+                    Some(p) => p,
+                    None => continue,
+                }
             };
-            let block_len = block.len_mips;
-
-            ctx.sync_from_executor(exec);
-            entry(&mut ctx);
-            ctx.sync_to_executor(exec);
-
-            // Advance cp0_count by block length
-            let count_advance = exec.core.count_step.wrapping_mul(block_len as u64);
-            let prev = exec.core.cp0_count;
-            exec.core.cp0_count = prev.wrapping_add(count_advance) & 0x0000_FFFF_FFFF_FFFF;
-            if exec.core.cp0_compare != 0 && prev < exec.core.cp0_compare && exec.core.cp0_count >= exec.core.cp0_compare {
-                exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
-            }
 
-            exec.local_cycles += block_len as u64;
-            steps_since_flush += block_len;
+            if let Some(block) = cache.lookup(phys_pc) {
+                // Cache hit — execute compiled block.
+                // NO &mut MipsExecutor exists during the JIT call.
+                let entry: extern "C" fn(*mut JitContext) = unsafe {
+                    std::mem::transmute(block.entry)
+                };
+                let block_len = block.len_mips;
 
-            match ctx.exit_reason {
-                EXIT_NORMAL => {}
-                EXIT_INTERPRET => {
-                    // The block ended before an uncompilable instruction.
-                    // PC is set to the uncompilable instruction — interpret it.
-                    exec.step();
-                    steps_since_flush += 1;
-                }
-                _ => {}
-            }
+                {
+                    let exec = unsafe { &mut *exec_ptr };
+                    ctx.sync_from_executor(exec);
+                } // &mut dropped before JIT call
 
-            // Check interrupts between blocks
-            exec.cached_pending = unsafe {
-                let ptr = std::ptr::addr_of!(exec.core.interrupts) as *const std::sync::atomic::AtomicU64;
-                (*ptr).load(Ordering::Relaxed)
-            };
-            if (exec.cached_pending | exec.core.cp0_cause as u64) != 0 {
-                // Let the interpreter handle the interrupt
-                exec.step();
-                steps_since_flush += 1;
-            }
-        } else {
-            // Cache miss — try to trace and compile a block
-            let instrs = trace_block(exec, pc);
-            if instrs.is_empty() {
-                // First instruction isn't compilable — interpret it
-                exec.step();
-                steps_since_flush += 1;
+                ctx.exit_reason = 0;
+                entry(&mut ctx); // Helpers create their own &mut from exec_ptr — no aliasing
+
+                {
+                    let exec = unsafe { &mut *exec_ptr };
+                    ctx.sync_to_executor(exec);
+
+                    if ctx.exit_reason == EXIT_EXCEPTION {
+                        // A load/store hit a TLB miss or other exception.
+                        // ctx.pc has the faulting instruction's PC (stored before the helper call).
+                        // GPRs are current (stored by the exc_block).
+                        // Re-execute the faulting instruction through the interpreter,
+                        // which will handle the exception properly (set EPC, jump to handler).
+                        exec.step();
+                        steps_in_batch += 1;
+                        // Reset exit_reason for next block
+                        ctx.exit_reason = 0;
+                    } else {
+                        // Normal exit — advance cp0_count per-instruction
+                        for _ in 0..block_len {
+                            let prev = exec.core.cp0_count;
+                            exec.core.cp0_count = prev.wrapping_add(exec.core.count_step) & 0x0000_FFFF_FFFF_FFFF;
+                            if exec.core.cp0_compare != 0 && prev < exec.core.cp0_compare && exec.core.cp0_count >= exec.core.cp0_compare {
+                                exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                                exec.core.fasttick_count.fetch_add(1, Ordering::Relaxed);
+                            }
+                        }
+                        exec.local_cycles += block_len as u64;
+                        steps_in_batch += block_len;
+                        total_jit_instrs += block_len as u64;
+                    }
+                } // &mut dropped
             } else {
-                if let Some(mut block) = compiler.compile_block(&instrs, pc) {
-                    block.phys_addr = phys_pc;
-                    cache.insert(phys_pc, block);
-                    // Next iteration will hit the cache
-                } else {
-                    // Compilation failed — interpret one instruction
-                    exec.step();
-                    steps_since_flush += 1;
+                // Cache miss — try to compile
+                let exec = unsafe { &mut *exec_ptr };
+                let instrs = trace_block(exec, pc);
+                if !instrs.is_empty() {
+                    if let Some(mut block) = compiler.compile_block(&instrs, pc) {
+                        block.phys_addr = phys_pc;
+                        cache.insert(phys_pc, block);
+                        blocks_compiled += 1;
+                        if blocks_compiled <= 10 || blocks_compiled % 500 == 0 {
+                            eprintln!("JIT: compiled #{} at {:016x} ({} instrs, cache={})",
+                                blocks_compiled, pc, instrs.len(), cache.len());
+                        }
+                    }
                 }
             }
         }
 
-        if steps_since_flush >= 1000 {
+        {
+            let exec = unsafe { &mut *exec_ptr };
             exec.flush_cycles();
-            steps_since_flush = 0;
         }
+        total_interp_steps += steps_in_batch as u64;
+
+        if total_interp_steps % 10000000 < BATCH_SIZE as u64 {
+            let exec = unsafe { &*exec_ptr };
+            eprintln!("JIT: {} steps, {} JIT instrs ({:.1}%), {} blocks, pc={:016x}",
+                total_interp_steps, total_jit_instrs,
+                if total_interp_steps > 0 { total_jit_instrs as f64 / total_interp_steps as f64 * 100.0 } else { 0.0 },
+                blocks_compiled, exec.core.pc);
+        }
+    }
+
+    {
+        let exec = unsafe { &mut *exec_ptr };
+        exec.flush_cycles();
     }
+    eprintln!("JIT: shutdown. {} blocks, {} JIT instrs / {} total steps ({:.1}%)",
+        blocks_compiled, total_jit_instrs, total_interp_steps,
+        if total_interp_steps > 0 { total_jit_instrs as f64 / total_interp_steps as f64 * 100.0 } else { 0.0 });
+}
 
-    exec.flush_cycles();
+fn interpreter_loop<T: Tlb, C: MipsCache>(
+    exec: &mut MipsExecutor<T, C>,
+    running: &AtomicBool,
+) {
+    while running.load(Ordering::Relaxed) {
+        #[cfg(feature = "lightning")]
+        for _ in 0..1000 {
+            exec.step(); exec.step(); exec.step(); exec.step(); exec.step();
+            exec.step(); exec.step(); exec.step(); exec.step(); exec.step();
+        }
+        #[cfg(not(feature = "lightning"))]
+        for _ in 0..1000 {
+            let status = exec.step();
+            if status == EXEC_BREAKPOINT {
+                running.store(false, Ordering::SeqCst);
+                break;
+            }
+        }
+        exec.flush_cycles();
+    }
 }
 
-/// Translate a virtual PC to a physical address for code cache lookup.
 fn translate_pc<T: Tlb, C: MipsCache>(
     exec: &mut MipsExecutor<T, C>,
     virt_pc: u64,
 ) -> Option<u64> {
     let result = (exec.translate_fn)(exec, virt_pc, AccessType::Fetch);
-    if result.is_exception() {
-        None
-    } else {
-        Some(result.phys as u64)
-    }
+    if result.is_exception() { None } else { Some(result.phys as u64) }
 }
 
-/// Trace a basic block: walk instructions from `virt_pc`, collecting compilable
-/// instructions until we hit a non-compilable op, a branch, or the max block size.
 fn trace_block<T: Tlb, C: MipsCache>(
     exec: &mut MipsExecutor<T, C>,
     start_pc: u64,
@@ -136,37 +224,50 @@ fn trace_block<T: Tlb, C: MipsCache>(
     let mut pc = start_pc;
 
     for _ in 0..MAX_BLOCK_LEN {
-        // Fetch instruction word without side effects
         let raw = match exec.debug_fetch_instr(pc) {
             Ok(w) => w,
-            Err(_) => break, // fetch failed (unmapped, etc.)
+            Err(_) => break,
         };
 
-        // Decode into a DecodedInstr
         let mut d = DecodedInstr::default();
         d.raw = raw;
         decode_into::<T, C>(&mut d);
 
-        // Check if this instruction is compilable (ALU only in Phase 2)
-        if !is_compilable(&d) {
+        if !is_compilable(&d) { break; }
+
+        let is_branch = is_branch_or_jump(&d);
+        instrs.push((raw, d));
+
+        if is_branch {
+            pc = pc.wrapping_add(4);
+            let mut delay_ok = false;
+            if let Ok(delay_raw) = exec.debug_fetch_instr(pc) {
+                let mut delay_d = DecodedInstr::default();
+                delay_d.raw = delay_raw;
+                decode_into::<T, C>(&mut delay_d);
+                if is_compilable_alu(&delay_d) || is_compilable_mem(&delay_d) {
+                    instrs.push((delay_raw, delay_d));
+                    delay_ok = true;
+                }
+            }
+            if !delay_ok { instrs.pop(); }
             break;
         }
 
-        instrs.push((raw, d));
         pc = pc.wrapping_add(4);
     }
 
     instrs
 }
 
-/// Returns true if the instruction can be compiled by the JIT (Phase 2: integer ALU only).
 fn is_compilable(d: &DecodedInstr) -> bool {
-    use crate::mips_isa::*;
-    let op = d.op as u32;
-    let funct = d.funct as u32;
+    is_compilable_alu(d) || is_compilable_mem(d) || is_branch_or_jump(d)
+}
 
-    match op {
-        OP_SPECIAL => matches!(funct,
+fn is_compilable_alu(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    match d.op as u32 {
+        OP_SPECIAL => matches!(d.funct as u32,
             FUNCT_SLL | FUNCT_SRL | FUNCT_SRA |
             FUNCT_SLLV | FUNCT_SRLV | FUNCT_SRAV |
             FUNCT_MOVZ | FUNCT_MOVN |
@@ -186,3 +287,21 @@ fn is_compilable(d: &DecodedInstr) -> bool {
         _ => false,
     }
 }
+
+fn is_compilable_mem(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    matches!(d.op as u32,
+        OP_LB | OP_LBU | OP_LH | OP_LHU | OP_LW | OP_LWU | OP_LD |
+        OP_SB | OP_SH | OP_SW | OP_SD
+    )
+}
+
+fn is_branch_or_jump(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    match d.op as u32 {
+        OP_BEQ | OP_BNE | OP_BLEZ | OP_BGTZ => true,
+        OP_J | OP_JAL => true,
+        OP_SPECIAL => matches!(d.funct as u32, FUNCT_JR),
+        _ => false,
+    }
+}
diff --git a/src/jit/helpers.rs b/src/jit/helpers.rs
index ed79649..81f1673 100644
--- a/src/jit/helpers.rs
+++ b/src/jit/helpers.rs
@@ -1,26 +1,138 @@
-//! `extern "C"` bridge functions called by JIT-compiled code.
+//! `extern "C"` bridge functions called by JIT-compiled code for memory access.
 //!
-//! Phase 1: stubs only. These are populated in Phase 3+ when compiled blocks
-//! need to call back into the interpreter for memory access, exceptions, etc.
+//! CRITICAL: All pointer casts use `std::hint::black_box` to prevent LLVM from
+//! tracking pointer provenance through LTO. Without this, LLVM can prove the
+//! exec_ptr derives from a &mut in the dispatch loop and apply noalias
+//! optimizations that cause stale reads.
 
-use super::context::JitContext;
+use super::context::{JitContext, EXIT_EXCEPTION};
+use crate::mips_exec::{MipsExecutor, MemAccessSize, EXEC_COMPLETE};
+use crate::mips_tlb::Tlb;
+use crate::mips_cache_v2::MipsCache;
 
-/// Read memory via the interpreter's full memory subsystem.
-/// Called by JIT-compiled code when a memory load hits the slow path.
-pub extern "C" fn jit_helper_read_data(
-    _ctx: *mut JitContext,
-    _virt_addr: u64,
-    _size: u32,
+/// Opaque cast that defeats LLVM's alias analysis and pointer provenance tracking.
+/// `#[inline(never)]` ensures LLVM can't see through this to recover provenance.
+#[inline(never)]
+fn opaque_exec<T: Tlb, C: MipsCache>(ptr: *mut u8) -> *mut MipsExecutor<T, C> {
+    std::hint::black_box(ptr as *mut MipsExecutor<T, C>)
+}
+
+#[inline(never)]
+fn opaque_ctx(ptr: *mut JitContext) -> *mut JitContext {
+    std::hint::black_box(ptr)
+}
+
+// ─── Read helpers ────────────────────────────────────────────────────────────
+
+pub extern "C" fn jit_read_u8<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    match exec.read_data(virt_addr, MemAccessSize::Byte) {
+        Ok(value) => value,
+        Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
+    }
+}
+
+pub extern "C" fn jit_read_u16<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64,
 ) -> u64 {
-    unimplemented!("JIT memory read helper not yet implemented")
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    match exec.read_data(virt_addr, MemAccessSize::Half) {
+        Ok(value) => value,
+        Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
+    }
+}
+
+pub extern "C" fn jit_read_u32<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    match exec.read_data(virt_addr, MemAccessSize::Word) {
+        Ok(value) => value,
+        Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
+    }
+}
+
+pub extern "C" fn jit_read_u64<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    match exec.read_data(virt_addr, MemAccessSize::Double) {
+        Ok(value) => value,
+        Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
+    }
+}
+
+// ─── Write helpers ───────────────────────────────────────────────────────────
+
+pub extern "C" fn jit_write_u8<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    let status = exec.write_data(virt_addr, value, MemAccessSize::Byte, 0xFF);
+    if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
+    0
+}
+
+pub extern "C" fn jit_write_u16<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    let status = exec.write_data(virt_addr, value, MemAccessSize::Half, 0xFFFF);
+    if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
+    0
+}
+
+pub extern "C" fn jit_write_u32<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    let status = exec.write_data(virt_addr, value, MemAccessSize::Word, 0xFFFF_FFFF);
+    if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
+    0
+}
+
+pub extern "C" fn jit_write_u64<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8, virt_addr: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    let status = exec.write_data(virt_addr, value, MemAccessSize::Double, 0xFFFF_FFFF_FFFF_FFFF);
+    if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
+    0
+}
+
+/// Collection of monomorphized helper function pointers.
+pub struct HelperPtrs {
+    pub read_u8:  *const u8,
+    pub read_u16: *const u8,
+    pub read_u32: *const u8,
+    pub read_u64: *const u8,
+    pub write_u8:  *const u8,
+    pub write_u16: *const u8,
+    pub write_u32: *const u8,
+    pub write_u64: *const u8,
 }
 
-/// Write memory via the interpreter's full memory subsystem.
-pub extern "C" fn jit_helper_write_data(
-    _ctx: *mut JitContext,
-    _virt_addr: u64,
-    _value: u64,
-    _size: u32,
-) -> u32 {
-    unimplemented!("JIT memory write helper not yet implemented")
+impl HelperPtrs {
+    pub fn new<T: Tlb, C: MipsCache>() -> Self {
+        Self {
+            read_u8:  jit_read_u8::<T, C>  as *const u8,
+            read_u16: jit_read_u16::<T, C> as *const u8,
+            read_u32: jit_read_u32::<T, C> as *const u8,
+            read_u64: jit_read_u64::<T, C> as *const u8,
+            write_u8:  jit_write_u8::<T, C>  as *const u8,
+            write_u16: jit_write_u16::<T, C> as *const u8,
+            write_u32: jit_write_u32::<T, C> as *const u8,
+            write_u64: jit_write_u64::<T, C> as *const u8,
+        }
+    }
 }
diff --git a/src/mips_exec.rs b/src/mips_exec.rs
index ced3311..58f584f 100644
--- a/src/mips_exec.rs
+++ b/src/mips_exec.rs
@@ -1523,7 +1523,7 @@ For R4000SC/MC CPUs:
 
     /// Production data read (with breakpoints, updates CP0 state on exceptions).
     #[inline]
-    fn read_data<const SIZE: usize>(&mut self, virt_addr: u64) -> Result<u64, ExecStatus> {
+    pub(crate) fn read_data<const SIZE: usize>(&mut self, virt_addr: u64) -> Result<u64, ExecStatus> {
         self.read_data_impl::<false, SIZE>(virt_addr)
     }
 
@@ -1624,15 +1624,13 @@ For R4000SC/MC CPUs:
 
     /// Production data write (with breakpoints, undo tracking, updates CP0 state on exceptions).
     #[inline]
-    fn write_data<const SIZE: usize>(&mut self, virt_addr: u64, val: u64) -> ExecStatus {
+    pub(crate) fn write_data<const SIZE: usize>(&mut self, virt_addr: u64, val: u64) -> ExecStatus {
         self.write_data_impl::<false, SIZE>(virt_addr, val)
     }
 
     /// Partial masked doubleword write for SDL/SDR/SWL/SWR.
-    /// Only bytes where the corresponding mask byte is non-zero are written.
-    /// `virt_addr` must be 8-byte aligned; val/mask are in MIPS big-endian doubleword space.
     #[inline]
-    fn write_data64_masked(&mut self, virt_addr: u64, val: u64, mask: u64) -> ExecStatus {
+    pub(crate) fn write_data64_masked(&mut self, virt_addr: u64, val: u64, mask: u64) -> ExecStatus {
         self.write_data64_masked_impl::<false>(virt_addr, val, mask)
     }
 

From 76fb7f334f1de6c6eff1099c59d88c876e78e3a8 Mon Sep 17 00:00:00 2001
From: Eric Dodd <Eric Dodd>
Date: Sat, 4 Apr 2026 09:47:18 -0400
Subject: [PATCH 3/5] added flamegraph to diag

---
 Cargo.toml  |  1 +
 jit-diag.sh | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 5d02def..7a73fc3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -47,6 +47,7 @@ libc = "0.2"
 lto = "fat"
 codegen-units = 1
 panic = "abort"
+debug = 1
 
 # Developer profile: release optimizations + debug symbols. Default build target.
 # Enables the "developer" feature flag for dev-only tooling.
diff --git a/jit-diag.sh b/jit-diag.sh
index 79a1950..9541e4c 100755
--- a/jit-diag.sh
+++ b/jit-diag.sh
@@ -33,9 +33,25 @@ case "$MODE" in
     echo "Running: cargo run --release --features lightning (no jit feature)" | tee -a "$OUTFILE"
     cargo run --release --features lightning 2>&1 | tee -a "$OUTFILE"
     ;;
+  flamegraph)
+    DURATION="${2:-30}"
+    OUTSVG="flamegraph-$(date +%Y%m%d-%H%M%S).svg"
+    echo "Running: cargo flamegraph --features lightning (${DURATION}s, no jit feature)" | tee -a "$OUTFILE"
+    echo "Output SVG: $OUTSVG" | tee -a "$OUTFILE"
+    timeout "$DURATION" cargo flamegraph --release --features lightning --output "$OUTSVG" 2>&1 | tee -a "$OUTFILE"
+    echo "Flamegraph saved to: $OUTSVG"
+    ;;
+  flamegraph-jit)
+    DURATION="${2:-30}"
+    OUTSVG="flamegraph-jit-$(date +%Y%m%d-%H%M%S).svg"
+    echo "Running: IRIS_JIT=1 cargo flamegraph --features jit,lightning (${DURATION}s)" | tee -a "$OUTFILE"
+    echo "Output SVG: $OUTSVG" | tee -a "$OUTFILE"
+    IRIS_JIT=1 timeout "$DURATION" cargo flamegraph --release --features jit,lightning --output "$OUTSVG" 2>&1 | tee -a "$OUTFILE"
+    echo "Flamegraph saved to: $OUTSVG"
+    ;;
   *)
     echo "Unknown mode: $MODE"
-    echo "Usage: $0 [jit|verify|nojit|interp]"
+    echo "Usage: $0 [jit|verify|nojit|interp|flamegraph [seconds]|flamegraph-jit [seconds]]"
     exit 1
     ;;
 esac

From 29284636dd42913f075c3dbe8606bf9180ea769b Mon Sep 17 00:00:00 2001
From: Eric Dodd <Eric Dodd>
Date: Sat, 4 Apr 2026 17:23:55 -0400
Subject: [PATCH 4/5] adaptive jit

---
 .gitignore          |   3 +-
 CLAUDE.md           |  60 +++++++++
 jit-diag.sh         |  57 ++++----
 jit_overview.md     |  52 ++++++++
 mcm-engine.yaml     |  52 ++++++++
 src/hptimer.rs      |   8 +-
 src/jit/cache.rs    |  51 ++++++++
 src/jit/compiler.rs | 143 ++++++++++++++-------
 src/jit/dispatch.rs | 307 ++++++++++++++++++++++++++++++++++++++------
 src/jit/mod.rs      |   3 +
 src/jit/profile.rs  | 119 +++++++++++++++++
 src/jit/snapshot.rs | 104 +++++++++++++++
 src/mips_tlb.rs     |  12 ++
 src/rex3.rs         |  28 +++-
 14 files changed, 879 insertions(+), 120 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 jit_overview.md
 create mode 100644 mcm-engine.yaml
 create mode 100644 src/jit/profile.rs
 create mode 100644 src/jit/snapshot.rs

diff --git a/.gitignore b/.gitignore
index ffe669b..b2791c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ tasks.json
 scsi1.raw
 scsi2.raw
 cdrom4.iso
-Cargo.lock
\ No newline at end of file
+Cargo.lock
+*.log
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..f6ba8a7
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,60 @@
+# Claude Instructions
+
+## INVARIANTS (non-negotiable)
+
+These are hard rules. Violating any of these is a session failure.
+
+- **MCP TOOLS BEFORE EVERYTHING**: `search` first!! `search` everything before you do it so you don't flail around.`report_error` before attempting any fix. `check_compat` before writing any compat function. `search` before inventing any technique. NO EXCEPTIONS. Do not skip these because you think you already know the answer — your training data for IRIX is outdated and wrong. **ENFORCED AT TWO LEVELS**: (1) The knowledge MCP tracks MCP tool calls — warning at turn 3, blocking at turn 6+ (nudge_escalation_threshold=2). (2) A Claude Code PreToolUse hook tracks built-in tool calls (Edit, Write, Bash) — warning after 8 calls without MCP search, **Edit/Write BLOCKED after 20 calls**. The hook catches the blind spot where the MCP nudge system can't see built-in tools. Both levels reset when you call search/report_error/check_compat.
+- **NO FIXES OUTSIDE MOGRIX RULES**: Every fix goes into `rules/`, `compat/`, or `patches/`. If you `sed` a file during debugging, that fix MUST end up in a YAML rule. If it doesn't, you have failed.
+- **NO INLINE C IN YAML**: C files go in `patches/packages/<pkg>/`, referenced via `add_source`. No heredocs generating .c/.h files in `prep_commands`.
+- **`add_rule` IMMEDIATELY AFTER FIX CONFIRMED**: The moment a build passes after a fix, call `add_rule` with `file_path` pointing to the authoritative rule file. Do not batch to session end — context pressure causes deferred `add_rule` calls to be dropped.
+- **DB IS CACHE, FILES ARE AUTHORITATIVE**: Rule files (`rules/packages/*.yaml`, `rules/generic.yaml`, `compat/catalog.yaml`, `rules/methods/*.md`) are the source of truth. `add_rule` must include `file_path`.
+- **DELEGATE LONG DEBUGS**: >2 failed fix attempts for the same error → stop and spawn a sub-agent with `Task()`. Pass it the error text, file paths, and tell it to use MCP tools first. Never let debug trace flood parent context.
+- **REDIRECT BUILD OUTPUT**: Never let rpmbuild output flood context. Log to file. Use sub-agents (`Task(model="haiku")`) for reading large build logs.
+- **INVOCATION**: `uv run mogrix <command>`. No other invocation method works.
+
+---
+
+## Session Protocol
+
+1. Call `session_start` MCP tool
+2. Work — use MCP tools for every error, every symbol, every lookup
+3. `add_rule` immediately after each confirmed fix
+4. Call `session_handoff` MCP tool before ending
+
+---
+
+## MCP Tool Quick Reference
+
+These are your primary interface. Use them before reading files, before grepping, before guessing.
+
+| When | Tool | What it does |
+|------|------|--------------|
+| Hit any error | `report_error` | Logs error AND auto-searches rules+compat+errors in one call |
+| Need to look something up | `search` (or `knowledge_query`) | FTS5 search across all knowledge, rules, errors, negative knowledge |
+| Confirmed a fix | `add_rule` | Stores the fix with `file_path` to authoritative rule file |
+| Learned something | `add_knowledge` (or `report_finding`) | Stores findings, decisions, insights |
+| Found a dead end | `add_negative` | Stores anti-patterns so they're never repeated |
+| Session start | `session_start` | Context summary, last handoff, active tasks |
+| Session end | `session_handoff` | Snapshot state for next session |
+
+
+## Context Management
+
+**Tuned for 1M context (Opus 4.6).** Sessions can safely run 400+ turns. Compaction/handoff urgency is low. Focus is on knowledge capture quality, not checkpoint frequency.
+
+- **Sub-agents for investigation**: Any task requiring >200 lines of output gets a sub-agent. `Task(model="haiku")` for build log reading. Sub-agent investigates and returns a concise summary; parent applies the fix.
+- **Re-orientation check every 8 tool calls**: Am I using MCP tools? Am I freestyling a fix that's probably already documented? Have I stored my findings? If unsure, call `session_start`.
+- **Store knowledge continuously**: `report_error` when you hit it → fix it → build passes → `add_rule` right then. Don't accumulate findings to store later. The nudge system fires a store reminder after 6 turns without a store.
+- **Checkpoint at 30 turns**: `save_snapshot` or `session_handoff` to reset the checkpoint counter. Mandatory stop at 60 turns (enforced, blocks all tools).
+- **Batch builds**: Max 2-3 background agents, each with its own rpmbuild directory. Only the orchestrator updates rule files. See `rules/methods/task-tracking.md`.
+
+**MCP enforcement thresholds** (mcm-engine.yaml):
+- Store reminder: 6 turns
+- Checkpoint: 30 turns
+- Mandatory stop: 60 turns (+10 grace)
+- Nudge escalation: 2 ignores → blocking
+- MCP-first enforcement: warning at turn 3, blocks at turn ~7 if no search/report_error/check_compat called
+
+---
+
diff --git a/jit-diag.sh b/jit-diag.sh
index 9541e4c..92b4e6d 100755
--- a/jit-diag.sh
+++ b/jit-diag.sh
@@ -1,12 +1,19 @@
 #!/bin/bash
 # JIT diagnostic launcher — runs emulator and captures output for analysis
 # Usage: ./jit-diag.sh [mode]
-#   mode: "jit"     — JIT enabled (default)
-#         "verify"  — JIT with verification
-#         "nojit"   — interpreter only through JIT dispatch
-#         "interp"  — pure interpreter (no JIT feature, baseline)
+#   mode: "jit"      — JIT enabled (default)
+#         "verify"   — JIT with verification
+#         "nojit"    — interpreter only through JIT dispatch
+#         "interp"   — pure interpreter (no JIT feature, baseline)
+#         "perf"     — perf profile, interpreter only (text report for analysis)
+#         "perf-jit" — perf profile with JIT enabled
 
 MODE="${1:-jit}"
+# IRIS_JIT_MAX_TIER from environment (0=Alu, 1=Loads, 2=Full, unset=Full)
+TIER_ENV=""
+if [ -n "$IRIS_JIT_MAX_TIER" ]; then
+  TIER_ENV="IRIS_JIT_MAX_TIER=$IRIS_JIT_MAX_TIER"
+fi
 OUTFILE="jit-diag-$(date +%Y%m%d-%H%M%S)-${MODE}.log"
 
 echo "=== IRIS JIT Diagnostic ===" | tee "$OUTFILE"
@@ -18,12 +25,12 @@ echo "" | tee -a "$OUTFILE"
 
 case "$MODE" in
   jit)
-    echo "Running: IRIS_JIT=1 cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
-    IRIS_JIT=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    echo "Running: IRIS_JIT=1 $TIER_ENV cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
+    IRIS_JIT=1 $TIER_ENV cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
     ;;
   verify)
-    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
-    IRIS_JIT=1 IRIS_JIT_VERIFY=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 $TIER_ENV cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
+    IRIS_JIT=1 IRIS_JIT_VERIFY=1 $TIER_ENV cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
     ;;
   nojit)
     echo "Running: cargo run --release --features jit,lightning (no IRIS_JIT)" | tee -a "$OUTFILE"
@@ -33,25 +40,29 @@ case "$MODE" in
     echo "Running: cargo run --release --features lightning (no jit feature)" | tee -a "$OUTFILE"
     cargo run --release --features lightning 2>&1 | tee -a "$OUTFILE"
     ;;
-  flamegraph)
-    DURATION="${2:-30}"
-    OUTSVG="flamegraph-$(date +%Y%m%d-%H%M%S).svg"
-    echo "Running: cargo flamegraph --features lightning (${DURATION}s, no jit feature)" | tee -a "$OUTFILE"
-    echo "Output SVG: $OUTSVG" | tee -a "$OUTFILE"
-    timeout "$DURATION" cargo flamegraph --release --features lightning --output "$OUTSVG" 2>&1 | tee -a "$OUTFILE"
-    echo "Flamegraph saved to: $OUTSVG"
+  perf)
+    PERFREPORT="perf-report-$(date +%Y%m%d-%H%M%S).txt"
+    echo "Building (profiling profile, no jit feature)..." | tee -a "$OUTFILE"
+    cargo build --profile profiling --features lightning 2>&1 | tee -a "$OUTFILE"
+    echo "--- Press Ctrl-C when you have enough samples ---"
+    perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
+    echo "Processing perf data..." | tee -a "$OUTFILE"
+    perf report --stdio --no-children -i perf.data > "$PERFREPORT" 2>&1
+    echo "Perf report saved to: $PERFREPORT"
     ;;
-  flamegraph-jit)
-    DURATION="${2:-30}"
-    OUTSVG="flamegraph-jit-$(date +%Y%m%d-%H%M%S).svg"
-    echo "Running: IRIS_JIT=1 cargo flamegraph --features jit,lightning (${DURATION}s)" | tee -a "$OUTFILE"
-    echo "Output SVG: $OUTSVG" | tee -a "$OUTFILE"
-    IRIS_JIT=1 timeout "$DURATION" cargo flamegraph --release --features jit,lightning --output "$OUTSVG" 2>&1 | tee -a "$OUTFILE"
-    echo "Flamegraph saved to: $OUTSVG"
+  perf-jit)
+    PERFREPORT="perf-report-jit-$(date +%Y%m%d-%H%M%S).txt"
+    echo "Building (profiling profile, jit feature)..." | tee -a "$OUTFILE"
+    cargo build --profile profiling --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    echo "--- Press Ctrl-C when you have enough samples ---"
+    IRIS_JIT=1 perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
+    echo "Processing perf data..." | tee -a "$OUTFILE"
+    perf report --stdio --no-children -i perf.data > "$PERFREPORT" 2>&1
+    echo "Perf report saved to: $PERFREPORT"
     ;;
   *)
     echo "Unknown mode: $MODE"
-    echo "Usage: $0 [jit|verify|nojit|interp|flamegraph [seconds]|flamegraph-jit [seconds]]"
+    echo "Usage: $0 [jit|verify|nojit|interp|perf|perf-jit]"
     exit 1
     ;;
 esac
diff --git a/jit_overview.md b/jit_overview.md
new file mode 100644
index 0000000..28f34b6
--- /dev/null
+++ b/jit_overview.md
@@ -0,0 +1,52 @@
+# IRIS Adaptive JIT — How We Taught an Emulator to Learn
+
+## The Problem
+
+IRIS emulates an SGI Indy (MIPS R4400) well enough to boot IRIX 6.5 to a graphical desktop. But the interpreter tops out at ~30 MIPS on x86_64. We wanted a Cranelift-based JIT compiler to go faster.
+
+First attempt: compile everything. Result: **hang**. Loads and stores in the same compiled block caused Cranelift to generate bad register spill code on x86_64 (only 15 usable registers vs AArch64's 31). Weeks of debugging.
+
+## The Insight
+
+Instead of fixing one bug and praying, make the JIT **fix itself**.
+
+## How It Works
+
+Every compiled block starts at the safest level and earns its way up:
+
+```
+Tier 0 (Alu)    Pure math + branches. Can't go wrong.
+Tier 1 (Loads)  Add memory reads. Might hit TLB misses.
+Tier 2 (Full)   Add memory writes. Full native speed.
+```
+
+**Lifecycle of a block:**
+1. First seen → compile at Tier 0, mark **speculative**
+2. Before each speculative run → snapshot the entire CPU (~2.3 KB)
+3. Block runs clean 50 times → **trusted** (no more snapshots)
+4. Trusted for 200 runs → **promote** to next tier (speculative again)
+5. Block causes 3 exceptions at new tier → **demote** back, recompile
+
+If a speculative block misbehaves, CPU state is rolled back from the snapshot and the interpreter re-runs the instruction correctly. The system never crashes — it just learns that block isn't ready yet.
+
+## Bugs Found Along the Way
+
+1. **SSA register pressure** — Cranelift's exception paths referenced values across block boundaries. Fixed by flushing modified registers before each helper call.
+
+2. **Delay slot skip** *(the real killer)* — MIPS branches have a "delay slot": the instruction after a branch always executes. The JIT's tracer included load instructions in delay slots but the compiler's tier gate silently skipped them. Every branch with a load delay slot (extremely common in MIPS) produced wrong results. One-line fix.
+
+## Profile Cache
+
+Hot block profiles are saved to `~/.iris/jit-profile.bin` on shutdown. Next boot, blocks are pre-compiled at their proven tier — skipping the entire warmup.
+
+## Results
+
+```
+Run with IRIS_JIT=0:  boots ✓  (interpreter only)
+Run with IRIS_JIT=1:  boots to graphical desktop ✓
+                      73,015 blocks compiled
+                      4,036 promotions, 6 demotions, 145 rollbacks
+                      0 crashes
+```
+
+The JIT is now self-correcting. It starts conservative, learns what's safe, and backs off when it's wrong. The emulator doesn't need us to manually decide what to compile — it figures it out at runtime.
diff --git a/mcm-engine.yaml b/mcm-engine.yaml
new file mode 100644
index 0000000..37acef8
--- /dev/null
+++ b/mcm-engine.yaml
@@ -0,0 +1,52 @@
+project_name: iris
+db_path: .claude/knowledge.db
+rules_path: rules/
+plugins: []
+nudges:
+  # Tuned for 1M context (Opus 4.6) — sessions can safely run 400+ turns.
+  # Compaction/handoff thresholds are relaxed; enforcement thresholds are tightened.
+  store_reminder_turns: 6        # was 4 — less aggressive, sessions are longer
+  checkpoint_turns: 30            # was 10 — 1M context, no rush to checkpoint
+  mandatory_stop_turns: 320        # was 20 — allows much longer sessions
+  hyper_focus_threshold: 8        # was 3 — slight increase for longer sessions
+  rules_check_interval: 4         # was 5 — less frequent but still periodic
+  # Block tool calls (return error) after mandatory_stop + grace without checkpoint.
+  mandatory_stop_blocking: true
+  mandatory_stop_grace: 10        # was 5 — more grace for long sessions
+  # After N ignored nudges of the same type, escalate to blocking.
+  # TIGHTER than before — escalate faster when agent ignores nudges.
+  nudge_escalation_threshold: 2   # was 3
+
+server_name: iris-knowledge
+
+server_instructions: |
+
+  BEHAVIORAL MANDATES (non-negotiable):
+
+  0. Do not be lazy. Do not cheat. Focus on correctness and precision, not the "quickest way" to solve problems. Carefully examine any potential shortcut     and consider how it will impact downstream packages.
+
+  1. MUST call `report_error` BEFORE attempting manual fixes for any build/link/runtime error.
+     It logs the error AND auto-searches rules, errors, and compat catalog in one call.
+
+  2. MUST call `check_compat` BEFORE writing compat function implementations.
+
+  3. MUST call `add_rule` IMMEDIATELY after confirming a fix works (build passes, test passes).
+     Do NOT defer add_rule calls to session end — context pressure causes them to be dropped.
+     Pattern: report_error when you hit it -> fix it -> build passes -> add_rule RIGHT THEN.
+
+  4. MUST delegate to a sub-agent after 2 failed fix attempts for the same error.
+     Long debug sessions destroy parent context.
+
+  5. DB is CACHE, files are AUTHORITATIVE. Rule files (rules/packages/*.yaml, rules/generic.yaml,
+     compat/catalog.yaml, rules/methods/*.md) are the source of truth. When using add_rule,
+     provide file_path pointing to the authoritative rule file.
+
+  Tool quick reference:
+  - `search` (or `knowledge_query`): Search rules, knowledge, errors, compat
+  - `report_error`: Log error + auto-search for fixes (THE KILLER FEATURE)
+  - `check_compat`: Search compat/catalog.yaml for a symbol
+  - `add_knowledge` (or `report_finding`): Store findings/decisions/insights
+  - `add_negative`: Store anti-patterns and dead ends
+  - `add_rule`: Create/index rule after fixing a problem
+  - `session_start`: Initialize session with context
+  - `session_handoff`: Snapshot state for next session
diff --git a/src/hptimer.rs b/src/hptimer.rs
index cdbb0c1..e25ab92 100644
--- a/src/hptimer.rs
+++ b/src/hptimer.rs
@@ -366,12 +366,14 @@ fn timer_thread_loop(inner: Arc<Mutex<TimerManagerInner>>, new_timer_added: Arc<
 
                 let delay = target - sleep_now;
 
-                if delay > Duration::from_millis(2) {
+                if delay > Duration::from_micros(200) {
                     // Park with a safe threshold
-                    let park_duration = delay - Duration::from_millis(1);
+                    let park_duration = delay - Duration::from_micros(100);
                     thread::park_timeout(park_duration);
                 } else {
-                    std::hint::spin_loop();
+                    // Short sleep instead of spin — yields the core without
+                    // burning CPU while waiting for the timer to fire.
+                    thread::sleep(Duration::from_micros(50));
                 }
             }
         } else {
diff --git a/src/jit/cache.rs b/src/jit/cache.rs
index 3dd44e8..fff6ca3 100644
--- a/src/jit/cache.rs
+++ b/src/jit/cache.rs
@@ -2,6 +2,35 @@
 
 use std::collections::HashMap;
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+#[repr(u8)]
+pub enum BlockTier {
+    Alu   = 0,  // ALU + branches only, no memory helper calls
+    Loads = 1,  // ALU + loads + branches
+    Full  = 2,  // ALU + loads + stores + branches
+}
+
+impl BlockTier {
+    pub fn promote(self) -> Option<BlockTier> {
+        match self {
+            BlockTier::Alu   => Some(BlockTier::Loads),
+            BlockTier::Loads => Some(BlockTier::Full),
+            BlockTier::Full  => None,
+        }
+    }
+    pub fn demote(self) -> Option<BlockTier> {
+        match self {
+            BlockTier::Alu   => None,
+            BlockTier::Loads => Some(BlockTier::Alu),
+            BlockTier::Full  => Some(BlockTier::Loads),
+        }
+    }
+}
+
+pub const TIER_STABLE_THRESHOLD:  u32 = 50;   // consecutive clean exits → trusted
+pub const TIER_PROMOTE_THRESHOLD: u32 = 200;  // trusted clean exits → try next tier
+pub const TIER_DEMOTE_THRESHOLD:  u32 = 3;    // exceptions in trial period → demote
+
 /// A compiled native code block.
 pub struct CompiledBlock {
     /// Function pointer to compiled native code.
@@ -14,6 +43,16 @@ pub struct CompiledBlock {
     pub len_mips: u32,
     /// Size of native code in bytes.
     pub len_native: u32,
+    /// Compilation tier for this block.
+    pub tier:            BlockTier,
+    /// Total number of times this block has been entered.
+    pub hit_count:       u32,
+    /// Number of exceptions that occurred during this block's execution.
+    pub exception_count: u32,
+    /// Consecutive clean (non-exception) exits since last exception or tier change.
+    pub stable_hits:     u32,
+    /// True when this block is in a trial period (not yet fully trusted at current tier).
+    pub speculative:     bool,
 }
 
 // Safety: CompiledBlock is only accessed from the CPU thread.
@@ -35,10 +74,18 @@ impl CodeCache {
         self.blocks.get(&phys_pc)
     }
 
+    pub fn lookup_mut(&mut self, phys_pc: u64) -> Option<&mut CompiledBlock> {
+        self.blocks.get_mut(&phys_pc)
+    }
+
     pub fn insert(&mut self, phys_pc: u64, block: CompiledBlock) {
         self.blocks.insert(phys_pc, block);
     }
 
+    pub fn replace(&mut self, phys_pc: u64, block: CompiledBlock) {
+        self.blocks.insert(phys_pc, block);
+    }
+
     /// Invalidate all blocks that overlap a physical address range.
     /// Called when self-modifying code is detected or CACHE instruction executes.
     pub fn invalidate_range(&mut self, phys_start: u64, phys_end: u64) {
@@ -56,4 +103,8 @@ impl CodeCache {
     pub fn len(&self) -> usize {
         self.blocks.len()
     }
+
+    pub fn iter(&self) -> impl Iterator<Item = (&u64, &CompiledBlock)> {
+        self.blocks.iter()
+    }
 }
diff --git a/src/jit/compiler.rs b/src/jit/compiler.rs
index 0b6f9ca..aa7f36a 100644
--- a/src/jit/compiler.rs
+++ b/src/jit/compiler.rs
@@ -11,7 +11,7 @@ use cranelift_module::{Linkage, Module, FuncId};
 use crate::mips_exec::DecodedInstr;
 use crate::mips_isa::*;
 
-use super::cache::CompiledBlock;
+use super::cache::{BlockTier, CompiledBlock};
 use super::context::{JitContext, EXIT_NORMAL, EXIT_INTERPRET, EXIT_EXCEPTION};
 use super::helpers::HelperPtrs;
 
@@ -99,6 +99,7 @@ impl BlockCompiler {
         &mut self,
         instrs: &[(u32, DecodedInstr)],
         block_pc: u64,
+        tier: BlockTier,
     ) -> Option<CompiledBlock> {
         if instrs.is_empty() {
             return None;
@@ -162,6 +163,9 @@ impl BlockCompiler {
         let mut lo = builder.ins().load(types::I64, mem, ctx_ptr,
             ir::immediates::Offset32::new(JitContext::lo_offset()));
 
+        // Bitmask of GPRs modified so far (bits 1-31); used to flush before helper calls
+        let mut modified_gprs: u32 = 0;
+
         // Emit IR for each instruction
         let mut compiled_count = 0u32;
         let mut branch_exit_pc: Option<Value> = None;
@@ -172,7 +176,7 @@ impl BlockCompiler {
             let instr_pc = block_pc.wrapping_add(idx as u64 * 4);
             let result = emit_instruction(
                 &mut builder, ctx_ptr, exec_ptr, &helpers,
-                &mut gpr, &mut hi, &mut lo, d, instr_pc,
+                &mut gpr, &mut hi, &mut lo, &mut modified_gprs, d, instr_pc, tier,
             );
             match result {
                 EmitResult::Ok => { compiled_count += 1; idx += 1; }
@@ -185,7 +189,7 @@ impl BlockCompiler {
                         let delay_pc = block_pc.wrapping_add(idx as u64 * 4);
                         let delay_result = emit_instruction(
                             &mut builder, ctx_ptr, exec_ptr, &helpers,
-                            &mut gpr, &mut hi, &mut lo, delay_d, delay_pc,
+                            &mut gpr, &mut hi, &mut lo, &mut modified_gprs, delay_d, delay_pc, tier,
                         );
                         if matches!(delay_result, EmitResult::Ok) {
                             compiled_count += 1;
@@ -205,11 +209,9 @@ impl BlockCompiler {
             return None;
         }
 
-        // Store GPRs back (skip r0)
-        for i in 1..32usize {
-            builder.ins().store(mem, gpr[i], ctx_ptr,
-                ir::immediates::Offset32::new(JitContext::gpr_offset(i)));
-        }
+        // Store all GPRs that may have changed. Use a full bitmask to ensure completeness.
+        let mut all_modified: u32 = 0xFFFFFFFE; // bits 1-31 set (skip r0)
+        flush_modified_gprs(&mut builder, &gpr, ctx_ptr, &mut all_modified);
 
         // Store hi/lo back
         builder.ins().store(mem, hi, ctx_ptr,
@@ -254,6 +256,11 @@ impl BlockCompiler {
             virt_addr: block_pc,
             len_mips: compiled_count,
             len_native: code_size,
+            tier,
+            speculative: true,
+            hit_count: 0,
+            exception_count: 0,
+            stable_hits: 0,
         })
     }
 }
@@ -283,8 +290,10 @@ fn emit_instruction(
     gpr: &mut [Value; 32],
     hi: &mut Value,
     lo: &mut Value,
+    modified_gprs: &mut u32,
     d: &DecodedInstr,
     instr_pc: u64,
+    tier: BlockTier,
 ) -> EmitResult {
     let op = d.op as u32;
     let rs = d.rs as usize;
@@ -294,30 +303,51 @@ fn emit_instruction(
     let funct = d.funct as u32;
 
     match op {
-        OP_SPECIAL => emit_special(builder, gpr, hi, lo, d, rs, rt, rd, sa, funct),
-        OP_ADDIU  => { emit_addiu(builder, gpr, rs, rt, d); EmitResult::Ok }
-        OP_DADDIU => { emit_daddiu(builder, gpr, rs, rt, d); EmitResult::Ok }
-        OP_SLTI   => { emit_slti(builder, gpr, rs, rt, d); EmitResult::Ok }
-        OP_SLTIU  => { emit_sltiu(builder, gpr, rs, rt, d); EmitResult::Ok }
-        OP_ANDI   => { emit_andi(builder, gpr, rs, rt, d); EmitResult::Ok }
-        OP_ORI    => { emit_ori(builder, gpr, rs, rt, d); EmitResult::Ok }
-        OP_XORI   => { emit_xori(builder, gpr, rs, rt, d); EmitResult::Ok }
-        OP_LUI    => { emit_lui(builder, gpr, rt, d); EmitResult::Ok }
-
-        // --- Loads ---
-        OP_LB  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u8,  gpr, rs, rt, d, LoadWidth::Byte, true, instr_pc),
-        OP_LBU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u8,  gpr, rs, rt, d, LoadWidth::Byte, false, instr_pc),
-        OP_LH  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u16, gpr, rs, rt, d, LoadWidth::Half, true, instr_pc),
-        OP_LHU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u16, gpr, rs, rt, d, LoadWidth::Half, false, instr_pc),
-        OP_LW  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u32, gpr, rs, rt, d, LoadWidth::Word, true, instr_pc),
-        OP_LWU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u32, gpr, rs, rt, d, LoadWidth::Word, false, instr_pc),
-        OP_LD  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u64, gpr, rs, rt, d, LoadWidth::Double, false, instr_pc),
-
-        // --- Stores ---
-        OP_SB => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u8,  gpr, rs, rt, d, instr_pc),
-        OP_SH => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u16, gpr, rs, rt, d, instr_pc),
-        OP_SW => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u32, gpr, rs, rt, d, instr_pc),
-        OP_SD => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u64, gpr, rs, rt, d, instr_pc),
+        OP_SPECIAL => {
+            let result = emit_special(builder, gpr, hi, lo, d, rs, rt, rd, sa, funct);
+            // Conservative: mark rd modified for all SPECIAL ops that return Ok.
+            // Harmless for ops that don't write rd (JR, MTHI, MTLO) since flush
+            // will simply store the still-valid value that was loaded at block entry.
+            if matches!(result, EmitResult::Ok) {
+                *modified_gprs |= 1u32 << rd;
+            }
+            result
+        }
+        OP_ADDIU  => { emit_addiu(builder, gpr, rs, rt, d);  *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_DADDIU => { emit_daddiu(builder, gpr, rs, rt, d); *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_SLTI   => { emit_slti(builder, gpr, rs, rt, d);   *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_SLTIU  => { emit_sltiu(builder, gpr, rs, rt, d);  *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_ANDI   => { emit_andi(builder, gpr, rs, rt, d);   *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_ORI    => { emit_ori(builder, gpr, rs, rt, d);    *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_XORI   => { emit_xori(builder, gpr, rs, rt, d);   *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_LUI    => { emit_lui(builder, gpr, rt, d);         *modified_gprs |= 1 << rt; EmitResult::Ok }
+
+        // --- Loads (tier-gated) ---
+        OP_LB | OP_LBU | OP_LH | OP_LHU | OP_LW | OP_LWU | OP_LD => {
+            if tier == BlockTier::Alu { return EmitResult::Stop; }
+            match op {
+                OP_LB  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u8,  gpr, rs, rt, d, LoadWidth::Byte,   true,  instr_pc, modified_gprs),
+                OP_LBU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u8,  gpr, rs, rt, d, LoadWidth::Byte,   false, instr_pc, modified_gprs),
+                OP_LH  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u16, gpr, rs, rt, d, LoadWidth::Half,   true,  instr_pc, modified_gprs),
+                OP_LHU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u16, gpr, rs, rt, d, LoadWidth::Half,   false, instr_pc, modified_gprs),
+                OP_LW  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u32, gpr, rs, rt, d, LoadWidth::Word,   true,  instr_pc, modified_gprs),
+                OP_LWU => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u32, gpr, rs, rt, d, LoadWidth::Word,   false, instr_pc, modified_gprs),
+                OP_LD  => emit_load(builder, ctx_ptr, exec_ptr, helpers.read_u64, gpr, rs, rt, d, LoadWidth::Double, false, instr_pc, modified_gprs),
+                _ => unreachable!(),
+            }
+        }
+
+        // --- Stores (tier-gated) ---
+        OP_SB | OP_SH | OP_SW | OP_SD => {
+            if tier == BlockTier::Alu || tier == BlockTier::Loads { return EmitResult::Stop; }
+            match op {
+                OP_SB => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u8,  gpr, rs, rt, d, instr_pc, modified_gprs),
+                OP_SH => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u16, gpr, rs, rt, d, instr_pc, modified_gprs),
+                OP_SW => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u32, gpr, rs, rt, d, instr_pc, modified_gprs),
+                OP_SD => emit_store(builder, ctx_ptr, exec_ptr, helpers.write_u64, gpr, rs, rt, d, instr_pc, modified_gprs),
+                _ => unreachable!(),
+            }
+        }
 
         // --- Branches ---
         OP_BEQ   => emit_beq(builder, gpr, rs, rt, d, instr_pc, false),
@@ -327,7 +357,7 @@ fn emit_instruction(
 
         // --- Jumps ---
         OP_J   => emit_j(builder, gpr, d, instr_pc),
-        OP_JAL => emit_jal(builder, gpr, d, instr_pc),
+        OP_JAL => { *modified_gprs |= 1 << 31; emit_jal(builder, gpr, d, instr_pc) }
 
         _ => EmitResult::Stop,
     }
@@ -712,6 +742,30 @@ fn emit_movn(builder: &mut FunctionBuilder, gpr: &mut [Value; 32], rs: usize, rt
     gpr[rd] = builder.ins().select(is_nonzero, gpr[rs], gpr[rd]);
 }
 
+// ─── GPR flush helper ────────────────────────────────────────────────────────
+
+/// Flush modified GPRs from SSA values to JitContext memory.
+/// Called immediately BEFORE each `builder.ins().call(helper, ...)`.
+/// After flushing, `*modified` is reset to 0.
+/// This eliminates cross-block SSA live value pressure on x86_64 (the "35+ live I64" spill bug).
+fn flush_modified_gprs(
+    builder: &mut FunctionBuilder,
+    gpr: &[Value; 32],
+    ctx_ptr: Value,
+    modified: &mut u32,
+) {
+    let mem = MemFlags::trusted();
+    for i in 1..32usize {
+        if (*modified >> i) & 1 != 0 {
+            builder.ins().store(
+                mem, gpr[i], ctx_ptr,
+                ir::immediates::Offset32::new(JitContext::gpr_offset(i)),
+            );
+        }
+    }
+    *modified = 0;
+}
+
 // ─── Load/Store emitters ─────────────────────────────────────────────────────
 
 /// Load width tag passed to emit_load so it applies the correct sign extension.
@@ -730,11 +784,15 @@ fn emit_load(
     width: LoadWidth,
     sign_extend: bool,
     instr_pc: u64,
+    modified_gprs: &mut u32,
 ) -> EmitResult {
     let base = gpr[rs];
     let offset = builder.ins().iconst(types::I64, d.imm as i32 as i64);
     let virt_addr = builder.ins().iadd(base, offset);
 
+    // Flush all GPRs modified so far — prevents cross-block SSA live value pressure
+    flush_modified_gprs(builder, gpr, ctx_ptr, modified_gprs);
+
     // Store faulting PC to ctx BEFORE the helper call, so the dispatch loop
     // knows which instruction caused the exception if one occurs.
     let instr_pc_val = builder.ins().iconst(types::I64, instr_pc as i64);
@@ -757,14 +815,9 @@ fn emit_load(
     let exc_block = builder.create_block();
     builder.ins().brif(is_exception, exc_block, &[], ok_block, &[raw_val]);
 
-    // Exception path: store all GPRs back to ctx so sync_to has current state
+    // Exception path: GPRs already flushed before the helper call — just return
     builder.switch_to_block(exc_block);
     builder.seal_block(exc_block);
-    let mem = MemFlags::trusted();
-    for i in 1..32usize {
-        builder.ins().store(mem, gpr[i], ctx_ptr,
-            ir::immediates::Offset32::new(JitContext::gpr_offset(i)));
-    }
     builder.ins().return_(&[]);
 
     // Normal path — raw_val comes through as a block parameter
@@ -793,6 +846,7 @@ fn emit_load(
             val
         }
     };
+    *modified_gprs |= 1u32 << rt;
 
     EmitResult::Ok
 }
@@ -806,12 +860,16 @@ fn emit_store(
     rs: usize, rt: usize,
     d: &DecodedInstr,
     instr_pc: u64,
+    modified_gprs: &mut u32,
 ) -> EmitResult {
     let base = gpr[rs];
     let offset = builder.ins().iconst(types::I64, d.imm as i32 as i64);
     let virt_addr = builder.ins().iadd(base, offset);
     let value = gpr[rt];
 
+    // Flush all GPRs modified so far — prevents cross-block SSA live value pressure
+    flush_modified_gprs(builder, gpr, ctx_ptr, modified_gprs);
+
     // Store faulting PC before helper call
     let instr_pc_val = builder.ins().iconst(types::I64, instr_pc as i64);
     builder.ins().store(MemFlags::trusted(), instr_pc_val, ctx_ptr,
@@ -829,14 +887,9 @@ fn emit_store(
     let exc_block = builder.create_block();
     builder.ins().brif(is_exception, exc_block, &[], ok_block, &[]);
 
-    // Exception path: store all GPRs back to ctx
+    // Exception path: GPRs already flushed before the helper call — just return
     builder.switch_to_block(exc_block);
     builder.seal_block(exc_block);
-    let mem = MemFlags::trusted();
-    for i in 1..32usize {
-        builder.ins().store(mem, gpr[i], ctx_ptr,
-            ir::immediates::Offset32::new(JitContext::gpr_offset(i)));
-    }
     builder.ins().return_(&[]);
 
     builder.switch_to_block(ok_block);
diff --git a/src/jit/dispatch.rs b/src/jit/dispatch.rs
index 1d7c247..5bd24ae 100644
--- a/src/jit/dispatch.rs
+++ b/src/jit/dispatch.rs
@@ -1,9 +1,10 @@
-//! JIT dispatch loop: interpreter-first architecture with inline cache probes.
+//! Adaptive JIT dispatch loop with tiered compilation and speculative execution.
 //!
-//! The interpreter runs in tight batches. Every PROBE_INTERVAL steps within
-//! the batch, we check if the current PC has a compiled block. If so, we
-//! execute it and return to the interpreter. This gives high JIT hit rates
-//! while keeping zero overhead on most interpreter steps.
+//! Every block starts at Tier 0 (ALU only — safe by construction). Hot blocks
+//! are promoted through tiers as they prove stable. If a speculative block
+//! misbehaves, CPU state is rolled back from a pre-block snapshot and the block
+//! is demoted. Blocks that prove stable graduate to trusted execution with zero
+//! snapshot overhead.
 
 use std::sync::atomic::{AtomicBool, Ordering};
 
@@ -11,10 +12,12 @@ use crate::mips_exec::{MipsExecutor, DecodedInstr, EXEC_BREAKPOINT, decode_into}
 use crate::mips_tlb::{Tlb, AccessType};
 use crate::mips_cache_v2::MipsCache;
 
-use super::cache::CodeCache;
+use super::cache::{BlockTier, CodeCache, TIER_STABLE_THRESHOLD, TIER_PROMOTE_THRESHOLD, TIER_DEMOTE_THRESHOLD};
 use super::compiler::BlockCompiler;
 use super::context::{JitContext, EXIT_NORMAL, EXIT_EXCEPTION};
 use super::helpers::HelperPtrs;
+use super::profile::{self, ProfileEntry};
+use super::snapshot::CpuRollbackSnapshot;
 
 const MAX_BLOCK_LEN: usize = 64;
 
@@ -43,7 +46,16 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     // causing stale TLB/cache/CP0 state and kernel panics.
     let exec_ptr: *mut MipsExecutor<T, C> = exec as *mut _;
 
-    eprintln!("JIT: enabled (interpreter-first, probe every {} steps)", PROBE_INTERVAL);
+    // IRIS_JIT_MAX_TIER: cap the highest tier blocks can reach (0=Alu, 1=Loads, 2=Full)
+    let max_tier = match std::env::var("IRIS_JIT_MAX_TIER").ok().and_then(|v| v.parse::<u8>().ok()) {
+        Some(0) => BlockTier::Alu,
+        Some(1) => BlockTier::Loads,
+        _ => BlockTier::Full,
+    };
+    // IRIS_JIT_VERIFY=1: after each JIT block, re-run via interpreter and compare
+    let verify_mode = std::env::var("IRIS_JIT_VERIFY").map(|v| v == "1").unwrap_or(false);
+    eprintln!("JIT: adaptive mode (max_tier={:?}, verify={}, probe every {} steps)",
+        max_tier, verify_mode, PROBE_INTERVAL);
     let helpers = HelperPtrs::new::<T, C>();
     let mut compiler = BlockCompiler::new(&helpers);
     let mut cache = CodeCache::new();
@@ -53,12 +65,41 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     let mut total_jit_instrs: u64 = 0;
     let mut total_interp_steps: u64 = 0;
     let mut blocks_compiled: u64 = 0;
+    let mut promotions: u64 = 0;
+    let mut demotions: u64 = 0;
+    let mut rollbacks: u64 = 0;
+
+    // Load saved profile and eagerly compile hot blocks
+    {
+        let exec = unsafe { &mut *exec_ptr };
+        let profile_entries = profile::load_profile();
+        let mut profile_compiled = 0u64;
+        for entry in &profile_entries {
+            // Cap at max_tier
+            let tier = if entry.tier > max_tier { max_tier } else { entry.tier };
+            if tier == BlockTier::Alu {
+                continue; // Alu blocks compile on first miss anyway
+            }
+            let instrs = trace_block(exec, entry.virt_pc, tier);
+            if !instrs.is_empty() {
+                if let Some(mut block) = compiler.compile_block(&instrs, entry.virt_pc, tier) {
+                    block.phys_addr = entry.phys_pc;
+                    cache.insert(entry.phys_pc, block);
+                    blocks_compiled += 1;
+                    profile_compiled += 1;
+                }
+            }
+        }
+        if profile_compiled > 0 {
+            eprintln!("JIT profile: pre-compiled {} blocks from profile", profile_compiled);
+        }
+    }
 
     while running.load(Ordering::Relaxed) {
         let mut steps_in_batch: u32 = 0;
 
         while steps_in_batch < BATCH_SIZE {
-            // Borrow exec for interpreter batch — no JIT call happens here
+            // Interpreter batch — no JIT call happens here
             {
                 let exec = unsafe { &mut *exec_ptr };
                 #[cfg(feature = "lightning")]
@@ -78,7 +119,7 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
 
             if !running.load(Ordering::Relaxed) { break; }
 
-            // Probe the JIT code cache — borrow briefly for reads
+            // Probe the JIT code cache
             let (pc, in_delay_slot) = {
                 let exec = unsafe { &*exec_ptr };
                 (exec.core.pc, exec.in_delay_slot)
@@ -99,44 +140,178 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                 }
             };
 
-            if let Some(block) = cache.lookup(phys_pc) {
+            if cache.lookup(phys_pc).is_some() {
                 // Cache hit — execute compiled block.
-                // NO &mut MipsExecutor exists during the JIT call.
-                let entry: extern "C" fn(*mut JitContext) = unsafe {
-                    std::mem::transmute(block.entry)
-                };
+                let block = cache.lookup(phys_pc).unwrap();
                 let block_len = block.len_mips;
+                let block_tier = block.tier;
+                let is_speculative = block.speculative;
+
+                // Snapshot CPU if speculative OR verify mode
+                let snapshot = if is_speculative || verify_mode {
+                    let exec = unsafe { &*exec_ptr };
+                    exec.tlb.clone_as_mips_tlb().map(|tlb| {
+                        CpuRollbackSnapshot::capture(exec, tlb)
+                    })
+                } else {
+                    None
+                };
 
+                // Sync and run
                 {
                     let exec = unsafe { &mut *exec_ptr };
                     ctx.sync_from_executor(exec);
                 } // &mut dropped before JIT call
 
                 ctx.exit_reason = 0;
-                entry(&mut ctx); // Helpers create their own &mut from exec_ptr — no aliasing
+                let entry: extern "C" fn(*mut JitContext) = unsafe {
+                    std::mem::transmute(cache.lookup(phys_pc).unwrap().entry)
+                };
+                entry(&mut ctx); // Helpers create their own &mut from exec_ptr
 
                 {
                     let exec = unsafe { &mut *exec_ptr };
                     ctx.sync_to_executor(exec);
 
                     if ctx.exit_reason == EXIT_EXCEPTION {
-                        // A load/store hit a TLB miss or other exception.
-                        // ctx.pc has the faulting instruction's PC (stored before the helper call).
-                        // GPRs are current (stored by the exc_block).
-                        // Re-execute the faulting instruction through the interpreter,
-                        // which will handle the exception properly (set EPC, jump to handler).
+                        if let Some(snap) = &snapshot {
+                            if is_speculative {
+                                // Speculative block hit an exception — roll back
+                                snap.restore(exec);
+                                rollbacks += 1;
+
+                                if let Some(block) = cache.lookup_mut(phys_pc) {
+                                    block.hit_count += 1;
+                                    block.exception_count += 1;
+                                    block.stable_hits = 0;
+
+                                    if block.exception_count >= TIER_DEMOTE_THRESHOLD {
+                                        if let Some(lower) = block.tier.demote() {
+                                            demotions += 1;
+                                            eprintln!("JIT: demote {:016x} {:?}→{:?} ({}exc)",
+                                                pc, block.tier, lower, block.exception_count);
+                                            recompile_block_at_tier(
+                                                &mut compiler, &mut cache, exec,
+                                                phys_pc, pc, lower,
+                                                &mut blocks_compiled,
+                                            );
+                                        } else {
+                                            block.speculative = false;
+                                        }
+                                    }
+                                }
+                            } else if verify_mode {
+                                // Verify mode but not speculative — restore for verification
+                                snap.restore(exec);
+                            }
+                        }
+                        // Interpreter handles the faulting instruction
                         exec.step();
                         steps_in_batch += 1;
-                        // Reset exit_reason for next block
                         ctx.exit_reason = 0;
                     } else {
-                        // Normal exit — advance cp0_count per-instruction
-                        for _ in 0..block_len {
-                            let prev = exec.core.cp0_count;
-                            exec.core.cp0_count = prev.wrapping_add(exec.core.count_step) & 0x0000_FFFF_FFFF_FFFF;
-                            if exec.core.cp0_compare != 0 && prev < exec.core.cp0_compare && exec.core.cp0_count >= exec.core.cp0_compare {
-                                exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
-                                exec.core.fasttick_count.fetch_add(1, Ordering::Relaxed);
+                        // Normal exit — verify if enabled
+                        if verify_mode {
+                            if let Some(snap) = &snapshot {
+                                // Save JIT results
+                                let jit_gpr = exec.core.gpr;
+                                let jit_pc = exec.core.pc;
+                                let jit_hi = exec.core.hi;
+                                let jit_lo = exec.core.lo;
+
+                                // Restore pre-block state
+                                snap.restore(exec);
+
+                                // Run interpreter for the same number of instructions
+                                for _ in 0..block_len {
+                                    exec.step();
+                                }
+
+                                // Compare
+                                let interp_gpr = exec.core.gpr;
+                                let interp_pc = exec.core.pc;
+                                let interp_hi = exec.core.hi;
+                                let interp_lo = exec.core.lo;
+
+                                let mut mismatch = false;
+                                for i in 0..32 {
+                                    if jit_gpr[i] != interp_gpr[i] {
+                                        eprintln!("JIT VERIFY FAIL at {:016x} (tier={:?}, len={}): gpr[{}] jit={:016x} interp={:016x}",
+                                            pc, block_tier, block_len, i, jit_gpr[i], interp_gpr[i]);
+                                        mismatch = true;
+                                    }
+                                }
+                                if jit_pc != interp_pc {
+                                    eprintln!("JIT VERIFY FAIL at {:016x}: pc jit={:016x} interp={:016x}",
+                                        pc, jit_pc, interp_pc);
+                                    mismatch = true;
+                                }
+                                if jit_hi != interp_hi {
+                                    eprintln!("JIT VERIFY FAIL at {:016x}: hi jit={:016x} interp={:016x}",
+                                        pc, jit_hi, interp_hi);
+                                    mismatch = true;
+                                }
+                                if jit_lo != interp_lo {
+                                    eprintln!("JIT VERIFY FAIL at {:016x}: lo jit={:016x} interp={:016x}",
+                                        pc, jit_lo, interp_lo);
+                                    mismatch = true;
+                                }
+
+                                if mismatch {
+                                    // Dump the block instructions
+                                    let instrs = trace_block(exec, pc, block_tier);
+                                    eprintln!("JIT VERIFY: block at {:016x} ({} instrs):", pc, instrs.len());
+                                    for (idx, (raw, d)) in instrs.iter().enumerate() {
+                                        let ipc = pc.wrapping_add(idx as u64 * 4);
+                                        eprintln!("  {:016x}: {:08x} op={} rs={} rt={} rd={} funct={} imm={:04x}",
+                                            ipc, raw, d.op, d.rs, d.rt, d.rd, d.funct, d.imm as u16);
+                                    }
+                                    // Leave interpreter state (correct) in place
+                                    steps_in_batch += block_len;
+                                    total_jit_instrs += block_len as u64;
+                                    // Invalidate this block so we don't keep hitting it
+                                    cache.invalidate_range(phys_pc, phys_pc + 4);
+                                    continue;
+                                }
+                                // Verification passed — interpreter state is already correct
+                                // (we ran the interpreter, so state is authoritative)
+                            }
+                        }
+
+                        // Update stats and check for promotion
+                        if let Some(block) = cache.lookup_mut(phys_pc) {
+                            block.hit_count += 1;
+                            block.stable_hits += 1;
+                            block.exception_count = 0;
+
+                            if block.speculative && block.stable_hits >= TIER_STABLE_THRESHOLD {
+                                block.speculative = false;
+                            }
+
+                            if !block.speculative && block.stable_hits >= TIER_PROMOTE_THRESHOLD {
+                                if let Some(next) = block.tier.promote().filter(|t| *t <= max_tier) {
+                                    promotions += 1;
+                                    eprintln!("JIT: promote {:016x} {:?}→{:?} ({}hits)",
+                                        pc, block.tier, next, block.hit_count);
+                                    recompile_block_at_tier(
+                                        &mut compiler, &mut cache, exec,
+                                        phys_pc, pc, next,
+                                        &mut blocks_compiled,
+                                    );
+                                }
+                            }
+                        }
+
+                        // Advance cp0_count per-instruction
+                        if !verify_mode {
+                            // In verify mode, interpreter already advanced these
+                            for _ in 0..block_len {
+                                let prev = exec.core.cp0_count;
+                                exec.core.cp0_count = prev.wrapping_add(exec.core.count_step) & 0x0000_FFFF_FFFF_FFFF;
+                                if exec.core.cp0_compare != 0 && prev < exec.core.cp0_compare && exec.core.cp0_count >= exec.core.cp0_compare {
+                                    exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                                    exec.core.fasttick_count.fetch_add(1, Ordering::Relaxed);
+                                }
                             }
                         }
                         exec.local_cycles += block_len as u64;
@@ -145,16 +320,16 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                     }
                 } // &mut dropped
             } else {
-                // Cache miss — try to compile
+                // Cache miss — compile at lowest tier (safe by construction)
                 let exec = unsafe { &mut *exec_ptr };
-                let instrs = trace_block(exec, pc);
+                let instrs = trace_block(exec, pc, BlockTier::Alu);
                 if !instrs.is_empty() {
-                    if let Some(mut block) = compiler.compile_block(&instrs, pc) {
+                    if let Some(mut block) = compiler.compile_block(&instrs, pc, BlockTier::Alu) {
                         block.phys_addr = phys_pc;
                         cache.insert(phys_pc, block);
                         blocks_compiled += 1;
                         if blocks_compiled <= 10 || blocks_compiled % 500 == 0 {
-                            eprintln!("JIT: compiled #{} at {:016x} ({} instrs, cache={})",
+                            eprintln!("JIT: compiled #{} at {:016x} ({} instrs, tier=Alu, cache={})",
                                 blocks_compiled, pc, instrs.len(), cache.len());
                         }
                     }
@@ -170,10 +345,10 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
 
         if total_interp_steps % 10000000 < BATCH_SIZE as u64 {
             let exec = unsafe { &*exec_ptr };
-            eprintln!("JIT: {} steps, {} JIT instrs ({:.1}%), {} blocks, pc={:016x}",
+            eprintln!("JIT: {} steps, {} JIT instrs ({:.1}%), {} blocks, {}↑ {}↓ {}⟲, pc={:016x}",
                 total_interp_steps, total_jit_instrs,
                 if total_interp_steps > 0 { total_jit_instrs as f64 / total_interp_steps as f64 * 100.0 } else { 0.0 },
-                blocks_compiled, exec.core.pc);
+                blocks_compiled, promotions, demotions, rollbacks, exec.core.pc);
         }
     }
 
@@ -181,9 +356,45 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
         let exec = unsafe { &mut *exec_ptr };
         exec.flush_cycles();
     }
-    eprintln!("JIT: shutdown. {} blocks, {} JIT instrs / {} total steps ({:.1}%)",
+    eprintln!("JIT: shutdown. {} blocks, {} JIT instrs / {} total ({:.1}%), {}↑ {}↓ {}⟲",
         blocks_compiled, total_jit_instrs, total_interp_steps,
-        if total_interp_steps > 0 { total_jit_instrs as f64 / total_interp_steps as f64 * 100.0 } else { 0.0 });
+        if total_interp_steps > 0 { total_jit_instrs as f64 / total_interp_steps as f64 * 100.0 } else { 0.0 },
+        promotions, demotions, rollbacks);
+
+    // Save profile: all blocks above Alu tier
+    let profile_entries: Vec<ProfileEntry> = cache.iter()
+        .filter(|(_, block)| block.tier > BlockTier::Alu)
+        .map(|(&phys_pc, block)| ProfileEntry {
+            phys_pc,
+            virt_pc: block.virt_addr,
+            tier: block.tier,
+        })
+        .collect();
+    if !profile_entries.is_empty() {
+        if let Err(e) = profile::save_profile(&profile_entries) {
+            eprintln!("JIT profile: save failed: {}", e);
+        }
+    }
+}
+
+/// Recompile a block at a different tier, replacing the existing cache entry.
+fn recompile_block_at_tier<T: Tlb, C: MipsCache>(
+    compiler: &mut BlockCompiler,
+    cache: &mut CodeCache,
+    exec: &mut MipsExecutor<T, C>,
+    phys_pc: u64,
+    virt_pc: u64,
+    tier: BlockTier,
+    blocks_compiled: &mut u64,
+) {
+    let instrs = trace_block(exec, virt_pc, tier);
+    if !instrs.is_empty() {
+        if let Some(mut block) = compiler.compile_block(&instrs, virt_pc, tier) {
+            block.phys_addr = phys_pc;
+            cache.replace(phys_pc, block);
+            *blocks_compiled += 1;
+        }
+    }
 }
 
 fn interpreter_loop<T: Tlb, C: MipsCache>(
@@ -219,6 +430,7 @@ fn translate_pc<T: Tlb, C: MipsCache>(
 fn trace_block<T: Tlb, C: MipsCache>(
     exec: &mut MipsExecutor<T, C>,
     start_pc: u64,
+    tier: BlockTier,
 ) -> Vec<(u32, DecodedInstr)> {
     let mut instrs = Vec::with_capacity(MAX_BLOCK_LEN);
     let mut pc = start_pc;
@@ -233,7 +445,7 @@ fn trace_block<T: Tlb, C: MipsCache>(
         d.raw = raw;
         decode_into::<T, C>(&mut d);
 
-        if !is_compilable(&d) { break; }
+        if !is_compilable_for_tier(&d, tier) { break; }
 
         let is_branch = is_branch_or_jump(&d);
         instrs.push((raw, d));
@@ -245,7 +457,7 @@ fn trace_block<T: Tlb, C: MipsCache>(
                 let mut delay_d = DecodedInstr::default();
                 delay_d.raw = delay_raw;
                 decode_into::<T, C>(&mut delay_d);
-                if is_compilable_alu(&delay_d) || is_compilable_mem(&delay_d) {
+                if is_compilable_for_tier(&delay_d, tier) {
                     instrs.push((delay_raw, delay_d));
                     delay_ok = true;
                 }
@@ -260,8 +472,13 @@ fn trace_block<T: Tlb, C: MipsCache>(
     instrs
 }
 
-fn is_compilable(d: &DecodedInstr) -> bool {
-    is_compilable_alu(d) || is_compilable_mem(d) || is_branch_or_jump(d)
+fn is_compilable_for_tier(d: &DecodedInstr, tier: BlockTier) -> bool {
+    if is_compilable_alu(d) || is_branch_or_jump(d) { return true; }
+    match tier {
+        BlockTier::Alu => false,
+        BlockTier::Loads => is_compilable_load(d),
+        BlockTier::Full => is_compilable_load(d) || is_compilable_store(d),
+    }
 }
 
 fn is_compilable_alu(d: &DecodedInstr) -> bool {
@@ -288,10 +505,16 @@ fn is_compilable_alu(d: &DecodedInstr) -> bool {
     }
 }
 
-fn is_compilable_mem(d: &DecodedInstr) -> bool {
+fn is_compilable_load(d: &DecodedInstr) -> bool {
+    use crate::mips_isa::*;
+    matches!(d.op as u32,
+        OP_LB | OP_LBU | OP_LH | OP_LHU | OP_LW | OP_LWU | OP_LD
+    )
+}
+
+fn is_compilable_store(d: &DecodedInstr) -> bool {
     use crate::mips_isa::*;
     matches!(d.op as u32,
-        OP_LB | OP_LBU | OP_LH | OP_LHU | OP_LW | OP_LWU | OP_LD |
         OP_SB | OP_SH | OP_SW | OP_SD
     )
 }
diff --git a/src/jit/mod.rs b/src/jit/mod.rs
index 3e8efc8..c98854d 100644
--- a/src/jit/mod.rs
+++ b/src/jit/mod.rs
@@ -8,7 +8,10 @@ pub mod cache;
 pub mod compiler;
 pub mod dispatch;
 pub mod helpers;
+pub mod profile;
+pub mod snapshot;
 
 pub use context::JitContext;
 pub use cache::{CodeCache, CompiledBlock};
+pub use snapshot::CpuRollbackSnapshot;
 pub use compiler::BlockCompiler;
diff --git a/src/jit/profile.rs b/src/jit/profile.rs
new file mode 100644
index 0000000..12a2650
--- /dev/null
+++ b/src/jit/profile.rs
@@ -0,0 +1,119 @@
+//! JIT profile cache: persists hot block metadata across emulator runs.
+//!
+//! On shutdown, saves (phys_pc, virt_pc, tier) tuples for all blocks above Alu tier.
+//! On startup, loads the profile and eagerly compiles those blocks at their saved tier
+//! (still speculative until they prove stable again). Eliminates warmup time.
+
+use std::fs;
+use std::io::{self, Read, Write, BufReader, BufWriter};
+use std::path::PathBuf;
+
+use super::cache::BlockTier;
+
+/// One entry in the profile: a block that reached a tier worth persisting.
+#[derive(Debug, Clone)]
+pub struct ProfileEntry {
+    pub phys_pc: u64,
+    pub virt_pc: u64,
+    pub tier: BlockTier,
+}
+
+const PROFILE_MAGIC: &[u8; 4] = b"IRJP"; // IRIS JIT Profile
+const PROFILE_VERSION: u8 = 1;
+
+/// Default profile path: ~/.iris/jit-profile.bin
+fn default_profile_path() -> PathBuf {
+    if let Some(home) = std::env::var_os("HOME") {
+        PathBuf::from(home).join(".iris").join("jit-profile.bin")
+    } else {
+        PathBuf::from("jit-profile.bin")
+    }
+}
+
+/// Get the profile path, respecting IRIS_JIT_PROFILE env var override.
+pub fn profile_path() -> PathBuf {
+    match std::env::var_os("IRIS_JIT_PROFILE") {
+        Some(p) => PathBuf::from(p),
+        None => default_profile_path(),
+    }
+}
+
+/// Load profile entries from disk. Returns empty vec on any error.
+pub fn load_profile() -> Vec<ProfileEntry> {
+    let path = profile_path();
+    let file = match fs::File::open(&path) {
+        Ok(f) => f,
+        Err(_) => return Vec::new(),
+    };
+    let mut reader = BufReader::new(file);
+
+    let mut magic = [0u8; 4];
+    if reader.read_exact(&mut magic).is_err() || &magic != PROFILE_MAGIC {
+        eprintln!("JIT profile: invalid magic in {:?}, ignoring", path);
+        return Vec::new();
+    }
+
+    let mut version = [0u8; 1];
+    if reader.read_exact(&mut version).is_err() || version[0] != PROFILE_VERSION {
+        eprintln!("JIT profile: version mismatch in {:?}, ignoring", path);
+        return Vec::new();
+    }
+
+    let mut count_buf = [0u8; 4];
+    if reader.read_exact(&mut count_buf).is_err() {
+        return Vec::new();
+    }
+    let count = u32::from_le_bytes(count_buf) as usize;
+
+    let mut entries = Vec::with_capacity(count);
+    for _ in 0..count {
+        let mut buf = [0u8; 17]; // 8 + 8 + 1
+        if reader.read_exact(&mut buf).is_err() {
+            break;
+        }
+        let phys_pc = u64::from_le_bytes(buf[0..8].try_into().unwrap());
+        let virt_pc = u64::from_le_bytes(buf[8..16].try_into().unwrap());
+        let tier = match buf[16] {
+            0 => BlockTier::Alu,
+            1 => BlockTier::Loads,
+            2 => BlockTier::Full,
+            _ => continue,
+        };
+        entries.push(ProfileEntry { phys_pc, virt_pc, tier });
+    }
+
+    eprintln!("JIT profile: loaded {} entries from {:?}", entries.len(), path);
+    entries
+}
+
+/// Save profile entries to disk.
+pub fn save_profile(entries: &[ProfileEntry]) -> io::Result<()> {
+    let path = profile_path();
+
+    // Ensure parent directory exists
+    if let Some(parent) = path.parent() {
+        fs::create_dir_all(parent)?;
+    }
+
+    let file = fs::File::create(&path)?;
+    let mut writer = BufWriter::new(file);
+
+    writer.write_all(PROFILE_MAGIC)?;
+    writer.write_all(&[PROFILE_VERSION])?;
+    writer.write_all(&(entries.len() as u32).to_le_bytes())?;
+
+    for entry in entries {
+        writer.write_all(&entry.phys_pc.to_le_bytes())?;
+        writer.write_all(&entry.virt_pc.to_le_bytes())?;
+        let tier_byte = match entry.tier {
+            BlockTier::Alu => 0u8,
+            BlockTier::Loads => 1u8,
+            BlockTier::Full => 2u8,
+        };
+        writer.write_all(&[tier_byte])?;
+    }
+
+    writer.flush()?;
+    eprintln!("JIT profile: saved {} entries to {:?}", entries.len(), path);
+    Ok(())
+}
diff --git a/src/jit/snapshot.rs b/src/jit/snapshot.rs
new file mode 100644
index 0000000..7b8b448
--- /dev/null
+++ b/src/jit/snapshot.rs
@@ -0,0 +1,104 @@
+//! Fast CPU state snapshot for JIT speculative-execution rollback.
+
+use crate::mips_core::NanoTlbEntry;
+use crate::mips_tlb::{MipsTlb, Tlb};
+use crate::mips_exec::MipsExecutor;
+use crate::mips_cache_v2::MipsCache;
+
+/// Complete CPU snapshot for JIT speculative-execution rollback.
+/// ~2.3 KB. Only allocated for speculative blocks; zero overhead for trusted blocks.
+#[derive(Clone)]
+pub struct CpuRollbackSnapshot {
+    pub gpr: [u64; 32],
+    pub pc: u64,
+    pub hi: u64,
+    pub lo: u64,
+    // CP0 subset that JIT blocks can observe or dirty:
+    pub cp0_status:   u32,
+    pub cp0_cause:    u32,
+    pub cp0_epc:      u64,
+    pub cp0_count:    u64,
+    pub cp0_compare:  u64,
+    pub cp0_badvaddr: u64,
+    pub cp0_entryhi:  u64,
+    pub cp0_context:  u64,
+    pub cp0_wired:    u32,
+    pub cp0_entrylo0: u64,
+    pub cp0_entrylo1: u64,
+    pub cp0_pagemask: u64,
+    pub nanotlb: [NanoTlbEntry; 3],
+    pub in_delay_slot: bool,
+    pub delay_slot_target: u64,
+    pub cached_pending: u64,
+    pub interrupt_check_counter: u8,
+    pub tlb: MipsTlb,
+}
+
+impl CpuRollbackSnapshot {
+    /// Capture current CPU state. Call immediately before running a speculative block.
+    /// `tlb` should be obtained via `exec.tlb.clone_as_mips_tlb().unwrap()`.
+    pub fn capture<T: Tlb, C: MipsCache>(exec: &MipsExecutor<T, C>, tlb: MipsTlb) -> Self {
+        Self {
+            gpr:               exec.core.gpr,
+            pc:                exec.core.pc,
+            hi:                exec.core.hi,
+            lo:                exec.core.lo,
+            cp0_status:        exec.core.cp0_status,
+            cp0_cause:         exec.core.cp0_cause,
+            cp0_epc:           exec.core.cp0_epc,
+            cp0_count:         exec.core.cp0_count,
+            cp0_compare:       exec.core.cp0_compare,
+            cp0_badvaddr:      exec.core.cp0_badvaddr,
+            cp0_entryhi:       exec.core.cp0_entryhi,
+            cp0_context:       exec.core.cp0_context,
+            cp0_wired:         exec.core.cp0_wired,
+            cp0_entrylo0:      exec.core.cp0_entrylo0,
+            cp0_entrylo1:      exec.core.cp0_entrylo1,
+            cp0_pagemask:      exec.core.cp0_pagemask,
+            nanotlb:           exec.core.nanotlb,
+            in_delay_slot:     exec.in_delay_slot,
+            delay_slot_target: exec.delay_slot_target,
+            cached_pending:    exec.cached_pending,
+            interrupt_check_counter: exec.interrupt_check_counter,
+            tlb,
+        }
+    }
+
+    /// Restore CPU state from snapshot. Call on rollback after a speculative block misbehaves.
+    pub fn restore<T: Tlb, C: MipsCache>(&self, exec: &mut MipsExecutor<T, C>) {
+        exec.core.gpr          = self.gpr;
+        exec.core.pc           = self.pc;
+        exec.core.hi           = self.hi;
+        exec.core.lo           = self.lo;
+        exec.core.cp0_status   = self.cp0_status;
+        exec.core.cp0_cause    = self.cp0_cause;
+        exec.core.cp0_epc      = self.cp0_epc;
+        exec.core.cp0_count    = self.cp0_count;
+        exec.core.cp0_compare  = self.cp0_compare;
+        exec.core.cp0_badvaddr = self.cp0_badvaddr;
+        exec.core.cp0_entryhi  = self.cp0_entryhi;
+        exec.core.cp0_context  = self.cp0_context;
+        exec.core.cp0_wired    = self.cp0_wired;
+        exec.core.cp0_entrylo0 = self.cp0_entrylo0;
+        exec.core.cp0_entrylo1 = self.cp0_entrylo1;
+        exec.core.cp0_pagemask = self.cp0_pagemask;
+        exec.core.nanotlb      = self.nanotlb;
+        exec.in_delay_slot     = self.in_delay_slot;
+        exec.delay_slot_target = self.delay_slot_target;
+        exec.cached_pending    = self.cached_pending;
+        exec.interrupt_check_counter = self.interrupt_check_counter;
+        exec.tlb.restore_from_mips_tlb(&self.tlb);
+    }
+
+    /// Compare GPRs between snapshot and current state.
+    /// Returns bitmask of register indices that differ (bit i set = gpr[i] changed).
+    pub fn compare_gprs<T: Tlb, C: MipsCache>(&self, exec: &MipsExecutor<T, C>) -> u32 {
+        let mut mask = 0u32;
+        for i in 0..32 {
+            if self.gpr[i] != exec.core.gpr[i] {
+                mask |= 1u32 << i;
+            }
+        }
+        mask
+    }
+}
diff --git a/src/mips_tlb.rs b/src/mips_tlb.rs
index a7f2653..e769ffa 100644
--- a/src/mips_tlb.rs
+++ b/src/mips_tlb.rs
@@ -191,6 +191,14 @@ pub trait Tlb {
     fn power_on(&mut self) {}
     fn save_state(&self) -> toml::Value { toml::Value::Table(Default::default()) }
     fn load_state(&mut self, _v: &toml::Value) -> Result<(), String> { Ok(()) }
+
+    /// Attempt to clone this TLB as a concrete `MipsTlb`.
+    /// Returns `None` for implementations that are not `MipsTlb` (e.g. `PassthroughTlb`).
+    fn clone_as_mips_tlb(&self) -> Option<MipsTlb> { None }
+
+    /// Restore TLB state from a `MipsTlb` snapshot (used by JIT rollback).
+    /// Default no-op for implementations that don't support rollback.
+    fn restore_from_mips_tlb(&mut self, _src: &MipsTlb) {}
 }
 
 /// Sentinel: end of MRU list.
@@ -499,6 +507,10 @@ impl Tlb for MipsTlb {
         }
         Ok(())
     }
+
+    fn clone_as_mips_tlb(&self) -> Option<MipsTlb> { Some(self.clone()) }
+
+    fn restore_from_mips_tlb(&mut self, src: &MipsTlb) { *self = src.clone(); }
 }
 
 /// Passthrough TLB implementation for testing
diff --git a/src/rex3.rs b/src/rex3.rs
index f3ea517..a322150 100644
--- a/src/rex3.rs
+++ b/src/rex3.rs
@@ -1,5 +1,5 @@
 use std::sync::Arc;
-use parking_lot::Mutex;
+use parking_lot::{Mutex, Condvar};
 use spin::Mutex as SpinMutex;
 use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};
 use std::thread;
@@ -878,6 +878,9 @@ pub struct Rex3 {
     pub host_count: UnsafeCell<u32>,
     pub gfifo_producer: SpinMutex<Producer<GFIFOEntry>>,
     pub gfifo_consumer: SpinMutex<Option<Consumer<GFIFOEntry>>>,
+    /// Condvar used to wake the processor thread when work is pushed to the GFIFO.
+    gfifo_ready: Mutex<bool>,
+    gfifo_condvar: Condvar,
 
     pub vc2: Mutex<Vc2>,
     pub xmap0: Mutex<Xmap9>,
@@ -1007,6 +1010,8 @@ impl Rex3 {
             host_count: UnsafeCell::new(0),
             gfifo_producer: SpinMutex::new(producer),
             gfifo_consumer: SpinMutex::new(Some(consumer)),
+            gfifo_ready: Mutex::new(false),
+            gfifo_condvar: Condvar::new(),
             vc2: Mutex::new(Vc2::new()),
             xmap0: Mutex::new(Xmap9::new()),
             xmap1: Mutex::new(Xmap9::new()),
@@ -2701,8 +2706,15 @@ impl Rex3 {
         // push and break write ordering for multi-entry sequences.
         let mut producer = self.gfifo_producer.lock();
         loop {
-            if producer.push(entry).is_ok() { return; }
-            std::hint::spin_loop(); // be little bit nice at least
+            if producer.push(entry).is_ok() {
+                *self.gfifo_ready.lock() = true;
+                self.gfifo_condvar.notify_one();
+                return;
+            }
+            // Buffer full — wake consumer to drain it, then retry.
+            *self.gfifo_ready.lock() = true;
+            self.gfifo_condvar.notify_one();
+            std::hint::spin_loop();
         }
     }
 
@@ -2849,9 +2861,13 @@ impl Rex3 {
 
                 self.gfifo_pending.fetch_sub(1, Ordering::SeqCst);
             } else {
-                // Spin-wait for more entries.
-                // A condvar would be better, but this matches the user request.
-                std::hint::spin_loop();
+                // Wait for the producer to push work. The condvar is notified by
+                // gfifo_push on every successful push and on buffer-full retries.
+                let mut ready = self.gfifo_ready.lock();
+                while !*ready {
+                    self.gfifo_condvar.wait(&mut ready);
+                }
+                *ready = false;
             }
         }
         consumer

From 2beb419e60599f134d4bd76bbda028b6fd253030 Mon Sep 17 00:00:00 2001
From: Eric Dodd <Eric Dodd>
Date: Mon, 6 Apr 2026 09:30:43 -0400
Subject: [PATCH 5/5] fully working jit now

---
 iris.toml           |   2 +-
 jit-diag.sh         |  21 ++-
 src/jit/cache.rs    |  21 +++
 src/jit/compiler.rs |  51 +++++-
 src/jit/dispatch.rs | 427 ++++++++++++++++++++++++++++++--------------
 src/jit/helpers.rs  |  39 +++-
 src/jit/snapshot.rs |   3 -
 src/main.rs         |  18 +-
 8 files changed, 417 insertions(+), 165 deletions(-)

diff --git a/iris.toml b/iris.toml
index ec1de59..1305f1f 100644
--- a/iris.toml
+++ b/iris.toml
@@ -15,7 +15,7 @@ prom = "prom.bin"
 
 # Window scale factor: 1 = native resolution, 2 = 2× for HiDPI/4K monitors.
 # Can also be set with the --2x command-line flag (CLI takes precedence).
-scale = 2
+scale = 1
 
 # RAM bank sizes in MB.
 # Each bank must be 0 (absent), 8, 16, 32, 64, or 128.
diff --git a/jit-diag.sh b/jit-diag.sh
index 92b4e6d..6934c34 100755
--- a/jit-diag.sh
+++ b/jit-diag.sh
@@ -7,30 +7,33 @@
 #         "interp"   — pure interpreter (no JIT feature, baseline)
 #         "perf"     — perf profile, interpreter only (text report for analysis)
 #         "perf-jit" — perf profile with JIT enabled
+#
+# All IRIS_JIT_* env vars are passed through automatically:
+#   IRIS_JIT_MAX_TIER=0 ./jit-diag.sh jit
+#   IRIS_JIT_PROBE=500 IRIS_JIT_PROBE_MIN=100 ./jit-diag.sh jit
 
 MODE="${1:-jit}"
-# IRIS_JIT_MAX_TIER from environment (0=Alu, 1=Loads, 2=Full, unset=Full)
-TIER_ENV=""
-if [ -n "$IRIS_JIT_MAX_TIER" ]; then
-  TIER_ENV="IRIS_JIT_MAX_TIER=$IRIS_JIT_MAX_TIER"
-fi
 OUTFILE="jit-diag-$(date +%Y%m%d-%H%M%S)-${MODE}.log"
 
+# Collect all IRIS_JIT_* env vars for display and passthrough
+JIT_VARS=$(env | grep '^IRIS_JIT_' | tr '\n' ' ')
+
 echo "=== IRIS JIT Diagnostic ===" | tee "$OUTFILE"
 echo "Mode: $MODE" | tee -a "$OUTFILE"
 echo "Date: $(date)" | tee -a "$OUTFILE"
 echo "Host: $(uname -m) $(uname -s) $(uname -r)" | tee -a "$OUTFILE"
 echo "Rust: $(rustc --version)" | tee -a "$OUTFILE"
+[ -n "$JIT_VARS" ] && echo "Env: $JIT_VARS" | tee -a "$OUTFILE"
 echo "" | tee -a "$OUTFILE"
 
 case "$MODE" in
   jit)
-    echo "Running: IRIS_JIT=1 $TIER_ENV cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
-    IRIS_JIT=1 $TIER_ENV cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    echo "Running: IRIS_JIT=1 ${JIT_VARS}cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
+    IRIS_JIT=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
     ;;
   verify)
-    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 $TIER_ENV cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
-    IRIS_JIT=1 IRIS_JIT_VERIFY=1 $TIER_ENV cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 ${JIT_VARS}cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
+    IRIS_JIT=1 IRIS_JIT_VERIFY=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
     ;;
   nojit)
     echo "Running: cargo run --release --features jit,lightning (no IRIS_JIT)" | tee -a "$OUTFILE"
diff --git a/src/jit/cache.rs b/src/jit/cache.rs
index fff6ca3..9f7eb23 100644
--- a/src/jit/cache.rs
+++ b/src/jit/cache.rs
@@ -27,10 +27,31 @@ impl BlockTier {
     }
 }
 
+// Defaults; overridden by IRIS_JIT_STABLE / IRIS_JIT_PROMOTE / IRIS_JIT_DEMOTE env vars.
 pub const TIER_STABLE_THRESHOLD:  u32 = 50;   // consecutive clean exits → trusted
 pub const TIER_PROMOTE_THRESHOLD: u32 = 200;  // trusted clean exits → try next tier
 pub const TIER_DEMOTE_THRESHOLD:  u32 = 3;    // exceptions in trial period → demote
 
+/// Runtime-configurable tier thresholds. Reads env vars once at init.
+pub struct TierConfig {
+    pub stable:  u32,
+    pub promote: u32,
+    pub demote:  u32,
+}
+
+impl TierConfig {
+    pub fn from_env() -> Self {
+        Self {
+            stable:  std::env::var("IRIS_JIT_STABLE").ok()
+                .and_then(|v| v.parse().ok()).unwrap_or(TIER_STABLE_THRESHOLD),
+            promote: std::env::var("IRIS_JIT_PROMOTE").ok()
+                .and_then(|v| v.parse().ok()).unwrap_or(TIER_PROMOTE_THRESHOLD),
+            demote:  std::env::var("IRIS_JIT_DEMOTE").ok()
+                .and_then(|v| v.parse().ok()).unwrap_or(TIER_DEMOTE_THRESHOLD),
+        }
+    }
+}
+
 /// A compiled native code block.
 pub struct CompiledBlock {
     /// Function pointer to compiled native code.
diff --git a/src/jit/compiler.rs b/src/jit/compiler.rs
index aa7f36a..cb5461b 100644
--- a/src/jit/compiler.rs
+++ b/src/jit/compiler.rs
@@ -29,6 +29,7 @@ pub struct BlockCompiler {
     fn_write_u16: FuncId,
     fn_write_u32: FuncId,
     fn_write_u64: FuncId,
+    fn_interp_step: FuncId,
 }
 
 impl BlockCompiler {
@@ -51,6 +52,7 @@ impl BlockCompiler {
         jit_builder.symbol("jit_write_u16", helpers.write_u16);
         jit_builder.symbol("jit_write_u32", helpers.write_u32);
         jit_builder.symbol("jit_write_u64", helpers.write_u64);
+        jit_builder.symbol("jit_interp_step", helpers.interp_step);
 
         let mut jit_module = JITModule::new(jit_builder);
 
@@ -81,6 +83,13 @@ impl BlockCompiler {
         let fn_write_u32 = jit_module.declare_function("jit_write_u32", Linkage::Import, &write_sig).unwrap();
         let fn_write_u64 = jit_module.declare_function("jit_write_u64", Linkage::Import, &write_sig).unwrap();
 
+        // interp_step(ctx_ptr, exec_ptr) -> u64
+        let mut step_sig = jit_module.make_signature();
+        step_sig.params.push(AbiParam::new(ptr_type)); // ctx_ptr
+        step_sig.params.push(AbiParam::new(ptr_type)); // exec_ptr
+        step_sig.returns.push(AbiParam::new(types::I64));
+        let fn_interp_step = jit_module.declare_function("jit_interp_step", Linkage::Import, &step_sig).unwrap();
+
         Self {
             ctx: jit_module.make_context(),
             jit_module,
@@ -88,6 +97,7 @@ impl BlockCompiler {
             func_id_counter: 0,
             fn_read_u8, fn_read_u16, fn_read_u32, fn_read_u64,
             fn_write_u8, fn_write_u16, fn_write_u32, fn_write_u64,
+            fn_interp_step,
         }
     }
 
@@ -146,6 +156,7 @@ impl BlockCompiler {
             write_u16: self.jit_module.declare_func_in_func(self.fn_write_u16, &mut builder.func),
             write_u32: self.jit_module.declare_func_in_func(self.fn_write_u32, &mut builder.func),
             write_u64: self.jit_module.declare_func_in_func(self.fn_write_u64, &mut builder.func),
+            interp_step: self.jit_module.declare_func_in_func(self.fn_interp_step, &mut builder.func),
         };
 
         // Load GPRs 1-31 from JitContext (gpr[0] is always 0)
@@ -191,8 +202,36 @@ impl BlockCompiler {
                             &mut builder, ctx_ptr, exec_ptr, &helpers,
                             &mut gpr, &mut hi, &mut lo, &mut modified_gprs, delay_d, delay_pc, tier,
                         );
-                        if matches!(delay_result, EmitResult::Ok) {
-                            compiled_count += 1;
+                        match delay_result {
+                            EmitResult::Ok => { compiled_count += 1; }
+                            EmitResult::Stop => {
+                                // Delay slot can't be compiled at this tier — interpreter fallback.
+                                // Flush all modified GPRs to ctx so interpreter sees current state.
+                                flush_modified_gprs(&mut builder, &gpr, ctx_ptr, &mut modified_gprs);
+                                builder.ins().store(mem, hi, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::hi_offset()));
+                                builder.ins().store(mem, lo, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::lo_offset()));
+                                // Store delay slot PC so interpreter executes the right instruction
+                                let delay_pc_val = builder.ins().iconst(types::I64, delay_pc as i64);
+                                builder.ins().store(mem, delay_pc_val, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::pc_offset()));
+                                // Call interpreter: syncs ctx→exec, step(), syncs exec→ctx
+                                builder.ins().call(helpers.interp_step, &[ctx_ptr, exec_ptr]);
+                                // Reload GPRs from ctx (interpreter may have modified any register)
+                                for i in 1..32usize {
+                                    gpr[i] = builder.ins().load(
+                                        types::I64, mem, ctx_ptr,
+                                        ir::immediates::Offset32::new(JitContext::gpr_offset(i)),
+                                    );
+                                }
+                                hi = builder.ins().load(types::I64, mem, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::hi_offset()));
+                                lo = builder.ins().load(types::I64, mem, ctx_ptr,
+                                    ir::immediates::Offset32::new(JitContext::lo_offset()));
+                                compiled_count += 1;
+                            }
+                            _ => {} // Branch in delay slot — shouldn't happen
                         }
                     }
                     branch_exit_pc = Some(target_val);
@@ -257,7 +296,12 @@ impl BlockCompiler {
             len_mips: compiled_count,
             len_native: code_size,
             tier,
-            speculative: true,
+            // Full-tier blocks contain stores that modify memory. Speculative
+            // rollback restores CPU/TLB state but NOT memory, so read-modify-write
+            // sequences get double-applied on rollback. Non-speculative blocks skip
+            // snapshot/rollback — on exception, the store emitter's flushed GPRs and
+            // faulting PC (already in executor via sync_to) are used directly.
+            speculative: tier != BlockTier::Full,
             hit_count: 0,
             exception_count: 0,
             stable_hits: 0,
@@ -269,6 +313,7 @@ impl BlockCompiler {
 struct EmitHelpers {
     read_u8: FuncRef, read_u16: FuncRef, read_u32: FuncRef, read_u64: FuncRef,
     write_u8: FuncRef, write_u16: FuncRef, write_u32: FuncRef, write_u64: FuncRef,
+    interp_step: FuncRef,
 }
 
 /// Result of emitting a single instruction.
diff --git a/src/jit/dispatch.rs b/src/jit/dispatch.rs
index 5bd24ae..65fd205 100644
--- a/src/jit/dispatch.rs
+++ b/src/jit/dispatch.rs
@@ -1,10 +1,11 @@
 //! Adaptive JIT dispatch loop with tiered compilation and speculative execution.
 //!
-//! Every block starts at Tier 0 (ALU only — safe by construction). Hot blocks
-//! are promoted through tiers as they prove stable. If a speculative block
-//! misbehaves, CPU state is rolled back from a pre-block snapshot and the block
-//! is demoted. Blocks that prove stable graduate to trusted execution with zero
-//! snapshot overhead.
+//! Interpreter-first architecture: the interpreter runs in short bursts, with
+//! cache probes after each burst. One JIT block per probe, then back to interpreter.
+//! Blocks start at Tier 0 (ALU only) and earn promotion through stable execution.
+//!
+//! The probe interval adapts dynamically: frequent cache hits → shorter interval
+//! (probe more often), frequent misses → longer interval (less overhead).
 
 use std::sync::atomic::{AtomicBool, Ordering};
 
@@ -12,7 +13,7 @@ use crate::mips_exec::{MipsExecutor, DecodedInstr, EXEC_BREAKPOINT, decode_into}
 use crate::mips_tlb::{Tlb, AccessType};
 use crate::mips_cache_v2::MipsCache;
 
-use super::cache::{BlockTier, CodeCache, TIER_STABLE_THRESHOLD, TIER_PROMOTE_THRESHOLD, TIER_DEMOTE_THRESHOLD};
+use super::cache::{BlockTier, CodeCache, TierConfig};
 use super::compiler::BlockCompiler;
 use super::context::{JitContext, EXIT_NORMAL, EXIT_EXCEPTION};
 use super::helpers::HelperPtrs;
@@ -21,12 +22,97 @@ use super::snapshot::CpuRollbackSnapshot;
 
 const MAX_BLOCK_LEN: usize = 64;
 
-/// How many interpreter steps between cache probes within a batch.
-const PROBE_INTERVAL: u32 = 1000;
-
-/// How many interpreter steps in one outer batch.
+/// How many interpreter steps in one outer batch (controls flush_cycles frequency).
 const BATCH_SIZE: u32 = 10000;
 
+/// Adaptive probe interval controller.
+///
+/// Asymmetric adjustment: hits pull the interval down aggressively (we want to
+/// exploit hot code), misses push it up gently (don't overreact to cold regions).
+/// Cache size provides a floor — more compiled blocks means shorter intervals
+/// even when the instantaneous hit rate is low.
+struct ProbeController {
+    /// Current probe interval (interpreter steps between cache probes).
+    interval: u32,
+    /// Minimum allowed interval.
+    min_interval: u32,
+    /// Maximum allowed interval.
+    max_interval: u32,
+    /// Exponentially weighted hit rate (0..256 fixed-point, 256 = 100%).
+    ewma_hit_rate: u32,
+    /// Number of compiled blocks (updated externally).
+    cache_size: u32,
+    /// Simple LFSR for jitter (avoids lock-step with OS timers).
+    lfsr: u32,
+}
+
+impl ProbeController {
+    fn new() -> Self {
+        let base = std::env::var("IRIS_JIT_PROBE").ok()
+            .and_then(|v| v.parse().ok()).unwrap_or(200u32);
+        let min = std::env::var("IRIS_JIT_PROBE_MIN").ok()
+            .and_then(|v| v.parse().ok()).unwrap_or(100u32);
+        let max = std::env::var("IRIS_JIT_PROBE_MAX").ok()
+            .and_then(|v| v.parse().ok()).unwrap_or(2000u32);
+        Self {
+            interval: base.clamp(min, max),
+            min_interval: min,
+            max_interval: max,
+            ewma_hit_rate: 0,
+            cache_size: 0,
+            lfsr: 0xACE1u32,
+        }
+    }
+
+    /// Record a cache hit — aggressively pull interval down.
+    fn record_hit(&mut self) {
+        // EWMA with alpha ~1/8 for hits (fast response to hot code)
+        self.ewma_hit_rate = self.ewma_hit_rate - (self.ewma_hit_rate / 8) + 32; // +32 = 1/8 of 256
+
+        // Each hit immediately nudges interval down by ~3%
+        self.interval = (self.interval * 31 / 32).max(self.min_interval);
+    }
+
+    /// Record a cache miss — gently push interval up.
+    fn record_miss(&mut self) {
+        // EWMA with alpha ~1/32 for misses (slow response, don't overreact)
+        self.ewma_hit_rate = self.ewma_hit_rate.saturating_sub(self.ewma_hit_rate / 32);
+
+        // Misses push interval up by ~1% (3x slower than hit pull-down)
+        self.interval = (self.interval * 33 / 32).min(self.max_interval);
+    }
+
+    /// Update cache size — provides an interval floor.
+    fn set_cache_size(&mut self, size: u32) {
+        self.cache_size = size;
+    }
+
+    /// Get current interval with jitter, incorporating cache size pressure.
+    fn next_interval(&mut self) -> u32 {
+        // Cache size pressure: more blocks compiled → gently push interval down.
+        // Uses sqrt so 100 blocks barely changes anything, 10000 blocks halves it,
+        // but never goes below min_interval.
+        // 100 blocks → factor 1.0 (no change), 1000 → 0.68, 10000 → 0.46, 50000 → 0.31
+        let cache_factor = if self.cache_size > 100 {
+            1.0f32 / (self.cache_size as f32 / 100.0).sqrt().max(1.0)
+        } else {
+            1.0
+        };
+        let cache_adjusted = (self.interval as f32 * cache_factor) as u32;
+        let effective = cache_adjusted.clamp(self.min_interval, self.max_interval);
+
+        // Galois LFSR for cheap pseudo-randomness
+        let bit = self.lfsr & 1;
+        self.lfsr >>= 1;
+        if bit != 0 { self.lfsr ^= 0xB400; }
+
+        // Jitter: ~0.85x to ~1.15x using 3 bits of LFSR
+        let jitter_bits = (self.lfsr & 0x7) as u32; // 0-7
+        let jittered = effective * (17 + jitter_bits) / 21; // range ~0.81x to ~1.14x
+        jittered.clamp(self.min_interval, self.max_interval)
+    }
+}
+
 pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     exec: &mut MipsExecutor<T, C>,
     running: &AtomicBool,
@@ -39,11 +125,6 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
         return;
     }
 
-    // CRITICAL: Convert &mut to raw pointer. We must never hold &mut MipsExecutor
-    // across a JIT block call, because the JIT's memory helpers create their own
-    // &mut from the raw pointer. Two simultaneous &mut is UB, and with lto="fat"
-    // LLVM exploits the noalias guarantee to cache/hoist loads across the call,
-    // causing stale TLB/cache/CP0 state and kernel panics.
     let exec_ptr: *mut MipsExecutor<T, C> = exec as *mut _;
 
     // IRIS_JIT_MAX_TIER: cap the highest tier blocks can reach (0=Alu, 1=Loads, 2=Full)
@@ -54,8 +135,11 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     };
     // IRIS_JIT_VERIFY=1: after each JIT block, re-run via interpreter and compare
     let verify_mode = std::env::var("IRIS_JIT_VERIFY").map(|v| v == "1").unwrap_or(false);
-    eprintln!("JIT: adaptive mode (max_tier={:?}, verify={}, probe every {} steps)",
-        max_tier, verify_mode, PROBE_INTERVAL);
+    let tier_cfg = TierConfig::from_env();
+    let mut probe = ProbeController::new();
+    eprintln!("JIT: adaptive mode (max_tier={:?}, verify={}, probe={} [{}-{}], stable={}, promote={}, demote={})",
+        max_tier, verify_mode, probe.interval, probe.min_interval, probe.max_interval,
+        tier_cfg.stable, tier_cfg.promote, tier_cfg.demote);
     let helpers = HelperPtrs::new::<T, C>();
     let mut compiler = BlockCompiler::new(&helpers);
     let mut cache = CodeCache::new();
@@ -75,10 +159,9 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
         let profile_entries = profile::load_profile();
         let mut profile_compiled = 0u64;
         for entry in &profile_entries {
-            // Cap at max_tier
             let tier = if entry.tier > max_tier { max_tier } else { entry.tier };
             if tier == BlockTier::Alu {
-                continue; // Alu blocks compile on first miss anyway
+                continue;
             }
             let instrs = trace_block(exec, entry.virt_pc, tier);
             if !instrs.is_empty() {
@@ -99,23 +182,26 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
         let mut steps_in_batch: u32 = 0;
 
         while steps_in_batch < BATCH_SIZE {
-            // Interpreter batch — no JIT call happens here
+            let burst = probe.next_interval();
+
+            // Interpreter burst
             {
                 let exec = unsafe { &mut *exec_ptr };
                 #[cfg(feature = "lightning")]
-                for _ in 0..PROBE_INTERVAL {
+                for _ in 0..burst {
                     exec.step();
                 }
                 #[cfg(not(feature = "lightning"))]
-                for _ in 0..PROBE_INTERVAL {
+                for _ in 0..burst {
                     let status = exec.step();
                     if status == EXEC_BREAKPOINT {
                         running.store(false, Ordering::SeqCst);
                         break;
                     }
                 }
-            } // &mut exec dropped here
-            steps_in_batch += PROBE_INTERVAL;
+            }
+            steps_in_batch += burst;
+            total_interp_steps += burst as u64;
 
             if !running.load(Ordering::Relaxed) { break; }
 
@@ -129,6 +215,7 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
             let in_prom = (pc32 >= 0x9FC00000 && pc32 < 0xA0000000) || (pc32 >= 0xBFC00000);
             let in_exc = pc32 >= 0x80000000 && pc32 < 0x80000400;
             if in_prom || in_exc || in_delay_slot {
+                probe.record_miss();
                 continue;
             }
 
@@ -136,13 +223,12 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                 let exec = unsafe { &mut *exec_ptr };
                 match translate_pc(exec, pc) {
                     Some(p) => p,
-                    None => continue,
+                    None => { probe.record_miss(); continue; }
                 }
             };
 
-            if cache.lookup(phys_pc).is_some() {
-                // Cache hit — execute compiled block.
-                let block = cache.lookup(phys_pc).unwrap();
+            if let Some(block) = cache.lookup(phys_pc) {
+                probe.record_hit();
                 let block_len = block.len_mips;
                 let block_tier = block.tier;
                 let is_speculative = block.speculative;
@@ -161,13 +247,13 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                 {
                     let exec = unsafe { &mut *exec_ptr };
                     ctx.sync_from_executor(exec);
-                } // &mut dropped before JIT call
+                }
 
                 ctx.exit_reason = 0;
                 let entry: extern "C" fn(*mut JitContext) = unsafe {
-                    std::mem::transmute(cache.lookup(phys_pc).unwrap().entry)
+                    std::mem::transmute(block.entry)
                 };
-                entry(&mut ctx); // Helpers create their own &mut from exec_ptr
+                entry(&mut ctx);
 
                 {
                     let exec = unsafe { &mut *exec_ptr };
@@ -176,7 +262,6 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                     if ctx.exit_reason == EXIT_EXCEPTION {
                         if let Some(snap) = &snapshot {
                             if is_speculative {
-                                // Speculative block hit an exception — roll back
                                 snap.restore(exec);
                                 rollbacks += 1;
 
@@ -185,7 +270,7 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                                     block.exception_count += 1;
                                     block.stable_hits = 0;
 
-                                    if block.exception_count >= TIER_DEMOTE_THRESHOLD {
+                                    if block.exception_count >= tier_cfg.demote {
                                         if let Some(lower) = block.tier.demote() {
                                             demotions += 1;
                                             eprintln!("JIT: demote {:016x} {:?}→{:?} ({}exc)",
@@ -201,64 +286,92 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                                     }
                                 }
                             } else if verify_mode {
-                                // Verify mode but not speculative — restore for verification
                                 snap.restore(exec);
                             }
                         }
-                        // Interpreter handles the faulting instruction
+                        // Advance cp0_count for instructions that executed before the fault.
+                        // ctx.pc was set to the faulting instruction by the load/store emitter.
+                        let instrs_before_fault = ctx.pc.wrapping_sub(pc) / 4;
+                        if instrs_before_fault > 0 {
+                            let advance = exec.core.count_step.wrapping_mul(instrs_before_fault);
+                            let prev = exec.core.cp0_count;
+                            exec.core.cp0_count = prev.wrapping_add(advance) & 0x0000_FFFF_FFFF_FFFF;
+                            if exec.core.cp0_compare != 0
+                                && prev < exec.core.cp0_compare
+                                && exec.core.cp0_count >= exec.core.cp0_compare
+                            {
+                                exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                            }
+                            exec.local_cycles += instrs_before_fault;
+                        }
                         exec.step();
+                        total_interp_steps += 1;
                         steps_in_batch += 1;
-                        ctx.exit_reason = 0;
-                    } else {
-                        // Normal exit — verify if enabled
-                        if verify_mode {
-                            if let Some(snap) = &snapshot {
-                                // Save JIT results
-                                let jit_gpr = exec.core.gpr;
-                                let jit_pc = exec.core.pc;
-                                let jit_hi = exec.core.hi;
-                                let jit_lo = exec.core.lo;
-
-                                // Restore pre-block state
-                                snap.restore(exec);
+                        continue;
+                    }
 
-                                // Run interpreter for the same number of instructions
-                                for _ in 0..block_len {
-                                    exec.step();
-                                }
+                    // Normal exit
+                    if verify_mode {
+                        if let Some(snap) = &snapshot {
+                            let jit_gpr = exec.core.gpr;
+                            let jit_pc = exec.core.pc;
+                            let jit_hi = exec.core.hi;
+                            let jit_lo = exec.core.lo;
 
-                                // Compare
-                                let interp_gpr = exec.core.gpr;
-                                let interp_pc = exec.core.pc;
-                                let interp_hi = exec.core.hi;
-                                let interp_lo = exec.core.lo;
-
-                                let mut mismatch = false;
-                                for i in 0..32 {
-                                    if jit_gpr[i] != interp_gpr[i] {
-                                        eprintln!("JIT VERIFY FAIL at {:016x} (tier={:?}, len={}): gpr[{}] jit={:016x} interp={:016x}",
-                                            pc, block_tier, block_len, i, jit_gpr[i], interp_gpr[i]);
-                                        mismatch = true;
-                                    }
-                                }
-                                if jit_pc != interp_pc {
-                                    eprintln!("JIT VERIFY FAIL at {:016x}: pc jit={:016x} interp={:016x}",
-                                        pc, jit_pc, interp_pc);
-                                    mismatch = true;
-                                }
-                                if jit_hi != interp_hi {
-                                    eprintln!("JIT VERIFY FAIL at {:016x}: hi jit={:016x} interp={:016x}",
-                                        pc, jit_hi, interp_hi);
-                                    mismatch = true;
-                                }
-                                if jit_lo != interp_lo {
-                                    eprintln!("JIT VERIFY FAIL at {:016x}: lo jit={:016x} interp={:016x}",
-                                        pc, jit_lo, interp_lo);
+                            snap.restore(exec);
+                            for _ in 0..block_len {
+                                exec.step();
+                            }
+
+                            let interp_gpr = exec.core.gpr;
+                            let interp_pc = exec.core.pc;
+                            let interp_hi = exec.core.hi;
+                            let interp_lo = exec.core.lo;
+
+                            let mut mismatch = false;
+                            for i in 0..32 {
+                                if jit_gpr[i] != interp_gpr[i] {
+                                    eprintln!("JIT VERIFY FAIL at {:016x} (tier={:?}, len={}): gpr[{}] jit={:016x} interp={:016x}",
+                                        pc, block_tier, block_len, i, jit_gpr[i], interp_gpr[i]);
                                     mismatch = true;
                                 }
+                            }
+                            if jit_pc != interp_pc {
+                                eprintln!("JIT VERIFY FAIL at {:016x}: pc jit={:016x} interp={:016x}",
+                                    pc, jit_pc, interp_pc);
+                                mismatch = true;
+                            }
+                            if jit_hi != interp_hi {
+                                eprintln!("JIT VERIFY FAIL at {:016x}: hi jit={:016x} interp={:016x}",
+                                    pc, jit_hi, interp_hi);
+                                mismatch = true;
+                            }
+                            if jit_lo != interp_lo {
+                                eprintln!("JIT VERIFY FAIL at {:016x}: lo jit={:016x} interp={:016x}",
+                                    pc, jit_lo, interp_lo);
+                                mismatch = true;
+                            }
 
-                                if mismatch {
-                                    // Dump the block instructions
+                            if mismatch {
+                                // Check if this is a timing false positive:
+                                // interpreter took an exception (PC in exception vectors)
+                                // while JIT didn't. This happens because the interpreter
+                                // re-run occurs at a different wall-clock time and sees
+                                // different external interrupt state via the atomic.
+                                let interp_pc32 = interp_pc as u32;
+                                let interp_in_exc = (interp_pc32 >= 0x80000000 && interp_pc32 < 0x80000400)
+                                    || interp_pc32 == 0x80000180; // general exception vector
+                                let jit_pc32 = jit_pc as u32;
+                                let jit_not_exc = jit_pc32 < 0x80000000 || jit_pc32 >= 0x80000400;
+
+                                if interp_in_exc && jit_not_exc {
+                                    // Timing false positive — interpreter took an interrupt
+                                    // the JIT didn't see. Don't invalidate the block.
+                                    // Use the interpreter's result (it's authoritative).
+                                    eprintln!("JIT VERIFY: timing false positive at {:016x} (interp took exception to {:016x}), keeping block",
+                                        pc, interp_pc);
+                                } else {
+                                    // Real codegen mismatch — dump and invalidate
                                     let instrs = trace_block(exec, pc, block_tier);
                                     eprintln!("JIT VERIFY: block at {:016x} ({} instrs):", pc, instrs.len());
                                     for (idx, (raw, d)) in instrs.iter().enumerate() {
@@ -266,61 +379,86 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                                         eprintln!("  {:016x}: {:08x} op={} rs={} rt={} rd={} funct={} imm={:04x}",
                                             ipc, raw, d.op, d.rs, d.rt, d.rd, d.funct, d.imm as u16);
                                     }
-                                    // Leave interpreter state (correct) in place
-                                    steps_in_batch += block_len;
-                                    total_jit_instrs += block_len as u64;
-                                    // Invalidate this block so we don't keep hitting it
                                     cache.invalidate_range(phys_pc, phys_pc + 4);
-                                    continue;
                                 }
-                                // Verification passed — interpreter state is already correct
-                                // (we ran the interpreter, so state is authoritative)
+                                total_jit_instrs += block_len as u64;
+                                continue;
                             }
                         }
+                    }
 
-                        // Update stats and check for promotion
-                        if let Some(block) = cache.lookup_mut(phys_pc) {
-                            block.hit_count += 1;
-                            block.stable_hits += 1;
-                            block.exception_count = 0;
-
-                            if block.speculative && block.stable_hits >= TIER_STABLE_THRESHOLD {
-                                block.speculative = false;
-                            }
-
-                            if !block.speculative && block.stable_hits >= TIER_PROMOTE_THRESHOLD {
-                                if let Some(next) = block.tier.promote().filter(|t| *t <= max_tier) {
-                                    promotions += 1;
-                                    eprintln!("JIT: promote {:016x} {:?}→{:?} ({}hits)",
-                                        pc, block.tier, next, block.hit_count);
-                                    recompile_block_at_tier(
-                                        &mut compiler, &mut cache, exec,
-                                        phys_pc, pc, next,
-                                        &mut blocks_compiled,
-                                    );
+                    // Advance cp0_count and check interrupts for the N instructions
+                    // the JIT block executed. The interpreter's step() does this per-
+                    // instruction; we must do it in bulk here or timing drifts and
+                    // the kernel panics from missed timer interrupts.
+                    {
+                        let n = block_len as u64;
+                        // Advance cp0_count by block_len * count_step
+                        let count_advance = exec.core.count_step.wrapping_mul(n);
+                        let prev = exec.core.cp0_count;
+                        exec.core.cp0_count = prev.wrapping_add(count_advance) & 0x0000_FFFF_FFFF_FFFF;
+                        if exec.core.cp0_compare != 0
+                            && prev < exec.core.cp0_compare
+                            && exec.core.cp0_count >= exec.core.cp0_compare
+                        {
+                            exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                        }
+                        // Credit local_cycles so the stats display shows correct MHz
+                        exec.local_cycles += n;
+
+                        // Check for pending interrupts — JIT blocks don't check per-
+                        // instruction like the interpreter does. If an external interrupt
+                        // arrived during the block, service it now via one interpreter step.
+                        let pending = exec.core.interrupts.load(Ordering::Relaxed);
+                        if (pending | exec.core.cp0_cause as u64) != 0 {
+                            // Merge external IP bits (IP2-IP6) into Cause (same as step() does)
+                            use crate::mips_core::{CAUSE_IP2, CAUSE_IP3, CAUSE_IP4, CAUSE_IP5, CAUSE_IP6};
+                            let ext_mask = CAUSE_IP2 | CAUSE_IP3 | CAUSE_IP4 | CAUSE_IP5 | CAUSE_IP6;
+                            exec.core.cp0_cause = (exec.core.cp0_cause & !ext_mask)
+                                | (pending as u32 & ext_mask);
+                            if exec.core.interrupts_enabled() {
+                                let ip = exec.core.cp0_cause & crate::mips_core::CAUSE_IP_MASK;
+                                let im = exec.core.cp0_status & crate::mips_core::STATUS_IM_MASK;
+                                if (ip & im) != 0 {
+                                    // Pending unmasked interrupt — let the interpreter handle it
+                                    exec.step();
+                                    total_interp_steps += 1;
+                                    steps_in_batch += 1;
                                 }
                             }
                         }
+                    }
 
-                        // Advance cp0_count per-instruction
-                        if !verify_mode {
-                            // In verify mode, interpreter already advanced these
-                            for _ in 0..block_len {
-                                let prev = exec.core.cp0_count;
-                                exec.core.cp0_count = prev.wrapping_add(exec.core.count_step) & 0x0000_FFFF_FFFF_FFFF;
-                                if exec.core.cp0_compare != 0 && prev < exec.core.cp0_compare && exec.core.cp0_count >= exec.core.cp0_compare {
-                                    exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
-                                    exec.core.fasttick_count.fetch_add(1, Ordering::Relaxed);
-                                }
+                    // Update stats and check for promotion
+                    if let Some(block) = cache.lookup_mut(phys_pc) {
+                        block.hit_count += 1;
+                        block.stable_hits += 1;
+                        block.exception_count = 0;
+
+                        if block.speculative && block.stable_hits >= tier_cfg.stable {
+                            block.speculative = false;
+                        }
+
+                        if !block.speculative && block.stable_hits >= tier_cfg.promote {
+                            if let Some(next) = block.tier.promote().filter(|t| *t <= max_tier) {
+                                promotions += 1;
+                                eprintln!("JIT: promote {:016x} {:?}→{:?} ({}hits)",
+                                    pc, block.tier, next, block.hit_count);
+                                recompile_block_at_tier(
+                                    &mut compiler, &mut cache, exec,
+                                    phys_pc, pc, next,
+                                    &mut blocks_compiled,
+                                );
                             }
                         }
-                        exec.local_cycles += block_len as u64;
-                        steps_in_batch += block_len;
-                        total_jit_instrs += block_len as u64;
                     }
-                } // &mut dropped
+
+                    total_jit_instrs += block_len as u64;
+                    steps_in_batch += block_len;
+                }
             } else {
-                // Cache miss — compile at lowest tier (safe by construction)
+                probe.record_miss();
+                // Cache miss — compile at Alu tier
                 let exec = unsafe { &mut *exec_ptr };
                 let instrs = trace_block(exec, pc, BlockTier::Alu);
                 if !instrs.is_empty() {
@@ -328,6 +466,7 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                         block.phys_addr = phys_pc;
                         cache.insert(phys_pc, block);
                         blocks_compiled += 1;
+                        probe.set_cache_size(cache.len() as u32);
                         if blocks_compiled <= 10 || blocks_compiled % 500 == 0 {
                             eprintln!("JIT: compiled #{} at {:016x} ({} instrs, tier=Alu, cache={})",
                                 blocks_compiled, pc, instrs.len(), cache.len());
@@ -341,14 +480,20 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
             let exec = unsafe { &mut *exec_ptr };
             exec.flush_cycles();
         }
-        total_interp_steps += steps_in_batch as u64;
 
-        if total_interp_steps % 10000000 < BATCH_SIZE as u64 {
+        let total = total_interp_steps + total_jit_instrs;
+        if total % 10000000 < BATCH_SIZE as u64 {
             let exec = unsafe { &*exec_ptr };
-            eprintln!("JIT: {} steps, {} JIT instrs ({:.1}%), {} blocks, {}↑ {}↓ {}⟲, pc={:016x}",
-                total_interp_steps, total_jit_instrs,
-                if total_interp_steps > 0 { total_jit_instrs as f64 / total_interp_steps as f64 * 100.0 } else { 0.0 },
-                blocks_compiled, promotions, demotions, rollbacks, exec.core.pc);
+            let jit_pct = if total > 0 { total_jit_instrs as f64 / total as f64 * 100.0 } else { 0.0 };
+            let effective_probe = {
+                let cf = if probe.cache_size > 100 {
+                    1.0f32 / (probe.cache_size as f32 / 100.0).sqrt().max(1.0)
+                } else { 1.0 };
+                ((probe.interval as f32 * cf) as u32).clamp(probe.min_interval, probe.max_interval)
+            };
+            eprintln!("JIT: {} total ({:.1}% jit), {} blocks, {}↑ {}↓ {}⟲, probe={}(eff {}), pc={:016x}",
+                total, jit_pct, blocks_compiled, promotions, demotions, rollbacks,
+                probe.interval, effective_probe, exec.core.pc);
         }
     }
 
@@ -356,10 +501,11 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
         let exec = unsafe { &mut *exec_ptr };
         exec.flush_cycles();
     }
-    eprintln!("JIT: shutdown. {} blocks, {} JIT instrs / {} total ({:.1}%), {}↑ {}↓ {}⟲",
-        blocks_compiled, total_jit_instrs, total_interp_steps,
-        if total_interp_steps > 0 { total_jit_instrs as f64 / total_interp_steps as f64 * 100.0 } else { 0.0 },
-        promotions, demotions, rollbacks);
+    let total = total_interp_steps + total_jit_instrs;
+    let jit_pct = if total > 0 { total_jit_instrs as f64 / total as f64 * 100.0 } else { 0.0 };
+    eprintln!("JIT: shutdown. {} blocks, {} jit / {} interp / {} total ({:.1}% jit), {}↑ {}↓ {}⟲, final_probe={}",
+        blocks_compiled, total_jit_instrs, total_interp_steps, total,
+        jit_pct, promotions, demotions, rollbacks, probe.interval);
 
     // Save profile: all blocks above Alu tier
     let profile_entries: Vec<ProfileEntry> = cache.iter()
@@ -448,8 +594,17 @@ fn trace_block<T: Tlb, C: MipsCache>(
         if !is_compilable_for_tier(&d, tier) { break; }
 
         let is_branch = is_branch_or_jump(&d);
+        // Terminate Full-tier blocks after each store to keep blocks short.
+        // Long blocks with multiple load/store helper calls create complex CFG
+        // (ok_block/exc_block diamonds) that triggers Cranelift regalloc2 issues
+        // on x86_64, causing rare but fatal codegen corruption.
+        let is_store = tier == BlockTier::Full && is_compilable_store(&d);
         instrs.push((raw, d));
 
+        if is_store {
+            break;
+        }
+
         if is_branch {
             pc = pc.wrapping_add(4);
             let mut delay_ok = false;
@@ -457,7 +612,11 @@ fn trace_block<T: Tlb, C: MipsCache>(
                 let mut delay_d = DecodedInstr::default();
                 delay_d.raw = delay_raw;
                 decode_into::<T, C>(&mut delay_d);
-                if is_compilable_for_tier(&delay_d, tier) {
+                // Exclude stores from delay slots: if the delay slot faults,
+                // the JIT exception path loses delay-slot context (sync_to clears
+                // in_delay_slot), so handle_exception sets wrong cp0_epc/BD bit,
+                // and on ERET the branch is permanently skipped → crash.
+                if is_compilable_for_tier(&delay_d, tier) && !is_compilable_store(&delay_d) {
                     instrs.push((delay_raw, delay_d));
                     delay_ok = true;
                 }
diff --git a/src/jit/helpers.rs b/src/jit/helpers.rs
index 81f1673..2c4f9f2 100644
--- a/src/jit/helpers.rs
+++ b/src/jit/helpers.rs
@@ -6,7 +6,7 @@
 //! optimizations that cause stale reads.
 
 use super::context::{JitContext, EXIT_EXCEPTION};
-use crate::mips_exec::{MipsExecutor, MemAccessSize, EXEC_COMPLETE};
+use crate::mips_exec::{MipsExecutor, EXEC_COMPLETE};
 use crate::mips_tlb::Tlb;
 use crate::mips_cache_v2::MipsCache;
 
@@ -29,7 +29,7 @@ pub extern "C" fn jit_read_u8<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
-    match exec.read_data(virt_addr, MemAccessSize::Byte) {
+    match exec.read_data::<1>(virt_addr) {
         Ok(value) => value,
         Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
     }
@@ -40,7 +40,7 @@ pub extern "C" fn jit_read_u16<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
-    match exec.read_data(virt_addr, MemAccessSize::Half) {
+    match exec.read_data::<2>(virt_addr) {
         Ok(value) => value,
         Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
     }
@@ -51,7 +51,7 @@ pub extern "C" fn jit_read_u32<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
-    match exec.read_data(virt_addr, MemAccessSize::Word) {
+    match exec.read_data::<4>(virt_addr) {
         Ok(value) => value,
         Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
     }
@@ -62,7 +62,7 @@ pub extern "C" fn jit_read_u64<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
-    match exec.read_data(virt_addr, MemAccessSize::Double) {
+    match exec.read_data::<8>(virt_addr) {
         Ok(value) => value,
         Err(status) => { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; 0 }
     }
@@ -75,7 +75,7 @@ pub extern "C" fn jit_write_u8<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
-    let status = exec.write_data(virt_addr, value, MemAccessSize::Byte, 0xFF);
+    let status = exec.write_data::<1>(virt_addr, value);
     if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
     0
 }
@@ -85,7 +85,7 @@ pub extern "C" fn jit_write_u16<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
-    let status = exec.write_data(virt_addr, value, MemAccessSize::Half, 0xFFFF);
+    let status = exec.write_data::<2>(virt_addr, value);
     if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
     0
 }
@@ -95,7 +95,7 @@ pub extern "C" fn jit_write_u32<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
-    let status = exec.write_data(virt_addr, value, MemAccessSize::Word, 0xFFFF_FFFF);
+    let status = exec.write_data::<4>(virt_addr, value);
     if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
     0
 }
@@ -105,11 +105,30 @@ pub extern "C" fn jit_write_u64<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
-    let status = exec.write_data(virt_addr, value, MemAccessSize::Double, 0xFFFF_FFFF_FFFF_FFFF);
+    let status = exec.write_data::<8>(virt_addr, value);
     if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
     0
 }
 
+// ─── Interpreter fallback ────────────────────────────────────────────────────
+
+/// Execute one interpreter step for a delay slot that can't be compiled at
+/// the current JIT tier. The caller (JIT block) has already flushed modified
+/// GPRs and set ctx.pc to the delay slot PC. This function:
+/// 1. Syncs JitContext → executor (so interpreter sees JIT's register state)
+/// 2. Calls exec.step() (executes the instruction + full bookkeeping)
+/// 3. Syncs executor → JitContext (so JIT sees the result, e.g. loaded value)
+pub extern "C" fn jit_interp_one_step<T: Tlb, C: MipsCache>(
+    ctx_ptr: *mut JitContext, exec_ptr: *mut u8,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    ctx.sync_to_executor(exec);
+    exec.step();
+    ctx.sync_from_executor(exec);
+    0
+}
+
 /// Collection of monomorphized helper function pointers.
 pub struct HelperPtrs {
     pub read_u8:  *const u8,
@@ -120,6 +139,7 @@ pub struct HelperPtrs {
     pub write_u16: *const u8,
     pub write_u32: *const u8,
     pub write_u64: *const u8,
+    pub interp_step: *const u8,
 }
 
 impl HelperPtrs {
@@ -133,6 +153,7 @@ impl HelperPtrs {
             write_u16: jit_write_u16::<T, C> as *const u8,
             write_u32: jit_write_u32::<T, C> as *const u8,
             write_u64: jit_write_u64::<T, C> as *const u8,
+            interp_step: jit_interp_one_step::<T, C> as *const u8,
         }
     }
 }
diff --git a/src/jit/snapshot.rs b/src/jit/snapshot.rs
index 7b8b448..03af56c 100644
--- a/src/jit/snapshot.rs
+++ b/src/jit/snapshot.rs
@@ -30,7 +30,6 @@ pub struct CpuRollbackSnapshot {
     pub in_delay_slot: bool,
     pub delay_slot_target: u64,
     pub cached_pending: u64,
-    pub interrupt_check_counter: u8,
     pub tlb: MipsTlb,
 }
 
@@ -59,7 +58,6 @@ impl CpuRollbackSnapshot {
             in_delay_slot:     exec.in_delay_slot,
             delay_slot_target: exec.delay_slot_target,
             cached_pending:    exec.cached_pending,
-            interrupt_check_counter: exec.interrupt_check_counter,
             tlb,
         }
     }
@@ -86,7 +84,6 @@ impl CpuRollbackSnapshot {
         exec.in_delay_slot     = self.in_delay_slot;
         exec.delay_slot_target = self.delay_slot_target;
         exec.cached_pending    = self.cached_pending;
-        exec.interrupt_check_counter = self.interrupt_check_counter;
         exec.tlb.restore_from_mips_tlb(&self.tlb);
     }
 
diff --git a/src/main.rs b/src/main.rs
index 048be6f..7d437f8 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -6,7 +6,7 @@ fn main() {
     let headless = cfg.headless;
 
     // Start unfsd before the machine so NFS is ready when IRIX boots.
-    let nfs_proc = cfg.nfs.as_ref().map(|nfs| start_unfsd(nfs));
+    let nfs_proc = cfg.nfs.as_ref().and_then(|nfs| start_unfsd(nfs));
 
     // Machine::new() allocates >1MB on the stack (Physical device_map), which overflows
     // the default stack on Windows (1MB). We spawn a thread with a larger stack to create it.
@@ -80,7 +80,7 @@ impl UnfsdProc {
     }
 }
 
-fn start_unfsd(nfs: &NfsConfig) -> UnfsdProc {
+fn start_unfsd(nfs: &NfsConfig) -> Option<UnfsdProc> {
     use std::io::Write as _;
 
     // NFS requires an absolute path in the exports file.
@@ -98,7 +98,7 @@ fn start_unfsd(nfs: &NfsConfig) -> UnfsdProc {
 
     let pid_path = std::env::temp_dir().join("iris_nfs.pid");
 
-    let child = std::process::Command::new(&nfs.unfsd)
+    let child = match std::process::Command::new(&nfs.unfsd)
         .arg("-u")                                       // don't require root
         .arg("-p")                                       // don't register with host portmap
         .arg("-3")                                       // truncate fileid/cookie to 32 bits (IRIX compat)
@@ -108,7 +108,13 @@ fn start_unfsd(nfs: &NfsConfig) -> UnfsdProc {
         .arg("-e").arg(&exports_path)
         .arg("-i").arg(&pid_path)
         .spawn()
-        .unwrap_or_else(|e| panic!("failed to start unfsd '{}': {}", nfs.unfsd, e));
+    {
+        Ok(child) => child,
+        Err(e) => {
+            eprintln!("iris: warning: failed to start unfsd '{}': {} (NFS sharing disabled)", nfs.unfsd, e);
+            return None;
+        }
+    };
 
     eprintln!("iris: unfsd started (pid {}) nfs=127.0.0.1:{} mountd=127.0.0.1:{} dir={}",
               child.id(), nfs.nfs_host_port, nfs.mountd_host_port, abs_dir.display());
@@ -120,8 +126,8 @@ fn start_unfsd(nfs: &NfsConfig) -> UnfsdProc {
     { let mut c = child; let _ = c.wait(); }
 
     #[cfg(windows)]
-    return UnfsdProc { child };
+    return Some(UnfsdProc { child });
 
     #[cfg(not(windows))]
-    return UnfsdProc { pid_path };
+    return Some(UnfsdProc { pid_path });
 }