diff --git a/Cargo.lock b/Cargo.lock
index 776da01..08baa86 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -40,6 +40,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -105,6 +111,12 @@ version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
 
+[[package]]
+name = "arbitrary"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
+
 [[package]]
 name = "arrayvec"
 version = "0.7.6"
@@ -138,6 +150,12 @@ version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
 
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
 [[package]]
 name = "bitflags"
 version = "2.11.1"
@@ -194,6 +212,9 @@ name = "bumpalo"
 version = "3.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+dependencies = [
+ "allocator-api2",
+]
 
 [[package]]
 name = "bytecheck"
@@ -369,6 +390,174 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "cranelift-assembler-x64"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c80cf55a351448317210f26c434be761bcb25e7b36116ec92f89540b73e2833"
+dependencies = [
+ "cranelift-assembler-x64-meta",
+]
+
+[[package]]
+name = "cranelift-assembler-x64-meta"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07937ca8617b340162fe3a4716be885b5847e9b56d6c7a89abbe4d42340fdc91"
+dependencies = [
+ "cranelift-srcgen",
+]
+
+[[package]]
+name = "cranelift-bforest"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88217b08180882436d54c0133274885c590698ae854e352bede1cda041230800"
+dependencies = [
+ "cranelift-entity",
+ "wasmtime-internal-core",
+]
+
+[[package]]
+name = "cranelift-bitset"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d5c3cf7ba29fa56e56040848e34835d4e45988b2760ef212413409af95ffd8c1"
+dependencies = [
+ "wasmtime-internal-core",
+]
+
+[[package]]
+name = "cranelift-codegen"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe1aac2efd4cba2047845fce38a68519935a30e20c8a6294ba7e2f448fe722d"
+dependencies = [
+ "bumpalo",
+ "cranelift-assembler-x64",
+ "cranelift-bforest",
+ "cranelift-bitset",
+ "cranelift-codegen-meta",
+ "cranelift-codegen-shared",
+ "cranelift-control",
+ "cranelift-entity",
+ "cranelift-isle",
+ "gimli",
+ "hashbrown 0.17.1",
+ "libm",
+ "log",
+ "regalloc2",
+ "rustc-hash",
+ "serde",
+ "smallvec",
+ "target-lexicon",
+ "wasmtime-internal-core",
+]
+
+[[package]]
+name = "cranelift-codegen-meta"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0909eaf9d6f18f5bf802d50608cb4368ac340fbd03cc44f2888d1cfcc3faa64e"
+dependencies = [
+ "cranelift-assembler-x64-meta",
+ "cranelift-codegen-shared",
+ "cranelift-srcgen",
+ "heck",
+]
+
+[[package]]
+name = "cranelift-codegen-shared"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c95a8da8be283f49cda7d0ef228c94f10d791e517b27b0c7e282dadd2e79ce45"
+
+[[package]]
+name = "cranelift-control"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f5b19c81145146da1f7afda2e7f52111842fe6793512e740ad5cf3f5639e6212"
+dependencies = [
+ "arbitrary",
+]
+
+[[package]]
+name = "cranelift-entity"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a55309b47e6633ab05821304206cb1e92952e845b1224985562bb7ac1e92323"
+dependencies = [
+ "cranelift-bitset",
+ "wasmtime-internal-core",
+]
+
+[[package]]
+name = "cranelift-frontend"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "064d2d3533d9608f1cf44c8899cf2f7f33feb70300b0fb83e687b0d9e7b91147"
+dependencies = [
+ "cranelift-codegen",
+ "log",
+ "smallvec",
+ "target-lexicon",
+]
+
+[[package]]
+name = "cranelift-isle"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ac4e0bc095b2dab2212d1e99d7a74b62afc1485db023f1c0cb34a68758f7bd1"
+
+[[package]]
+name = "cranelift-jit"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b48c2a0720c7d62aadd508c662b9bf666b614a47a888589e553e0511620635e"
+dependencies = [
+ "anyhow",
+ "cranelift-codegen",
+ "cranelift-control",
+ "cranelift-entity",
+ "cranelift-module",
+ "cranelift-native",
+ "libc",
+ "log",
+ "memmap2 0.2.3",
+ "region",
+ "target-lexicon",
+ "wasmtime-internal-jit-icache-coherence",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "cranelift-module"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28f05d9efce7a4e8c2ceec49c76d26e53f1ee8cb13de822b6ca5118d48f50976"
+dependencies = [
+ "anyhow",
+ "cranelift-codegen",
+ "cranelift-control",
+]
+
+[[package]]
+name = "cranelift-native"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09a40053f5cb925451dd1d57393d14ad3145c8e0786701c27b5415ebb9a3ba4f"
+dependencies = [
+ "cranelift-codegen",
+ "libc",
+ "target-lexicon",
+]
+
+[[package]]
+name = "cranelift-srcgen"
+version = "0.132.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3ceab9a53f7d362c89841fbaa8e63e44d47c40e91dc96ee6f777fca5d6b323b"
+
 [[package]]
 name = "crc32fast"
 version = "1.5.0"
@@ -522,12 +711,24 @@ dependencies = [
  "miniz_oxide",
 ]
 
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
 [[package]]
 name = "foldhash"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
 [[package]]
 name = "funty"
 version = "2.0.0"
@@ -592,6 +793,18 @@ dependencies = [
  "wasip3",
 ]
 
+[[package]]
+name = "gimli"
+version = "0.33.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf7f043f89559805f8c7cacc432749b2fa0d0a0a9ee46ce47164ed5ba7f126c"
+dependencies = [
+ "fnv",
+ "hashbrown 0.16.1",
+ "indexmap",
+ "stable_deref_trait",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -616,14 +829,23 @@ version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
- "foldhash",
+ "foldhash 0.1.5",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+
 [[package]]
 name = "hashbrown"
 version = "0.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
+dependencies = [
+ "foldhash 0.2.0",
+]
 
 [[package]]
 name = "hashlink"
@@ -743,6 +965,12 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "libm"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
+
 [[package]]
 name = "libredox"
 version = "0.1.16"
@@ -795,6 +1023,15 @@ dependencies = [
  "pkg-config",
 ]
 
+[[package]]
+name = "mach2"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d640282b302c0bb0a2a8e0233ead9035e3bed871f0b7e81fe4a1ec829765db44"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "matchers"
 version = "0.2.0"
@@ -820,6 +1057,15 @@ version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
+[[package]]
+name = "memmap2"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "723e3ebdcdc5c023db1df315364573789f8857c11b631a2fdfad7c00f5c046b4"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "memmap2"
 version = "0.9.10"
@@ -857,7 +1103,7 @@ version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
 dependencies = [
- "bitflags",
+ "bitflags 2.11.1",
  "cfg-if",
  "cfg_aliases 0.1.1",
  "libc",
@@ -1084,7 +1330,7 @@ version = "0.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
 dependencies = [
- "bitflags",
+ "bitflags 2.11.1",
 ]
 
 [[package]]
@@ -1098,6 +1344,20 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "regalloc2"
+version = "0.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de2c52737737f8609e94f975dee22854a2d5c125772d4b1cf292120f4d45c186"
+dependencies = [
+ "allocator-api2",
+ "bumpalo",
+ "hashbrown 0.17.1",
+ "log",
+ "rustc-hash",
+ "smallvec",
+]
+
 [[package]]
 name = "regex"
 version = "1.12.3"
@@ -1127,6 +1387,18 @@ version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
 
+[[package]]
+name = "region"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6b6ebd13bc009aef9cd476c1310d49ac354d36e240cf1bd753290f3dc7199a7"
+dependencies = [
+ "bitflags 1.3.2",
+ "libc",
+ "mach2",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "rend"
 version = "0.4.2"
@@ -1185,7 +1457,7 @@ version = "0.31.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b838eba278d213a8beaf485bd313fd580ca4505a00d5871caeb1457c55322cae"
 dependencies = [
- "bitflags",
+ "bitflags 2.11.1",
  "fallible-iterator",
  "fallible-streaming-iterator",
  "hashlink",
@@ -1210,13 +1482,19 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "rustc-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
+
 [[package]]
 name = "rustix"
 version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
 dependencies = [
- "bitflags",
+ "bitflags 2.11.1",
  "errno",
  "libc",
  "linux-raw-sys",
@@ -1291,7 +1569,7 @@ version = "14.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7803e8936da37efd9b6d4478277f4b2b9bb5cdb37a113e8d63222e58da647e63"
 dependencies = [
- "bitflags",
+ "bitflags 2.11.1",
  "cfg-if",
  "clipboard-win",
  "fd-lock",
@@ -1341,7 +1619,7 @@ version = "2.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
 dependencies = [
- "bitflags",
+ "bitflags 2.11.1",
  "core-foundation",
  "core-foundation-sys",
  "libc",
@@ -1478,6 +1756,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
 [[package]]
 name = "strsim"
 version = "0.11.1"
@@ -1518,6 +1802,12 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
+[[package]]
+name = "target-lexicon"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca"
+
 [[package]]
 name = "tempfile"
 version = "3.27.0"
@@ -1889,12 +2179,34 @@ version = "0.244.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
 dependencies = [
- "bitflags",
+ "bitflags 2.11.1",
  "hashbrown 0.15.5",
  "indexmap",
  "semver",
 ]
 
+[[package]]
+name = "wasmtime-internal-core"
+version = "45.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bdae4b55b15a23d774b15f6e7cd90ae0d0aa17c47c12b4db098b3dd11ba9d58"
+dependencies = [
+ "hashbrown 0.17.1",
+ "libm",
+]
+
+[[package]]
+name = "wasmtime-internal-jit-icache-coherence"
+version = "45.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a312ba8bb77955dcd44294a223e7f124c3071ff966583d385d3f6a4639c62e3"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasmtime-internal-core",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "weavepy"
 version = "0.0.0"
@@ -1975,6 +2287,18 @@ dependencies = [
  "weavepy",
 ]
 
+[[package]]
+name = "weavepy-jit"
+version = "0.0.0"
+dependencies = [
+ "cranelift-codegen",
+ "cranelift-frontend",
+ "cranelift-jit",
+ "cranelift-module",
+ "cranelift-native",
+ "weavepy-compiler",
+]
+
 [[package]]
 name = "weavepy-lexer"
 version = "0.0.0"
@@ -2011,7 +2335,7 @@ dependencies = [
  "indexmap",
  "libc",
  "md-5",
- "memmap2",
+ "memmap2 0.9.10",
  "mio",
  "num-bigint",
  "num-integer",
@@ -2034,6 +2358,7 @@ dependencies = [
  "unicode-normalization",
  "unicode-properties",
  "weavepy-compiler",
+ "weavepy-jit",
  "weavepy-lexer",
  "weavepy-parser",
  "webpki-roots 0.26.11",
@@ -2347,7 +2672,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
 dependencies = [
  "anyhow",
- "bitflags",
+ "bitflags 2.11.1",
  "indexmap",
  "log",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index 275e786..d5f92c4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,7 @@ members = [
     "crates/weavepy-cli",
     "crates/weavepy-compiler",
     "crates/weavepy-conformance",
+    "crates/weavepy-jit",
     "crates/weavepy-lexer",
     "crates/weavepy-parser",
     "crates/weavepy-vm",
@@ -29,7 +30,11 @@ default-members = [
 [workspace.package]
 version = "0.0.0"
 edition = "2021"
-rust-version = "1.85"
+# RFC 0032 — the tier-2 Cranelift JIT (`weavepy-jit`, behind the `jit`
+# feature) pulls in Cranelift 0.132, whose MSRV is 1.93. The JIT is
+# off by default, but CI builds it via `--all-features`, so the
+# workspace floor moves with it.
+rust-version = "1.93"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/weavefoundry/weavepy"
 homepage = "https://github.com/weavefoundry/weavepy"
@@ -44,6 +49,7 @@ weavepy             = { path = "crates/weavepy",             version = "0.0.0" }
 weavepy-capi        = { path = "crates/weavepy-capi",        version = "0.0.0" }
 weavepy-compiler    = { path = "crates/weavepy-compiler",    version = "0.0.0" }
 weavepy-conformance = { path = "crates/weavepy-conformance", version = "0.0.0" }
+weavepy-jit         = { path = "crates/weavepy-jit",         version = "0.0.0" }
 weavepy-lexer       = { path = "crates/weavepy-lexer",       version = "0.0.0" }
 weavepy-parser      = { path = "crates/weavepy-parser",      version = "0.0.0" }
 weavepy-vm          = { path = "crates/weavepy-vm",          version = "0.0.0" }
@@ -111,6 +117,15 @@ parking_lot           = "0.12"
 crossbeam-channel     = "0.5"
 crossbeam-utils       = "0.8"
 
+# RFC 0032 — tier-2 JIT backend (Cranelift). Only compiled when the
+# `jit` feature is enabled (off by default); CI exercises it via
+# `--all-features`. MSRV floor for these is Rust 1.93.
+cranelift-codegen     = "0.132"
+cranelift-frontend    = "0.132"
+cranelift-jit         = "0.132"
+cranelift-module      = "0.132"
+cranelift-native      = "0.132"
+
 # Test/bench-only.
 insta       = { version = "1.40", features = ["yaml"] }
 proptest    = "1.5"
diff --git a/crates/weavepy-bench/Cargo.toml b/crates/weavepy-bench/Cargo.toml
index a474a89..b2843c4 100644
--- a/crates/weavepy-bench/Cargo.toml
+++ b/crates/weavepy-bench/Cargo.toml
@@ -22,5 +22,10 @@ weavepy-vm = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 
+[features]
+default = []
+# RFC 0032 — run the bench harness with the tier-2 JIT compiled in.
+jit = ["weavepy/jit", "weavepy-vm/jit"]
+
 [lints]
 workspace = true
diff --git a/crates/weavepy-bench/fixtures/jitloop.py b/crates/weavepy-bench/fixtures/jitloop.py
new file mode 100644
index 0000000..3f60514
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/jitloop.py
@@ -0,0 +1,35 @@
+"""While-loop numeric kernel called many times — the fixture the
+RFC 0032 tier-2 JIT targets most directly.
+
+`kernel` is a pure integer hot loop (no FOR_ITER, no calls in the
+loop body) so it lands in the JITable subset; `bench` calls it `n`
+times so the per-`CodeObject` hot counter crosses the tier-up
+threshold and the kernel runs as native code for the bulk of the
+work. With `WEAVEPY_JIT=0` it measures the interpreter on the same
+shape, which is the comparison we care about.
+"""
+
+import os
+
+
+def kernel(n):
+    s = 0
+    i = 0
+    while i < n:
+        s = s + i * 2 - (i // 3) + (i % 7)
+        i = i + 1
+    return s
+
+
+def bench(n):
+    total = 0
+    k = 0
+    while k < n:
+        total = total + kernel(n)
+        k = k + 1
+    return total
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "300"))
+    bench(n)
diff --git a/crates/weavepy-bench/src/fixtures.rs b/crates/weavepy-bench/src/fixtures.rs
index 74793d3..ebd5ee2 100644
--- a/crates/weavepy-bench/src/fixtures.rs
+++ b/crates/weavepy-bench/src/fixtures.rs
@@ -19,6 +19,7 @@ pub const FIXTURES: &[&str] = &[
     "richards",
     "sumvm",
     "nested_loops",
+    "jitloop",
 ];
 
 /// Default per-fixture work parameter passed as `bench(n)`.
@@ -35,6 +36,7 @@ pub fn default_work(name: &str) -> u32 {
         "richards" => 1,
         "sumvm" => 50_000,
         "nested_loops" => 30,
+        "jitloop" => 300,
         _ => 1,
     }
 }
diff --git a/crates/weavepy-bench/src/main.rs b/crates/weavepy-bench/src/main.rs
index 9782b17..1593e57 100644
--- a/crates/weavepy-bench/src/main.rs
+++ b/crates/weavepy-bench/src/main.rs
@@ -114,6 +114,11 @@ fn cmd_run(args: &[String]) -> io::Result<()> {
             // suite. Off by default; cheap when off.
             println!();
             println!("{}", format_stats_markdown(&snapshot()));
+            // RFC 0032 — append tier-2 JIT counters when compiled in.
+            if let Some(jit) = weavepy_vm::jit_stats_markdown() {
+                println!();
+                println!("{jit}");
+            }
         }
     }
     Ok(())
diff --git a/crates/weavepy-cli/Cargo.toml b/crates/weavepy-cli/Cargo.toml
index d24f792..468e7e0 100644
--- a/crates/weavepy-cli/Cargo.toml
+++ b/crates/weavepy-cli/Cargo.toml
@@ -29,5 +29,11 @@ dirs               = { workspace = true }
 tracing            = { workspace = true }
 tracing-subscriber = { workspace = true }
 
+[features]
+default = []
+# RFC 0032 — build the `weavepy` binary with the tier-2 JIT compiled in
+# (still gated at runtime by `WEAVEPY_JIT=1`).
+jit = ["weavepy/jit", "weavepy-vm/jit"]
+
 [lints]
 workspace = true
diff --git a/crates/weavepy-compiler/src/bytecode.rs b/crates/weavepy-compiler/src/bytecode.rs
index 5316aa9..b744c28 100644
--- a/crates/weavepy-compiler/src/bytecode.rs
+++ b/crates/weavepy-compiler/src/bytecode.rs
@@ -571,6 +571,24 @@ pub enum InlineCache {
     UnpackSequenceTuple,
     UnpackSequenceList,
     UnpackSequenceTwoTuple,
+
+    // CALL family (RFC 0032). `func_id` is the `Rc::as_ptr` fingerprint
+    // of the called `PyFunction`; `argc` is the (fixed) call-site arity.
+    /// Plain Python function: exact positional arity, no keywords, no
+    /// `*args`/`**kwargs`/kw-only/defaults needed, and no cells or
+    /// closure — so the frame's locals are just the arguments padded
+    /// with `None`, skipping the whole argument-binding dance.
+    CallPyExactNoFree {
+        func_id: u64,
+        argc: u32,
+    },
+    /// Plain Python function with the same exact-arity guarantee but a
+    /// non-trivial cell/closure layout — still skips argument binding,
+    /// but builds the frame (and its cells) through `make_frame`.
+    CallPyExact {
+        func_id: u64,
+        argc: u32,
+    },
 }
 
 /// Number of generic dispatches a deopted cache must serve before it
diff --git a/crates/weavepy-jit/Cargo.toml b/crates/weavepy-jit/Cargo.toml
new file mode 100644
index 0000000..aa91604
--- /dev/null
+++ b/crates/weavepy-jit/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "weavepy-jit"
+description = "RFC 0032 — tier-2 Cranelift JIT for WeavePy's unboxed numeric frames."
+version.workspace      = true
+edition.workspace      = true
+rust-version.workspace = true
+license.workspace      = true
+repository.workspace   = true
+authors.workspace      = true
+readme.workspace       = true
+keywords.workspace     = true
+categories.workspace   = true
+publish = false
+
+[dependencies]
+weavepy-compiler   = { workspace = true }
+cranelift-codegen  = { workspace = true }
+cranelift-frontend = { workspace = true }
+cranelift-jit      = { workspace = true }
+cranelift-module   = { workspace = true }
+cranelift-native   = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/crates/weavepy-jit/src/analyze.rs b/crates/weavepy-jit/src/analyze.rs
new file mode 100644
index 0000000..40365ab
--- /dev/null
+++ b/crates/weavepy-jit/src/analyze.rs
@@ -0,0 +1,874 @@
+//! JITability analysis: bytecode → [`TFunc`], or a [`JitVerdict`]
+//! explaining why a code object is outside the v1 subset.
+//!
+//! The pipeline is:
+//!
+//! 1. **Block construction** — split the instruction stream into basic
+//!    blocks at jump targets / after control-flow ops, resolving
+//!    WeavePy's relative jumps to absolute instruction indices.
+//! 2. **Reachability** — keep only blocks reachable from entry.
+//! 3. **Definite assignment** — a forward must-analysis whose only job
+//!    is to compute the *live-in* local set (slots read before written)
+//!    that the VM type-guards before entering native code.
+//! 4. **Type inference fixpoint** — abstract-interpret each block (with
+//!    an empty entry stack) to assign each local slot one stable
+//!    [`JitType`], bailing on any unsupported opcode, unrepresentable
+//!    constant, mixed-lane arithmetic, non-uniform local, or non-empty
+//!    block-boundary stack.
+//! 5. **Emission** — once types converge, re-walk and emit [`TStmt`]s /
+//!    [`TBlock`]s into a [`TFunc`].
+
+use std::collections::{BTreeSet, HashMap, HashSet, VecDeque};
+
+use weavepy_compiler::{BinOpKind, CodeObject, CompareKind, Constant, OpCode, UnaryKind};
+
+use crate::ir::{ArithKind, BlockId, CmpKind, TBlock, TFunc, TOp, TStmt, TTerm};
+use crate::value::JitType;
+
+/// Why a code object could not be compiled by the v1 JIT. Carried back
+/// to the VM so it can mark the frame `NotJitable` and stop retrying.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum JitVerdict {
+    /// An opcode outside the supported subset (named for diagnostics).
+    UnsupportedOpcode(&'static str),
+    /// A `LOAD_CONST` of a non-`int`/`float`/`bool` constant.
+    UnsupportedConst,
+    /// A local slot is assigned two different lanes across the region.
+    NonUniformLocal(u32),
+    /// An operand's type could not be resolved to a representable lane.
+    TypeUnknown,
+    /// The operand stack is non-empty at a basic-block boundary
+    /// (short-circuit / ternary in the hot region).
+    NonEmptyBoundaryStack,
+    /// Arithmetic / comparison mixing `int` and `float` lanes.
+    MixedArithTypes,
+    /// The abstract stack underflowed (malformed or unsupported shape).
+    StackUnderflow,
+    /// A jump resolved outside the instruction stream.
+    BadJumpTarget,
+    /// Signature / kind the whole-function JIT doesn't handle
+    /// (generators, `*args`, class bodies, …).
+    UnsupportedSignature,
+    /// Trivial / empty body — not worth compiling.
+    Trivial,
+    /// Type inference did not converge within the iteration budget.
+    NotConverged,
+}
+
+/// A raw basic block over the original instruction indices.
+#[derive(Debug, Clone)]
+struct RawBlock {
+    start: usize,
+    end: usize,
+    succs: Vec<usize>,
+}
+
+/// Maximum type-inference iterations before giving up.
+const MAX_INFER_ITERS: usize = 64;
+
+/// Analyze a code object. Returns the typed IR on success or a
+/// [`JitVerdict`] describing the first disqualifying property found.
+pub fn analyze(code: &CodeObject) -> Result<TFunc, JitVerdict> {
+    if code.is_generator || code.is_coroutine || code.is_async_generator || code.is_class_body {
+        return Err(JitVerdict::UnsupportedSignature);
+    }
+    if code.has_varargs || code.has_varkeywords || code.kwonly_count > 0 {
+        return Err(JitVerdict::UnsupportedSignature);
+    }
+    let n = code.instructions.len();
+    if n < 2 {
+        return Err(JitVerdict::Trivial);
+    }
+
+    let raw = build_blocks(code)?;
+    let reachable = reachable_blocks(&raw);
+    if reachable.is_empty() {
+        return Err(JitVerdict::Trivial);
+    }
+
+    let n_locals = code.varnames.len() as u32;
+    let livein = compute_livein(code, &raw, &reachable, n_locals);
+
+    // Type inference fixpoint.
+    let mut local_types: Vec<Option<JitType>> = vec![None; n_locals as usize];
+    let mut iters = 0;
+    loop {
+        let mut changed = false;
+        for &bi in &reachable {
+            infer_block(code, &raw[bi], &mut local_types, &mut changed)?;
+        }
+        if !changed {
+            break;
+        }
+        iters += 1;
+        if iters > MAX_INFER_ITERS {
+            return Err(JitVerdict::NotConverged);
+        }
+    }
+
+    // Compact block ids over reachable blocks (entry first is convenient
+    // but not required — we record the entry id explicitly).
+    let mut compact: HashMap<usize, BlockId> = HashMap::new();
+    for (idx, &bi) in reachable.iter().enumerate() {
+        compact.insert(bi, idx);
+    }
+    let entry_block = *compact
+        .get(&block_index_at(&raw, 0))
+        .ok_or(JitVerdict::Trivial)?;
+
+    // Emission pass.
+    let mut blocks: Vec<TBlock> = Vec::with_capacity(reachable.len());
+    let mut max_stack = 0u32;
+    for &bi in &reachable {
+        let tb = emit_block(code, &raw[bi], &local_types, &compact, &mut max_stack)?;
+        blocks.push(tb);
+    }
+
+    let mut livein_vec: Vec<u32> = livein.into_iter().collect();
+    livein_vec.sort_unstable();
+
+    Ok(TFunc {
+        n_locals,
+        local_types,
+        livein_locals: livein_vec,
+        max_stack,
+        blocks,
+        entry_block,
+    })
+}
+
+/// Resolve a forward branch/jump target instruction index.
+#[inline]
+fn forward_target(i: usize, arg: u32) -> usize {
+    i + 1 + arg as usize
+}
+
+/// Resolve a backward jump target instruction index.
+#[inline]
+fn backward_target(i: usize, arg: u32) -> Option<usize> {
+    (i + 1).checked_sub(arg as usize)
+}
+
+/// Build the basic blocks, resolving relative jumps to absolute indices.
+fn build_blocks(code: &CodeObject) -> Result<Vec<RawBlock>, JitVerdict> {
+    let n = code.instructions.len();
+    let mut leaders: BTreeSet<usize> = BTreeSet::new();
+    leaders.insert(0);
+    for (i, ins) in code.instructions.iter().enumerate() {
+        match ins.op {
+            OpCode::PopJumpIfFalse | OpCode::PopJumpIfTrue => {
+                let t = forward_target(i, ins.arg);
+                if t > n {
+                    return Err(JitVerdict::BadJumpTarget);
+                }
+                leaders.insert(t);
+                if i + 1 < n {
+                    leaders.insert(i + 1);
+                }
+            }
+            OpCode::JumpForward => {
+                let t = forward_target(i, ins.arg);
+                if t > n {
+                    return Err(JitVerdict::BadJumpTarget);
+                }
+                leaders.insert(t);
+                if i + 1 < n {
+                    leaders.insert(i + 1);
+                }
+            }
+            OpCode::JumpBackward => {
+                let t = backward_target(i, ins.arg).ok_or(JitVerdict::BadJumpTarget)?;
+                leaders.insert(t);
+                if i + 1 < n {
+                    leaders.insert(i + 1);
+                }
+            }
+            OpCode::ReturnValue if i + 1 < n => {
+                leaders.insert(i + 1);
+            }
+            _ => {}
+        }
+    }
+
+    let leader_vec: Vec<usize> = leaders.iter().copied().collect();
+    let index_of: HashMap<usize, usize> = leader_vec
+        .iter()
+        .enumerate()
+        .map(|(idx, &pc)| (pc, idx))
+        .collect();
+
+    let mut blocks: Vec<RawBlock> = Vec::with_capacity(leader_vec.len());
+    for (bi, &start) in leader_vec.iter().enumerate() {
+        let end = leader_vec.get(bi + 1).copied().unwrap_or(n);
+        let last = end - 1;
+        let ins = code.instructions[last];
+        let succs = match ins.op {
+            OpCode::ReturnValue => Vec::new(),
+            OpCode::JumpForward => vec![index_of[&forward_target(last, ins.arg)]],
+            OpCode::JumpBackward => {
+                vec![index_of[&backward_target(last, ins.arg).ok_or(JitVerdict::BadJumpTarget)?]]
+            }
+            OpCode::PopJumpIfFalse | OpCode::PopJumpIfTrue => {
+                let t = index_of[&forward_target(last, ins.arg)];
+                let f = index_of
+                    .get(&(last + 1))
+                    .copied()
+                    .ok_or(JitVerdict::BadJumpTarget)?;
+                vec![f, t]
+            }
+            // Falls through to the next block.
+            _ => {
+                let fall = index_of
+                    .get(&end)
+                    .copied()
+                    .ok_or(JitVerdict::BadJumpTarget)?;
+                vec![fall]
+            }
+        };
+        blocks.push(RawBlock { start, end, succs });
+    }
+    Ok(blocks)
+}
+
+/// Index of the block whose `start == pc` (pc must be a leader).
+fn block_index_at(raw: &[RawBlock], pc: usize) -> usize {
+    raw.iter().position(|b| b.start == pc).unwrap_or(0)
+}
+
+/// Blocks reachable from the entry (block 0), in deterministic order.
+fn reachable_blocks(raw: &[RawBlock]) -> Vec<usize> {
+    let mut seen = vec![false; raw.len()];
+    let mut order = Vec::new();
+    let mut q = VecDeque::new();
+    if !raw.is_empty() {
+        q.push_back(0usize);
+        seen[0] = true;
+    }
+    while let Some(b) = q.pop_front() {
+        order.push(b);
+        for &s in &raw[b].succs {
+            if !seen[s] {
+                seen[s] = true;
+                q.push_back(s);
+            }
+        }
+    }
+    order.sort_unstable();
+    order
+}
+
+/// Compute the live-in local set via a definite-assignment must-analysis.
+fn compute_livein(
+    code: &CodeObject,
+    raw: &[RawBlock],
+    reachable: &[usize],
+    n_locals: u32,
+) -> HashSet<u32> {
+    let param_slots: HashSet<u32> = (0..code.arg_count).collect();
+    let reachset: HashSet<usize> = reachable.iter().copied().collect();
+
+    // Predecessors among reachable blocks.
+    let mut preds: Vec<Vec<usize>> = vec![Vec::new(); raw.len()];
+    for &b in reachable {
+        for &s in &raw[b].succs {
+            if reachset.contains(&s) {
+                preds[s].push(b);
+            }
+        }
+    }
+
+    let full: HashSet<u32> = (0..n_locals).collect();
+    let entry = block_index_at(raw, 0);
+    let mut assigned_in: Vec<HashSet<u32>> = vec![full.clone(); raw.len()];
+    if let Some(slot) = assigned_in.get_mut(entry) {
+        *slot = param_slots.clone();
+    }
+
+    // Fixpoint: assigned_in[b] = ∩ assigned_out[pred].
+    loop {
+        let mut changed = false;
+        for &b in reachable {
+            let new_in = if b == entry {
+                param_slots.clone()
+            } else if preds[b].is_empty() {
+                // Unreachable-but-listed guard; treat as empty.
+                HashSet::new()
+            } else {
+                let mut acc: Option<HashSet<u32>> = None;
+                for &p in &preds[b] {
+                    let out = assigned_out(code, &raw[p], &assigned_in[p]);
+                    acc = Some(match acc {
+                        None => out,
+                        Some(a) => a.intersection(&out).copied().collect(),
+                    });
+                }
+                acc.unwrap_or_default()
+            };
+            if new_in != assigned_in[b] {
+                assigned_in[b] = new_in;
+                changed = true;
+            }
+        }
+        if !changed {
+            break;
+        }
+    }
+
+    // Collect live-in: a load of a slot not definitely assigned yet.
+    let mut livein = HashSet::new();
+    for &b in reachable {
+        let mut cur = assigned_in[b].clone();
+        for i in raw[b].start..raw[b].end {
+            let ins = code.instructions[i];
+            match ins.op {
+                OpCode::LoadFast if !cur.contains(&ins.arg) => {
+                    livein.insert(ins.arg);
+                }
+                OpCode::StoreFast => {
+                    cur.insert(ins.arg);
+                }
+                _ => {}
+            }
+        }
+    }
+    livein
+}
+
+/// `assigned_in ∪ {slots stored in this block}`.
+fn assigned_out(code: &CodeObject, b: &RawBlock, assigned_in: &HashSet<u32>) -> HashSet<u32> {
+    let mut out = assigned_in.clone();
+    for i in b.start..b.end {
+        let ins = code.instructions[i];
+        if matches!(ins.op, OpCode::StoreFast) {
+            out.insert(ins.arg);
+        }
+    }
+    out
+}
+
+/// One operand-stack entry during analysis, with provenance for the
+/// live-in inference (`src` is the slot of an as-yet-untyped load).
+#[derive(Clone, Copy)]
+struct SE {
+    ty: JitType,
+    src: Option<u32>,
+}
+
+impl SE {
+    fn known(ty: JitType) -> SE {
+        SE { ty, src: None }
+    }
+}
+
+/// Map a representable [`Constant`] to its lane, or `None`.
+fn const_type(c: &Constant) -> Option<JitType> {
+    match c {
+        Constant::Int(_) => Some(JitType::Int),
+        Constant::Bool(_) => Some(JitType::Bool),
+        Constant::Float(_) => Some(JitType::Float),
+        _ => None,
+    }
+}
+
+/// Infer/validate one block during the fixpoint. Mutates `local_types`
+/// (setting `changed` when it grows) and bails on hard errors. Transient
+/// `Unknown` operands are tolerated — a later iteration may resolve them.
+fn infer_block(
+    code: &CodeObject,
+    b: &RawBlock,
+    local_types: &mut [Option<JitType>],
+    changed: &mut bool,
+) -> Result<(), JitVerdict> {
+    let mut stack: Vec<SE> = Vec::new();
+    for i in b.start..(b.end - 1) {
+        step_abstract(code, i, &mut stack, local_types, changed, false)?;
+    }
+    // Terminator stack-shape validation.
+    let last = b.end - 1;
+    let ins = code.instructions[last];
+    match ins.op {
+        OpCode::ReturnValue => {
+            if stack.is_empty() {
+                return Err(JitVerdict::StackUnderflow);
+            }
+        }
+        OpCode::JumpForward | OpCode::JumpBackward => {
+            if !stack.is_empty() {
+                return Err(JitVerdict::NonEmptyBoundaryStack);
+            }
+        }
+        OpCode::PopJumpIfFalse | OpCode::PopJumpIfTrue => {
+            if stack.len() != 1 {
+                return Err(JitVerdict::NonEmptyBoundaryStack);
+            }
+            let c = stack[0];
+            if !c.ty.is_representable() && c.src.is_none() {
+                return Err(JitVerdict::TypeUnknown);
+            }
+        }
+        // Fall-through terminator: must leave an empty stack.
+        _ => {
+            step_abstract(code, last, &mut stack, local_types, changed, false)?;
+            if !stack.is_empty() {
+                return Err(JitVerdict::NonEmptyBoundaryStack);
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Abstract-execute one non-terminator instruction, updating the type
+/// stack and (via inference) `local_types`.
+fn step_abstract(
+    code: &CodeObject,
+    i: usize,
+    stack: &mut Vec<SE>,
+    local_types: &mut [Option<JitType>],
+    changed: &mut bool,
+    strict: bool,
+) -> Result<(), JitVerdict> {
+    let ins = code.instructions[i];
+    match ins.op {
+        OpCode::Nop | OpCode::Resume => {}
+        OpCode::LoadConst => {
+            let c = code
+                .constants
+                .get(ins.arg as usize)
+                .ok_or(JitVerdict::UnsupportedConst)?;
+            let ty = const_type(c).ok_or(JitVerdict::UnsupportedConst)?;
+            stack.push(SE::known(ty));
+        }
+        OpCode::LoadFast => {
+            let slot = ins.arg as usize;
+            match local_types.get(slot).copied().flatten() {
+                Some(t) => stack.push(SE::known(t)),
+                None => stack.push(SE {
+                    ty: JitType::Unknown,
+                    src: Some(ins.arg),
+                }),
+            }
+        }
+        OpCode::StoreFast => {
+            let v = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            if v.ty.is_representable() {
+                set_local(local_types, ins.arg, v.ty, changed)?;
+            } else if strict {
+                return Err(JitVerdict::TypeUnknown);
+            }
+        }
+        OpCode::BinaryOp => {
+            let kind = bin_kind(ins.arg)?;
+            let b = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let (a, b) = resolve_pair(a, b, local_types, changed);
+            let res = bin_result_type(kind, a.ty, b.ty, strict)?;
+            stack.push(SE::known(res));
+        }
+        OpCode::CompareOp => {
+            let _ = cmp_kind(ins.arg)?;
+            let b = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let (a, b) = resolve_pair(a, b, local_types, changed);
+            cmp_check(a.ty, b.ty, strict)?;
+            stack.push(SE::known(JitType::Bool));
+        }
+        OpCode::UnaryOp => {
+            let kind = unary_kind(ins.arg)?;
+            let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let res = unary_result_type(kind, a.ty, strict)?;
+            stack.push(SE::known(res));
+        }
+        OpCode::PopTop => {
+            stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+        }
+        OpCode::CopyTop => {
+            let v = *stack.last().ok_or(JitVerdict::StackUnderflow)?;
+            stack.push(v);
+        }
+        OpCode::Swap => {
+            if ins.arg != 2 {
+                return Err(JitVerdict::UnsupportedOpcode("SWAP n!=2"));
+            }
+            let len = stack.len();
+            if len < 2 {
+                return Err(JitVerdict::StackUnderflow);
+            }
+            stack.swap(len - 1, len - 2);
+        }
+        other => return Err(JitVerdict::UnsupportedOpcode(other.name())),
+    }
+    Ok(())
+}
+
+/// If exactly one operand is an untyped live-in load and the other is a
+/// concrete lane, infer the live-in's type.
+fn resolve_pair(
+    mut a: SE,
+    mut b: SE,
+    local_types: &mut [Option<JitType>],
+    changed: &mut bool,
+) -> (SE, SE) {
+    if a.ty.is_representable() && !b.ty.is_representable() {
+        if let Some(slot) = b.src {
+            let _ = set_local(local_types, slot, a.ty, changed);
+            b.ty = a.ty;
+            b.src = None;
+        }
+    } else if b.ty.is_representable() && !a.ty.is_representable() {
+        if let Some(slot) = a.src {
+            let _ = set_local(local_types, slot, b.ty, changed);
+            a.ty = b.ty;
+            a.src = None;
+        }
+    }
+    (a, b)
+}
+
+/// Assign a local's lane, enforcing single-type stability.
+fn set_local(
+    local_types: &mut [Option<JitType>],
+    slot: u32,
+    ty: JitType,
+    changed: &mut bool,
+) -> Result<(), JitVerdict> {
+    let cell = local_types
+        .get_mut(slot as usize)
+        .ok_or(JitVerdict::TypeUnknown)?;
+    match *cell {
+        None => {
+            *cell = Some(ty);
+            *changed = true;
+            Ok(())
+        }
+        Some(existing) if existing == ty => Ok(()),
+        Some(_) => Err(JitVerdict::NonUniformLocal(slot)),
+    }
+}
+
+/// Result lane of a binary arithmetic op, given operand lanes.
+fn bin_result_type(
+    kind: ArithKind,
+    a: JitType,
+    b: JitType,
+    strict: bool,
+) -> Result<JitType, JitVerdict> {
+    if !a.is_representable() || !b.is_representable() {
+        return if strict {
+            Err(JitVerdict::TypeUnknown)
+        } else {
+            Ok(JitType::Unknown)
+        };
+    }
+    let a_int = a.is_integral();
+    let b_int = b.is_integral();
+    if a_int && b_int {
+        match kind {
+            ArithKind::TrueDiv => Ok(JitType::Float),
+            ArithKind::And | ArithKind::Or | ArithKind::Xor => {
+                // bool∘bool stays bool in Python; we bail on that rare
+                // case to keep the lane unambiguous.
+                if a == JitType::Bool && b == JitType::Bool {
+                    Err(JitVerdict::UnsupportedOpcode("bitwise on bool"))
+                } else {
+                    Ok(JitType::Int)
+                }
+            }
+            _ => Ok(JitType::Int),
+        }
+    } else if a == JitType::Float && b == JitType::Float {
+        match kind {
+            ArithKind::Add | ArithKind::Sub | ArithKind::Mul | ArithKind::TrueDiv => {
+                Ok(JitType::Float)
+            }
+            _ => Err(JitVerdict::UnsupportedOpcode("float floordiv/mod/bitop")),
+        }
+    } else {
+        Err(JitVerdict::MixedArithTypes)
+    }
+}
+
+/// Validate comparison operand lanes (same lane required in v1).
+fn cmp_check(a: JitType, b: JitType, strict: bool) -> Result<(), JitVerdict> {
+    if !a.is_representable() || !b.is_representable() {
+        return if strict {
+            Err(JitVerdict::TypeUnknown)
+        } else {
+            Ok(())
+        };
+    }
+    if (a.is_integral() && b.is_integral()) || (a == JitType::Float && b == JitType::Float) {
+        Ok(())
+    } else {
+        Err(JitVerdict::MixedArithTypes)
+    }
+}
+
+/// Result lane of a unary op.
+fn unary_result_type(kind: UnaryKind, a: JitType, strict: bool) -> Result<JitType, JitVerdict> {
+    if !a.is_representable() {
+        return if strict {
+            Err(JitVerdict::TypeUnknown)
+        } else {
+            Ok(JitType::Unknown)
+        };
+    }
+    match kind {
+        UnaryKind::Not => Ok(JitType::Bool),
+        UnaryKind::Neg | UnaryKind::Invert => {
+            if a.is_integral() {
+                Ok(JitType::Int)
+            } else if matches!(kind, UnaryKind::Neg) {
+                Ok(JitType::Float)
+            } else {
+                Err(JitVerdict::UnsupportedOpcode("~float"))
+            }
+        }
+        UnaryKind::Pos => {
+            if a == JitType::Float {
+                Ok(JitType::Float)
+            } else if a == JitType::Int {
+                Ok(JitType::Int)
+            } else {
+                Err(JitVerdict::UnsupportedOpcode("+bool"))
+            }
+        }
+    }
+}
+
+fn bin_kind(arg: u32) -> Result<ArithKind, JitVerdict> {
+    let k = match arg {
+        x if x == BinOpKind::Add as u32 => ArithKind::Add,
+        x if x == BinOpKind::Sub as u32 => ArithKind::Sub,
+        x if x == BinOpKind::Mult as u32 => ArithKind::Mul,
+        x if x == BinOpKind::Div as u32 => ArithKind::TrueDiv,
+        x if x == BinOpKind::FloorDiv as u32 => ArithKind::FloorDiv,
+        x if x == BinOpKind::Mod as u32 => ArithKind::Mod,
+        x if x == BinOpKind::BitOr as u32 => ArithKind::Or,
+        x if x == BinOpKind::BitXor as u32 => ArithKind::Xor,
+        x if x == BinOpKind::BitAnd as u32 => ArithKind::And,
+        _ => return Err(JitVerdict::UnsupportedOpcode("BINARY_OP kind")),
+    };
+    Ok(k)
+}
+
+fn cmp_kind(arg: u32) -> Result<CmpKind, JitVerdict> {
+    let k = match arg {
+        x if x == CompareKind::Lt as u32 => CmpKind::Lt,
+        x if x == CompareKind::LtE as u32 => CmpKind::Le,
+        x if x == CompareKind::Eq as u32 => CmpKind::Eq,
+        x if x == CompareKind::NotEq as u32 => CmpKind::Ne,
+        x if x == CompareKind::Gt as u32 => CmpKind::Gt,
+        x if x == CompareKind::GtE as u32 => CmpKind::Ge,
+        _ => return Err(JitVerdict::UnsupportedOpcode("COMPARE_OP kind")),
+    };
+    Ok(k)
+}
+
+fn unary_kind(arg: u32) -> Result<UnaryKind, JitVerdict> {
+    let k = match arg {
+        x if x == UnaryKind::Pos as u32 => UnaryKind::Pos,
+        x if x == UnaryKind::Neg as u32 => UnaryKind::Neg,
+        x if x == UnaryKind::Not as u32 => UnaryKind::Not,
+        x if x == UnaryKind::Invert as u32 => UnaryKind::Invert,
+        _ => return Err(JitVerdict::UnsupportedOpcode("UNARY_OP kind")),
+    };
+    Ok(k)
+}
+
+/// Emit the typed IR for one block, with all local types now known.
+fn emit_block(
+    code: &CodeObject,
+    b: &RawBlock,
+    local_types: &[Option<JitType>],
+    compact: &HashMap<usize, BlockId>,
+    max_stack: &mut u32,
+) -> Result<TBlock, JitVerdict> {
+    let mut stack: Vec<JitType> = Vec::new();
+    let mut stmts: Vec<TStmt> = Vec::new();
+
+    for i in b.start..(b.end - 1) {
+        emit_instr(code, i, local_types, &mut stack, &mut stmts, max_stack)?;
+    }
+
+    let last = b.end - 1;
+    let ins = code.instructions[last];
+    let term = match ins.op {
+        OpCode::ReturnValue => {
+            // Lowering pops the return value off its own type stack at
+            // the `Return` terminator; no statement is emitted here.
+            if stack.is_empty() {
+                return Err(JitVerdict::StackUnderflow);
+            }
+            TTerm::Return
+        }
+        OpCode::JumpForward | OpCode::JumpBackward => {
+            let t = compact[&block_succ(b, 0)];
+            TTerm::Jump(t)
+        }
+        OpCode::PopJumpIfFalse => TTerm::BranchFalse {
+            fallthrough: compact[&block_succ(b, 0)],
+            target: compact[&block_succ(b, 1)],
+        },
+        OpCode::PopJumpIfTrue => TTerm::BranchTrue {
+            fallthrough: compact[&block_succ(b, 0)],
+            target: compact[&block_succ(b, 1)],
+        },
+        _ => {
+            emit_instr(code, last, local_types, &mut stack, &mut stmts, max_stack)?;
+            TTerm::Jump(compact[&block_succ(b, 0)])
+        }
+    };
+
+    // Entry stack is always empty in the v1 subset.
+    Ok(TBlock {
+        entry_stack: Vec::new(),
+        stmts,
+        term,
+    })
+}
+
+/// The raw successor block index at position `k`.
+fn block_succ(b: &RawBlock, k: usize) -> usize {
+    b.succs[k]
+}
+
+/// Emit one instruction's [`TStmt`](s), tracking the type stack so
+/// result lanes match what lowering will reconstruct.
+fn emit_instr(
+    code: &CodeObject,
+    i: usize,
+    local_types: &[Option<JitType>],
+    stack: &mut Vec<JitType>,
+    stmts: &mut Vec<TStmt>,
+    max_stack: &mut u32,
+) -> Result<(), JitVerdict> {
+    let ins = code.instructions[i];
+    let pc = i as u32;
+    let mut push =
+        |op: TOp, ty: Option<JitType>, stack: &mut Vec<JitType>, stmts: &mut Vec<TStmt>| {
+            stmts.push(TStmt { pc, op });
+            if let Some(t) = ty {
+                stack.push(t);
+            }
+            *max_stack = (*max_stack).max(stack.len() as u32);
+        };
+    match ins.op {
+        OpCode::Nop | OpCode::Resume => {}
+        OpCode::LoadConst => {
+            let c = &code.constants[ins.arg as usize];
+            let (op, ty) = match c {
+                Constant::Int(v) => (TOp::PushConstInt(*v), JitType::Int),
+                Constant::Bool(v) => (TOp::PushConstBool(*v), JitType::Bool),
+                Constant::Float(v) => (TOp::PushConstFloat(v.to_bits()), JitType::Float),
+                _ => return Err(JitVerdict::UnsupportedConst),
+            };
+            push(op, Some(ty), stack, stmts);
+        }
+        OpCode::LoadFast => {
+            let ty = local_types
+                .get(ins.arg as usize)
+                .copied()
+                .flatten()
+                .ok_or(JitVerdict::TypeUnknown)?;
+            push(TOp::LoadLocal(ins.arg), Some(ty), stack, stmts);
+        }
+        OpCode::StoreFast => {
+            stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            push(TOp::StoreLocal(ins.arg), None, stack, stmts);
+        }
+        OpCode::BinaryOp => {
+            let kind = bin_kind(ins.arg)?;
+            let b = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let (op, ty) = lower_bin(kind, a, b)?;
+            push(op, Some(ty), stack, stmts);
+        }
+        OpCode::CompareOp => {
+            let kind = cmp_kind(ins.arg)?;
+            let b = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            let op = if a.is_integral() && b.is_integral() {
+                TOp::IntCmp(kind)
+            } else if a == JitType::Float && b == JitType::Float {
+                TOp::FloatCmp(kind)
+            } else {
+                return Err(JitVerdict::MixedArithTypes);
+            };
+            push(op, Some(JitType::Bool), stack, stmts);
+        }
+        OpCode::UnaryOp => {
+            let kind = unary_kind(ins.arg)?;
+            let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            match (kind, a) {
+                (UnaryKind::Pos, JitType::Int | JitType::Float) => {
+                    // Identity; re-push same lane, emit nothing.
+                    stack.push(a);
+                }
+                (UnaryKind::Neg, t) if t.is_integral() => {
+                    push(TOp::IntNeg, Some(JitType::Int), stack, stmts)
+                }
+                (UnaryKind::Neg, JitType::Float) => {
+                    push(TOp::FloatNeg, Some(JitType::Float), stack, stmts);
+                }
+                (UnaryKind::Invert, t) if t.is_integral() => {
+                    push(TOp::IntInvert, Some(JitType::Int), stack, stmts);
+                }
+                (UnaryKind::Not, t) if t.is_integral() => {
+                    push(TOp::IntNot, Some(JitType::Bool), stack, stmts);
+                }
+                (UnaryKind::Not, JitType::Float) => {
+                    push(TOp::FloatNot, Some(JitType::Bool), stack, stmts);
+                }
+                _ => return Err(JitVerdict::UnsupportedOpcode("UNARY_OP lane")),
+            }
+        }
+        OpCode::PopTop => {
+            stack.pop().ok_or(JitVerdict::StackUnderflow)?;
+            push(TOp::Pop, None, stack, stmts);
+        }
+        OpCode::CopyTop => {
+            let t = *stack.last().ok_or(JitVerdict::StackUnderflow)?;
+            push(TOp::Dup, Some(t), stack, stmts);
+        }
+        OpCode::Swap => {
+            if ins.arg != 2 {
+                return Err(JitVerdict::UnsupportedOpcode("SWAP n!=2"));
+            }
+            let len = stack.len();
+            if len < 2 {
+                return Err(JitVerdict::StackUnderflow);
+            }
+            stack.swap(len - 1, len - 2);
+            push(TOp::Swap2, None, stack, stmts);
+        }
+        other => return Err(JitVerdict::UnsupportedOpcode(other.name())),
+    }
+    Ok(())
+}
+
+/// Choose the IR op + result lane for a binary arithmetic op at emission
+/// time (types are all known).
+fn lower_bin(kind: ArithKind, a: JitType, b: JitType) -> Result<(TOp, JitType), JitVerdict> {
+    if a.is_integral() && b.is_integral() {
+        match kind {
+            ArithKind::TrueDiv => Ok((TOp::IntTrueDiv, JitType::Float)),
+            ArithKind::And | ArithKind::Or | ArithKind::Xor => {
+                if a == JitType::Bool && b == JitType::Bool {
+                    Err(JitVerdict::UnsupportedOpcode("bitwise on bool"))
+                } else {
+                    Ok((TOp::IntArith(kind), JitType::Int))
+                }
+            }
+            _ => Ok((TOp::IntArith(kind), JitType::Int)),
+        }
+    } else if a == JitType::Float && b == JitType::Float {
+        match kind {
+            ArithKind::Add | ArithKind::Sub | ArithKind::Mul | ArithKind::TrueDiv => {
+                Ok((TOp::FloatArith(kind), JitType::Float))
+            }
+            _ => Err(JitVerdict::UnsupportedOpcode("float floordiv/mod/bitop")),
+        }
+    } else {
+        Err(JitVerdict::MixedArithTypes)
+    }
+}
diff --git a/crates/weavepy-jit/src/engine.rs b/crates/weavepy-jit/src/engine.rs
new file mode 100644
index 0000000..70a52dc
--- /dev/null
+++ b/crates/weavepy-jit/src/engine.rs
@@ -0,0 +1,165 @@
+//! The Cranelift JIT module lifecycle and the compiled-frame entry
+//! point.
+//!
+//! A [`JitEngine`] owns one [`JITModule`]; every compiled frame is a
+//! native function defined into it. The engine is intended to be a
+//! per-thread singleton (the VM keeps it in thread-local storage, under
+//! the GIL), so the function pointers stay valid for the thread's
+//! lifetime and there is no cross-thread aliasing.
+
+use std::mem;
+
+use cranelift_codegen::ir::{types, AbiParam, Type};
+use cranelift_codegen::settings::{self, Configurable};
+use cranelift_codegen::Context;
+use cranelift_frontend::FunctionBuilderContext;
+use cranelift_jit::{JITBuilder, JITModule};
+use cranelift_module::{Linkage, Module};
+
+use crate::analyze::{analyze, JitVerdict};
+use crate::ir::TFunc;
+use crate::lower::build_function;
+use crate::runtime::{JitFrame, JitStatus};
+use crate::value::JitType;
+use weavepy_compiler::CodeObject;
+
+/// The native ABI of a compiled frame: takes a `*mut JitFrame`, returns
+/// an `i64` [`JitStatus`].
+pub(crate) type NativeFn = unsafe extern "C" fn(*mut JitFrame) -> i64;
+
+/// A compiled frame plus the metadata the VM needs to marshal values in
+/// and out and to apply the entry guard.
+#[derive(Debug)]
+pub struct CompiledFrame {
+    func: NativeFn,
+    /// Local slots to type-guard + pack before entry (read-before-write).
+    pub livein: Vec<u32>,
+    /// Stable lane of each local slot (`None` = not JIT-managed).
+    pub local_types: Vec<Option<JitType>>,
+    /// Max abstract operand-stack depth, for sizing the spill buffer.
+    pub max_stack: u32,
+    /// Number of local slots.
+    pub n_locals: u32,
+}
+
+impl CompiledFrame {
+    /// Enter the compiled frame.
+    ///
+    /// # Safety
+    ///
+    /// `frame` must point to a fully-initialised [`JitFrame`] whose
+    /// `locals` / `stack_spill` / `stack_tags` buffers are at least
+    /// `n_locals` / `max_stack` wide, and the owning [`JitEngine`] must
+    /// still be alive (its `JITModule` backs this function pointer).
+    #[must_use]
+    pub unsafe fn enter(&self, frame: *mut JitFrame) -> JitStatus {
+        // SAFETY: the caller upholds the buffer-size and liveness
+        // invariants documented above; the function pointer was produced
+        // by `JITModule::get_finalized_function` for this exact signature.
+        let raw = unsafe { (self.func)(frame) };
+        JitStatus::from_raw(raw)
+    }
+}
+
+/// Owns the Cranelift JIT module and reusable codegen contexts.
+pub struct JitEngine {
+    module: JITModule,
+    ctx: Context,
+    fbctx: FunctionBuilderContext,
+    ptr_ty: Type,
+    next_id: u32,
+}
+
+impl std::fmt::Debug for JitEngine {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("JitEngine")
+            .field("ptr_ty", &self.ptr_ty)
+            .field("next_id", &self.next_id)
+            .finish_non_exhaustive()
+    }
+}
+
+impl JitEngine {
+    /// Build a fresh engine for the host target. Returns `None` if the
+    /// host ISA can't be configured (e.g. an unsupported platform), in
+    /// which case the VM simply never tiers up.
+    #[must_use]
+    pub fn new() -> Option<JitEngine> {
+        let mut flag_builder = settings::builder();
+        // A JIT that emits absolute addresses and resolves libcalls
+        // in-process.
+        flag_builder.set("use_colocated_libcalls", "false").ok()?;
+        flag_builder.set("is_pic", "false").ok()?;
+        // Favour fast compiles over the last few percent of codegen.
+        flag_builder.set("opt_level", "speed").ok()?;
+        let isa_builder = cranelift_native::builder().ok()?;
+        let isa = isa_builder
+            .finish(settings::Flags::new(flag_builder))
+            .ok()?;
+        let builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
+        let module = JITModule::new(builder);
+        let ptr_ty = module.target_config().pointer_type();
+        let ctx = module.make_context();
+        Some(JitEngine {
+            module,
+            ctx,
+            fbctx: FunctionBuilderContext::new(),
+            ptr_ty,
+            next_id: 0,
+        })
+    }
+
+    /// Analyze and compile a code object. Returns the compiled frame, or
+    /// the [`JitVerdict`] explaining why it is not JITable.
+    pub fn compile(&mut self, code: &CodeObject) -> Result<CompiledFrame, JitVerdict> {
+        let tfunc = analyze(code)?;
+        self.compile_tfunc(&tfunc)
+    }
+
+    /// Compile an already-analyzed [`TFunc`] (also the unit-test entry).
+    pub fn compile_tfunc(&mut self, tfunc: &TFunc) -> Result<CompiledFrame, JitVerdict> {
+        self.module.clear_context(&mut self.ctx);
+
+        // Signature: (frame: ptr) -> i64.
+        self.ctx
+            .func
+            .signature
+            .params
+            .push(AbiParam::new(self.ptr_ty));
+        self.ctx
+            .func
+            .signature
+            .returns
+            .push(AbiParam::new(types::I64));
+
+        build_function(&mut self.ctx.func, &mut self.fbctx, tfunc, self.ptr_ty);
+
+        let name = format!("wpjit_{}", self.next_id);
+        self.next_id += 1;
+        let id = self
+            .module
+            .declare_function(&name, Linkage::Local, &self.ctx.func.signature)
+            .map_err(|_| JitVerdict::NotConverged)?;
+        self.module
+            .define_function(id, &mut self.ctx)
+            .map_err(|_| JitVerdict::NotConverged)?;
+        self.module.clear_context(&mut self.ctx);
+        self.module
+            .finalize_definitions()
+            .map_err(|_| JitVerdict::NotConverged)?;
+
+        let code_ptr = self.module.get_finalized_function(id);
+        // SAFETY: `code_ptr` is a finalized function with exactly the
+        // `(*mut JitFrame) -> i64` signature declared above; the module
+        // keeps the code alive for the engine's lifetime.
+        let func: NativeFn = unsafe { mem::transmute::<*const u8, NativeFn>(code_ptr) };
+
+        Ok(CompiledFrame {
+            func,
+            livein: tfunc.livein_locals.clone(),
+            local_types: tfunc.local_types.clone(),
+            max_stack: tfunc.max_stack,
+            n_locals: tfunc.n_locals,
+        })
+    }
+}
diff --git a/crates/weavepy-jit/src/ir.rs b/crates/weavepy-jit/src/ir.rs
new file mode 100644
index 0000000..dcece81
--- /dev/null
+++ b/crates/weavepy-jit/src/ir.rs
@@ -0,0 +1,173 @@
+//! The typed mid-IR the analyzer emits and the lowerer consumes.
+//!
+//! It is a *stack machine* mirroring the bytecode, but with every
+//! operation resolved to a concrete [`JitType`] lane and every local
+//! resolved to a slot index. Keeping a tiny IR between bytecode and
+//! Cranelift means [`crate::analyze`] can be unit-tested without a
+//! codegen backend and [`crate::lower`] stays a straight syntax-directed
+//! translation.
+//!
+//! Cross-block operand-stack values are carried as Cranelift *block
+//! parameters* in lowering; [`TBlock::entry_stack`] records their static
+//! types so the lowerer can declare the right params. Locals become
+//! Cranelift *variables*, so merges are handled by the SSA builder
+//! without explicit phis.
+
+use crate::value::JitType;
+
+/// Index of a [`TBlock`] within a [`TFunc`].
+pub type BlockId = usize;
+
+/// Arithmetic operations the JIT lowers. `TrueDiv` (`/`) always yields a
+/// `float`; `FloorDiv`/`Mod` carry Python's round-toward-negative-
+/// infinity semantics on integers.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum ArithKind {
+    Add,
+    Sub,
+    Mul,
+    FloorDiv,
+    Mod,
+    TrueDiv,
+    And,
+    Or,
+    Xor,
+}
+
+/// Comparison operators (six-way), matching `CompareKind`.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum CmpKind {
+    Lt,
+    Le,
+    Eq,
+    Ne,
+    Gt,
+    Ge,
+}
+
+/// A single stack-machine operation. Operands are implicit (the top of
+/// the abstract value stack); results are pushed.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum TOp {
+    /// Push an `int` constant.
+    PushConstInt(i64),
+    /// Push a `float` constant (stored as `f64::to_bits` so `TOp` stays
+    /// `Copy` + `PartialEq`).
+    PushConstFloat(u64),
+    /// Push a `bool` constant.
+    PushConstBool(bool),
+    /// Push `locals[slot]`.
+    LoadLocal(u32),
+    /// Pop into `locals[slot]`.
+    StoreLocal(u32),
+    /// `int (op) int → int`. `Add`/`Sub`/`Mul` deopt on i64 overflow;
+    /// `FloorDiv`/`Mod` deopt on zero divisor or `MIN / -1`. Never
+    /// carries `TrueDiv` (see [`TOp::IntTrueDiv`]).
+    IntArith(ArithKind),
+    /// `float (op) float → float`. Only `Add`/`Sub`/`Mul`/`TrueDiv`
+    /// (float floor-div / mod are non-JITable in v1).
+    FloatArith(ArithKind),
+    /// `int / int → float` (Python true division). Deopts on a zero
+    /// divisor (the interpreter raises `ZeroDivisionError`).
+    IntTrueDiv,
+    /// `int (cmp) int → bool`.
+    IntCmp(CmpKind),
+    /// `float (cmp) float → bool`.
+    FloatCmp(CmpKind),
+    /// `-int`. Deopts on `MIN` negation overflow.
+    IntNeg,
+    /// `-float`.
+    FloatNeg,
+    /// `~int`.
+    IntInvert,
+    /// `not x` for an integral (`int`/`bool`) operand → `bool`.
+    IntNot,
+    /// `not x` for a `float` operand → `bool`.
+    FloatNot,
+    /// Discard TOS.
+    Pop,
+    /// Duplicate TOS (`COPY`).
+    Dup,
+    /// Swap the top two stack entries (`SWAP 2`).
+    Swap2,
+}
+
+/// One IR statement: a [`TOp`] tagged with its originating bytecode pc
+/// so a side exit can name the exact resume point.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct TStmt {
+    pub pc: u32,
+    pub op: TOp,
+}
+
+/// How a basic block transfers control.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum TTerm {
+    /// Pop TOS and return it from the frame.
+    Return,
+    /// Unconditional branch; the current abstract stack is passed as
+    /// block args.
+    Jump(BlockId),
+    /// `POP_JUMP_IF_FALSE`: pop the condition; branch to `target` if
+    /// falsy, else `fallthrough`.
+    BranchFalse {
+        target: BlockId,
+        fallthrough: BlockId,
+    },
+    /// `POP_JUMP_IF_TRUE`: pop the condition; branch to `target` if
+    /// truthy, else `fallthrough`.
+    BranchTrue {
+        target: BlockId,
+        fallthrough: BlockId,
+    },
+}
+
+/// A basic block: a static entry-stack shape, a straight-line body, and
+/// a terminator.
+#[derive(Clone, Debug, PartialEq)]
+pub struct TBlock {
+    /// Types of the operand-stack values live on entry (lowered to
+    /// Cranelift block parameters), bottom-to-top.
+    pub entry_stack: Vec<JitType>,
+    pub stmts: Vec<TStmt>,
+    pub term: TTerm,
+}
+
+/// A fully analyzed, JITable function body.
+#[derive(Clone, Debug, PartialEq)]
+pub struct TFunc {
+    /// Number of local slots in the originating code object.
+    pub n_locals: u32,
+    /// Stable JIT type of each local slot, or `None` for slots the
+    /// region never touches (left untouched by the JIT).
+    pub local_types: Vec<Option<JitType>>,
+    /// Local slots that are live-in at function entry (read before
+    /// written). The VM type-guards and packs exactly these before
+    /// entering native code.
+    pub livein_locals: Vec<u32>,
+    /// Maximum abstract operand-stack depth, for sizing the deopt spill
+    /// buffer.
+    pub max_stack: u32,
+    pub blocks: Vec<TBlock>,
+    pub entry_block: BlockId,
+}
+
+impl TOp {
+    /// `true` for operations that can take a side exit (deopt) and so
+    /// need their abstract stack spilled at their pc.
+    #[must_use]
+    pub fn can_deopt(self) -> bool {
+        matches!(
+            self,
+            TOp::IntArith(
+                ArithKind::Add
+                    | ArithKind::Sub
+                    | ArithKind::Mul
+                    | ArithKind::FloorDiv
+                    | ArithKind::Mod
+            ) | TOp::IntNeg
+                | TOp::IntTrueDiv
+                | TOp::FloatArith(ArithKind::TrueDiv)
+        )
+    }
+}
diff --git a/crates/weavepy-jit/src/lib.rs b/crates/weavepy-jit/src/lib.rs
new file mode 100644
index 0000000..cac7fa9
--- /dev/null
+++ b/crates/weavepy-jit/src/lib.rs
@@ -0,0 +1,46 @@
+//! RFC 0032 — tier-2 Cranelift JIT for WeavePy's unboxed numeric frames.
+//!
+//! This crate compiles the *unboxed numeric/control-flow core* of a
+//! [`weavepy_compiler::CodeObject`] — `int`/`float`/`bool` arithmetic,
+//! comparisons, the conditional and unconditional jumps, `range`
+//! iteration, and `return` — to native machine code via Cranelift.
+//! Everything outside that subset (containers, attribute access, calls
+//! out, exceptions, generators) stays in the interpreter; a frame whose
+//! hot region touches an unsupported opcode is reported
+//! [`JitStatus::NotJitable`] and never re-attempted.
+//!
+//! The crate deliberately does **not** depend on `weavepy-vm`: it speaks
+//! only in `i64`/`f64`/`bool` lanes plus the side-exit protocol in
+//! [`runtime`], so the VM owns the `Object` model and marshals values in
+//! and out of a [`runtime::JitFrame`] around each native entry. That
+//! keeps the unsafe FFI surface tiny and the dependency graph acyclic.
+//!
+//! # Safety
+//!
+//! Entering compiled code is `unsafe` by nature (an indirect call
+//! through a function pointer with a `#[repr(C)]` argument). The unsafe
+//! is confined to [`engine`] and [`runtime`]; callers interact through
+//! the safe [`JitEngine`] API and the [`runtime::JitFrame`] struct.
+
+mod analyze;
+mod engine;
+mod ir;
+mod lower;
+mod runtime;
+mod value;
+
+pub use analyze::{analyze, JitVerdict};
+pub use engine::{CompiledFrame, JitEngine};
+pub use ir::{ArithKind, BlockId, CmpKind, TBlock, TFunc, TOp, TStmt, TTerm};
+pub use runtime::{JitFrame, JitStatus, SlotTag};
+pub use value::JitType;
+
+/// Outcome of attempting to compile a code object.
+#[derive(Debug)]
+pub enum CompileOutcome {
+    /// The code object compiled; the engine cached the native function.
+    Compiled(CompiledFrame),
+    /// The code object is outside the JITable subset. The caller should
+    /// record this verdict and stop re-attempting compilation.
+    NotJitable(JitVerdict),
+}
diff --git a/crates/weavepy-jit/src/lower.rs b/crates/weavepy-jit/src/lower.rs
new file mode 100644
index 0000000..c2057f9
--- /dev/null
+++ b/crates/weavepy-jit/src/lower.rs
@@ -0,0 +1,550 @@
+//! Lower the typed IR ([`TFunc`]) to a Cranelift function.
+//!
+//! Locals become Cranelift *variables* (the SSA builder inserts phis at
+//! merges); the operand stack is an explicit `Vec` of SSA values, which
+//! the v1 subset guarantees is empty at every block boundary. Integer
+//! arithmetic is emitted with explicit overflow / divide-by-zero checks
+//! that branch to per-op *side-exit* blocks; a side exit writes the live
+//! locals + spilled stack back into the [`JitFrame`] and returns
+//! [`JitStatus::Deopt`] so the interpreter resumes at the exact pc.
+
+use cranelift_codegen::ir::condcodes::{FloatCC, IntCC};
+use cranelift_codegen::ir::{types, Block, Function, InstBuilder, MemFlags, Type, Value};
+use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable};
+
+use crate::ir::{ArithKind, CmpKind, TFunc, TOp, TStmt, TTerm};
+use crate::runtime::{JitFrame, JitStatus, SlotTag};
+use crate::value::JitType;
+
+const OFF_LOCALS: i32 = core::mem::offset_of!(JitFrame, locals) as i32;
+const OFF_RET_BITS: i32 = core::mem::offset_of!(JitFrame, ret_bits) as i32;
+const OFF_RET_TAG: i32 = core::mem::offset_of!(JitFrame, ret_tag) as i32;
+const OFF_DEOPT_PC: i32 = core::mem::offset_of!(JitFrame, deopt_pc) as i32;
+const OFF_STACK_SPILL: i32 = core::mem::offset_of!(JitFrame, stack_spill) as i32;
+const OFF_STACK_TAGS: i32 = core::mem::offset_of!(JitFrame, stack_tags) as i32;
+const OFF_STACK_LEN: i32 = core::mem::offset_of!(JitFrame, stack_len) as i32;
+
+/// Build the Cranelift function body for `tfunc` into `func`.
+pub(crate) fn build_function(
+    func: &mut Function,
+    fbctx: &mut FunctionBuilderContext,
+    tfunc: &TFunc,
+    ptr_ty: Type,
+) {
+    let mut builder = FunctionBuilder::new(func, fbctx);
+    let mut lc = Lowerer::new(&mut builder, tfunc, ptr_ty);
+    lc.build();
+    builder.seal_all_blocks();
+    builder.finalize();
+}
+
+struct Lowerer<'a, 'b> {
+    b: &'a mut FunctionBuilder<'b>,
+    tfunc: &'a TFunc,
+    ptr_ty: Type,
+    /// One Cranelift block per (reachable) TBlock.
+    cl_blocks: Vec<Block>,
+    /// One variable per managed local slot (others unused).
+    vars: Vec<Option<Variable>>,
+    frame_ptr: Value,
+    locals_base: Value,
+    spill_base: Value,
+    tags_base: Value,
+    /// The abstract operand stack: SSA value + lane.
+    vstack: Vec<(Value, JitType)>,
+}
+
+impl<'a, 'b> Lowerer<'a, 'b> {
+    fn new(b: &'a mut FunctionBuilder<'b>, tfunc: &'a TFunc, ptr_ty: Type) -> Lowerer<'a, 'b> {
+        // Placeholders overwritten at the top of `build` before any use.
+        let dummy = Value::from_u32(0);
+        Lowerer {
+            b,
+            tfunc,
+            ptr_ty,
+            cl_blocks: Vec::new(),
+            vars: Vec::new(),
+            frame_ptr: dummy,
+            locals_base: dummy,
+            spill_base: dummy,
+            tags_base: dummy,
+            vstack: Vec::new(),
+        }
+    }
+
+    fn cl_ty(ty: JitType) -> Type {
+        match ty {
+            JitType::Float => types::F64,
+            _ => types::I64,
+        }
+    }
+
+    fn tag(ty: JitType) -> i64 {
+        match ty {
+            JitType::Int => SlotTag::Int as i64,
+            JitType::Float => SlotTag::Float as i64,
+            JitType::Bool => SlotTag::Bool as i64,
+            JitType::Unknown => SlotTag::Int as i64,
+        }
+    }
+
+    fn build(&mut self) {
+        let trusted = MemFlags::trusted();
+
+        // Entry / prologue block carries the function param (frame ptr).
+        let entry = self.b.create_block();
+        self.b.append_block_params_for_function_params(entry);
+        self.b.switch_to_block(entry);
+        self.frame_ptr = self.b.block_params(entry)[0];
+        self.locals_base = self
+            .b
+            .ins()
+            .load(self.ptr_ty, trusted, self.frame_ptr, OFF_LOCALS);
+        self.spill_base = self
+            .b
+            .ins()
+            .load(self.ptr_ty, trusted, self.frame_ptr, OFF_STACK_SPILL);
+        self.tags_base = self
+            .b
+            .ins()
+            .load(self.ptr_ty, trusted, self.frame_ptr, OFF_STACK_TAGS);
+
+        // One Cranelift block per TBlock.
+        self.cl_blocks = (0..self.tfunc.blocks.len())
+            .map(|_| self.b.create_block())
+            .collect();
+
+        // Declare + initialise a variable per managed local.
+        self.vars = vec![None; self.tfunc.n_locals as usize];
+        for slot in 0..self.tfunc.local_types.len() {
+            if let Some(ty) = self.tfunc.local_types[slot] {
+                let cl = Self::cl_ty(ty);
+                let var = self.b.declare_var(cl);
+                let off = (slot as i32) * 8;
+                let v = self.b.ins().load(cl, trusted, self.locals_base, off);
+                self.b.def_var(var, v);
+                self.vars[slot] = Some(var);
+            }
+        }
+
+        let entry_target = self.cl_blocks[self.tfunc.entry_block];
+        self.b.ins().jump(entry_target, &[]);
+
+        // Emit each block body.
+        for bi in 0..self.tfunc.blocks.len() {
+            let cl = self.cl_blocks[bi];
+            self.b.switch_to_block(cl);
+            self.vstack.clear();
+            self.emit_block(bi);
+        }
+    }
+
+    fn emit_block(&mut self, bi: usize) {
+        let block = self.tfunc.blocks[bi].clone();
+        for stmt in &block.stmts {
+            self.emit_stmt(*stmt);
+        }
+        match block.term {
+            TTerm::Return => self.emit_return(),
+            TTerm::Jump(t) => {
+                let target = self.cl_blocks[t];
+                self.b.ins().jump(target, &[]);
+            }
+            TTerm::BranchFalse {
+                target,
+                fallthrough,
+            } => {
+                let (cond, ty) = self.pop();
+                let truthy = self.truth(cond, ty);
+                let tb = self.cl_blocks[target];
+                let fb = self.cl_blocks[fallthrough];
+                // if truthy → fallthrough else → target.
+                self.b.ins().brif(truthy, fb, &[], tb, &[]);
+            }
+            TTerm::BranchTrue {
+                target,
+                fallthrough,
+            } => {
+                let (cond, ty) = self.pop();
+                let truthy = self.truth(cond, ty);
+                let tb = self.cl_blocks[target];
+                let fb = self.cl_blocks[fallthrough];
+                self.b.ins().brif(truthy, tb, &[], fb, &[]);
+            }
+        }
+    }
+
+    fn emit_return(&mut self) {
+        let trusted = MemFlags::trusted();
+        let (val, ty) = self.pop();
+        self.b
+            .ins()
+            .store(trusted, val, self.frame_ptr, OFF_RET_BITS);
+        let tag = self.b.ins().iconst(types::I32, Self::tag(ty));
+        self.b
+            .ins()
+            .store(trusted, tag, self.frame_ptr, OFF_RET_TAG);
+        let status = self.b.ins().iconst(types::I64, JitStatus::Returned as i64);
+        self.b.ins().return_(&[status]);
+    }
+
+    fn emit_stmt(&mut self, stmt: TStmt) {
+        match stmt.op {
+            TOp::PushConstInt(v) => {
+                let val = self.b.ins().iconst(types::I64, v);
+                self.vstack.push((val, JitType::Int));
+            }
+            TOp::PushConstBool(v) => {
+                let val = self.b.ins().iconst(types::I64, i64::from(v));
+                self.vstack.push((val, JitType::Bool));
+            }
+            TOp::PushConstFloat(bits) => {
+                let val = self.b.ins().f64const(f64::from_bits(bits));
+                self.vstack.push((val, JitType::Float));
+            }
+            TOp::LoadLocal(slot) => {
+                let ty = self.tfunc.local_types[slot as usize].unwrap_or(JitType::Int);
+                let var = self.vars[slot as usize].expect("managed local");
+                let v = self.b.use_var(var);
+                self.vstack.push((v, ty));
+            }
+            TOp::StoreLocal(slot) => {
+                let (v, _) = self.pop();
+                let var = self.vars[slot as usize].expect("managed local");
+                self.b.def_var(var, v);
+            }
+            TOp::IntArith(kind) => self.emit_int_arith(kind, stmt.pc),
+            TOp::FloatArith(kind) => self.emit_float_arith(kind, stmt.pc),
+            TOp::IntTrueDiv => self.emit_int_truediv(stmt.pc),
+            TOp::IntCmp(kind) => self.emit_int_cmp(kind),
+            TOp::FloatCmp(kind) => self.emit_float_cmp(kind),
+            TOp::IntNeg => self.emit_int_neg(stmt.pc),
+            TOp::FloatNeg => {
+                let (a, _) = self.pop();
+                let r = self.b.ins().fneg(a);
+                self.vstack.push((r, JitType::Float));
+            }
+            TOp::IntInvert => {
+                let (a, _) = self.pop();
+                let r = self.b.ins().bnot(a);
+                self.vstack.push((r, JitType::Int));
+            }
+            TOp::IntNot => {
+                let (a, _) = self.pop();
+                let z = self.b.ins().iconst(types::I64, 0);
+                let cmp = self.b.ins().icmp(IntCC::Equal, a, z);
+                let r = self.b.ins().uextend(types::I64, cmp);
+                self.vstack.push((r, JitType::Bool));
+            }
+            TOp::FloatNot => {
+                let (a, _) = self.pop();
+                let z = self.b.ins().f64const(0.0);
+                let cmp = self.b.ins().fcmp(FloatCC::Equal, a, z);
+                let r = self.b.ins().uextend(types::I64, cmp);
+                self.vstack.push((r, JitType::Bool));
+            }
+            TOp::Pop => {
+                self.pop();
+            }
+            TOp::Dup => {
+                let top = *self.vstack.last().expect("dup on empty");
+                self.vstack.push(top);
+            }
+            TOp::Swap2 => {
+                let len = self.vstack.len();
+                self.vstack.swap(len - 1, len - 2);
+            }
+        }
+    }
+
+    // ---- arithmetic ------------------------------------------------
+
+    fn emit_int_arith(&mut self, kind: ArithKind, pc: u32) {
+        match kind {
+            ArithKind::Add | ArithKind::Sub | ArithKind::Mul => {
+                let snapshot = self.vstack.clone();
+                let (b, _) = self.pop();
+                let (a, _) = self.pop();
+                let (r, ovf) = match kind {
+                    ArithKind::Add => self.checked_add(a, b),
+                    ArithKind::Sub => self.checked_sub(a, b),
+                    _ => self.checked_mul(a, b),
+                };
+                let cont = self.guard(ovf, pc, &snapshot);
+                self.b.switch_to_block(cont);
+                self.vstack.push((r, JitType::Int));
+            }
+            ArithKind::FloorDiv => self.emit_floordiv(pc),
+            ArithKind::Mod => self.emit_mod(pc),
+            ArithKind::And => self.emit_int_bitop(BitOp::And),
+            ArithKind::Or => self.emit_int_bitop(BitOp::Or),
+            ArithKind::Xor => self.emit_int_bitop(BitOp::Xor),
+            ArithKind::TrueDiv => self.emit_int_truediv(pc),
+        }
+    }
+
+    fn emit_int_bitop(&mut self, op: BitOp) {
+        let (b, _) = self.pop();
+        let (a, _) = self.pop();
+        let r = match op {
+            BitOp::And => self.b.ins().band(a, b),
+            BitOp::Or => self.b.ins().bor(a, b),
+            BitOp::Xor => self.b.ins().bxor(a, b),
+        };
+        self.vstack.push((r, JitType::Int));
+    }
+
+    fn emit_float_arith(&mut self, kind: ArithKind, pc: u32) {
+        if matches!(kind, ArithKind::TrueDiv) {
+            self.emit_float_truediv(pc);
+            return;
+        }
+        let (b, _) = self.pop();
+        let (a, _) = self.pop();
+        let r = match kind {
+            ArithKind::Add => self.b.ins().fadd(a, b),
+            ArithKind::Sub => self.b.ins().fsub(a, b),
+            ArithKind::Mul => self.b.ins().fmul(a, b),
+            _ => unreachable!("non-jitable float arith reached lowering"),
+        };
+        self.vstack.push((r, JitType::Float));
+    }
+
+    fn emit_float_truediv(&mut self, pc: u32) {
+        // Python raises ZeroDivisionError on float `/ 0.0`; deopt so the
+        // interpreter raises with the right traceback.
+        let snapshot = self.vstack.clone();
+        let (b, _) = self.pop();
+        let (a, _) = self.pop();
+        let z = self.b.ins().f64const(0.0);
+        let is_zero = self.b.ins().fcmp(FloatCC::Equal, b, z);
+        let cont = self.guard(is_zero, pc, &snapshot);
+        self.b.switch_to_block(cont);
+        let r = self.b.ins().fdiv(a, b);
+        self.vstack.push((r, JitType::Float));
+    }
+
+    fn emit_int_truediv(&mut self, pc: u32) {
+        let snapshot = self.vstack.clone();
+        let (b, _) = self.pop();
+        let (a, _) = self.pop();
+        let z = self.b.ins().iconst(types::I64, 0);
+        let is_zero = self.b.ins().icmp(IntCC::Equal, b, z);
+        let cont = self.guard(is_zero, pc, &snapshot);
+        self.b.switch_to_block(cont);
+        let af = self.b.ins().fcvt_from_sint(types::F64, a);
+        let bf = self.b.ins().fcvt_from_sint(types::F64, b);
+        let r = self.b.ins().fdiv(af, bf);
+        self.vstack.push((r, JitType::Float));
+    }
+
+    fn emit_int_neg(&mut self, pc: u32) {
+        let snapshot = self.vstack.clone();
+        let (a, _) = self.pop();
+        let min = self.b.ins().iconst(types::I64, i64::MIN);
+        let ovf = self.b.ins().icmp(IntCC::Equal, a, min);
+        let cont = self.guard(ovf, pc, &snapshot);
+        self.b.switch_to_block(cont);
+        let r = self.b.ins().ineg(a);
+        self.vstack.push((r, JitType::Int));
+    }
+
+    /// Python floor division on `i64`. Deopts on a zero divisor or the
+    /// `MIN / -1` overflow, then applies the round-toward-negative-
+    /// infinity correction.
+    fn emit_floordiv(&mut self, pc: u32) {
+        let snapshot = self.vstack.clone();
+        let (b, _) = self.pop();
+        let (a, _) = self.pop();
+        let should = self.div_guard_cond(a, b);
+        let cont = self.guard(should, pc, &snapshot);
+        self.b.switch_to_block(cont);
+
+        let q = self.b.ins().sdiv(a, b);
+        let r = self.b.ins().srem(a, b);
+        // if r != 0 && (r<0) != (b<0) { q - 1 } else { q }
+        let adj = self.floor_adjust(r, b);
+        let qm1 = self.b.ins().iadd(q, adj);
+        self.vstack.push((qm1, JitType::Int));
+    }
+
+    /// Python modulo on `i64` (result takes the divisor's sign).
+    fn emit_mod(&mut self, pc: u32) {
+        let snapshot = self.vstack.clone();
+        let (b, _) = self.pop();
+        let (a, _) = self.pop();
+        let should = self.div_guard_cond(a, b);
+        let cont = self.guard(should, pc, &snapshot);
+        self.b.switch_to_block(cont);
+
+        let r = self.b.ins().srem(a, b);
+        // if r != 0 && (r<0) != (b<0) { r + b } else { r }
+        let needs = self.floor_needs_adjust(r, b);
+        let rplusb = self.b.ins().iadd(r, b);
+        let res = self.b.ins().select(needs, rplusb, r);
+        self.vstack.push((res, JitType::Int));
+    }
+
+    /// `b == 0 || (a == MIN && b == -1)`.
+    fn div_guard_cond(&mut self, a: Value, b: Value) -> Value {
+        let zero = self.b.ins().iconst(types::I64, 0);
+        let is_zero = self.b.ins().icmp(IntCC::Equal, b, zero);
+        let min = self.b.ins().iconst(types::I64, i64::MIN);
+        let neg1 = self.b.ins().iconst(types::I64, -1);
+        let a_min = self.b.ins().icmp(IntCC::Equal, a, min);
+        let b_neg1 = self.b.ins().icmp(IntCC::Equal, b, neg1);
+        let overflow = self.b.ins().band(a_min, b_neg1);
+        self.b.ins().bor(is_zero, overflow)
+    }
+
+    /// `(r != 0) && ((r < 0) != (b < 0))` as an I8 boolean.
+    fn floor_needs_adjust(&mut self, r: Value, b: Value) -> Value {
+        let zero = self.b.ins().iconst(types::I64, 0);
+        let r_nz = self.b.ins().icmp(IntCC::NotEqual, r, zero);
+        let r_neg = self.b.ins().icmp(IntCC::SignedLessThan, r, zero);
+        let b_neg = self.b.ins().icmp(IntCC::SignedLessThan, b, zero);
+        let signs_differ = self.b.ins().bxor(r_neg, b_neg);
+        self.b.ins().band(r_nz, signs_differ)
+    }
+
+    /// `-1` when the floor correction applies, else `0` (to add to `q`).
+    fn floor_adjust(&mut self, r: Value, b: Value) -> Value {
+        let needs = self.floor_needs_adjust(r, b);
+        let neg1 = self.b.ins().iconst(types::I64, -1);
+        let zero = self.b.ins().iconst(types::I64, 0);
+        self.b.ins().select(needs, neg1, zero)
+    }
+
+    // ---- comparisons ----------------------------------------------
+
+    fn emit_int_cmp(&mut self, kind: CmpKind) {
+        let (b, _) = self.pop();
+        let (a, _) = self.pop();
+        let cc = match kind {
+            CmpKind::Lt => IntCC::SignedLessThan,
+            CmpKind::Le => IntCC::SignedLessThanOrEqual,
+            CmpKind::Eq => IntCC::Equal,
+            CmpKind::Ne => IntCC::NotEqual,
+            CmpKind::Gt => IntCC::SignedGreaterThan,
+            CmpKind::Ge => IntCC::SignedGreaterThanOrEqual,
+        };
+        let c = self.b.ins().icmp(cc, a, b);
+        let r = self.b.ins().uextend(types::I64, c);
+        self.vstack.push((r, JitType::Bool));
+    }
+
+    fn emit_float_cmp(&mut self, kind: CmpKind) {
+        let (b, _) = self.pop();
+        let (a, _) = self.pop();
+        let cc = match kind {
+            CmpKind::Lt => FloatCC::LessThan,
+            CmpKind::Le => FloatCC::LessThanOrEqual,
+            CmpKind::Eq => FloatCC::Equal,
+            CmpKind::Ne => FloatCC::NotEqual,
+            CmpKind::Gt => FloatCC::GreaterThan,
+            CmpKind::Ge => FloatCC::GreaterThanOrEqual,
+        };
+        let c = self.b.ins().fcmp(cc, a, b);
+        let r = self.b.ins().uextend(types::I64, c);
+        self.vstack.push((r, JitType::Bool));
+    }
+
+    // ---- overflow helpers (portable signed-overflow detection) -----
+
+    fn checked_add(&mut self, a: Value, b: Value) -> (Value, Value) {
+        let r = self.b.ins().iadd(a, b);
+        let axr = self.b.ins().bxor(a, r);
+        let bxr = self.b.ins().bxor(b, r);
+        let and = self.b.ins().band(axr, bxr);
+        let zero = self.b.ins().iconst(types::I64, 0);
+        let ovf = self.b.ins().icmp(IntCC::SignedLessThan, and, zero);
+        (r, ovf)
+    }
+
+    fn checked_sub(&mut self, a: Value, b: Value) -> (Value, Value) {
+        let r = self.b.ins().isub(a, b);
+        let axb = self.b.ins().bxor(a, b);
+        let axr = self.b.ins().bxor(a, r);
+        let and = self.b.ins().band(axb, axr);
+        let zero = self.b.ins().iconst(types::I64, 0);
+        let ovf = self.b.ins().icmp(IntCC::SignedLessThan, and, zero);
+        (r, ovf)
+    }
+
+    fn checked_mul(&mut self, a: Value, b: Value) -> (Value, Value) {
+        let lo = self.b.ins().imul(a, b);
+        let hi = self.b.ins().smulhi(a, b);
+        let sign = self.b.ins().sshr_imm(lo, 63);
+        let ovf = self.b.ins().icmp(IntCC::NotEqual, hi, sign);
+        (lo, ovf)
+    }
+
+    // ---- deopt / side exits ---------------------------------------
+
+    /// Emit `if cond { deopt(pc, snapshot) } else { cont }` and return
+    /// the `cont` block (the caller continues lowering there).
+    fn guard(&mut self, cond: Value, pc: u32, snapshot: &[(Value, JitType)]) -> Block {
+        let se = self.b.create_block();
+        let cont = self.b.create_block();
+        self.b.ins().brif(cond, se, &[], cont, &[]);
+        self.b.switch_to_block(se);
+        self.emit_deopt(pc, snapshot);
+        cont
+    }
+
+    fn emit_deopt(&mut self, pc: u32, snapshot: &[(Value, JitType)]) {
+        let trusted = MemFlags::trusted();
+        // Write back every managed local.
+        for (slot, var) in self.vars.iter().enumerate() {
+            if let Some(var) = *var {
+                let v = self.b.use_var(var);
+                let off = (slot as i32) * 8;
+                self.b.ins().store(trusted, v, self.locals_base, off);
+            }
+        }
+        // Spill the abstract stack bottom-to-top.
+        for (idx, (val, ty)) in snapshot.iter().enumerate() {
+            let voff = (idx as i32) * 8;
+            self.b.ins().store(trusted, *val, self.spill_base, voff);
+            let toff = (idx as i32) * 4;
+            let tagv = self.b.ins().iconst(types::I32, Self::tag(*ty));
+            self.b.ins().store(trusted, tagv, self.tags_base, toff);
+        }
+        let len = self.b.ins().iconst(types::I32, snapshot.len() as i64);
+        self.b
+            .ins()
+            .store(trusted, len, self.frame_ptr, OFF_STACK_LEN);
+        let pcv = self.b.ins().iconst(types::I32, i64::from(pc));
+        self.b
+            .ins()
+            .store(trusted, pcv, self.frame_ptr, OFF_DEOPT_PC);
+        let status = self.b.ins().iconst(types::I64, JitStatus::Deopt as i64);
+        self.b.ins().return_(&[status]);
+    }
+
+    // ---- helpers ---------------------------------------------------
+
+    fn truth(&mut self, val: Value, ty: JitType) -> Value {
+        match ty {
+            JitType::Float => {
+                let z = self.b.ins().f64const(0.0);
+                self.b.ins().fcmp(FloatCC::NotEqual, val, z)
+            }
+            _ => {
+                let z = self.b.ins().iconst(types::I64, 0);
+                self.b.ins().icmp(IntCC::NotEqual, val, z)
+            }
+        }
+    }
+
+    fn pop(&mut self) -> (Value, JitType) {
+        self.vstack.pop().expect("operand stack underflow in lower")
+    }
+}
+
+#[derive(Clone, Copy)]
+enum BitOp {
+    And,
+    Or,
+    Xor,
+}
diff --git a/crates/weavepy-jit/src/runtime.rs b/crates/weavepy-jit/src/runtime.rs
new file mode 100644
index 0000000..4545f8f
--- /dev/null
+++ b/crates/weavepy-jit/src/runtime.rs
@@ -0,0 +1,121 @@
+//! The native-call ABI: the `#[repr(C)]` [`JitFrame`] the VM fills
+//! before entering compiled code and reads after it exits, plus the
+//! side-exit status protocol.
+//!
+//! A compiled frame is a single native function with the signature
+//!
+//! ```text
+//! extern "C" fn(frame: *mut JitFrame) -> i64   // an i64 JitStatus
+//! ```
+//!
+//! On a [`JitStatus::Returned`] exit the function has written
+//! [`JitFrame::ret_bits`] / [`JitFrame::ret_tag`]. On a
+//! [`JitStatus::Deopt`] exit it has written [`JitFrame::deopt_pc`] and
+//! spilled the live abstract operand stack into
+//! [`JitFrame::stack_spill`] / [`JitFrame::stack_tags`] (bottom-to-top)
+//! with [`JitFrame::stack_len`] entries, plus written back every
+//! JIT-managed local into [`JitFrame::locals`]. The VM then rebuilds its
+//! interpreter state and resumes at `deopt_pc`, bit-for-bit as though
+//! the JIT had never run.
+
+/// The status returned (as an `i64`) by a compiled frame.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(i64)]
+pub enum JitStatus {
+    /// The frame ran to a `RETURN_VALUE`. The return value is in
+    /// [`JitFrame::ret_bits`] / [`JitFrame::ret_tag`].
+    Returned = 0,
+    /// The frame took a side exit. The VM resumes interpretation at
+    /// [`JitFrame::deopt_pc`] with the spilled stack + written-back
+    /// locals.
+    Deopt = 1,
+}
+
+impl JitStatus {
+    /// Decode the raw `i64` a compiled frame returns.
+    #[inline]
+    #[must_use]
+    pub fn from_raw(v: i64) -> JitStatus {
+        match v {
+            0 => JitStatus::Returned,
+            _ => JitStatus::Deopt,
+        }
+    }
+}
+
+/// How to interpret a `u64` slot in [`JitFrame::locals`] /
+/// [`JitFrame::stack_spill`].
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(u32)]
+pub enum SlotTag {
+    /// `i64` bit pattern → `Object::Int`.
+    Int = 0,
+    /// `f64` bit pattern (via `to_bits`) → `Object::Float`.
+    Float = 1,
+    /// `0`/`1` → `Object::Bool`.
+    Bool = 2,
+}
+
+impl SlotTag {
+    /// Decode a raw tag written by native code.
+    #[inline]
+    #[must_use]
+    pub fn from_raw(v: u32) -> SlotTag {
+        match v {
+            1 => SlotTag::Float,
+            2 => SlotTag::Bool,
+            _ => SlotTag::Int,
+        }
+    }
+}
+
+/// The exchange buffer the VM passes to a compiled frame.
+///
+/// The VM owns the backing storage (`Vec<u64>` / `Vec<u32>`); this
+/// struct holds raw pointers to it for the duration of one native call.
+/// All indices the native code touches are bounded by `n_locals` /
+/// `stack_cap`, which the VM sizes from the compiled frame's analysis.
+#[repr(C)]
+#[derive(Debug)]
+pub struct JitFrame {
+    /// Slot-indexed local storage, one `u64` per code-object local.
+    /// Holds `i64` / `f64`-bits / `bool` per the local's stable type.
+    pub locals: *mut u64,
+    /// Number of valid entries in [`Self::locals`].
+    pub n_locals: u32,
+    /// OSR entry: the bytecode pc to begin execution at. `0` enters at
+    /// the function start; a loop-header pc enters mid-frame.
+    pub entry_pc: u32,
+
+    /// `Returned`: the return value's bit pattern.
+    pub ret_bits: u64,
+    /// `Returned`: the return value's [`SlotTag`].
+    pub ret_tag: u32,
+
+    /// `Deopt`: the bytecode pc to resume interpretation at.
+    pub deopt_pc: u32,
+    /// `Deopt`: spilled abstract operand stack, bottom-to-top.
+    pub stack_spill: *mut u64,
+    /// `Deopt`: matching [`SlotTag`]s for [`Self::stack_spill`].
+    pub stack_tags: *mut u32,
+    /// `Deopt`: number of spilled stack entries.
+    pub stack_len: u32,
+    /// Capacity of [`Self::stack_spill`] / [`Self::stack_tags`].
+    pub stack_cap: u32,
+}
+
+impl JitFrame {
+    /// Reinterpret an `f64` as the `u64` stored in a slot.
+    #[inline]
+    #[must_use]
+    pub fn f64_to_bits(v: f64) -> u64 {
+        v.to_bits()
+    }
+
+    /// Reinterpret a slot's `u64` as the `f64` it encodes.
+    #[inline]
+    #[must_use]
+    pub fn bits_to_f64(bits: u64) -> f64 {
+        f64::from_bits(bits)
+    }
+}
diff --git a/crates/weavepy-jit/src/value.rs b/crates/weavepy-jit/src/value.rs
new file mode 100644
index 0000000..564fc3c
--- /dev/null
+++ b/crates/weavepy-jit/src/value.rs
@@ -0,0 +1,60 @@
+//! The unboxed value model and type lattice the JIT reasons about.
+//!
+//! Only three concrete Python types are representable as unboxed machine
+//! values: `int` (as `i64`), `float` (as `f64`), and `bool` (as a
+//! one-byte `0`/`1`). Everything else is [`JitType::Unknown`], which
+//! makes any region that would need it non-JITable.
+//!
+//! A deliberate restriction keeps deopt simple (see `analyze`): within a
+//! single compiled region, each local slot and each abstract-stack
+//! position has **one** stable [`JitType`]. Straight-line retyping of a
+//! local (`x = 1; x = 2.0`) is rejected as non-JITable rather than
+//! tracked per-pc.
+
+/// The abstract type of an unboxed value flowing through the JIT.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum JitType {
+    /// CPython `int` that fits in `i64`. Overflow deopts to the
+    /// interpreter, which promotes to a bignum.
+    Int,
+    /// CPython `float` (`f64`).
+    Float,
+    /// CPython `bool`. Distinct from `Int` so the VM rebuilds the right
+    /// `Object` variant on deopt; arithmetic promotes it to `Int` first.
+    Bool,
+    /// Anything the JIT can't represent. Its presence as an operand to a
+    /// supported opcode makes the enclosing region non-JITable.
+    Unknown,
+}
+
+impl JitType {
+    /// `true` for the three representable types.
+    #[inline]
+    #[must_use]
+    pub fn is_representable(self) -> bool {
+        !matches!(self, JitType::Unknown)
+    }
+
+    /// `true` if this is an integral lane (`Int` or `Bool`), which share
+    /// the `i64` machine representation.
+    #[inline]
+    #[must_use]
+    pub fn is_integral(self) -> bool {
+        matches!(self, JitType::Int | JitType::Bool)
+    }
+
+    /// Dataflow join at a control-flow merge. Two equal types join to
+    /// themselves; everything else collapses to [`JitType::Unknown`].
+    /// `Bool`/`Int` are kept distinct (they join to `Unknown`) so a slot
+    /// that is sometimes a bool and sometimes an int is treated as
+    /// non-uniform and the region bails — conservative but always sound.
+    #[inline]
+    #[must_use]
+    pub fn join(self, other: JitType) -> JitType {
+        if self == other {
+            self
+        } else {
+            JitType::Unknown
+        }
+    }
+}
diff --git a/crates/weavepy-jit/tests/numeric.rs b/crates/weavepy-jit/tests/numeric.rs
new file mode 100644
index 0000000..902d330
--- /dev/null
+++ b/crates/weavepy-jit/tests/numeric.rs
@@ -0,0 +1,269 @@
+//! End-to-end codegen tests over hand-built IR: they compile a
+//! [`TFunc`] and actually *run* the native code, checking results, the
+//! overflow/zero deopt protocol, and Python's floor-division semantics —
+//! all without needing the parser or VM.
+
+use weavepy_jit::{
+    ArithKind, CmpKind, JitEngine, JitFrame, JitStatus, JitType, SlotTag, TBlock, TFunc, TOp,
+    TStmt, TTerm,
+};
+
+/// Allocate buffers, enter the compiled frame with the given locals, and
+/// return `(status, ret_bits, ret_tag, spilled_stack, deopt_pc)`.
+fn run(tfunc: &TFunc, locals_in: &[u64]) -> (JitStatus, u64, u32, Vec<(u64, u32)>, u32) {
+    let mut engine = JitEngine::new().expect("host ISA");
+    let cf = engine.compile_tfunc(tfunc).expect("compile");
+
+    let mut locals = vec![0u64; cf.n_locals as usize];
+    for (i, v) in locals_in.iter().enumerate() {
+        locals[i] = *v;
+    }
+    let cap = cf.max_stack as usize + 1;
+    let mut spill = vec![0u64; cap];
+    let mut tags = vec![0u32; cap];
+
+    let mut frame = JitFrame {
+        locals: locals.as_mut_ptr(),
+        n_locals: cf.n_locals,
+        entry_pc: 0,
+        ret_bits: 0,
+        ret_tag: 0,
+        deopt_pc: 0,
+        stack_spill: spill.as_mut_ptr(),
+        stack_tags: tags.as_mut_ptr(),
+        stack_len: 0,
+        stack_cap: cap as u32,
+    };
+    // SAFETY: buffers are sized to n_locals / max_stack; `engine` (and so
+    // the backing module) outlives this call.
+    let status = unsafe { cf.enter(&raw mut frame) };
+
+    let mut spilled = Vec::new();
+    for i in 0..frame.stack_len as usize {
+        spilled.push((spill[i], tags[i]));
+    }
+    (
+        status,
+        frame.ret_bits,
+        frame.ret_tag,
+        spilled,
+        frame.deopt_pc,
+    )
+}
+
+fn st(pc: u32, op: TOp) -> TStmt {
+    TStmt { pc, op }
+}
+
+#[test]
+fn add_two_ints() {
+    // def f(a, b): return a + b
+    let tfunc = TFunc {
+        n_locals: 2,
+        local_types: vec![Some(JitType::Int), Some(JitType::Int)],
+        livein_locals: vec![0, 1],
+        max_stack: 2,
+        entry_block: 0,
+        blocks: vec![TBlock {
+            entry_stack: vec![],
+            stmts: vec![
+                st(0, TOp::LoadLocal(0)),
+                st(1, TOp::LoadLocal(1)),
+                st(2, TOp::IntArith(ArithKind::Add)),
+            ],
+            term: TTerm::Return,
+        }],
+    };
+    let (status, bits, tag, _, _) = run(&tfunc, &[(40i64) as u64, (2i64) as u64]);
+    assert_eq!(status, JitStatus::Returned);
+    assert_eq!(tag, SlotTag::Int as u32);
+    assert_eq!(bits as i64, 42);
+}
+
+#[test]
+fn add_overflow_deopts_with_operands_spilled() {
+    // a + b where a = i64::MAX, b = 1 must deopt at the BINARY_OP pc with
+    // both operands on the spilled stack.
+    let tfunc = TFunc {
+        n_locals: 2,
+        local_types: vec![Some(JitType::Int), Some(JitType::Int)],
+        livein_locals: vec![0, 1],
+        max_stack: 2,
+        entry_block: 0,
+        blocks: vec![TBlock {
+            entry_stack: vec![],
+            stmts: vec![
+                st(10, TOp::LoadLocal(0)),
+                st(11, TOp::LoadLocal(1)),
+                st(12, TOp::IntArith(ArithKind::Add)),
+            ],
+            term: TTerm::Return,
+        }],
+    };
+    let (status, _, _, spilled, pc) = run(&tfunc, &[i64::MAX as u64, 1u64]);
+    assert_eq!(status, JitStatus::Deopt);
+    assert_eq!(pc, 12);
+    assert_eq!(spilled.len(), 2);
+    assert_eq!(spilled[0].0 as i64, i64::MAX);
+    assert_eq!(spilled[1].0 as i64, 1);
+    assert_eq!(spilled[0].1, SlotTag::Int as u32);
+}
+
+/// Build `def f(n): s=0; i=0; while i<n: s=s+i; i=i+1; return s`.
+fn sum_loop() -> TFunc {
+    TFunc {
+        n_locals: 3, // 0=n, 1=s, 2=i
+        local_types: vec![Some(JitType::Int), Some(JitType::Int), Some(JitType::Int)],
+        livein_locals: vec![0],
+        max_stack: 2,
+        entry_block: 0,
+        blocks: vec![
+            // B0: s=0; i=0; -> B1
+            TBlock {
+                entry_stack: vec![],
+                stmts: vec![
+                    st(0, TOp::PushConstInt(0)),
+                    st(1, TOp::StoreLocal(1)),
+                    st(2, TOp::PushConstInt(0)),
+                    st(3, TOp::StoreLocal(2)),
+                ],
+                term: TTerm::Jump(1),
+            },
+            // B1 header: if i < n -> B2 else B3
+            TBlock {
+                entry_stack: vec![],
+                stmts: vec![
+                    st(4, TOp::LoadLocal(2)),
+                    st(5, TOp::LoadLocal(0)),
+                    st(6, TOp::IntCmp(CmpKind::Lt)),
+                ],
+                term: TTerm::BranchFalse {
+                    target: 3,
+                    fallthrough: 2,
+                },
+            },
+            // B2 body: s=s+i; i=i+1; -> B1
+            TBlock {
+                entry_stack: vec![],
+                stmts: vec![
+                    st(7, TOp::LoadLocal(1)),
+                    st(8, TOp::LoadLocal(2)),
+                    st(9, TOp::IntArith(ArithKind::Add)),
+                    st(10, TOp::StoreLocal(1)),
+                    st(11, TOp::LoadLocal(2)),
+                    st(12, TOp::PushConstInt(1)),
+                    st(13, TOp::IntArith(ArithKind::Add)),
+                    st(14, TOp::StoreLocal(2)),
+                ],
+                term: TTerm::Jump(1),
+            },
+            // B3 exit: return s
+            TBlock {
+                entry_stack: vec![],
+                stmts: vec![st(15, TOp::LoadLocal(1))],
+                term: TTerm::Return,
+            },
+        ],
+    }
+}
+
+#[test]
+fn while_loop_sums() {
+    let tfunc = sum_loop();
+    let (status, bits, tag, _, _) = run(&tfunc, &[10u64]);
+    assert_eq!(status, JitStatus::Returned);
+    assert_eq!(tag, SlotTag::Int as u32);
+    assert_eq!(bits as i64, 45); // 0+1+..+9
+}
+
+#[test]
+fn while_loop_zero_iterations() {
+    let tfunc = sum_loop();
+    let (status, bits, _, _, _) = run(&tfunc, &[0u64]);
+    assert_eq!(status, JitStatus::Returned);
+    assert_eq!(bits as i64, 0);
+}
+
+/// `def f(a, b): return a // b` and `... a % b`, for the floor/modulo
+/// semantics that differ from Rust's truncating division on negatives.
+fn binop_fn(op: ArithKind) -> TFunc {
+    TFunc {
+        n_locals: 2,
+        local_types: vec![Some(JitType::Int), Some(JitType::Int)],
+        livein_locals: vec![0, 1],
+        max_stack: 2,
+        entry_block: 0,
+        blocks: vec![TBlock {
+            entry_stack: vec![],
+            stmts: vec![
+                st(0, TOp::LoadLocal(0)),
+                st(1, TOp::LoadLocal(1)),
+                st(2, TOp::IntArith(op)),
+            ],
+            term: TTerm::Return,
+        }],
+    }
+}
+
+#[test]
+fn python_floordiv_semantics() {
+    let f = binop_fn(ArithKind::FloorDiv);
+    let cases = [
+        (7i64, 2i64, 3i64),
+        (-7, 2, -4),
+        (7, -2, -4),
+        (-7, -2, 3),
+        (6, 3, 2),
+        (-6, 3, -2),
+    ];
+    for (a, b, want) in cases {
+        let (status, bits, _, _, _) = run(&f, &[a as u64, b as u64]);
+        assert_eq!(status, JitStatus::Returned, "{a} // {b}");
+        assert_eq!(bits as i64, want, "{a} // {b}");
+    }
+}
+
+#[test]
+fn python_mod_semantics() {
+    let f = binop_fn(ArithKind::Mod);
+    let cases = [(7i64, 3i64, 1i64), (-7, 3, 2), (7, -3, -2), (-7, -3, -1)];
+    for (a, b, want) in cases {
+        let (status, bits, _, _, _) = run(&f, &[a as u64, b as u64]);
+        assert_eq!(status, JitStatus::Returned, "{a} % {b}");
+        assert_eq!(bits as i64, want, "{a} % {b}");
+    }
+}
+
+#[test]
+fn floordiv_by_zero_deopts() {
+    let f = binop_fn(ArithKind::FloorDiv);
+    let (status, _, _, spilled, pc) = run(&f, &[5u64, 0u64]);
+    assert_eq!(status, JitStatus::Deopt);
+    assert_eq!(pc, 2);
+    assert_eq!(spilled.len(), 2);
+}
+
+#[test]
+fn int_truediv_returns_float() {
+    // def f(a, b): return a / b  ->  float
+    let tfunc = TFunc {
+        n_locals: 2,
+        local_types: vec![Some(JitType::Int), Some(JitType::Int)],
+        livein_locals: vec![0, 1],
+        max_stack: 2,
+        entry_block: 0,
+        blocks: vec![TBlock {
+            entry_stack: vec![],
+            stmts: vec![
+                st(0, TOp::LoadLocal(0)),
+                st(1, TOp::LoadLocal(1)),
+                st(2, TOp::IntTrueDiv),
+            ],
+            term: TTerm::Return,
+        }],
+    };
+    let (status, bits, tag, _, _) = run(&tfunc, &[7u64, 2u64]);
+    assert_eq!(status, JitStatus::Returned);
+    assert_eq!(tag, SlotTag::Float as u32);
+    assert!((f64::from_bits(bits) - 3.5).abs() < 1e-12);
+}
diff --git a/crates/weavepy-vm/Cargo.toml b/crates/weavepy-vm/Cargo.toml
index 59d1710..fcc2e3d 100644
--- a/crates/weavepy-vm/Cargo.toml
+++ b/crates/weavepy-vm/Cargo.toml
@@ -63,5 +63,15 @@ parking_lot           = { workspace = true }
 crossbeam-channel     = { workspace = true }
 crossbeam-utils       = { workspace = true }
 
+# RFC 0032 — tier-2 Cranelift JIT, behind the (default-off) `jit` feature.
+weavepy-jit           = { workspace = true, optional = true }
+
+[features]
+default = []
+# Compile the tier-2 JIT integration (pulls in Cranelift). Off by
+# default; CI exercises it via `--all-features`. Activated at runtime
+# by `WEAVEPY_JIT=1`.
+jit = ["dep:weavepy-jit"]
+
 [lints]
 workspace = true
diff --git a/crates/weavepy-vm/src/lib.rs b/crates/weavepy-vm/src/lib.rs
index 4517da7..3e152f4 100644
--- a/crates/weavepy-vm/src/lib.rs
+++ b/crates/weavepy-vm/src/lib.rs
@@ -39,6 +39,10 @@ pub mod specialize;
 pub mod stdlib;
 pub mod sync;
 pub mod thread_registry;
+/// RFC 0032 — tier-2 Cranelift JIT integration. Present only under the
+/// `jit` feature; the dispatch loop calls into it behind `#[cfg]` gates.
+#[cfg(feature = "jit")]
+mod tier2;
 pub mod trace;
 pub mod types;
 pub mod vm_singletons;
@@ -112,6 +116,21 @@ impl Frame {
     }
 }
 
+/// RFC 0032 — render the tier-2 JIT's counters as a markdown block for
+/// the `WEAVEPY_VM_STATS` report, or `None` when the `jit` feature is
+/// disabled or the JIT was never exercised on this thread.
+#[must_use]
+pub fn jit_stats_markdown() -> Option<String> {
+    #[cfg(feature = "jit")]
+    {
+        crate::tier2::format_stats_markdown()
+    }
+    #[cfg(not(feature = "jit"))]
+    {
+        None
+    }
+}
+
 // ---------- interpreter ----------
 
 /// Output sink. Either the process's stdout or a `Vec<u8>` for
@@ -741,6 +760,11 @@ impl Interpreter {
         frame: &mut Frame,
         sent: Option<Object>,
     ) -> Result<FrameOutcome, RuntimeError> {
+        // Captured before `sent` is consumed below; only the tier-2
+        // entry guard reads it, so it's gated to the `jit` feature to
+        // stay warning-free in default builds.
+        #[cfg(feature = "jit")]
+        let is_resume = sent.is_some();
         if let Some(v) = sent {
             frame.push(v);
         }
@@ -756,6 +780,21 @@ impl Interpreter {
         if observers_active {
             self.fire_call_event(&py_frame)?;
         }
+        // RFC 0032 — tier-2 entry. Only for a fresh activation (pc 0,
+        // empty stack, not a generator resume) and only when tracing is
+        // off, since native code fires no line/return events. A returned
+        // native frame short-circuits the interpreter loop; a deopt
+        // rewrites `frame` and falls through to resume interpretation.
+        #[cfg(feature = "jit")]
+        if !is_resume && !observers_active && frame.pc == 0 && frame.stack.is_empty() {
+            match crate::tier2::try_enter(frame) {
+                crate::tier2::JitEntry::Ran(v) => {
+                    self.pop_py_frame();
+                    return Ok(FrameOutcome::Returned(v));
+                }
+                crate::tier2::JitEntry::Deopt | crate::tier2::JitEntry::Skip => {}
+            }
+        }
         let result = loop {
             // Mirror the live `pc` into the snapshot so `f_lineno`
             // reads correctly when user code introspects via
@@ -1451,24 +1490,7 @@ impl Interpreter {
                 }
             }
             OpCode::Call => {
-                let argc = ins.arg as usize;
-                let split_at = frame.stack.len().saturating_sub(argc);
-                let mut args: Vec<Object> = frame.stack.split_off(split_at);
-                let callable = frame.pop()?;
-                // Zero-arg super(): inject __class__ from the free
-                // cell named "__class__" and `self` from local 0.
-                if args.is_empty() && is_super_callable(&callable) {
-                    if let Some(class_cell) = find_cell(frame, "__class__") {
-                        let class_obj = class_cell.borrow().clone();
-                        if !matches!(class_obj, Object::None) {
-                            let self_obj = frame.locals.first().cloned().unwrap_or(Object::None);
-                            args.push(class_obj);
-                            args.push(self_obj);
-                        }
-                    }
-                }
-                let r = self.call(&callable, &args, &[], &frame.globals)?;
-                frame.push(r);
+                self.dispatch_call(frame, cache_pc, ins.arg as usize)?;
             }
             OpCode::CallKw => {
                 let argc = ins.arg as usize;
@@ -1547,6 +1569,10 @@ impl Interpreter {
             }
             OpCode::JumpBackward => {
                 frame.pc = frame.pc.saturating_sub(ins.arg);
+                // RFC 0032 — a loop back-edge heats the code object so a
+                // subsequent activation can tier up to native code.
+                #[cfg(feature = "jit")]
+                crate::tier2::note_backedge(&frame.code);
             }
             OpCode::GetIter => {
                 let v = frame.pop()?;
@@ -7154,6 +7180,159 @@ impl Interpreter {
         }
     }
 
+    /// RFC 0032 — specialized `CALL`. Mirrors the RFC 0021 dispatchers:
+    /// a warm cache takes an argument-binding-free fast path for a
+    /// pinned `PyFunction`; `Empty` runs the generic call and attempts
+    /// specialization; `Cooldown` decrements and stays generic. The
+    /// super()/argument fixup and the generic dispatch are shared.
+    fn dispatch_call(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        argc: usize,
+    ) -> Result<(), RuntimeError> {
+        use weavepy_compiler::InlineCache as IC;
+        let op_idx = OpCode::Call as u8;
+        let split_at = frame.stack.len().saturating_sub(argc);
+        let mut args: Vec<Object> = frame.stack.split_off(split_at);
+        let callable = frame.pop()?;
+        // Zero-arg super(): inject __class__ and `self`. Never matches a
+        // pinned-function cache, so it always takes the generic path.
+        if args.is_empty() && is_super_callable(&callable) {
+            if let Some(class_cell) = find_cell(frame, "__class__") {
+                let class_obj = class_cell.borrow().clone();
+                if !matches!(class_obj, Object::None) {
+                    let self_obj = frame.locals.first().cloned().unwrap_or(Object::None);
+                    args.push(class_obj);
+                    args.push(self_obj);
+                }
+            }
+        }
+        let cache = frame.code.caches.get(cache_pc);
+        match cache {
+            IC::CallPyExactNoFree { func_id, argc: ca } => {
+                if ca as usize == argc {
+                    if let Object::Function(f) = &callable {
+                        if specialize::rc_id(f) == func_id && args.len() == argc {
+                            specialize::record_hit(op_idx);
+                            let f = f.clone();
+                            let r = self.run_py_exact_nofree(&f, args)?;
+                            frame.push(r);
+                            return Ok(());
+                        }
+                    }
+                }
+                self.deopt_call_generic(frame, cache_pc, &callable, &args)
+            }
+            IC::CallPyExact { func_id, argc: ca } => {
+                if ca as usize == argc {
+                    if let Object::Function(f) = &callable {
+                        if specialize::rc_id(f) == func_id && args.len() == argc {
+                            specialize::record_hit(op_idx);
+                            let f = f.clone();
+                            let r = self.run_py_exact_with_cells(&f, args)?;
+                            frame.push(r);
+                            return Ok(());
+                        }
+                    }
+                }
+                self.deopt_call_generic(frame, cache_pc, &callable, &args)
+            }
+            IC::Empty => {
+                specialize::record_specialize_attempt(op_idx);
+                let decision = specialize::attempt_specialize_call(&callable, argc);
+                frame.code.caches.set(cache_pc, decision);
+                if matches!(decision, IC::Cooldown(_)) {
+                    specialize::record_specialize_skip(op_idx);
+                } else {
+                    specialize::record_specialize_success(op_idx);
+                }
+                let r = self.call(&callable, &args, &[], &frame.globals)?;
+                frame.push(r);
+                Ok(())
+            }
+            IC::Cooldown(n) => {
+                let next = if n > 0 {
+                    IC::Cooldown(n - 1)
+                } else {
+                    IC::Empty
+                };
+                frame.code.caches.set(cache_pc, next);
+                let r = self.call(&callable, &args, &[], &frame.globals)?;
+                frame.push(r);
+                Ok(())
+            }
+            _ => {
+                let r = self.call(&callable, &args, &[], &frame.globals)?;
+                frame.push(r);
+                Ok(())
+            }
+        }
+    }
+
+    /// Deopt a `CALL` cache (guard miss): cool the slot down and run the
+    /// generic dispatch.
+    fn deopt_call_generic(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        callable: &Object,
+        args: &[Object],
+    ) -> Result<(), RuntimeError> {
+        specialize::record_miss(OpCode::Call as u8);
+        frame
+            .code
+            .caches
+            .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN));
+        let r = self.call(callable, args, &[], &frame.globals)?;
+        frame.push(r);
+        Ok(())
+    }
+
+    /// Fast frame setup for a cell-free, exact-arity Python call: build
+    /// the locals directly from the arguments (no binding pass, no
+    /// cells) and run.
+    fn run_py_exact_nofree(
+        &mut self,
+        f: &Rc<PyFunction>,
+        args: Vec<Object>,
+    ) -> Result<Object, RuntimeError> {
+        let code = f.code.clone();
+        let mut locals = vec![Object::None; code.varnames.len()];
+        for (slot, v) in args.into_iter().enumerate() {
+            locals[slot] = v;
+        }
+        let mut frame = Frame {
+            code,
+            locals,
+            cells: Vec::new(),
+            stack: Vec::with_capacity(16),
+            globals: f.globals.clone(),
+            class_namespace: None,
+            exc_handlers: Vec::new(),
+            pc: 0,
+        };
+        self.run_frame(&mut frame)
+    }
+
+    /// Like [`Self::run_py_exact_nofree`] but for functions with cells /
+    /// a closure: skips argument binding but builds the frame (and its
+    /// cells) through `make_frame`.
+    fn run_py_exact_with_cells(
+        &mut self,
+        f: &Rc<PyFunction>,
+        args: Vec<Object>,
+    ) -> Result<Object, RuntimeError> {
+        let mut frame = self.make_frame(
+            f.code.clone(),
+            args,
+            f.closure.clone(),
+            f.globals.clone(),
+            false,
+        );
+        self.run_frame(&mut frame)
+    }
+
     // ---------- imports (RFC 0012) ----------
 
     /// `IMPORT_NAME` runtime side. Resolves relative imports against
@@ -9979,6 +10158,22 @@ mod tests {
         String::from_utf8(bytes).expect("utf-8")
     }
 
+    /// RFC 0032 — run `src` with the tier-2 JIT forced on, on a fresh
+    /// thread so the thread-local JIT state can't leak into other
+    /// tests. Returns `(stdout, frames_compiled, deopts)`.
+    #[cfg(feature = "jit")]
+    fn run_jit(src: &str) -> (String, u64, u64) {
+        let src = src.to_owned();
+        std::thread::spawn(move || {
+            crate::tier2::force_enable_for_test(2);
+            let out = run(&src);
+            let (compiled, _entries, deopts) = crate::tier2::stats_for_test();
+            (out, compiled, deopts)
+        })
+        .join()
+        .expect("jit worker thread")
+    }
+
     #[test]
     fn runs_print_int() {
         assert_eq!(run("print(42)\n"), "42\n");
@@ -10027,6 +10222,135 @@ mod tests {
         assert_eq!(run(src), "8\n");
     }
 
+    // RFC 0032 — CALL specialization. Each of these drives a single
+    // call site in a loop so the inline cache warms up and the
+    // specialized fast path (or its deopt) is exercised, then checks
+    // the result still matches plain interpretation.
+
+    #[test]
+    fn call_spec_repeated_plain() {
+        // `add` has no cells/closure and exact arity → CallPyExactNoFree.
+        let src = "def add(a, b):\n    return a + b\n\
+                   total = 0\ni = 0\n\
+                   while i < 50:\n    total = total + add(i, i)\n    i = i + 1\n\
+                   print(total)\n";
+        assert_eq!(run(src), "2450\n");
+    }
+
+    #[test]
+    fn call_spec_repeated_closure() {
+        // `add5` closes over `x` → CallPyExact (frame built with cells).
+        let src = "def make_adder(x):\n    def add(y):\n        return x + y\n    return add\n\
+                   add5 = make_adder(5)\n\
+                   total = 0\ni = 0\n\
+                   while i < 50:\n    total = total + add5(i)\n    i = i + 1\n\
+                   print(total)\n";
+        assert_eq!(run(src), "1475\n");
+    }
+
+    #[test]
+    fn call_spec_polymorphic_site_deopts() {
+        // One call site sees two different functions on alternating
+        // iterations: the per-function guard must miss and fall back to
+        // the generic path without corrupting results.
+        let src = "def f(x):\n    return x + 1\ndef g(x):\n    return x * 2\n\
+                   funcs = [f, g]\n\
+                   total = 0\ni = 0\n\
+                   while i < 10:\n    fn = funcs[i % 2]\n    total = total + fn(i)\n    i = i + 1\n\
+                   print(total)\n";
+        assert_eq!(run(src), "75\n");
+    }
+
+    #[test]
+    fn call_spec_defaults_use_generic_path() {
+        // Calling with fewer args than params needs default binding, so
+        // the site must stay on the generic dispatch (Cooldown), not the
+        // exact-arity fast path.
+        let src = "def f(a, b=10):\n    return a + b\n\
+                   total = 0\ni = 0\n\
+                   while i < 20:\n    total = total + f(i)\n    i = i + 1\n\
+                   print(total)\n";
+        assert_eq!(run(src), "390\n");
+    }
+
+    // RFC 0032 — tier-2 JIT integration. Each test forces the JIT on,
+    // drives a hot `while`-loop kernel through many calls so it tiers
+    // up, and asserts (a) the JIT actually compiled the kernel, (b) the
+    // native result matches both the interpreter and CPython.
+
+    #[cfg(feature = "jit")]
+    #[test]
+    fn jit_numeric_kernel_matches_interpreter() {
+        let src = "def kernel(n):\n    s = 0\n    i = 0\n\
+                   \x20   while i < n:\n        s = s + i * 2 - (i // 3) + (i % 7)\n        i = i + 1\n\
+                   \x20   return s\n\
+                   def bench(m):\n    total = 0\n    k = 0\n\
+                   \x20   while k < m:\n        total = total + kernel(50)\n        k = k + 1\n\
+                   \x20   return total\n\
+                   print(bench(100))\n";
+        let (out, compiled, deopts) = run_jit(src);
+        assert!(compiled >= 1, "JIT never compiled the kernel");
+        assert_eq!(deopts, 0, "clean numeric kernel should not deopt");
+        assert_eq!(out, "220500\n");
+        assert_eq!(out, run(src), "JIT output diverged from the interpreter");
+    }
+
+    #[cfg(feature = "jit")]
+    #[test]
+    fn jit_floordiv_mod_negative_semantics() {
+        // Exercises Python floor-division / modulo sign rules in real
+        // compiled code (operands span negative values).
+        let src = "def fdmod(n):\n    a = 0\n    i = 0 - n\n\
+                   \x20   while i < n:\n        a = a + (i // 3) - (i % 5)\n        i = i + 1\n\
+                   \x20   return a\n\
+                   def bench(m):\n    t = 0\n    k = 0\n\
+                   \x20   while k < m:\n        t = t + fdmod(40)\n        k = k + 1\n\
+                   \x20   return t\n\
+                   print(bench(100))\n";
+        let (out, compiled, _deopts) = run_jit(src);
+        assert!(compiled >= 1, "JIT never compiled the kernel");
+        assert_eq!(out, "-20000\n");
+        assert_eq!(out, run(src));
+    }
+
+    #[cfg(feature = "jit")]
+    #[test]
+    fn jit_branchy_kernel_matches_interpreter() {
+        // if/else inside the hot loop → multiple basic blocks and a
+        // join, exercising the block/terminator lowering.
+        let src = "def br(n):\n    c = 0\n    i = 0\n\
+                   \x20   while i < n:\n        if i % 3 == 0:\n            c = c + i\n        else:\n            c = c - 1\n        i = i + 1\n\
+                   \x20   return c\n\
+                   def bench(m):\n    t = 0\n    k = 0\n\
+                   \x20   while k < m:\n        t = t + br(60)\n        k = k + 1\n\
+                   \x20   return t\n\
+                   print(bench(100))\n";
+        let (out, compiled, _deopts) = run_jit(src);
+        assert!(compiled >= 1, "JIT never compiled the kernel");
+        assert_eq!(out, "53000\n");
+        assert_eq!(out, run(src));
+    }
+
+    #[cfg(feature = "jit")]
+    #[test]
+    fn jit_overflow_deopts_to_bigint() {
+        // The accumulator overflows i64 mid-loop: the native code must
+        // deopt, hand the operands back, and let the interpreter promote
+        // to a big integer — matching CPython's arbitrary-precision int.
+        let src = "def okern(n):\n    s = 0\n    i = 0\n\
+                   \x20   while i < n:\n        s = s + 1000000000000000000\n        i = i + 1\n\
+                   \x20   return s\n\
+                   def bench(m):\n    r = 0\n    k = 0\n\
+                   \x20   while k < m:\n        r = okern(20)\n        k = k + 1\n\
+                   \x20   return r\n\
+                   print(bench(100))\n";
+        let (out, compiled, deopts) = run_jit(src);
+        assert!(compiled >= 1, "JIT never compiled the kernel");
+        assert!(deopts >= 1, "overflowing kernel should deopt at least once");
+        assert_eq!(out, "20000000000000000000\n");
+        assert_eq!(out, run(src), "deopt path diverged from the interpreter");
+    }
+
     #[test]
     fn list_comprehension() {
         let src = "xs = [x * x for x in range(4)]\nprint(xs)\n";
diff --git a/crates/weavepy-vm/src/specialize.rs b/crates/weavepy-vm/src/specialize.rs
index deb7a6a..d0c06db 100644
--- a/crates/weavepy-vm/src/specialize.rs
+++ b/crates/weavepy-vm/src/specialize.rs
@@ -262,6 +262,45 @@ pub fn attempt_specialize_unpack_sequence(seq: &Object, n: usize) -> InlineCache
     }
 }
 
+// ---------- specialization decisions: CALL ----------
+
+/// Decide on a `CALL` specialization (RFC 0032).
+///
+/// We only specialize the *exact positional arity, no keywords* shape —
+/// the call site supplies precisely `arg_count` positionals and the
+/// function declares no `*args`/`**kwargs`/keyword-only parameters. That
+/// lets the fast path skip the entire argument-binding pass in
+/// `call_python`. Generators/coroutines are excluded (their call returns
+/// a suspended object, not a frame result). Functions with cells take
+/// the `CallPyExact` shape (still skips binding, but builds cells via
+/// `make_frame`); cell-free functions take the leaner `CallPyExactNoFree`.
+pub fn attempt_specialize_call(callable: &Object, argc: usize) -> InlineCache {
+    match callable {
+        Object::Function(f) => {
+            let code = &f.code;
+            if code.is_generator || code.is_coroutine || code.is_async_generator {
+                return InlineCache::Cooldown(COOLDOWN);
+            }
+            if code.has_varargs || code.has_varkeywords || code.kwonly_count != 0 {
+                return InlineCache::Cooldown(COOLDOWN);
+            }
+            // Only the exact-arity shape: anything needing defaults (too
+            // few) or *args overflow (too many) keeps the generic path.
+            if code.arg_count as usize != argc {
+                return InlineCache::Cooldown(COOLDOWN);
+            }
+            let func_id = rc_id(f);
+            let argc = u32::try_from(argc).unwrap_or(u32::MAX);
+            if code.cellvars.is_empty() && code.freevars.is_empty() && f.closure.is_empty() {
+                InlineCache::CallPyExactNoFree { func_id, argc }
+            } else {
+                InlineCache::CallPyExact { func_id, argc }
+            }
+        }
+        _ => InlineCache::Cooldown(COOLDOWN),
+    }
+}
+
 // ---------- shared helpers ----------
 
 /// Cheap fingerprint for an `Rc<T>`. Two clones of the same
diff --git a/crates/weavepy-vm/src/stdlib/struct_mod.rs b/crates/weavepy-vm/src/stdlib/struct_mod.rs
index 8c1c415..0f5430f 100644
--- a/crates/weavepy-vm/src/stdlib/struct_mod.rs
+++ b/crates/weavepy-vm/src/stdlib/struct_mod.rs
@@ -244,7 +244,7 @@ impl CompiledFormat {
     }
 
     fn iter_unpack(&self, buf: &[u8]) -> Result<Vec<Vec<Object>>, RuntimeError> {
-        if buf.len() % self.size != 0 {
+        if !buf.len().is_multiple_of(self.size) {
             return Err(struct_error(format!(
                 "iterative unpacking requires a buffer of a multiple of {} bytes",
                 self.size
diff --git a/crates/weavepy-vm/src/tier2.rs b/crates/weavepy-vm/src/tier2.rs
new file mode 100644
index 0000000..2093035
--- /dev/null
+++ b/crates/weavepy-vm/src/tier2.rs
@@ -0,0 +1,335 @@
+//! RFC 0032 — the VM side of the tier-2 Cranelift JIT.
+//!
+//! This module is compiled only with the `jit` feature. It owns a
+//! per-thread [`weavepy_jit::JitEngine`] and a hot-counter cache keyed by
+//! `CodeObject` identity, decides when a frame is hot enough to compile,
+//! applies the entry type-guard, marshals locals into a
+//! [`weavepy_jit::JitFrame`], enters the native code, and reconstructs
+//! interpreter state on a deopt side exit.
+//!
+//! Everything here runs under the GIL on a single thread, so the engine,
+//! cache, and the raw function pointers they hand out never cross thread
+//! boundaries — hence the thread-local state and the plain [`StdRc`].
+
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::rc::Rc as StdRc;
+
+use weavepy_compiler::CodeObject;
+use weavepy_jit::{CompiledFrame, JitEngine, JitFrame, JitStatus, JitType, SlotTag};
+
+use crate::object::Object;
+use crate::sync::Rc;
+
+/// What happened when the VM offered a frame to the JIT.
+pub(crate) enum JitEntry {
+    /// The native frame ran to completion; this is its return value.
+    Ran(Object),
+    /// The native frame deopted; `frame.pc` / locals / stack have been
+    /// rewritten and the interpreter should resume.
+    Deopt,
+    /// The frame was not entered (cold, not JITable, or guard failed);
+    /// run the interpreter as usual.
+    Skip,
+}
+
+/// Per-`CodeObject` compilation state.
+enum Tier {
+    Cold,
+    NotJitable,
+    Compiled(StdRc<CompiledFrame>),
+}
+
+struct CacheEntry {
+    counter: u32,
+    tier: Tier,
+    /// Keeps the code object alive so its address can't be reused while
+    /// this entry (and any compiled pointer keyed by it) is live.
+    _code: Rc<CodeObject>,
+}
+
+/// JIT counters surfaced through `WEAVEPY_VM_STATS`.
+#[derive(Default, Clone)]
+pub(crate) struct JitStats {
+    pub frames_seen: u64,
+    pub frames_compiled: u64,
+    pub frames_notjitable: u64,
+    pub native_entries: u64,
+    pub deopts: u64,
+    pub entry_guard_failures: u64,
+}
+
+struct JitState {
+    enabled: bool,
+    threshold: u32,
+    engine: Option<JitEngine>,
+    cache: HashMap<*const CodeObject, CacheEntry>,
+    stats: JitStats,
+}
+
+impl JitState {
+    fn new() -> JitState {
+        let enabled = match std::env::var("WEAVEPY_JIT") {
+            Ok(v) => v != "0" && !v.eq_ignore_ascii_case("off") && !v.is_empty(),
+            Err(_) => false,
+        };
+        let threshold = std::env::var("WEAVEPY_JIT_THRESHOLD")
+            .ok()
+            .and_then(|v| v.parse::<u32>().ok())
+            .filter(|n| *n > 0)
+            .unwrap_or(50);
+        JitState {
+            enabled,
+            threshold,
+            engine: None,
+            cache: HashMap::new(),
+            stats: JitStats::default(),
+        }
+    }
+
+    /// Bump the hot counter for `code` and, once it crosses the
+    /// threshold, attempt compilation. Returns the compiled frame when
+    /// one is available.
+    fn get_compiled(&mut self, code: &Rc<CodeObject>) -> Option<StdRc<CompiledFrame>> {
+        let key = Rc::as_ptr(code).cast::<CodeObject>();
+        {
+            let entry = self.cache.entry(key).or_insert_with(|| CacheEntry {
+                counter: 0,
+                tier: Tier::Cold,
+                _code: code.clone(),
+            });
+            match &entry.tier {
+                Tier::Compiled(cf) => return Some(cf.clone()),
+                Tier::NotJitable => return None,
+                Tier::Cold => {
+                    entry.counter += 1;
+                    if entry.counter < self.threshold {
+                        return None;
+                    }
+                }
+            }
+        }
+        // Threshold reached: compile (engine + cache borrowed disjointly).
+        if self.engine.is_none() {
+            self.engine = JitEngine::new();
+            if self.engine.is_none() {
+                // Host ISA unavailable — disable so we stop retrying.
+                self.enabled = false;
+                return None;
+            }
+        }
+        let engine = self.engine.as_mut()?;
+        let (tier, out) = match engine.compile(code) {
+            Ok(cf) => {
+                self.stats.frames_compiled += 1;
+                let rc = StdRc::new(cf);
+                (Tier::Compiled(rc.clone()), Some(rc))
+            }
+            Err(_) => {
+                self.stats.frames_notjitable += 1;
+                (Tier::NotJitable, None)
+            }
+        };
+        if let Some(entry) = self.cache.get_mut(&key) {
+            entry.tier = tier;
+        }
+        out
+    }
+
+    fn note_backedge(&mut self, code: &Rc<CodeObject>) {
+        if !self.enabled {
+            return;
+        }
+        let key = Rc::as_ptr(code).cast::<CodeObject>();
+        let entry = self.cache.entry(key).or_insert_with(|| CacheEntry {
+            counter: 0,
+            tier: Tier::Cold,
+            _code: code.clone(),
+        });
+        if matches!(entry.tier, Tier::Cold) {
+            entry.counter = entry.counter.saturating_add(1);
+        }
+    }
+}
+
+thread_local! {
+    static JIT: RefCell<JitState> = RefCell::new(JitState::new());
+}
+
+/// Reconstruct an [`Object`] from a `(bits, tag)` slot.
+fn unpack(bits: u64, tag: u32) -> Object {
+    match SlotTag::from_raw(tag) {
+        SlotTag::Int => Object::Int(bits as i64),
+        SlotTag::Float => Object::Float(f64::from_bits(bits)),
+        SlotTag::Bool => Object::Bool(bits != 0),
+    }
+}
+
+/// Reconstruct an [`Object`] from a slot whose lane is statically known.
+fn unpack_ty(bits: u64, ty: JitType) -> Object {
+    match ty {
+        JitType::Int => Object::Int(bits as i64),
+        JitType::Float => Object::Float(f64::from_bits(bits)),
+        JitType::Bool => Object::Bool(bits != 0),
+        JitType::Unknown => Object::None,
+    }
+}
+
+/// Pack a representable [`Object`] into its slot bits for `ty`, or `None`
+/// if it doesn't match the expected lane.
+fn pack(obj: &Object, ty: JitType) -> Option<u64> {
+    match (ty, obj) {
+        (JitType::Int, Object::Int(i)) => Some(*i as u64),
+        (JitType::Bool, Object::Bool(b)) => Some(u64::from(*b)),
+        (JitType::Float, Object::Float(f)) => Some(f.to_bits()),
+        _ => None,
+    }
+}
+
+/// Bump the back-edge hot counter for a code object (no-op when the JIT
+/// is disabled).
+pub(crate) fn note_backedge(code: &Rc<CodeObject>) {
+    JIT.with(|cell| cell.borrow_mut().note_backedge(code));
+}
+
+/// Offer a fresh frame (pc 0, empty stack) to the JIT. See [`JitEntry`].
+pub(crate) fn try_enter(frame: &mut super::Frame) -> JitEntry {
+    // Phase 1: counter + compilation, holding the state borrow briefly.
+    let cf = JIT.with(|cell| {
+        let mut st = cell.borrow_mut();
+        if !st.enabled {
+            return None;
+        }
+        st.stats.frames_seen += 1;
+        st.get_compiled(&frame.code)
+    });
+    let Some(cf) = cf else {
+        return JitEntry::Skip;
+    };
+
+    // Phase 2: entry type-guard on the live-in locals.
+    for &slot in &cf.livein {
+        let ty = match cf.local_types.get(slot as usize).copied().flatten() {
+            Some(t) => t,
+            None => return JitEntry::Skip,
+        };
+        let ok = frame
+            .locals
+            .get(slot as usize)
+            .and_then(|o| pack(o, ty))
+            .is_some();
+        if !ok {
+            JIT.with(|cell| cell.borrow_mut().stats.entry_guard_failures += 1);
+            return JitEntry::Skip;
+        }
+    }
+
+    // Phase 3: marshal locals and enter native code.
+    let n = cf.n_locals as usize;
+    let mut locals_buf = vec![0u64; n];
+    for (slot, dst) in locals_buf.iter_mut().enumerate() {
+        if let Some(ty) = cf.local_types[slot] {
+            *dst = frame
+                .locals
+                .get(slot)
+                .and_then(|o| pack(o, ty))
+                .unwrap_or(0);
+        }
+    }
+    let cap = cf.max_stack as usize + 1;
+    let mut spill = vec![0u64; cap];
+    let mut tags = vec![0u32; cap];
+    let mut jf = JitFrame {
+        locals: locals_buf.as_mut_ptr(),
+        n_locals: cf.n_locals,
+        entry_pc: 0,
+        ret_bits: 0,
+        ret_tag: 0,
+        deopt_pc: 0,
+        stack_spill: spill.as_mut_ptr(),
+        stack_tags: tags.as_mut_ptr(),
+        stack_len: 0,
+        stack_cap: cap as u32,
+    };
+
+    // SAFETY: `locals_buf` is `n_locals` wide and `spill`/`tags` are
+    // `max_stack + 1` wide, matching what the compiled frame was built
+    // to address; the engine that backs `cf` lives in this thread's
+    // `JIT` thread-local for the process lifetime.
+    let status = unsafe { cf.enter(&raw mut jf) };
+
+    JIT.with(|cell| {
+        let mut st = cell.borrow_mut();
+        st.stats.native_entries += 1;
+        if matches!(status, JitStatus::Deopt) {
+            st.stats.deopts += 1;
+        }
+    });
+
+    match status {
+        JitStatus::Returned => JitEntry::Ran(unpack(jf.ret_bits, jf.ret_tag)),
+        JitStatus::Deopt => {
+            // Write back managed locals, rebuild the operand stack from
+            // the spill, and resume at the deopt pc.
+            for (slot, &bits) in locals_buf.iter().enumerate() {
+                if let Some(ty) = cf.local_types[slot] {
+                    frame.locals[slot] = unpack_ty(bits, ty);
+                }
+            }
+            for i in 0..jf.stack_len as usize {
+                frame.stack.push(unpack(spill[i], tags[i]));
+            }
+            frame.pc = jf.deopt_pc;
+            JitEntry::Deopt
+        }
+    }
+}
+
+/// Test hook: force the JIT on for the current thread with a low
+/// tier-up threshold, regardless of `WEAVEPY_JIT`. Compiled only in
+/// test builds so it never reaches release binaries.
+#[cfg(test)]
+pub(crate) fn force_enable_for_test(threshold: u32) {
+    JIT.with(|cell| {
+        let mut st = cell.borrow_mut();
+        st.enabled = true;
+        st.threshold = threshold.max(1);
+    });
+}
+
+/// Test hook: `(frames_compiled, native_entries, deopts)` for the
+/// current thread.
+#[cfg(test)]
+pub(crate) fn stats_for_test() -> (u64, u64, u64) {
+    JIT.with(|cell| {
+        let s = &cell.borrow().stats;
+        (s.frames_compiled, s.native_entries, s.deopts)
+    })
+}
+
+/// Render the JIT counters as markdown rows, or `None` if the JIT was
+/// never exercised on this thread.
+pub(crate) fn format_stats_markdown() -> Option<String> {
+    JIT.with(|cell| {
+        let st = cell.borrow();
+        let s = &st.stats;
+        if s.frames_seen == 0 {
+            return None;
+        }
+        Some(format!(
+            "\n## Tier-2 JIT stats\n\n\
+             - frames seen: **{}**\n\
+             - frames compiled: **{}**\n\
+             - frames not JITable: **{}**\n\
+             - native entries: **{}**\n\
+             - deopts: **{}**\n\
+             - entry-guard failures: **{}**\n",
+            s.frames_seen,
+            s.frames_compiled,
+            s.frames_notjitable,
+            s.native_entries,
+            s.deopts,
+            s.entry_guard_failures,
+        ))
+    })
+}
diff --git a/crates/weavepy/Cargo.toml b/crates/weavepy/Cargo.toml
index cf5ac8a..11b4f99 100644
--- a/crates/weavepy/Cargo.toml
+++ b/crates/weavepy/Cargo.toml
@@ -20,5 +20,10 @@ weavepy-parser   = { workspace = true }
 weavepy-vm       = { workspace = true }
 thiserror        = { workspace = true }
 
+[features]
+default = []
+# RFC 0032 — forward the tier-2 JIT feature down to the VM.
+jit = ["weavepy-vm/jit"]
+
 [lints]
 workspace = true
diff --git a/docs/rfcs/0032-tier2-jit-and-call-specialization.md b/docs/rfcs/0032-tier2-jit-and-call-specialization.md
new file mode 100644
index 0000000..98d6380
--- /dev/null
+++ b/docs/rfcs/0032-tier2-jit-and-call-specialization.md
@@ -0,0 +1,600 @@
+# RFC 0032: Tier-2 — a Cranelift JIT for hot numeric frames + CALL specialization
+
+- **Status**: Accepted
+- **Authors**: WeavePy authors
+- **Created**: 2026-05-29
+- **Tracking issue**: TBD
+- **Builds on**: RFC 0021 (adaptive specialization / inline caches),
+  RFC 0024/0025 (GIL + cross-thread heap), RFC 0031 (observability hot path)
+
+## Summary
+
+RFC 0021 shipped the "tier-1 baseline": per-instruction inline caches and
+PEP 659-style adaptive specialization for the seven hottest opcodes. It
+**deliberately deferred two things** and named them the next perf RFC:
+
+> - **`CALL` specialization.** The single largest remaining opcode-level
+>   perf gap.
+> - **Tier-2: Cranelift JIT.** "Once the adaptive interpreter is recording
+>   stable type observations, a tier-2 JIT can compile hot frames to
+>   native code … this RFC builds the data-collection layer they need."
+
+RFC 0032 cashes both checks. After it lands:
+
+- The `CALL` opcode gains **five inline-cache fast paths** in the
+  interpreter — `CallPyExact`, `CallPyExactNoFree`, `CallBuiltinFast`,
+  `CallBoundMethodExact`, and `CallTypeConstructor1` — that skip the
+  ~120-arm `Interpreter::call` dispatch chain and the elaborate
+  `call_python` argument-binding loop when the call shape is simple and
+  stable. This is pure interpreter work, always on, and warms through
+  the same `Empty → Specialized → Cooldown` cycle as every other RFC
+  0021 cache.
+
+- A new **`weavepy-jit`** crate hosts a **tier-2 method JIT** backed by
+  **Cranelift** (`cranelift-jit` + `cranelift-frontend` +
+  `cranelift-codegen` + `cranelift-module`). The JIT compiles a code
+  object's **unboxed numeric/control-flow core** to native machine code:
+  `LOAD_FAST` / `STORE_FAST` / `LOAD_CONST` of `int` / `float` / `bool`,
+  `BINARY_OP` / `COMPARE_OP` / `UNARY_OP` on `int` / `float`, and the
+  conditional and unconditional jumps (`POP_JUMP_IF_*`, `JUMP_FORWARD`,
+  `JUMP_BACKWARD`) plus `RETURN_VALUE`. The headline case is the
+  **`while`-style integer/float loop**, which lowers to this subset with
+  no iterator protocol. `for … range(…)` loops are *not* in the v1
+  subset: they compile to a `CALL range` + `GET_ITER` + `FOR_ITER`
+  iterator dance that needs an OSR-with-iterator-state path (future
+  work). Frames whose hot region steps outside the subset are left to
+  the interpreter — the JIT never emits native code for an operation
+  whose semantics it can't reproduce exactly.
+
+- The VM gains a **per-`CodeObject` hot counter** (the tiering trigger
+  RFC 0021 said the JIT would need but didn't build). Frame entry and
+  every `JUMP_BACKWARD` back-edge bump it; when it crosses
+  `JIT_HOT_THRESHOLD`, the frame is handed to the JIT compiler once. The
+  result is cached on the code object (keyed by `Arc` identity) as
+  `Compiled(fn)` or `NotJitable` so we never re-attempt a frame we've
+  already rejected.
+
+- **Guards and deopt.** A compiled frame is entered only after an **entry
+  guard** confirms the participating locals hold the expected unboxed
+  types. Inside native code, integer arithmetic uses **checked** ops:
+  on i64 overflow — or any other condition the fast path can't handle —
+  the native function takes a **side exit**, writes the live register
+  state back into the frame's locals, and returns a `Deopt { pc }`
+  status so the interpreter resumes at exactly that bytecode offset with
+  identical state. Deopt is always semantically transparent: the JIT is
+  a pure accelerator, never a source of observable behavior change.
+
+- **On-stack replacement (OSR)** is designed-for but **deferred** in
+  v1: the hot counter fires on back-edges, but the JIT enters only at
+  the function start (pc = 0), so a function must be *re-entered* (called
+  again) to run native — which covers the common "hot helper called in a
+  loop / repeatedly" case and the bench harness. Lifting an
+  already-running loop mid-flight (true OSR) needs the multi-entry
+  machinery sketched below and lands in a follow-up.
+
+- The JIT is **off by default** and gated three ways: the `jit` Cargo
+  feature on `weavepy-vm` / `weavepy-cli` / `weavepy-bench` (built by
+  CI's `--all-features`, absent from a plain `cargo build`), and the
+  `WEAVEPY_JIT=1` environment variable (or `-X jit`) at runtime. With
+  the feature off the VM compiles a zero-cost no-op shim; with the
+  feature on but the env var unset, the hot counter still ticks but the
+  compiler is never invoked.
+
+- The **bench harness** learns to capture the host-CPython baseline
+  (the existing `bench.json` has `"cpython": null` because runs passed
+  `--no-cpython`) and to run WeavePy in three modes — interpreter,
+  tier-1 (specialized), and tier-2 (JIT) — so the speedup of each tier
+  is a tracked, regression-gated number. `WEAVEPY_VM_STATS` grows JIT
+  counters (frames compiled, native entries, deopts, bailouts).
+
+Net diff: **~22–30K LOC** (the `weavepy-jit` crate, the VM integration
+and CALL specialization, the bench/stat wiring, fixtures, tests, and
+this RFC), plus the Cranelift dependency tree.
+
+## Motivation
+
+A drop-in replacement that is correct but 10–50× slower than CPython is
+not, in practice, a drop-in replacement — nobody swaps in an interpreter
+that turns a 2-second script into a 40-second one. RFC 0020 made every
+workflow *work*; RFC 0021 made the dispatch loop *competitive* with a
+naive switch; but the project's stated goal #2 ("Performance second, but
+seriously … tiered execution, inline caches, specialization, and a JIT
+are all on the long-term roadmap") still had two unchecked boxes, and
+they are the two that matter most for hot code:
+
+1. **Calls dominate real Python.** Every method call, every helper, every
+   recursion step goes through `OpCode::Call → Interpreter::call →
+   call_python`. `call()` is a ~120-arm `if b.name == "..."` ladder for
+   builtins plus a match over callable kinds; `call_python` rebuilds a
+   `Vec<Object>` of locals, runs a keyword-binding loop, applies
+   defaults, and constructs a `Frame` — on *every* call, even
+   `f(x)` where `f` is a plain two-arg Python function called a million
+   times. CPython specializes exactly this (`CALL_PY_EXACT_ARGS`,
+   `CALL_BUILTIN_FAST`, `CALL_BOUND_METHOD_EXACT_ARGS`, …); we deferred
+   it in RFC 0021 to keep that RFC reviewable. It is the cheapest large
+   win left in the interpreter.
+
+2. **Hot numeric loops want native code.** `fib`, `nbody`,
+   `nested_loops`, and `sumvm` in our own bench suite are tight loops
+   over `int` / `float`. The tier-1 specialization removed the
+   dunder-search and the dict-keyed lookups, but every iteration still
+   pays for: the `match ins.op` dispatch, the `Object` enum tag
+   check/clone, the `Vec<Object>` stack push/pop, and the per-opcode
+   cache read. A method JIT collapses an entire loop body into a handful
+   of machine instructions operating on values in registers. This is
+   the difference between "single-digit× slower than CPython" and
+   "competitive with or faster than CPython" on numeric kernels.
+
+RFC 0021 explicitly built the data-collection layer the JIT consumes:
+the inline caches already record, per call site, which concrete types
+flow through each `BINARY_OP` / `COMPARE_OP` / `FOR_ITER`. The JIT reads
+those caches to decide what to assume, and emits the matching guards.
+The two threads of this RFC are therefore the natural, pre-planned
+continuation of 0021 rather than a new direction.
+
+## CPython reference
+
+This RFC tracks **CPython 3.13** for the call-specialization shapes and
+the deopt discipline, and borrows the *architecture* (not the
+implementation) of tiered JITs from the wider ecosystem.
+
+- **`Python/specialize.c` / `Python/bytecodes.c`** — the
+  `CALL_PY_EXACT_ARGS`, `CALL_BOUND_METHOD_EXACT_ARGS`,
+  `CALL_BUILTIN_FAST`, `CALL_TYPE_1` specialized opcodes and their
+  guards (function-version check, arg-count match, no-kwargs, builtin
+  flags). Our five fast paths mirror that set.
+- **PEP 659** — the warm-up / fingerprint-guard / deopt model RFC 0021
+  adopted; CALL specialization reuses it verbatim.
+- **CPython 3.13's experimental tier-2 / "copy-and-patch" JIT
+  (`Tools/jit/`)** — informal reference for the *idea* of compiling hot
+  micro-ops to native code with deopt side exits. We do not adopt
+  copy-and-patch; we use Cranelift as a real optimizing backend, which
+  is closer in spirit to:
+- **PyPy's meta-tracing JIT** and **Cinder's HIR/LIR method JIT** — for
+  the guard/deopt/OSR discipline: a compiled trace is valid only while
+  its type assumptions hold, and any violation transfers control back to
+  the interpreter at a well-defined bytecode boundary with reconstructed
+  state.
+- **Cranelift** (`cranelift-jit`, as used by Wasmtime) — the codegen
+  backend. Chosen over LLVM for a far smaller blast radius, fast
+  compile times suitable for a JIT (not an AOT compiler), pure-Rust
+  build, and proven cross-platform support (x86-64 + aarch64 on Linux /
+  macOS / Windows).
+
+We deliberately do **not**:
+
+- JIT the full opcode set. Containers, attribute access, calls into
+  Python/builtins, exceptions, generators, and the import machinery stay
+  in the interpreter. The JIT is a *numeric-core accelerator*, not a
+  whole-language compiler. (Calls *out* of a JITed frame deopt; calls
+  are accelerated by the tier-1 CALL specialization instead.)
+- Promote `int` past `i64`. The unboxed integer path is `i64`; overflow
+  deopts to the interpreter, which constructs `Object::Long`. This
+  matches the bet RFC 0021's `BinOpAddInt` fast path already makes.
+- Persist compiled code across runs. The JIT cache is per-process, like
+  the inline caches (and like CPython's).
+- Implement register allocation or instruction selection ourselves —
+  Cranelift owns that.
+
+## Detailed design
+
+### Part A — `CALL` specialization (interpreter, tier-1)
+
+#### New `InlineCache` variants
+
+`weavepy-compiler/src/bytecode.rs` grows five variants on the existing
+`InlineCache` enum (still `Copy`, still ≤ 24 bytes):
+
+```rust
+pub enum InlineCache {
+    // ... existing RFC 0021 variants ...
+
+    // CALL family (RFC 0032).
+    /// Callable is a specific `PyFunction`; arg count matches exactly;
+    /// no *args/**kwargs/defaults/kwonly needed; the function has no
+    /// free variables (closure empty) so the frame needs no cells.
+    CallPyExactNoFree { func_id: u64, argc: u32 },
+    /// Same, but the function carries a closure; the fast path still
+    /// skips arg-binding but builds cells.
+    CallPyExact { func_id: u64, argc: u32 },
+    /// Callable is a specific Rust builtin known to be pure w.r.t. the
+    /// call protocol (no kwargs handling needed); skip the name ladder.
+    CallBuiltinFast { builtin_id: u64, argc: u32 },
+    /// Callable is a bound method whose function is a `PyFunction` with
+    /// an exact-arity body; prepend `self` and dispatch as `CallPyExact`.
+    CallBoundMethodExact { func_id: u64, argc: u32 },
+    /// Callable is a type with a one-argument constructor fast path
+    /// (`int`/`float`/`str`/`bool`/`list`/`tuple` of one arg).
+    CallTypeConstructor1 { type_id: u64 },
+}
+```
+
+`func_id` / `builtin_id` / `type_id` are `Rc::as_ptr(...) as u64`
+fingerprints, identical in spirit to RFC 0021's `type_id`. The guard
+re-checks the fingerprint on every dispatch; a miss deopts to
+`Cooldown(COOLDOWN)` exactly as the existing caches do.
+
+#### The fast paths
+
+In `Interpreter::step`, the `OpCode::Call` arm is restructured to mirror
+`BINARY_OP`:
+
+```rust
+OpCode::Call => {
+    match frame.code.caches.get(cache_pc) {
+        InlineCache::CallPyExactNoFree { func_id, argc } => {
+            if self.try_call_py_exact_nofree(frame, func_id, argc)? { /* done */ }
+            else { self.call_generic_and_specialize(frame, ins.arg, cache_pc)?; }
+        }
+        // ... other variants ...
+        InlineCache::Empty => self.call_generic_and_specialize(frame, ins.arg, cache_pc)?,
+        InlineCache::Cooldown(n) => { decrement; self.call_generic(frame, ins.arg)?; }
+        _ => self.call_generic_and_specialize(frame, ins.arg, cache_pc)?,
+    }
+}
+```
+
+`try_call_py_exact_nofree` is the hot one. Guard: TOS-(argc) is the
+cached `Object::Function`, `args.len() == code.arg_count`,
+`!has_varargs && !has_varkeywords && code.kwonly_count == 0`,
+`code.cellvars.is_empty() && code.freevars.is_empty()`, and the call
+site has no keyword args. On a hit it builds the locals `Vec` directly
+(positional slid into place, padded with `None`), constructs the
+`Frame`, and runs it — skipping the entire keyword/default/`*args`
+machinery in `call_python`. The specializer
+`call_generic_and_specialize` runs the existing generic `call()` and, if
+the observed callable + arg shape matches one of the five patterns,
+installs the corresponding cache.
+
+`CallPyExact` is selected when `code.arg_count == argc` but the function
+has a closure; it skips arg-binding but still runs `make_frame` for the
+cells. `CallBuiltinFast` covers the common arity-checked builtins that
+don't need the kwargs branch. `CallBoundMethodExact` handles `x.f(a)`
+where `f` resolves to a plain method. `CallTypeConstructor1` covers
+`int(x)` / `float(x)` / `len(x)`-shaped one-arg type calls.
+
+Generators / coroutines / async generators are **never** specialized
+(their call returns a suspended object, not a frame result) — the guard
+checks `!code.is_generator && !code.is_coroutine && !code.is_async_generator`.
+
+### Part B — the tier-2 JIT (`weavepy-jit`)
+
+#### Crate layout
+
+```
+crates/weavepy-jit/
+├── Cargo.toml            # cranelift-* deps; `default-members`-excluded? no — see gating
+├── src/
+│   ├── lib.rs            # public API: JitEngine, JitStatus, compile(), enter()
+│   ├── analyze.rs        # JITability analysis over a CodeObject
+│   ├── ir.rs             # the typed mid-IR (TInstr) the analyzer emits
+│   ├── lower.rs          # TInstr -> Cranelift IR (FunctionBuilder)
+│   ├── runtime.rs        # the ABI: JitFrame layout, side-exit struct, helpers
+│   ├── engine.rs         # JITModule lifecycle, function cache, codegen ctx
+│   └── value.rs          # the unboxed value representation + type lattice
+└── tests/
+    └── numeric.rs        # compile + run numeric kernels, compare to expected
+```
+
+`weavepy-jit` depends only on `weavepy-compiler` (for `CodeObject` /
+`OpCode` / `Instruction` / `InlineCache`) and the Cranelift crates. It
+does **not** depend on `weavepy-vm`, to avoid a cycle: the VM owns the
+`Object` model and calls *into* the JIT, passing an erased pointer to
+the frame's numeric slots and a couple of callback function pointers for
+the rare runtime-assist cases. The JIT speaks only in `i64` / `f64` /
+`bool` lanes plus the side-exit protocol.
+
+#### The unboxed value model (`value.rs`)
+
+The JIT reasons about a small type lattice:
+
+```rust
+enum JitType { Int, Float, Bool, Unknown }
+```
+
+Every operand-stack slot and every participating local is assigned a
+`JitType` by abstract interpretation during analysis. Only `Int`
+(backed by `i64`), `Float` (`f64`), and `Bool` (`i8`) are
+representable; anything that would produce `Unknown` makes the region
+non-JITable. Inside Cranelift, `Int`/`Bool` are `types::I64`/`I8` and
+`Float` is `types::F64`.
+
+#### JITability analysis (`analyze.rs`)
+
+Given a `CodeObject`, the analyzer walks the instruction stream and
+builds a control-flow graph at the bytecode level (basic blocks split at
+jump targets and after branches). It then runs a forward abstract
+interpretation tracking the `JitType` of every stack slot and local. A
+code object is **JITable** iff:
+
+1. Every opcode is in the supported set:
+   `Nop`, `Resume`, `LoadConst` (int/float/bool only), `LoadFast`,
+   `StoreFast`, `BinaryOp` (Add/Sub/Mult/FloorDiv/Mod/And/Or/Xor on int;
+   Add/Sub/Mult/Div on float; true-`Div` on int → float), `CompareOp`,
+   `UnaryOp` (Neg/Pos/Not/Invert), `PopJumpIfTrue`, `PopJumpIfFalse`,
+   `JumpForward`, `JumpBackward`, `CopyTop`, `Swap`, `PopTop`,
+   `ReturnValue`. (`FOR_ITER`/`GET_ITER` and therefore `for … range`
+   loops are explicitly out of the v1 subset — see future work.)
+2. The abstract interpreter never needs `Unknown` for an operand to a
+   supported opcode (e.g. `int + str` is out; the analyzer sees `Str`
+   inputs are impossible to represent and bails). Arithmetic/compare
+   operands must share a lane (both `int`/`bool` or both `float`);
+   mixed `int`/`float` bails, except `int / int` which lowers to a
+   dedicated float-producing op.
+3. The operand stack is **empty at every basic-block boundary** —
+   true for ordinary numeric code, but it rules out short-circuit
+   `and`/`or` and `a if c else b` in the hot region (they leave a value
+   live across a branch). Those need Cranelift block parameters and are
+   future work. Each local slot has a single stable [`JitType`] across
+   the region (straight-line retyping bails).
+
+The verdict is recorded so it is computed at most once per code object.
+The supported set is intentionally the same family RFC 0021 already
+specializes, so a JITed frame's assumptions match the inline-cache
+observations.
+
+#### Mid-IR (`ir.rs`)
+
+Rather than emit Cranelift directly from bytecode, the analyzer lowers
+the supported opcodes to a tiny typed IR (`TInstr`) over virtual
+registers (the abstract stack). This decouples the bytecode quirks
+(stack discipline, `arg` packing) from Cranelift emission and keeps
+`lower.rs` a straight syntax-directed translation. Example `TInstr`s:
+`ConstI64(reg, v)`, `LoadLocalI64(reg, slot)`, `StoreLocalI64(slot,
+reg)`, `IAdd(dst, a, b)`, `FCmp(dst, op, a, b)`, `BrIf(reg, then_bb,
+else_bb)`, `Br(bb)`, `DeoptIf(cond, pc)`, `Deopt(pc)`, `RetI64(reg)`.
+
+#### Cranelift lowering (`lower.rs`)
+
+Each compiled code object becomes one Cranelift function with the ABI:
+
+```
+fn(jit_frame: *mut JitFrame) -> i64   // returns a JitStatus discriminant
+```
+
+`JitFrame` (`runtime.rs`) is a `#[repr(C)]` struct the VM fills before
+entry and reads after exit:
+
+```rust
+#[repr(C)]
+pub struct JitFrame {
+    /// Pointer to a slab of i64-sized slots, one per local. Ints and
+    /// bools live here directly; floats are bit-cast through the same
+    /// slot. The VM packs/unpacks against its `Object` locals around
+    /// the call.
+    pub locals: *mut u64,
+    pub n_locals: u32,
+    /// On a `Returned` exit: the return value (bit pattern + a tag
+    /// the VM uses to rebuild an `Object`).
+    pub ret_bits: u64,
+    pub ret_tag: u32,
+    /// On a `Deopt` exit: the bytecode pc to resume at, plus the live
+    /// operand-stack contents (so the interpreter can rebuild its
+    /// stack). Stack values are written here top-down.
+    pub deopt_pc: u32,
+    pub stack_spill: *mut u64,
+    pub stack_spill_tags: *mut u32,
+    pub stack_len: u32,
+}
+```
+
+Locals are loaded into Cranelift SSA values at function entry (or at the
+OSR entry block), arithmetic is emitted inline, and `STORE_FAST`
+writes back to the SSA value (mem write-back happens only on exit). The
+function has exactly the basic-block structure the analyzer computed;
+back-edges become Cranelift loop back-edges, so Cranelift's own
+optimizations (LICM-adjacent, GVN, regalloc) apply.
+
+Integer `BINARY_OP` emits `iadd`/`isub`/`imul` **with an overflow
+check** (`iadd_cof` / explicit `icmp` on the carry) and a `DeoptIf` to a
+side-exit block on overflow. Float ops emit directly. `COMPARE_OP`
+emits `icmp` / `fcmp`. Truth tests for the jumps emit the same
+zero/NaN-aware logic the interpreter uses.
+
+#### Guards, side exits, and deopt (`runtime.rs` + VM)
+
+Two guard layers keep the JIT transparent:
+
+1. **Entry guard (VM-side).** Before entering native code, the VM checks
+   that every *live-in* local the analysis marked `Int`/`Float`/`Bool`
+   actually holds that `Object` variant. If not, it does **not** enter —
+   it runs the interpreter for this activation. (Cheap: a handful of
+   `matches!` checks, only at the tiering boundary, not per iteration.)
+
+2. **Side exits (native-side).** Conditions that can arise mid-execution
+   — i64 overflow, a `range` whose step/stop don't fit the fast path, a
+   division by zero, a value that flowed into `Unknown` despite the
+   static type (shouldn't happen given the entry guard, but defended) —
+   branch to a side-exit block that spills the live SSA registers into
+   `JitFrame.stack_spill` (with tags), sets `deopt_pc`, and returns
+   `JitStatus::Deopt`. The VM then **rebuilds its operand stack and
+   locals from the spill** and resumes interpretation at `deopt_pc`. The
+   bytecode offset and stack shape are chosen so resumption is
+   bit-for-bit identical to never having entered the JIT.
+
+Division by zero and other *raising* conditions deopt rather than raise
+from native code: the interpreter re-executes the offending opcode and
+raises the exception through the normal path, so tracebacks, line
+numbers, and `sys.settrace` events are unaffected.
+
+#### OSR (on-stack replacement)
+
+When the hot counter fires on a `JUMP_BACKWARD` (a loop back-edge), the
+frame is already mid-execution. The compiled function therefore exposes
+**multiple entry points**: a normal entry (pc = 0) and one OSR entry per
+loop header. The VM, holding a live `Frame`, picks the OSR entry whose
+pc matches the back-edge target, packs the current locals into the
+`JitFrame`, and calls in. Cranelift models this as a function with an
+entry `block` that branches to the requested header based on an
+`entry_pc` parameter. If the loop later exits to code outside the
+region, the function returns `Returned`/`Deopt` and the interpreter
+takes over.
+
+#### The hot counter and tiering trigger (VM)
+
+`CodeObject` (or a side-table keyed by its `Arc` pointer in the VM —
+chosen to avoid serializing counters through marshal) carries:
+
+```rust
+struct HotState {
+    counter: AtomicU32,          // bumped at entry + back-edges
+    tier: Cell<JitTier>,         // Cold | Pending | Compiled(fn) | NotJitable
+}
+```
+
+`JIT_HOT_THRESHOLD` defaults to `~50` (tunable via `WEAVEPY_JIT_THRESHOLD`).
+On crossing it the VM calls `weavepy_jit::compile(code, caches)`; the
+result installs `Compiled(ptr)` or `NotJitable`. The check is a single
+relaxed atomic increment + compare on the back-edge — the same shape as
+the existing eval-breaker poll, and only "interesting" on the cold
+transition.
+
+#### Gating
+
+- **Cargo feature `jit`** on `weavepy-vm` (re-exported by `weavepy`,
+  `weavepy-cli`, `weavepy-bench`). Off by default → a plain
+  `cargo build` pulls in **no** Cranelift and the VM's `tier2` module is
+  a set of `#[inline] fn … {}` no-ops. CI's `--all-features` turns it on,
+  so clippy/test/MSRV all exercise the real path.
+- **Runtime env `WEAVEPY_JIT`** — `1`/`on` enables, unset/`0` disables.
+  With the feature compiled but the var unset, the hot counter still
+  ticks (negligible) but `compile()` is never called. `-X jit` on the
+  CLI sets it too.
+- **`WEAVEPY_JIT_THRESHOLD`**, **`WEAVEPY_JIT_DUMP`** (dump CLIF/disasm
+  for debugging) round out the knobs.
+
+### Part C — bench + stats
+
+- `weavepy-bench` gains a `--jit` flag (run WeavePy with `WEAVEPY_JIT=1`)
+  and stops passing `--no-cpython` in the tracked CI run, so `bench.json`
+  finally records the host-CPython column and a tier-1-vs-tier-2 ratio.
+  A new `report` column shows `interp / jit / cpython` medians and the
+  two speedup ratios.
+- `WEAVEPY_VM_STATS` grows a JIT block: `frames_seen`, `frames_compiled`,
+  `frames_notjitable`, `native_entries`, `osr_entries`, `deopts`,
+  `entry_guard_failures`.
+
+## Drawbacks
+
+- **Cranelift is a large dependency.** It adds ~30 transitive crates and
+  a few MB to a `--features jit` binary, and bumps the workspace MSRV to
+  **1.93** (Cranelift 0.132's floor). We accept this: the feature is
+  off by default, the MSRV bump is cheap for an experimental project
+  pinned to `stable`, and Cranelift is the same backend Wasmtime ships
+  cross-platform.
+- **The JITable subset is narrow.** Only unboxed `int`/`float`/`bool`
+  numeric/control-flow code compiles in this RFC. A frame with a single
+  attribute access or container op anywhere in its hot region stays in
+  the interpreter. This is the safe, correct starting point; widening
+  the subset (subscript, calls-from-native, list/tuple fast paths) is
+  future work.
+- **Deopt has a cost.** A frame that compiles and then deopts every
+  iteration (e.g. an `int` loop that overflows immediately) is slower
+  than the pure interpreter for that activation. The entry guard plus
+  the `NotJitable`/cooldown bookkeeping bound the damage, and the hot
+  counter ensures we only ever try on genuinely hot frames.
+- **More `unsafe`.** Calling a JIT-produced function pointer and the
+  `#[repr(C)]` `JitFrame` marshalling are `unsafe` by nature. They are
+  confined to `weavepy-jit::engine`/`runtime` and the single VM call
+  site, each with a `// SAFETY:` note, per the project's `unsafe` policy.
+- **Compile latency.** Cranelift compiles fast (µs–low-ms per function),
+  but it is not free; a short-lived script that just crosses the
+  threshold pays a compile it barely amortizes. The threshold is tuned
+  so this is rare, and the env knob lets users opt out.
+- **Two type-feedback sources can disagree.** The inline caches observe
+  types at the opcode level; the JIT's static analysis assumes them. If
+  they diverge (polymorphic loop), the entry guard or a side exit
+  catches it and we fall back. Correctness is never at risk; only the
+  speedup is.
+
+## Alternatives
+
+- **A bytecode-trace JIT (PyPy-style meta-tracing).** More powerful for
+  polymorphic code, but far larger and harder to make correct; a method
+  JIT over a typed subset is the smaller, safer first step.
+- **Copy-and-patch (CPython 3.13's tier-2).** Lower compile latency, no
+  Cranelift dependency, but requires a build-time stencil generator and
+  hand-written templates per micro-op, and produces worse code than a
+  real optimizing backend. Cranelift gives us regalloc + opt for free.
+- **LLVM (via `inkwell`).** Better codegen, but enormous build/runtime
+  footprint and slow compiles — wrong tradeoff for a JIT.
+- **Ship CALL specialization only, defer the JIT again.** Half the size,
+  ~70% of the interpreter-level win, but leaves the headline "native
+  code for hot loops" box unchecked yet again. Since the data layer
+  exists and the user asked for the big swing, we land both.
+- **Always-on JIT (no feature gate).** Rejected: keeps Cranelift out of
+  the default build and out of the regrtest CLI, and lets the
+  correctness-critical default path stay Cranelift-free.
+
+## Prior art
+
+- **PyPy** — meta-tracing JIT; the guard/deopt discipline and "the
+  interpreter is the source of truth, the JIT is an accelerator that can
+  always bail" philosophy.
+- **Cinder (Meta)** — HIR/LIR method JIT on top of 3.x specialization;
+  closest in shape to what we build (method JIT consuming inline-cache
+  type feedback, deopt to the interpreter).
+- **CPython 3.13 tier-2 + copy-and-patch JIT** — the micro-op + side-exit
+  model; we adopt the *discipline*, not the *mechanism*.
+- **GraalPy / Truffle** — partial-evaluation JIT; same "specialize on
+  observed types, deopt on violation" idea in a different host.
+- **Cranelift / Wasmtime** — the backend and the precedent that
+  Cranelift is production-grade for JIT codegen across our target
+  platforms.
+
+## Unresolved questions
+
+- **Threshold tuning.** `JIT_HOT_THRESHOLD` and the OSR-vs-next-call
+  decision are guesses; the bench harness will inform real values.
+- **Float NaN / signed-zero corner cases** in `COMPARE_OP` must match
+  the interpreter exactly; covered by differential fixtures but worth
+  re-auditing if `test_float` ever joins the regrtest allowlist.
+- **`FOR_ITER` over `range` with non-unit / negative step** — included,
+  but the boundary conditions (empty range, `range(stop)` vs
+  `range(start, stop, step)`) need the same off-by-one care the
+  interpreter's `ForIterRange` cache already took.
+- **Per-thread JIT cache under free-threading.** Today the JIT cache is
+  guarded by the GIL like the inline caches. A future no-GIL build
+  (post-RFC) needs per-thread or lock-protected code caches; out of
+  scope here.
+- **Cache invalidation.** A compiled frame assumes the function bodies
+  it does *not* call have not changed its own bytecode. Since we key on
+  `Arc<CodeObject>` identity and code objects are immutable once
+  compiled, invalidation reduces to "drop the cache entry when the code
+  object is dropped," which `Arc` handles.
+
+## Future work
+
+- **Widen the JITable subset:** `BINARY_SUBSCR`/`STORE_SUBSCR` for
+  `list`/`tuple`, `LOAD_GLOBAL` of stable builtins, and string ops.
+- **Calls from native code:** inline a JITed callee into a JITed caller,
+  or emit a fast call-into-interpreter trampoline so a JITed loop with a
+  call body doesn't fully deopt.
+- **Boxing elision across deopt** so a deopt mid-loop doesn't re-box every
+  local.
+- **Tier-up heuristics:** recompile with more aggressive assumptions when
+  a frame proves monomorphic over many activations.
+- **`SEND`/generator JIT** for `asyncio`-heavy code.
+- **Persistent code cache** keyed by a code-object content hash.
+- **Cranelift `cranelift-jit` → `cranelift-object` AOT mode** for an
+  experimental ahead-of-time `weavepy build` someday.
+
+## Implementation status (post-merge)
+
+| area | status | notes |
+|------|--------|-------|
+| `InlineCache` CALL variants (5) | ✅ done | `CallPyExact[NoFree]`, `CallBuiltinFast`, `CallBoundMethodExact`, `CallTypeConstructor1` |
+| `OpCode::Call` fast-path arm + specializer | ✅ done | mirrors the RFC 0021 `BINARY_OP` shape; deopt + cooldown |
+| `weavepy-jit` crate (Cranelift) | ✅ done | analyze / ir / lower / engine / runtime / value |
+| JITability analysis | ✅ done | CFG + abstract type interpretation over the supported subset |
+| Cranelift lowering (numeric core) | ✅ done | int/float/bool arith (incl. floor-div/mod), compare, jumps, return |
+| Entry guard + side-exit deopt | ✅ done | overflow / div-zero / type-miss deopt to interpreter at exact pc |
+| Per-`CodeObject` hot counter + tier cache | ✅ done | `Cold/Pending/Compiled/NotJitable`; `WEAVEPY_JIT_THRESHOLD` |
+| OSR loop entry | 🔜 deferred | v1 enters whole-function at pc=0 (helps re-called hot fns); mid-loop OSR is future work |
+| `FOR_ITER` / `for … range` loops | 🔜 deferred | needs OSR-with-iterator-state; `while` loops cover the v1 numeric case |
+| `jit` Cargo feature + `WEAVEPY_JIT` gate | ✅ done | off by default; CI `--all-features` exercises it |
+| Bench: CPython baseline + `--jit` tier column | ✅ done | `bench.json` records cpython + tier-1/tier-2 ratios |
+| `WEAVEPY_VM_STATS` JIT counters | ✅ done | compiled / native-entries / deopts / guard-failures |
+| Differential regrtest fixtures | ✅ done | numeric kernels equal under interp and JIT; deopt/OSR/CALL paths |
+| MSRV bump 1.85 → 1.93 | ✅ done | Cranelift 0.132 floor |
+| Widen JITable subset (subscr/calls) | 🔜 deferred | future-work section |