diff --git a/Cargo.lock b/Cargo.lock index 776da01..08baa86 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,6 +40,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -105,6 +111,12 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" + [[package]] name = "arrayvec" version = "0.7.6" @@ -138,6 +150,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.1" @@ -194,6 +212,9 @@ name = "bumpalo" version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +dependencies = [ + "allocator-api2", +] [[package]] name = "bytecheck" @@ -369,6 +390,174 @@ dependencies = [ "libc", ] +[[package]] +name = "cranelift-assembler-x64" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c80cf55a351448317210f26c434be761bcb25e7b36116ec92f89540b73e2833" +dependencies = [ + "cranelift-assembler-x64-meta", +] + +[[package]] +name = "cranelift-assembler-x64-meta" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07937ca8617b340162fe3a4716be885b5847e9b56d6c7a89abbe4d42340fdc91" +dependencies = [ + "cranelift-srcgen", +] + +[[package]] +name = "cranelift-bforest" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88217b08180882436d54c0133274885c590698ae854e352bede1cda041230800" +dependencies = [ + "cranelift-entity", + "wasmtime-internal-core", +] + +[[package]] +name = "cranelift-bitset" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5c3cf7ba29fa56e56040848e34835d4e45988b2760ef212413409af95ffd8c1" +dependencies = [ + "wasmtime-internal-core", +] + +[[package]] +name = "cranelift-codegen" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe1aac2efd4cba2047845fce38a68519935a30e20c8a6294ba7e2f448fe722d" +dependencies = [ + "bumpalo", + "cranelift-assembler-x64", + "cranelift-bforest", + "cranelift-bitset", + "cranelift-codegen-meta", + "cranelift-codegen-shared", + "cranelift-control", + "cranelift-entity", + "cranelift-isle", + "gimli", + "hashbrown 0.17.1", + "libm", + "log", + "regalloc2", + "rustc-hash", + "serde", + "smallvec", + "target-lexicon", + "wasmtime-internal-core", +] + +[[package]] +name = "cranelift-codegen-meta" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0909eaf9d6f18f5bf802d50608cb4368ac340fbd03cc44f2888d1cfcc3faa64e" +dependencies = [ + "cranelift-assembler-x64-meta", + "cranelift-codegen-shared", + "cranelift-srcgen", + "heck", +] + +[[package]] +name = "cranelift-codegen-shared" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c95a8da8be283f49cda7d0ef228c94f10d791e517b27b0c7e282dadd2e79ce45" + +[[package]] +name = "cranelift-control" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5b19c81145146da1f7afda2e7f52111842fe6793512e740ad5cf3f5639e6212" +dependencies = [ + "arbitrary", +] + +[[package]] +name = "cranelift-entity" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a55309b47e6633ab05821304206cb1e92952e845b1224985562bb7ac1e92323" +dependencies = [ + "cranelift-bitset", + "wasmtime-internal-core", +] + +[[package]] +name = "cranelift-frontend" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "064d2d3533d9608f1cf44c8899cf2f7f33feb70300b0fb83e687b0d9e7b91147" +dependencies = [ + "cranelift-codegen", + "log", + "smallvec", + "target-lexicon", +] + +[[package]] +name = "cranelift-isle" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac4e0bc095b2dab2212d1e99d7a74b62afc1485db023f1c0cb34a68758f7bd1" + +[[package]] +name = "cranelift-jit" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b48c2a0720c7d62aadd508c662b9bf666b614a47a888589e553e0511620635e" +dependencies = [ + "anyhow", + "cranelift-codegen", + "cranelift-control", + "cranelift-entity", + "cranelift-module", + "cranelift-native", + "libc", + "log", + "memmap2 0.2.3", + "region", + "target-lexicon", + "wasmtime-internal-jit-icache-coherence", + "windows-sys 0.61.2", +] + +[[package]] +name = "cranelift-module" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28f05d9efce7a4e8c2ceec49c76d26e53f1ee8cb13de822b6ca5118d48f50976" +dependencies = [ + "anyhow", + "cranelift-codegen", + "cranelift-control", +] + +[[package]] +name = "cranelift-native" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09a40053f5cb925451dd1d57393d14ad3145c8e0786701c27b5415ebb9a3ba4f" +dependencies = [ + "cranelift-codegen", + "libc", + "target-lexicon", +] + +[[package]] +name = "cranelift-srcgen" +version = "0.132.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3ceab9a53f7d362c89841fbaa8e63e44d47c40e91dc96ee6f777fca5d6b323b" + [[package]] name = "crc32fast" version = "1.5.0" @@ -522,12 +711,24 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foldhash" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "funty" version = "2.0.0" @@ -592,6 +793,18 @@ dependencies = [ "wasip3", ] +[[package]] +name = "gimli" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf7f043f89559805f8c7cacc432749b2fa0d0a0a9ee46ce47164ed5ba7f126c" +dependencies = [ + "fnv", + "hashbrown 0.16.1", + "indexmap", + "stable_deref_trait", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -616,14 +829,23 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "foldhash 0.1.5", ] +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + [[package]] name = "hashbrown" version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +dependencies = [ + "foldhash 0.2.0", +] [[package]] name = "hashlink" @@ -743,6 +965,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + [[package]] name = "libredox" version = "0.1.16" @@ -795,6 +1023,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "mach2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d640282b302c0bb0a2a8e0233ead9035e3bed871f0b7e81fe4a1ec829765db44" +dependencies = [ + "libc", +] + [[package]] name = "matchers" version = "0.2.0" @@ -820,6 +1057,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memmap2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "723e3ebdcdc5c023db1df315364573789f8857c11b631a2fdfad7c00f5c046b4" +dependencies = [ + "libc", +] + [[package]] name = "memmap2" version = "0.9.10" @@ -857,7 +1103,7 @@ version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4" dependencies = [ - "bitflags", + "bitflags 2.11.1", "cfg-if", "cfg_aliases 0.1.1", "libc", @@ -1084,7 +1330,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.1", ] [[package]] @@ -1098,6 +1344,20 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "regalloc2" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de2c52737737f8609e94f975dee22854a2d5c125772d4b1cf292120f4d45c186" +dependencies = [ + "allocator-api2", + "bumpalo", + "hashbrown 0.17.1", + "log", + "rustc-hash", + "smallvec", +] + [[package]] name = "regex" version = "1.12.3" @@ -1127,6 +1387,18 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "region" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6b6ebd13bc009aef9cd476c1310d49ac354d36e240cf1bd753290f3dc7199a7" +dependencies = [ + "bitflags 1.3.2", + "libc", + "mach2", + "windows-sys 0.52.0", +] + [[package]] name = "rend" version = "0.4.2" @@ -1185,7 +1457,7 @@ version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b838eba278d213a8beaf485bd313fd580ca4505a00d5871caeb1457c55322cae" dependencies = [ - "bitflags", + "bitflags 2.11.1", "fallible-iterator", "fallible-streaming-iterator", "hashlink", @@ -1210,13 +1482,19 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + [[package]] name = "rustix" version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys", @@ -1291,7 +1569,7 @@ version = "14.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7803e8936da37efd9b6d4478277f4b2b9bb5cdb37a113e8d63222e58da647e63" dependencies = [ - "bitflags", + "bitflags 2.11.1", "cfg-if", "clipboard-win", "fd-lock", @@ -1341,7 +1619,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags", + "bitflags 2.11.1", "core-foundation", "core-foundation-sys", "libc", @@ -1478,6 +1756,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + [[package]] name = "strsim" version = "0.11.1" @@ -1518,6 +1802,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "target-lexicon" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" + [[package]] name = "tempfile" version = "3.27.0" @@ -1889,12 +2179,34 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.1", "hashbrown 0.15.5", "indexmap", "semver", ] +[[package]] +name = "wasmtime-internal-core" +version = "45.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bdae4b55b15a23d774b15f6e7cd90ae0d0aa17c47c12b4db098b3dd11ba9d58" +dependencies = [ + "hashbrown 0.17.1", + "libm", +] + +[[package]] +name = "wasmtime-internal-jit-icache-coherence" +version = "45.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a312ba8bb77955dcd44294a223e7f124c3071ff966583d385d3f6a4639c62e3" +dependencies = [ + "cfg-if", + "libc", + "wasmtime-internal-core", + "windows-sys 0.61.2", +] + [[package]] name = "weavepy" version = "0.0.0" @@ -1975,6 +2287,18 @@ dependencies = [ "weavepy", ] +[[package]] +name = "weavepy-jit" +version = "0.0.0" +dependencies = [ + "cranelift-codegen", + "cranelift-frontend", + "cranelift-jit", + "cranelift-module", + "cranelift-native", + "weavepy-compiler", +] + [[package]] name = "weavepy-lexer" version = "0.0.0" @@ -2011,7 +2335,7 @@ dependencies = [ "indexmap", "libc", "md-5", - "memmap2", + "memmap2 0.9.10", "mio", "num-bigint", "num-integer", @@ -2034,6 +2358,7 @@ dependencies = [ "unicode-normalization", "unicode-properties", "weavepy-compiler", + "weavepy-jit", "weavepy-lexer", "weavepy-parser", "webpki-roots 0.26.11", @@ -2347,7 +2672,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.1", "indexmap", "log", "serde", diff --git a/Cargo.toml b/Cargo.toml index 275e786..d5f92c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "crates/weavepy-cli", "crates/weavepy-compiler", "crates/weavepy-conformance", + "crates/weavepy-jit", "crates/weavepy-lexer", "crates/weavepy-parser", "crates/weavepy-vm", @@ -29,7 +30,11 @@ default-members = [ [workspace.package] version = "0.0.0" edition = "2021" -rust-version = "1.85" +# RFC 0032 — the tier-2 Cranelift JIT (`weavepy-jit`, behind the `jit` +# feature) pulls in Cranelift 0.132, whose MSRV is 1.93. The JIT is +# off by default, but CI builds it via `--all-features`, so the +# workspace floor moves with it. +rust-version = "1.93" license = "MIT OR Apache-2.0" repository = "https://github.com/weavefoundry/weavepy" homepage = "https://github.com/weavefoundry/weavepy" @@ -44,6 +49,7 @@ weavepy = { path = "crates/weavepy", version = "0.0.0" } weavepy-capi = { path = "crates/weavepy-capi", version = "0.0.0" } weavepy-compiler = { path = "crates/weavepy-compiler", version = "0.0.0" } weavepy-conformance = { path = "crates/weavepy-conformance", version = "0.0.0" } +weavepy-jit = { path = "crates/weavepy-jit", version = "0.0.0" } weavepy-lexer = { path = "crates/weavepy-lexer", version = "0.0.0" } weavepy-parser = { path = "crates/weavepy-parser", version = "0.0.0" } weavepy-vm = { path = "crates/weavepy-vm", version = "0.0.0" } @@ -111,6 +117,15 @@ parking_lot = "0.12" crossbeam-channel = "0.5" crossbeam-utils = "0.8" +# RFC 0032 — tier-2 JIT backend (Cranelift). Only compiled when the +# `jit` feature is enabled (off by default); CI exercises it via +# `--all-features`. MSRV floor for these is Rust 1.93. +cranelift-codegen = "0.132" +cranelift-frontend = "0.132" +cranelift-jit = "0.132" +cranelift-module = "0.132" +cranelift-native = "0.132" + # Test/bench-only. insta = { version = "1.40", features = ["yaml"] } proptest = "1.5" diff --git a/crates/weavepy-bench/Cargo.toml b/crates/weavepy-bench/Cargo.toml index a474a89..b2843c4 100644 --- a/crates/weavepy-bench/Cargo.toml +++ b/crates/weavepy-bench/Cargo.toml @@ -22,5 +22,10 @@ weavepy-vm = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +[features] +default = [] +# RFC 0032 — run the bench harness with the tier-2 JIT compiled in. +jit = ["weavepy/jit", "weavepy-vm/jit"] + [lints] workspace = true diff --git a/crates/weavepy-bench/fixtures/jitloop.py b/crates/weavepy-bench/fixtures/jitloop.py new file mode 100644 index 0000000..3f60514 --- /dev/null +++ b/crates/weavepy-bench/fixtures/jitloop.py @@ -0,0 +1,35 @@ +"""While-loop numeric kernel called many times — the fixture the +RFC 0032 tier-2 JIT targets most directly. + +`kernel` is a pure integer hot loop (no FOR_ITER, no calls in the +loop body) so it lands in the JITable subset; `bench` calls it `n` +times so the per-`CodeObject` hot counter crosses the tier-up +threshold and the kernel runs as native code for the bulk of the +work. With `WEAVEPY_JIT=0` it measures the interpreter on the same +shape, which is the comparison we care about. +""" + +import os + + +def kernel(n): + s = 0 + i = 0 + while i < n: + s = s + i * 2 - (i // 3) + (i % 7) + i = i + 1 + return s + + +def bench(n): + total = 0 + k = 0 + while k < n: + total = total + kernel(n) + k = k + 1 + return total + + +if __name__ == "__main__": + n = int(os.environ.get("WEAVEPY_BENCH_WORK", "300")) + bench(n) diff --git a/crates/weavepy-bench/src/fixtures.rs b/crates/weavepy-bench/src/fixtures.rs index 74793d3..ebd5ee2 100644 --- a/crates/weavepy-bench/src/fixtures.rs +++ b/crates/weavepy-bench/src/fixtures.rs @@ -19,6 +19,7 @@ pub const FIXTURES: &[&str] = &[ "richards", "sumvm", "nested_loops", + "jitloop", ]; /// Default per-fixture work parameter passed as `bench(n)`. @@ -35,6 +36,7 @@ pub fn default_work(name: &str) -> u32 { "richards" => 1, "sumvm" => 50_000, "nested_loops" => 30, + "jitloop" => 300, _ => 1, } } diff --git a/crates/weavepy-bench/src/main.rs b/crates/weavepy-bench/src/main.rs index 9782b17..1593e57 100644 --- a/crates/weavepy-bench/src/main.rs +++ b/crates/weavepy-bench/src/main.rs @@ -114,6 +114,11 @@ fn cmd_run(args: &[String]) -> io::Result<()> { // suite. Off by default; cheap when off. println!(); println!("{}", format_stats_markdown(&snapshot())); + // RFC 0032 — append tier-2 JIT counters when compiled in. + if let Some(jit) = weavepy_vm::jit_stats_markdown() { + println!(); + println!("{jit}"); + } } } Ok(()) diff --git a/crates/weavepy-cli/Cargo.toml b/crates/weavepy-cli/Cargo.toml index d24f792..468e7e0 100644 --- a/crates/weavepy-cli/Cargo.toml +++ b/crates/weavepy-cli/Cargo.toml @@ -29,5 +29,11 @@ dirs = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } +[features] +default = [] +# RFC 0032 — build the `weavepy` binary with the tier-2 JIT compiled in +# (still gated at runtime by `WEAVEPY_JIT=1`). +jit = ["weavepy/jit", "weavepy-vm/jit"] + [lints] workspace = true diff --git a/crates/weavepy-compiler/src/bytecode.rs b/crates/weavepy-compiler/src/bytecode.rs index 5316aa9..b744c28 100644 --- a/crates/weavepy-compiler/src/bytecode.rs +++ b/crates/weavepy-compiler/src/bytecode.rs @@ -571,6 +571,24 @@ pub enum InlineCache { UnpackSequenceTuple, UnpackSequenceList, UnpackSequenceTwoTuple, + + // CALL family (RFC 0032). `func_id` is the `Rc::as_ptr` fingerprint + // of the called `PyFunction`; `argc` is the (fixed) call-site arity. + /// Plain Python function: exact positional arity, no keywords, no + /// `*args`/`**kwargs`/kw-only/defaults needed, and no cells or + /// closure — so the frame's locals are just the arguments padded + /// with `None`, skipping the whole argument-binding dance. + CallPyExactNoFree { + func_id: u64, + argc: u32, + }, + /// Plain Python function with the same exact-arity guarantee but a + /// non-trivial cell/closure layout — still skips argument binding, + /// but builds the frame (and its cells) through `make_frame`. + CallPyExact { + func_id: u64, + argc: u32, + }, } /// Number of generic dispatches a deopted cache must serve before it diff --git a/crates/weavepy-jit/Cargo.toml b/crates/weavepy-jit/Cargo.toml new file mode 100644 index 0000000..aa91604 --- /dev/null +++ b/crates/weavepy-jit/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "weavepy-jit" +description = "RFC 0032 — tier-2 Cranelift JIT for WeavePy's unboxed numeric frames." +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +authors.workspace = true +readme.workspace = true +keywords.workspace = true +categories.workspace = true +publish = false + +[dependencies] +weavepy-compiler = { workspace = true } +cranelift-codegen = { workspace = true } +cranelift-frontend = { workspace = true } +cranelift-jit = { workspace = true } +cranelift-module = { workspace = true } +cranelift-native = { workspace = true } + +[lints] +workspace = true diff --git a/crates/weavepy-jit/src/analyze.rs b/crates/weavepy-jit/src/analyze.rs new file mode 100644 index 0000000..40365ab --- /dev/null +++ b/crates/weavepy-jit/src/analyze.rs @@ -0,0 +1,874 @@ +//! JITability analysis: bytecode → [`TFunc`], or a [`JitVerdict`] +//! explaining why a code object is outside the v1 subset. +//! +//! The pipeline is: +//! +//! 1. **Block construction** — split the instruction stream into basic +//! blocks at jump targets / after control-flow ops, resolving +//! WeavePy's relative jumps to absolute instruction indices. +//! 2. **Reachability** — keep only blocks reachable from entry. +//! 3. **Definite assignment** — a forward must-analysis whose only job +//! is to compute the *live-in* local set (slots read before written) +//! that the VM type-guards before entering native code. +//! 4. **Type inference fixpoint** — abstract-interpret each block (with +//! an empty entry stack) to assign each local slot one stable +//! [`JitType`], bailing on any unsupported opcode, unrepresentable +//! constant, mixed-lane arithmetic, non-uniform local, or non-empty +//! block-boundary stack. +//! 5. **Emission** — once types converge, re-walk and emit [`TStmt`]s / +//! [`TBlock`]s into a [`TFunc`]. + +use std::collections::{BTreeSet, HashMap, HashSet, VecDeque}; + +use weavepy_compiler::{BinOpKind, CodeObject, CompareKind, Constant, OpCode, UnaryKind}; + +use crate::ir::{ArithKind, BlockId, CmpKind, TBlock, TFunc, TOp, TStmt, TTerm}; +use crate::value::JitType; + +/// Why a code object could not be compiled by the v1 JIT. Carried back +/// to the VM so it can mark the frame `NotJitable` and stop retrying. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum JitVerdict { + /// An opcode outside the supported subset (named for diagnostics). + UnsupportedOpcode(&'static str), + /// A `LOAD_CONST` of a non-`int`/`float`/`bool` constant. + UnsupportedConst, + /// A local slot is assigned two different lanes across the region. + NonUniformLocal(u32), + /// An operand's type could not be resolved to a representable lane. + TypeUnknown, + /// The operand stack is non-empty at a basic-block boundary + /// (short-circuit / ternary in the hot region). + NonEmptyBoundaryStack, + /// Arithmetic / comparison mixing `int` and `float` lanes. + MixedArithTypes, + /// The abstract stack underflowed (malformed or unsupported shape). + StackUnderflow, + /// A jump resolved outside the instruction stream. + BadJumpTarget, + /// Signature / kind the whole-function JIT doesn't handle + /// (generators, `*args`, class bodies, …). + UnsupportedSignature, + /// Trivial / empty body — not worth compiling. + Trivial, + /// Type inference did not converge within the iteration budget. + NotConverged, +} + +/// A raw basic block over the original instruction indices. +#[derive(Debug, Clone)] +struct RawBlock { + start: usize, + end: usize, + succs: Vec, +} + +/// Maximum type-inference iterations before giving up. +const MAX_INFER_ITERS: usize = 64; + +/// Analyze a code object. Returns the typed IR on success or a +/// [`JitVerdict`] describing the first disqualifying property found. +pub fn analyze(code: &CodeObject) -> Result { + if code.is_generator || code.is_coroutine || code.is_async_generator || code.is_class_body { + return Err(JitVerdict::UnsupportedSignature); + } + if code.has_varargs || code.has_varkeywords || code.kwonly_count > 0 { + return Err(JitVerdict::UnsupportedSignature); + } + let n = code.instructions.len(); + if n < 2 { + return Err(JitVerdict::Trivial); + } + + let raw = build_blocks(code)?; + let reachable = reachable_blocks(&raw); + if reachable.is_empty() { + return Err(JitVerdict::Trivial); + } + + let n_locals = code.varnames.len() as u32; + let livein = compute_livein(code, &raw, &reachable, n_locals); + + // Type inference fixpoint. + let mut local_types: Vec> = vec![None; n_locals as usize]; + let mut iters = 0; + loop { + let mut changed = false; + for &bi in &reachable { + infer_block(code, &raw[bi], &mut local_types, &mut changed)?; + } + if !changed { + break; + } + iters += 1; + if iters > MAX_INFER_ITERS { + return Err(JitVerdict::NotConverged); + } + } + + // Compact block ids over reachable blocks (entry first is convenient + // but not required — we record the entry id explicitly). + let mut compact: HashMap = HashMap::new(); + for (idx, &bi) in reachable.iter().enumerate() { + compact.insert(bi, idx); + } + let entry_block = *compact + .get(&block_index_at(&raw, 0)) + .ok_or(JitVerdict::Trivial)?; + + // Emission pass. + let mut blocks: Vec = Vec::with_capacity(reachable.len()); + let mut max_stack = 0u32; + for &bi in &reachable { + let tb = emit_block(code, &raw[bi], &local_types, &compact, &mut max_stack)?; + blocks.push(tb); + } + + let mut livein_vec: Vec = livein.into_iter().collect(); + livein_vec.sort_unstable(); + + Ok(TFunc { + n_locals, + local_types, + livein_locals: livein_vec, + max_stack, + blocks, + entry_block, + }) +} + +/// Resolve a forward branch/jump target instruction index. +#[inline] +fn forward_target(i: usize, arg: u32) -> usize { + i + 1 + arg as usize +} + +/// Resolve a backward jump target instruction index. +#[inline] +fn backward_target(i: usize, arg: u32) -> Option { + (i + 1).checked_sub(arg as usize) +} + +/// Build the basic blocks, resolving relative jumps to absolute indices. +fn build_blocks(code: &CodeObject) -> Result, JitVerdict> { + let n = code.instructions.len(); + let mut leaders: BTreeSet = BTreeSet::new(); + leaders.insert(0); + for (i, ins) in code.instructions.iter().enumerate() { + match ins.op { + OpCode::PopJumpIfFalse | OpCode::PopJumpIfTrue => { + let t = forward_target(i, ins.arg); + if t > n { + return Err(JitVerdict::BadJumpTarget); + } + leaders.insert(t); + if i + 1 < n { + leaders.insert(i + 1); + } + } + OpCode::JumpForward => { + let t = forward_target(i, ins.arg); + if t > n { + return Err(JitVerdict::BadJumpTarget); + } + leaders.insert(t); + if i + 1 < n { + leaders.insert(i + 1); + } + } + OpCode::JumpBackward => { + let t = backward_target(i, ins.arg).ok_or(JitVerdict::BadJumpTarget)?; + leaders.insert(t); + if i + 1 < n { + leaders.insert(i + 1); + } + } + OpCode::ReturnValue if i + 1 < n => { + leaders.insert(i + 1); + } + _ => {} + } + } + + let leader_vec: Vec = leaders.iter().copied().collect(); + let index_of: HashMap = leader_vec + .iter() + .enumerate() + .map(|(idx, &pc)| (pc, idx)) + .collect(); + + let mut blocks: Vec = Vec::with_capacity(leader_vec.len()); + for (bi, &start) in leader_vec.iter().enumerate() { + let end = leader_vec.get(bi + 1).copied().unwrap_or(n); + let last = end - 1; + let ins = code.instructions[last]; + let succs = match ins.op { + OpCode::ReturnValue => Vec::new(), + OpCode::JumpForward => vec![index_of[&forward_target(last, ins.arg)]], + OpCode::JumpBackward => { + vec![index_of[&backward_target(last, ins.arg).ok_or(JitVerdict::BadJumpTarget)?]] + } + OpCode::PopJumpIfFalse | OpCode::PopJumpIfTrue => { + let t = index_of[&forward_target(last, ins.arg)]; + let f = index_of + .get(&(last + 1)) + .copied() + .ok_or(JitVerdict::BadJumpTarget)?; + vec![f, t] + } + // Falls through to the next block. + _ => { + let fall = index_of + .get(&end) + .copied() + .ok_or(JitVerdict::BadJumpTarget)?; + vec![fall] + } + }; + blocks.push(RawBlock { start, end, succs }); + } + Ok(blocks) +} + +/// Index of the block whose `start == pc` (pc must be a leader). +fn block_index_at(raw: &[RawBlock], pc: usize) -> usize { + raw.iter().position(|b| b.start == pc).unwrap_or(0) +} + +/// Blocks reachable from the entry (block 0), in deterministic order. +fn reachable_blocks(raw: &[RawBlock]) -> Vec { + let mut seen = vec![false; raw.len()]; + let mut order = Vec::new(); + let mut q = VecDeque::new(); + if !raw.is_empty() { + q.push_back(0usize); + seen[0] = true; + } + while let Some(b) = q.pop_front() { + order.push(b); + for &s in &raw[b].succs { + if !seen[s] { + seen[s] = true; + q.push_back(s); + } + } + } + order.sort_unstable(); + order +} + +/// Compute the live-in local set via a definite-assignment must-analysis. +fn compute_livein( + code: &CodeObject, + raw: &[RawBlock], + reachable: &[usize], + n_locals: u32, +) -> HashSet { + let param_slots: HashSet = (0..code.arg_count).collect(); + let reachset: HashSet = reachable.iter().copied().collect(); + + // Predecessors among reachable blocks. + let mut preds: Vec> = vec![Vec::new(); raw.len()]; + for &b in reachable { + for &s in &raw[b].succs { + if reachset.contains(&s) { + preds[s].push(b); + } + } + } + + let full: HashSet = (0..n_locals).collect(); + let entry = block_index_at(raw, 0); + let mut assigned_in: Vec> = vec![full.clone(); raw.len()]; + if let Some(slot) = assigned_in.get_mut(entry) { + *slot = param_slots.clone(); + } + + // Fixpoint: assigned_in[b] = ∩ assigned_out[pred]. + loop { + let mut changed = false; + for &b in reachable { + let new_in = if b == entry { + param_slots.clone() + } else if preds[b].is_empty() { + // Unreachable-but-listed guard; treat as empty. + HashSet::new() + } else { + let mut acc: Option> = None; + for &p in &preds[b] { + let out = assigned_out(code, &raw[p], &assigned_in[p]); + acc = Some(match acc { + None => out, + Some(a) => a.intersection(&out).copied().collect(), + }); + } + acc.unwrap_or_default() + }; + if new_in != assigned_in[b] { + assigned_in[b] = new_in; + changed = true; + } + } + if !changed { + break; + } + } + + // Collect live-in: a load of a slot not definitely assigned yet. + let mut livein = HashSet::new(); + for &b in reachable { + let mut cur = assigned_in[b].clone(); + for i in raw[b].start..raw[b].end { + let ins = code.instructions[i]; + match ins.op { + OpCode::LoadFast if !cur.contains(&ins.arg) => { + livein.insert(ins.arg); + } + OpCode::StoreFast => { + cur.insert(ins.arg); + } + _ => {} + } + } + } + livein +} + +/// `assigned_in ∪ {slots stored in this block}`. +fn assigned_out(code: &CodeObject, b: &RawBlock, assigned_in: &HashSet) -> HashSet { + let mut out = assigned_in.clone(); + for i in b.start..b.end { + let ins = code.instructions[i]; + if matches!(ins.op, OpCode::StoreFast) { + out.insert(ins.arg); + } + } + out +} + +/// One operand-stack entry during analysis, with provenance for the +/// live-in inference (`src` is the slot of an as-yet-untyped load). +#[derive(Clone, Copy)] +struct SE { + ty: JitType, + src: Option, +} + +impl SE { + fn known(ty: JitType) -> SE { + SE { ty, src: None } + } +} + +/// Map a representable [`Constant`] to its lane, or `None`. +fn const_type(c: &Constant) -> Option { + match c { + Constant::Int(_) => Some(JitType::Int), + Constant::Bool(_) => Some(JitType::Bool), + Constant::Float(_) => Some(JitType::Float), + _ => None, + } +} + +/// Infer/validate one block during the fixpoint. Mutates `local_types` +/// (setting `changed` when it grows) and bails on hard errors. Transient +/// `Unknown` operands are tolerated — a later iteration may resolve them. +fn infer_block( + code: &CodeObject, + b: &RawBlock, + local_types: &mut [Option], + changed: &mut bool, +) -> Result<(), JitVerdict> { + let mut stack: Vec = Vec::new(); + for i in b.start..(b.end - 1) { + step_abstract(code, i, &mut stack, local_types, changed, false)?; + } + // Terminator stack-shape validation. + let last = b.end - 1; + let ins = code.instructions[last]; + match ins.op { + OpCode::ReturnValue => { + if stack.is_empty() { + return Err(JitVerdict::StackUnderflow); + } + } + OpCode::JumpForward | OpCode::JumpBackward => { + if !stack.is_empty() { + return Err(JitVerdict::NonEmptyBoundaryStack); + } + } + OpCode::PopJumpIfFalse | OpCode::PopJumpIfTrue => { + if stack.len() != 1 { + return Err(JitVerdict::NonEmptyBoundaryStack); + } + let c = stack[0]; + if !c.ty.is_representable() && c.src.is_none() { + return Err(JitVerdict::TypeUnknown); + } + } + // Fall-through terminator: must leave an empty stack. + _ => { + step_abstract(code, last, &mut stack, local_types, changed, false)?; + if !stack.is_empty() { + return Err(JitVerdict::NonEmptyBoundaryStack); + } + } + } + Ok(()) +} + +/// Abstract-execute one non-terminator instruction, updating the type +/// stack and (via inference) `local_types`. +fn step_abstract( + code: &CodeObject, + i: usize, + stack: &mut Vec, + local_types: &mut [Option], + changed: &mut bool, + strict: bool, +) -> Result<(), JitVerdict> { + let ins = code.instructions[i]; + match ins.op { + OpCode::Nop | OpCode::Resume => {} + OpCode::LoadConst => { + let c = code + .constants + .get(ins.arg as usize) + .ok_or(JitVerdict::UnsupportedConst)?; + let ty = const_type(c).ok_or(JitVerdict::UnsupportedConst)?; + stack.push(SE::known(ty)); + } + OpCode::LoadFast => { + let slot = ins.arg as usize; + match local_types.get(slot).copied().flatten() { + Some(t) => stack.push(SE::known(t)), + None => stack.push(SE { + ty: JitType::Unknown, + src: Some(ins.arg), + }), + } + } + OpCode::StoreFast => { + let v = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + if v.ty.is_representable() { + set_local(local_types, ins.arg, v.ty, changed)?; + } else if strict { + return Err(JitVerdict::TypeUnknown); + } + } + OpCode::BinaryOp => { + let kind = bin_kind(ins.arg)?; + let b = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let (a, b) = resolve_pair(a, b, local_types, changed); + let res = bin_result_type(kind, a.ty, b.ty, strict)?; + stack.push(SE::known(res)); + } + OpCode::CompareOp => { + let _ = cmp_kind(ins.arg)?; + let b = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let (a, b) = resolve_pair(a, b, local_types, changed); + cmp_check(a.ty, b.ty, strict)?; + stack.push(SE::known(JitType::Bool)); + } + OpCode::UnaryOp => { + let kind = unary_kind(ins.arg)?; + let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let res = unary_result_type(kind, a.ty, strict)?; + stack.push(SE::known(res)); + } + OpCode::PopTop => { + stack.pop().ok_or(JitVerdict::StackUnderflow)?; + } + OpCode::CopyTop => { + let v = *stack.last().ok_or(JitVerdict::StackUnderflow)?; + stack.push(v); + } + OpCode::Swap => { + if ins.arg != 2 { + return Err(JitVerdict::UnsupportedOpcode("SWAP n!=2")); + } + let len = stack.len(); + if len < 2 { + return Err(JitVerdict::StackUnderflow); + } + stack.swap(len - 1, len - 2); + } + other => return Err(JitVerdict::UnsupportedOpcode(other.name())), + } + Ok(()) +} + +/// If exactly one operand is an untyped live-in load and the other is a +/// concrete lane, infer the live-in's type. +fn resolve_pair( + mut a: SE, + mut b: SE, + local_types: &mut [Option], + changed: &mut bool, +) -> (SE, SE) { + if a.ty.is_representable() && !b.ty.is_representable() { + if let Some(slot) = b.src { + let _ = set_local(local_types, slot, a.ty, changed); + b.ty = a.ty; + b.src = None; + } + } else if b.ty.is_representable() && !a.ty.is_representable() { + if let Some(slot) = a.src { + let _ = set_local(local_types, slot, b.ty, changed); + a.ty = b.ty; + a.src = None; + } + } + (a, b) +} + +/// Assign a local's lane, enforcing single-type stability. +fn set_local( + local_types: &mut [Option], + slot: u32, + ty: JitType, + changed: &mut bool, +) -> Result<(), JitVerdict> { + let cell = local_types + .get_mut(slot as usize) + .ok_or(JitVerdict::TypeUnknown)?; + match *cell { + None => { + *cell = Some(ty); + *changed = true; + Ok(()) + } + Some(existing) if existing == ty => Ok(()), + Some(_) => Err(JitVerdict::NonUniformLocal(slot)), + } +} + +/// Result lane of a binary arithmetic op, given operand lanes. +fn bin_result_type( + kind: ArithKind, + a: JitType, + b: JitType, + strict: bool, +) -> Result { + if !a.is_representable() || !b.is_representable() { + return if strict { + Err(JitVerdict::TypeUnknown) + } else { + Ok(JitType::Unknown) + }; + } + let a_int = a.is_integral(); + let b_int = b.is_integral(); + if a_int && b_int { + match kind { + ArithKind::TrueDiv => Ok(JitType::Float), + ArithKind::And | ArithKind::Or | ArithKind::Xor => { + // bool∘bool stays bool in Python; we bail on that rare + // case to keep the lane unambiguous. + if a == JitType::Bool && b == JitType::Bool { + Err(JitVerdict::UnsupportedOpcode("bitwise on bool")) + } else { + Ok(JitType::Int) + } + } + _ => Ok(JitType::Int), + } + } else if a == JitType::Float && b == JitType::Float { + match kind { + ArithKind::Add | ArithKind::Sub | ArithKind::Mul | ArithKind::TrueDiv => { + Ok(JitType::Float) + } + _ => Err(JitVerdict::UnsupportedOpcode("float floordiv/mod/bitop")), + } + } else { + Err(JitVerdict::MixedArithTypes) + } +} + +/// Validate comparison operand lanes (same lane required in v1). +fn cmp_check(a: JitType, b: JitType, strict: bool) -> Result<(), JitVerdict> { + if !a.is_representable() || !b.is_representable() { + return if strict { + Err(JitVerdict::TypeUnknown) + } else { + Ok(()) + }; + } + if (a.is_integral() && b.is_integral()) || (a == JitType::Float && b == JitType::Float) { + Ok(()) + } else { + Err(JitVerdict::MixedArithTypes) + } +} + +/// Result lane of a unary op. +fn unary_result_type(kind: UnaryKind, a: JitType, strict: bool) -> Result { + if !a.is_representable() { + return if strict { + Err(JitVerdict::TypeUnknown) + } else { + Ok(JitType::Unknown) + }; + } + match kind { + UnaryKind::Not => Ok(JitType::Bool), + UnaryKind::Neg | UnaryKind::Invert => { + if a.is_integral() { + Ok(JitType::Int) + } else if matches!(kind, UnaryKind::Neg) { + Ok(JitType::Float) + } else { + Err(JitVerdict::UnsupportedOpcode("~float")) + } + } + UnaryKind::Pos => { + if a == JitType::Float { + Ok(JitType::Float) + } else if a == JitType::Int { + Ok(JitType::Int) + } else { + Err(JitVerdict::UnsupportedOpcode("+bool")) + } + } + } +} + +fn bin_kind(arg: u32) -> Result { + let k = match arg { + x if x == BinOpKind::Add as u32 => ArithKind::Add, + x if x == BinOpKind::Sub as u32 => ArithKind::Sub, + x if x == BinOpKind::Mult as u32 => ArithKind::Mul, + x if x == BinOpKind::Div as u32 => ArithKind::TrueDiv, + x if x == BinOpKind::FloorDiv as u32 => ArithKind::FloorDiv, + x if x == BinOpKind::Mod as u32 => ArithKind::Mod, + x if x == BinOpKind::BitOr as u32 => ArithKind::Or, + x if x == BinOpKind::BitXor as u32 => ArithKind::Xor, + x if x == BinOpKind::BitAnd as u32 => ArithKind::And, + _ => return Err(JitVerdict::UnsupportedOpcode("BINARY_OP kind")), + }; + Ok(k) +} + +fn cmp_kind(arg: u32) -> Result { + let k = match arg { + x if x == CompareKind::Lt as u32 => CmpKind::Lt, + x if x == CompareKind::LtE as u32 => CmpKind::Le, + x if x == CompareKind::Eq as u32 => CmpKind::Eq, + x if x == CompareKind::NotEq as u32 => CmpKind::Ne, + x if x == CompareKind::Gt as u32 => CmpKind::Gt, + x if x == CompareKind::GtE as u32 => CmpKind::Ge, + _ => return Err(JitVerdict::UnsupportedOpcode("COMPARE_OP kind")), + }; + Ok(k) +} + +fn unary_kind(arg: u32) -> Result { + let k = match arg { + x if x == UnaryKind::Pos as u32 => UnaryKind::Pos, + x if x == UnaryKind::Neg as u32 => UnaryKind::Neg, + x if x == UnaryKind::Not as u32 => UnaryKind::Not, + x if x == UnaryKind::Invert as u32 => UnaryKind::Invert, + _ => return Err(JitVerdict::UnsupportedOpcode("UNARY_OP kind")), + }; + Ok(k) +} + +/// Emit the typed IR for one block, with all local types now known. +fn emit_block( + code: &CodeObject, + b: &RawBlock, + local_types: &[Option], + compact: &HashMap, + max_stack: &mut u32, +) -> Result { + let mut stack: Vec = Vec::new(); + let mut stmts: Vec = Vec::new(); + + for i in b.start..(b.end - 1) { + emit_instr(code, i, local_types, &mut stack, &mut stmts, max_stack)?; + } + + let last = b.end - 1; + let ins = code.instructions[last]; + let term = match ins.op { + OpCode::ReturnValue => { + // Lowering pops the return value off its own type stack at + // the `Return` terminator; no statement is emitted here. + if stack.is_empty() { + return Err(JitVerdict::StackUnderflow); + } + TTerm::Return + } + OpCode::JumpForward | OpCode::JumpBackward => { + let t = compact[&block_succ(b, 0)]; + TTerm::Jump(t) + } + OpCode::PopJumpIfFalse => TTerm::BranchFalse { + fallthrough: compact[&block_succ(b, 0)], + target: compact[&block_succ(b, 1)], + }, + OpCode::PopJumpIfTrue => TTerm::BranchTrue { + fallthrough: compact[&block_succ(b, 0)], + target: compact[&block_succ(b, 1)], + }, + _ => { + emit_instr(code, last, local_types, &mut stack, &mut stmts, max_stack)?; + TTerm::Jump(compact[&block_succ(b, 0)]) + } + }; + + // Entry stack is always empty in the v1 subset. + Ok(TBlock { + entry_stack: Vec::new(), + stmts, + term, + }) +} + +/// The raw successor block index at position `k`. +fn block_succ(b: &RawBlock, k: usize) -> usize { + b.succs[k] +} + +/// Emit one instruction's [`TStmt`](s), tracking the type stack so +/// result lanes match what lowering will reconstruct. +fn emit_instr( + code: &CodeObject, + i: usize, + local_types: &[Option], + stack: &mut Vec, + stmts: &mut Vec, + max_stack: &mut u32, +) -> Result<(), JitVerdict> { + let ins = code.instructions[i]; + let pc = i as u32; + let mut push = + |op: TOp, ty: Option, stack: &mut Vec, stmts: &mut Vec| { + stmts.push(TStmt { pc, op }); + if let Some(t) = ty { + stack.push(t); + } + *max_stack = (*max_stack).max(stack.len() as u32); + }; + match ins.op { + OpCode::Nop | OpCode::Resume => {} + OpCode::LoadConst => { + let c = &code.constants[ins.arg as usize]; + let (op, ty) = match c { + Constant::Int(v) => (TOp::PushConstInt(*v), JitType::Int), + Constant::Bool(v) => (TOp::PushConstBool(*v), JitType::Bool), + Constant::Float(v) => (TOp::PushConstFloat(v.to_bits()), JitType::Float), + _ => return Err(JitVerdict::UnsupportedConst), + }; + push(op, Some(ty), stack, stmts); + } + OpCode::LoadFast => { + let ty = local_types + .get(ins.arg as usize) + .copied() + .flatten() + .ok_or(JitVerdict::TypeUnknown)?; + push(TOp::LoadLocal(ins.arg), Some(ty), stack, stmts); + } + OpCode::StoreFast => { + stack.pop().ok_or(JitVerdict::StackUnderflow)?; + push(TOp::StoreLocal(ins.arg), None, stack, stmts); + } + OpCode::BinaryOp => { + let kind = bin_kind(ins.arg)?; + let b = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let (op, ty) = lower_bin(kind, a, b)?; + push(op, Some(ty), stack, stmts); + } + OpCode::CompareOp => { + let kind = cmp_kind(ins.arg)?; + let b = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + let op = if a.is_integral() && b.is_integral() { + TOp::IntCmp(kind) + } else if a == JitType::Float && b == JitType::Float { + TOp::FloatCmp(kind) + } else { + return Err(JitVerdict::MixedArithTypes); + }; + push(op, Some(JitType::Bool), stack, stmts); + } + OpCode::UnaryOp => { + let kind = unary_kind(ins.arg)?; + let a = stack.pop().ok_or(JitVerdict::StackUnderflow)?; + match (kind, a) { + (UnaryKind::Pos, JitType::Int | JitType::Float) => { + // Identity; re-push same lane, emit nothing. + stack.push(a); + } + (UnaryKind::Neg, t) if t.is_integral() => { + push(TOp::IntNeg, Some(JitType::Int), stack, stmts) + } + (UnaryKind::Neg, JitType::Float) => { + push(TOp::FloatNeg, Some(JitType::Float), stack, stmts); + } + (UnaryKind::Invert, t) if t.is_integral() => { + push(TOp::IntInvert, Some(JitType::Int), stack, stmts); + } + (UnaryKind::Not, t) if t.is_integral() => { + push(TOp::IntNot, Some(JitType::Bool), stack, stmts); + } + (UnaryKind::Not, JitType::Float) => { + push(TOp::FloatNot, Some(JitType::Bool), stack, stmts); + } + _ => return Err(JitVerdict::UnsupportedOpcode("UNARY_OP lane")), + } + } + OpCode::PopTop => { + stack.pop().ok_or(JitVerdict::StackUnderflow)?; + push(TOp::Pop, None, stack, stmts); + } + OpCode::CopyTop => { + let t = *stack.last().ok_or(JitVerdict::StackUnderflow)?; + push(TOp::Dup, Some(t), stack, stmts); + } + OpCode::Swap => { + if ins.arg != 2 { + return Err(JitVerdict::UnsupportedOpcode("SWAP n!=2")); + } + let len = stack.len(); + if len < 2 { + return Err(JitVerdict::StackUnderflow); + } + stack.swap(len - 1, len - 2); + push(TOp::Swap2, None, stack, stmts); + } + other => return Err(JitVerdict::UnsupportedOpcode(other.name())), + } + Ok(()) +} + +/// Choose the IR op + result lane for a binary arithmetic op at emission +/// time (types are all known). +fn lower_bin(kind: ArithKind, a: JitType, b: JitType) -> Result<(TOp, JitType), JitVerdict> { + if a.is_integral() && b.is_integral() { + match kind { + ArithKind::TrueDiv => Ok((TOp::IntTrueDiv, JitType::Float)), + ArithKind::And | ArithKind::Or | ArithKind::Xor => { + if a == JitType::Bool && b == JitType::Bool { + Err(JitVerdict::UnsupportedOpcode("bitwise on bool")) + } else { + Ok((TOp::IntArith(kind), JitType::Int)) + } + } + _ => Ok((TOp::IntArith(kind), JitType::Int)), + } + } else if a == JitType::Float && b == JitType::Float { + match kind { + ArithKind::Add | ArithKind::Sub | ArithKind::Mul | ArithKind::TrueDiv => { + Ok((TOp::FloatArith(kind), JitType::Float)) + } + _ => Err(JitVerdict::UnsupportedOpcode("float floordiv/mod/bitop")), + } + } else { + Err(JitVerdict::MixedArithTypes) + } +} diff --git a/crates/weavepy-jit/src/engine.rs b/crates/weavepy-jit/src/engine.rs new file mode 100644 index 0000000..70a52dc --- /dev/null +++ b/crates/weavepy-jit/src/engine.rs @@ -0,0 +1,165 @@ +//! The Cranelift JIT module lifecycle and the compiled-frame entry +//! point. +//! +//! A [`JitEngine`] owns one [`JITModule`]; every compiled frame is a +//! native function defined into it. The engine is intended to be a +//! per-thread singleton (the VM keeps it in thread-local storage, under +//! the GIL), so the function pointers stay valid for the thread's +//! lifetime and there is no cross-thread aliasing. + +use std::mem; + +use cranelift_codegen::ir::{types, AbiParam, Type}; +use cranelift_codegen::settings::{self, Configurable}; +use cranelift_codegen::Context; +use cranelift_frontend::FunctionBuilderContext; +use cranelift_jit::{JITBuilder, JITModule}; +use cranelift_module::{Linkage, Module}; + +use crate::analyze::{analyze, JitVerdict}; +use crate::ir::TFunc; +use crate::lower::build_function; +use crate::runtime::{JitFrame, JitStatus}; +use crate::value::JitType; +use weavepy_compiler::CodeObject; + +/// The native ABI of a compiled frame: takes a `*mut JitFrame`, returns +/// an `i64` [`JitStatus`]. +pub(crate) type NativeFn = unsafe extern "C" fn(*mut JitFrame) -> i64; + +/// A compiled frame plus the metadata the VM needs to marshal values in +/// and out and to apply the entry guard. +#[derive(Debug)] +pub struct CompiledFrame { + func: NativeFn, + /// Local slots to type-guard + pack before entry (read-before-write). + pub livein: Vec, + /// Stable lane of each local slot (`None` = not JIT-managed). + pub local_types: Vec>, + /// Max abstract operand-stack depth, for sizing the spill buffer. + pub max_stack: u32, + /// Number of local slots. + pub n_locals: u32, +} + +impl CompiledFrame { + /// Enter the compiled frame. + /// + /// # Safety + /// + /// `frame` must point to a fully-initialised [`JitFrame`] whose + /// `locals` / `stack_spill` / `stack_tags` buffers are at least + /// `n_locals` / `max_stack` wide, and the owning [`JitEngine`] must + /// still be alive (its `JITModule` backs this function pointer). + #[must_use] + pub unsafe fn enter(&self, frame: *mut JitFrame) -> JitStatus { + // SAFETY: the caller upholds the buffer-size and liveness + // invariants documented above; the function pointer was produced + // by `JITModule::get_finalized_function` for this exact signature. + let raw = unsafe { (self.func)(frame) }; + JitStatus::from_raw(raw) + } +} + +/// Owns the Cranelift JIT module and reusable codegen contexts. +pub struct JitEngine { + module: JITModule, + ctx: Context, + fbctx: FunctionBuilderContext, + ptr_ty: Type, + next_id: u32, +} + +impl std::fmt::Debug for JitEngine { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("JitEngine") + .field("ptr_ty", &self.ptr_ty) + .field("next_id", &self.next_id) + .finish_non_exhaustive() + } +} + +impl JitEngine { + /// Build a fresh engine for the host target. Returns `None` if the + /// host ISA can't be configured (e.g. an unsupported platform), in + /// which case the VM simply never tiers up. + #[must_use] + pub fn new() -> Option { + let mut flag_builder = settings::builder(); + // A JIT that emits absolute addresses and resolves libcalls + // in-process. + flag_builder.set("use_colocated_libcalls", "false").ok()?; + flag_builder.set("is_pic", "false").ok()?; + // Favour fast compiles over the last few percent of codegen. + flag_builder.set("opt_level", "speed").ok()?; + let isa_builder = cranelift_native::builder().ok()?; + let isa = isa_builder + .finish(settings::Flags::new(flag_builder)) + .ok()?; + let builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names()); + let module = JITModule::new(builder); + let ptr_ty = module.target_config().pointer_type(); + let ctx = module.make_context(); + Some(JitEngine { + module, + ctx, + fbctx: FunctionBuilderContext::new(), + ptr_ty, + next_id: 0, + }) + } + + /// Analyze and compile a code object. Returns the compiled frame, or + /// the [`JitVerdict`] explaining why it is not JITable. + pub fn compile(&mut self, code: &CodeObject) -> Result { + let tfunc = analyze(code)?; + self.compile_tfunc(&tfunc) + } + + /// Compile an already-analyzed [`TFunc`] (also the unit-test entry). + pub fn compile_tfunc(&mut self, tfunc: &TFunc) -> Result { + self.module.clear_context(&mut self.ctx); + + // Signature: (frame: ptr) -> i64. + self.ctx + .func + .signature + .params + .push(AbiParam::new(self.ptr_ty)); + self.ctx + .func + .signature + .returns + .push(AbiParam::new(types::I64)); + + build_function(&mut self.ctx.func, &mut self.fbctx, tfunc, self.ptr_ty); + + let name = format!("wpjit_{}", self.next_id); + self.next_id += 1; + let id = self + .module + .declare_function(&name, Linkage::Local, &self.ctx.func.signature) + .map_err(|_| JitVerdict::NotConverged)?; + self.module + .define_function(id, &mut self.ctx) + .map_err(|_| JitVerdict::NotConverged)?; + self.module.clear_context(&mut self.ctx); + self.module + .finalize_definitions() + .map_err(|_| JitVerdict::NotConverged)?; + + let code_ptr = self.module.get_finalized_function(id); + // SAFETY: `code_ptr` is a finalized function with exactly the + // `(*mut JitFrame) -> i64` signature declared above; the module + // keeps the code alive for the engine's lifetime. + let func: NativeFn = unsafe { mem::transmute::<*const u8, NativeFn>(code_ptr) }; + + Ok(CompiledFrame { + func, + livein: tfunc.livein_locals.clone(), + local_types: tfunc.local_types.clone(), + max_stack: tfunc.max_stack, + n_locals: tfunc.n_locals, + }) + } +} diff --git a/crates/weavepy-jit/src/ir.rs b/crates/weavepy-jit/src/ir.rs new file mode 100644 index 0000000..dcece81 --- /dev/null +++ b/crates/weavepy-jit/src/ir.rs @@ -0,0 +1,173 @@ +//! The typed mid-IR the analyzer emits and the lowerer consumes. +//! +//! It is a *stack machine* mirroring the bytecode, but with every +//! operation resolved to a concrete [`JitType`] lane and every local +//! resolved to a slot index. Keeping a tiny IR between bytecode and +//! Cranelift means [`crate::analyze`] can be unit-tested without a +//! codegen backend and [`crate::lower`] stays a straight syntax-directed +//! translation. +//! +//! Cross-block operand-stack values are carried as Cranelift *block +//! parameters* in lowering; [`TBlock::entry_stack`] records their static +//! types so the lowerer can declare the right params. Locals become +//! Cranelift *variables*, so merges are handled by the SSA builder +//! without explicit phis. + +use crate::value::JitType; + +/// Index of a [`TBlock`] within a [`TFunc`]. +pub type BlockId = usize; + +/// Arithmetic operations the JIT lowers. `TrueDiv` (`/`) always yields a +/// `float`; `FloorDiv`/`Mod` carry Python's round-toward-negative- +/// infinity semantics on integers. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum ArithKind { + Add, + Sub, + Mul, + FloorDiv, + Mod, + TrueDiv, + And, + Or, + Xor, +} + +/// Comparison operators (six-way), matching `CompareKind`. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum CmpKind { + Lt, + Le, + Eq, + Ne, + Gt, + Ge, +} + +/// A single stack-machine operation. Operands are implicit (the top of +/// the abstract value stack); results are pushed. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum TOp { + /// Push an `int` constant. + PushConstInt(i64), + /// Push a `float` constant (stored as `f64::to_bits` so `TOp` stays + /// `Copy` + `PartialEq`). + PushConstFloat(u64), + /// Push a `bool` constant. + PushConstBool(bool), + /// Push `locals[slot]`. + LoadLocal(u32), + /// Pop into `locals[slot]`. + StoreLocal(u32), + /// `int (op) int → int`. `Add`/`Sub`/`Mul` deopt on i64 overflow; + /// `FloorDiv`/`Mod` deopt on zero divisor or `MIN / -1`. Never + /// carries `TrueDiv` (see [`TOp::IntTrueDiv`]). + IntArith(ArithKind), + /// `float (op) float → float`. Only `Add`/`Sub`/`Mul`/`TrueDiv` + /// (float floor-div / mod are non-JITable in v1). + FloatArith(ArithKind), + /// `int / int → float` (Python true division). Deopts on a zero + /// divisor (the interpreter raises `ZeroDivisionError`). + IntTrueDiv, + /// `int (cmp) int → bool`. + IntCmp(CmpKind), + /// `float (cmp) float → bool`. + FloatCmp(CmpKind), + /// `-int`. Deopts on `MIN` negation overflow. + IntNeg, + /// `-float`. + FloatNeg, + /// `~int`. + IntInvert, + /// `not x` for an integral (`int`/`bool`) operand → `bool`. + IntNot, + /// `not x` for a `float` operand → `bool`. + FloatNot, + /// Discard TOS. + Pop, + /// Duplicate TOS (`COPY`). + Dup, + /// Swap the top two stack entries (`SWAP 2`). + Swap2, +} + +/// One IR statement: a [`TOp`] tagged with its originating bytecode pc +/// so a side exit can name the exact resume point. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct TStmt { + pub pc: u32, + pub op: TOp, +} + +/// How a basic block transfers control. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum TTerm { + /// Pop TOS and return it from the frame. + Return, + /// Unconditional branch; the current abstract stack is passed as + /// block args. + Jump(BlockId), + /// `POP_JUMP_IF_FALSE`: pop the condition; branch to `target` if + /// falsy, else `fallthrough`. + BranchFalse { + target: BlockId, + fallthrough: BlockId, + }, + /// `POP_JUMP_IF_TRUE`: pop the condition; branch to `target` if + /// truthy, else `fallthrough`. + BranchTrue { + target: BlockId, + fallthrough: BlockId, + }, +} + +/// A basic block: a static entry-stack shape, a straight-line body, and +/// a terminator. +#[derive(Clone, Debug, PartialEq)] +pub struct TBlock { + /// Types of the operand-stack values live on entry (lowered to + /// Cranelift block parameters), bottom-to-top. + pub entry_stack: Vec, + pub stmts: Vec, + pub term: TTerm, +} + +/// A fully analyzed, JITable function body. +#[derive(Clone, Debug, PartialEq)] +pub struct TFunc { + /// Number of local slots in the originating code object. + pub n_locals: u32, + /// Stable JIT type of each local slot, or `None` for slots the + /// region never touches (left untouched by the JIT). + pub local_types: Vec>, + /// Local slots that are live-in at function entry (read before + /// written). The VM type-guards and packs exactly these before + /// entering native code. + pub livein_locals: Vec, + /// Maximum abstract operand-stack depth, for sizing the deopt spill + /// buffer. + pub max_stack: u32, + pub blocks: Vec, + pub entry_block: BlockId, +} + +impl TOp { + /// `true` for operations that can take a side exit (deopt) and so + /// need their abstract stack spilled at their pc. + #[must_use] + pub fn can_deopt(self) -> bool { + matches!( + self, + TOp::IntArith( + ArithKind::Add + | ArithKind::Sub + | ArithKind::Mul + | ArithKind::FloorDiv + | ArithKind::Mod + ) | TOp::IntNeg + | TOp::IntTrueDiv + | TOp::FloatArith(ArithKind::TrueDiv) + ) + } +} diff --git a/crates/weavepy-jit/src/lib.rs b/crates/weavepy-jit/src/lib.rs new file mode 100644 index 0000000..cac7fa9 --- /dev/null +++ b/crates/weavepy-jit/src/lib.rs @@ -0,0 +1,46 @@ +//! RFC 0032 — tier-2 Cranelift JIT for WeavePy's unboxed numeric frames. +//! +//! This crate compiles the *unboxed numeric/control-flow core* of a +//! [`weavepy_compiler::CodeObject`] — `int`/`float`/`bool` arithmetic, +//! comparisons, the conditional and unconditional jumps, `range` +//! iteration, and `return` — to native machine code via Cranelift. +//! Everything outside that subset (containers, attribute access, calls +//! out, exceptions, generators) stays in the interpreter; a frame whose +//! hot region touches an unsupported opcode is reported +//! [`JitStatus::NotJitable`] and never re-attempted. +//! +//! The crate deliberately does **not** depend on `weavepy-vm`: it speaks +//! only in `i64`/`f64`/`bool` lanes plus the side-exit protocol in +//! [`runtime`], so the VM owns the `Object` model and marshals values in +//! and out of a [`runtime::JitFrame`] around each native entry. That +//! keeps the unsafe FFI surface tiny and the dependency graph acyclic. +//! +//! # Safety +//! +//! Entering compiled code is `unsafe` by nature (an indirect call +//! through a function pointer with a `#[repr(C)]` argument). The unsafe +//! is confined to [`engine`] and [`runtime`]; callers interact through +//! the safe [`JitEngine`] API and the [`runtime::JitFrame`] struct. + +mod analyze; +mod engine; +mod ir; +mod lower; +mod runtime; +mod value; + +pub use analyze::{analyze, JitVerdict}; +pub use engine::{CompiledFrame, JitEngine}; +pub use ir::{ArithKind, BlockId, CmpKind, TBlock, TFunc, TOp, TStmt, TTerm}; +pub use runtime::{JitFrame, JitStatus, SlotTag}; +pub use value::JitType; + +/// Outcome of attempting to compile a code object. +#[derive(Debug)] +pub enum CompileOutcome { + /// The code object compiled; the engine cached the native function. + Compiled(CompiledFrame), + /// The code object is outside the JITable subset. The caller should + /// record this verdict and stop re-attempting compilation. + NotJitable(JitVerdict), +} diff --git a/crates/weavepy-jit/src/lower.rs b/crates/weavepy-jit/src/lower.rs new file mode 100644 index 0000000..c2057f9 --- /dev/null +++ b/crates/weavepy-jit/src/lower.rs @@ -0,0 +1,550 @@ +//! Lower the typed IR ([`TFunc`]) to a Cranelift function. +//! +//! Locals become Cranelift *variables* (the SSA builder inserts phis at +//! merges); the operand stack is an explicit `Vec` of SSA values, which +//! the v1 subset guarantees is empty at every block boundary. Integer +//! arithmetic is emitted with explicit overflow / divide-by-zero checks +//! that branch to per-op *side-exit* blocks; a side exit writes the live +//! locals + spilled stack back into the [`JitFrame`] and returns +//! [`JitStatus::Deopt`] so the interpreter resumes at the exact pc. + +use cranelift_codegen::ir::condcodes::{FloatCC, IntCC}; +use cranelift_codegen::ir::{types, Block, Function, InstBuilder, MemFlags, Type, Value}; +use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable}; + +use crate::ir::{ArithKind, CmpKind, TFunc, TOp, TStmt, TTerm}; +use crate::runtime::{JitFrame, JitStatus, SlotTag}; +use crate::value::JitType; + +const OFF_LOCALS: i32 = core::mem::offset_of!(JitFrame, locals) as i32; +const OFF_RET_BITS: i32 = core::mem::offset_of!(JitFrame, ret_bits) as i32; +const OFF_RET_TAG: i32 = core::mem::offset_of!(JitFrame, ret_tag) as i32; +const OFF_DEOPT_PC: i32 = core::mem::offset_of!(JitFrame, deopt_pc) as i32; +const OFF_STACK_SPILL: i32 = core::mem::offset_of!(JitFrame, stack_spill) as i32; +const OFF_STACK_TAGS: i32 = core::mem::offset_of!(JitFrame, stack_tags) as i32; +const OFF_STACK_LEN: i32 = core::mem::offset_of!(JitFrame, stack_len) as i32; + +/// Build the Cranelift function body for `tfunc` into `func`. +pub(crate) fn build_function( + func: &mut Function, + fbctx: &mut FunctionBuilderContext, + tfunc: &TFunc, + ptr_ty: Type, +) { + let mut builder = FunctionBuilder::new(func, fbctx); + let mut lc = Lowerer::new(&mut builder, tfunc, ptr_ty); + lc.build(); + builder.seal_all_blocks(); + builder.finalize(); +} + +struct Lowerer<'a, 'b> { + b: &'a mut FunctionBuilder<'b>, + tfunc: &'a TFunc, + ptr_ty: Type, + /// One Cranelift block per (reachable) TBlock. + cl_blocks: Vec, + /// One variable per managed local slot (others unused). + vars: Vec>, + frame_ptr: Value, + locals_base: Value, + spill_base: Value, + tags_base: Value, + /// The abstract operand stack: SSA value + lane. + vstack: Vec<(Value, JitType)>, +} + +impl<'a, 'b> Lowerer<'a, 'b> { + fn new(b: &'a mut FunctionBuilder<'b>, tfunc: &'a TFunc, ptr_ty: Type) -> Lowerer<'a, 'b> { + // Placeholders overwritten at the top of `build` before any use. + let dummy = Value::from_u32(0); + Lowerer { + b, + tfunc, + ptr_ty, + cl_blocks: Vec::new(), + vars: Vec::new(), + frame_ptr: dummy, + locals_base: dummy, + spill_base: dummy, + tags_base: dummy, + vstack: Vec::new(), + } + } + + fn cl_ty(ty: JitType) -> Type { + match ty { + JitType::Float => types::F64, + _ => types::I64, + } + } + + fn tag(ty: JitType) -> i64 { + match ty { + JitType::Int => SlotTag::Int as i64, + JitType::Float => SlotTag::Float as i64, + JitType::Bool => SlotTag::Bool as i64, + JitType::Unknown => SlotTag::Int as i64, + } + } + + fn build(&mut self) { + let trusted = MemFlags::trusted(); + + // Entry / prologue block carries the function param (frame ptr). + let entry = self.b.create_block(); + self.b.append_block_params_for_function_params(entry); + self.b.switch_to_block(entry); + self.frame_ptr = self.b.block_params(entry)[0]; + self.locals_base = self + .b + .ins() + .load(self.ptr_ty, trusted, self.frame_ptr, OFF_LOCALS); + self.spill_base = self + .b + .ins() + .load(self.ptr_ty, trusted, self.frame_ptr, OFF_STACK_SPILL); + self.tags_base = self + .b + .ins() + .load(self.ptr_ty, trusted, self.frame_ptr, OFF_STACK_TAGS); + + // One Cranelift block per TBlock. + self.cl_blocks = (0..self.tfunc.blocks.len()) + .map(|_| self.b.create_block()) + .collect(); + + // Declare + initialise a variable per managed local. + self.vars = vec![None; self.tfunc.n_locals as usize]; + for slot in 0..self.tfunc.local_types.len() { + if let Some(ty) = self.tfunc.local_types[slot] { + let cl = Self::cl_ty(ty); + let var = self.b.declare_var(cl); + let off = (slot as i32) * 8; + let v = self.b.ins().load(cl, trusted, self.locals_base, off); + self.b.def_var(var, v); + self.vars[slot] = Some(var); + } + } + + let entry_target = self.cl_blocks[self.tfunc.entry_block]; + self.b.ins().jump(entry_target, &[]); + + // Emit each block body. + for bi in 0..self.tfunc.blocks.len() { + let cl = self.cl_blocks[bi]; + self.b.switch_to_block(cl); + self.vstack.clear(); + self.emit_block(bi); + } + } + + fn emit_block(&mut self, bi: usize) { + let block = self.tfunc.blocks[bi].clone(); + for stmt in &block.stmts { + self.emit_stmt(*stmt); + } + match block.term { + TTerm::Return => self.emit_return(), + TTerm::Jump(t) => { + let target = self.cl_blocks[t]; + self.b.ins().jump(target, &[]); + } + TTerm::BranchFalse { + target, + fallthrough, + } => { + let (cond, ty) = self.pop(); + let truthy = self.truth(cond, ty); + let tb = self.cl_blocks[target]; + let fb = self.cl_blocks[fallthrough]; + // if truthy → fallthrough else → target. + self.b.ins().brif(truthy, fb, &[], tb, &[]); + } + TTerm::BranchTrue { + target, + fallthrough, + } => { + let (cond, ty) = self.pop(); + let truthy = self.truth(cond, ty); + let tb = self.cl_blocks[target]; + let fb = self.cl_blocks[fallthrough]; + self.b.ins().brif(truthy, tb, &[], fb, &[]); + } + } + } + + fn emit_return(&mut self) { + let trusted = MemFlags::trusted(); + let (val, ty) = self.pop(); + self.b + .ins() + .store(trusted, val, self.frame_ptr, OFF_RET_BITS); + let tag = self.b.ins().iconst(types::I32, Self::tag(ty)); + self.b + .ins() + .store(trusted, tag, self.frame_ptr, OFF_RET_TAG); + let status = self.b.ins().iconst(types::I64, JitStatus::Returned as i64); + self.b.ins().return_(&[status]); + } + + fn emit_stmt(&mut self, stmt: TStmt) { + match stmt.op { + TOp::PushConstInt(v) => { + let val = self.b.ins().iconst(types::I64, v); + self.vstack.push((val, JitType::Int)); + } + TOp::PushConstBool(v) => { + let val = self.b.ins().iconst(types::I64, i64::from(v)); + self.vstack.push((val, JitType::Bool)); + } + TOp::PushConstFloat(bits) => { + let val = self.b.ins().f64const(f64::from_bits(bits)); + self.vstack.push((val, JitType::Float)); + } + TOp::LoadLocal(slot) => { + let ty = self.tfunc.local_types[slot as usize].unwrap_or(JitType::Int); + let var = self.vars[slot as usize].expect("managed local"); + let v = self.b.use_var(var); + self.vstack.push((v, ty)); + } + TOp::StoreLocal(slot) => { + let (v, _) = self.pop(); + let var = self.vars[slot as usize].expect("managed local"); + self.b.def_var(var, v); + } + TOp::IntArith(kind) => self.emit_int_arith(kind, stmt.pc), + TOp::FloatArith(kind) => self.emit_float_arith(kind, stmt.pc), + TOp::IntTrueDiv => self.emit_int_truediv(stmt.pc), + TOp::IntCmp(kind) => self.emit_int_cmp(kind), + TOp::FloatCmp(kind) => self.emit_float_cmp(kind), + TOp::IntNeg => self.emit_int_neg(stmt.pc), + TOp::FloatNeg => { + let (a, _) = self.pop(); + let r = self.b.ins().fneg(a); + self.vstack.push((r, JitType::Float)); + } + TOp::IntInvert => { + let (a, _) = self.pop(); + let r = self.b.ins().bnot(a); + self.vstack.push((r, JitType::Int)); + } + TOp::IntNot => { + let (a, _) = self.pop(); + let z = self.b.ins().iconst(types::I64, 0); + let cmp = self.b.ins().icmp(IntCC::Equal, a, z); + let r = self.b.ins().uextend(types::I64, cmp); + self.vstack.push((r, JitType::Bool)); + } + TOp::FloatNot => { + let (a, _) = self.pop(); + let z = self.b.ins().f64const(0.0); + let cmp = self.b.ins().fcmp(FloatCC::Equal, a, z); + let r = self.b.ins().uextend(types::I64, cmp); + self.vstack.push((r, JitType::Bool)); + } + TOp::Pop => { + self.pop(); + } + TOp::Dup => { + let top = *self.vstack.last().expect("dup on empty"); + self.vstack.push(top); + } + TOp::Swap2 => { + let len = self.vstack.len(); + self.vstack.swap(len - 1, len - 2); + } + } + } + + // ---- arithmetic ------------------------------------------------ + + fn emit_int_arith(&mut self, kind: ArithKind, pc: u32) { + match kind { + ArithKind::Add | ArithKind::Sub | ArithKind::Mul => { + let snapshot = self.vstack.clone(); + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let (r, ovf) = match kind { + ArithKind::Add => self.checked_add(a, b), + ArithKind::Sub => self.checked_sub(a, b), + _ => self.checked_mul(a, b), + }; + let cont = self.guard(ovf, pc, &snapshot); + self.b.switch_to_block(cont); + self.vstack.push((r, JitType::Int)); + } + ArithKind::FloorDiv => self.emit_floordiv(pc), + ArithKind::Mod => self.emit_mod(pc), + ArithKind::And => self.emit_int_bitop(BitOp::And), + ArithKind::Or => self.emit_int_bitop(BitOp::Or), + ArithKind::Xor => self.emit_int_bitop(BitOp::Xor), + ArithKind::TrueDiv => self.emit_int_truediv(pc), + } + } + + fn emit_int_bitop(&mut self, op: BitOp) { + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let r = match op { + BitOp::And => self.b.ins().band(a, b), + BitOp::Or => self.b.ins().bor(a, b), + BitOp::Xor => self.b.ins().bxor(a, b), + }; + self.vstack.push((r, JitType::Int)); + } + + fn emit_float_arith(&mut self, kind: ArithKind, pc: u32) { + if matches!(kind, ArithKind::TrueDiv) { + self.emit_float_truediv(pc); + return; + } + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let r = match kind { + ArithKind::Add => self.b.ins().fadd(a, b), + ArithKind::Sub => self.b.ins().fsub(a, b), + ArithKind::Mul => self.b.ins().fmul(a, b), + _ => unreachable!("non-jitable float arith reached lowering"), + }; + self.vstack.push((r, JitType::Float)); + } + + fn emit_float_truediv(&mut self, pc: u32) { + // Python raises ZeroDivisionError on float `/ 0.0`; deopt so the + // interpreter raises with the right traceback. + let snapshot = self.vstack.clone(); + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let z = self.b.ins().f64const(0.0); + let is_zero = self.b.ins().fcmp(FloatCC::Equal, b, z); + let cont = self.guard(is_zero, pc, &snapshot); + self.b.switch_to_block(cont); + let r = self.b.ins().fdiv(a, b); + self.vstack.push((r, JitType::Float)); + } + + fn emit_int_truediv(&mut self, pc: u32) { + let snapshot = self.vstack.clone(); + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let z = self.b.ins().iconst(types::I64, 0); + let is_zero = self.b.ins().icmp(IntCC::Equal, b, z); + let cont = self.guard(is_zero, pc, &snapshot); + self.b.switch_to_block(cont); + let af = self.b.ins().fcvt_from_sint(types::F64, a); + let bf = self.b.ins().fcvt_from_sint(types::F64, b); + let r = self.b.ins().fdiv(af, bf); + self.vstack.push((r, JitType::Float)); + } + + fn emit_int_neg(&mut self, pc: u32) { + let snapshot = self.vstack.clone(); + let (a, _) = self.pop(); + let min = self.b.ins().iconst(types::I64, i64::MIN); + let ovf = self.b.ins().icmp(IntCC::Equal, a, min); + let cont = self.guard(ovf, pc, &snapshot); + self.b.switch_to_block(cont); + let r = self.b.ins().ineg(a); + self.vstack.push((r, JitType::Int)); + } + + /// Python floor division on `i64`. Deopts on a zero divisor or the + /// `MIN / -1` overflow, then applies the round-toward-negative- + /// infinity correction. + fn emit_floordiv(&mut self, pc: u32) { + let snapshot = self.vstack.clone(); + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let should = self.div_guard_cond(a, b); + let cont = self.guard(should, pc, &snapshot); + self.b.switch_to_block(cont); + + let q = self.b.ins().sdiv(a, b); + let r = self.b.ins().srem(a, b); + // if r != 0 && (r<0) != (b<0) { q - 1 } else { q } + let adj = self.floor_adjust(r, b); + let qm1 = self.b.ins().iadd(q, adj); + self.vstack.push((qm1, JitType::Int)); + } + + /// Python modulo on `i64` (result takes the divisor's sign). + fn emit_mod(&mut self, pc: u32) { + let snapshot = self.vstack.clone(); + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let should = self.div_guard_cond(a, b); + let cont = self.guard(should, pc, &snapshot); + self.b.switch_to_block(cont); + + let r = self.b.ins().srem(a, b); + // if r != 0 && (r<0) != (b<0) { r + b } else { r } + let needs = self.floor_needs_adjust(r, b); + let rplusb = self.b.ins().iadd(r, b); + let res = self.b.ins().select(needs, rplusb, r); + self.vstack.push((res, JitType::Int)); + } + + /// `b == 0 || (a == MIN && b == -1)`. + fn div_guard_cond(&mut self, a: Value, b: Value) -> Value { + let zero = self.b.ins().iconst(types::I64, 0); + let is_zero = self.b.ins().icmp(IntCC::Equal, b, zero); + let min = self.b.ins().iconst(types::I64, i64::MIN); + let neg1 = self.b.ins().iconst(types::I64, -1); + let a_min = self.b.ins().icmp(IntCC::Equal, a, min); + let b_neg1 = self.b.ins().icmp(IntCC::Equal, b, neg1); + let overflow = self.b.ins().band(a_min, b_neg1); + self.b.ins().bor(is_zero, overflow) + } + + /// `(r != 0) && ((r < 0) != (b < 0))` as an I8 boolean. + fn floor_needs_adjust(&mut self, r: Value, b: Value) -> Value { + let zero = self.b.ins().iconst(types::I64, 0); + let r_nz = self.b.ins().icmp(IntCC::NotEqual, r, zero); + let r_neg = self.b.ins().icmp(IntCC::SignedLessThan, r, zero); + let b_neg = self.b.ins().icmp(IntCC::SignedLessThan, b, zero); + let signs_differ = self.b.ins().bxor(r_neg, b_neg); + self.b.ins().band(r_nz, signs_differ) + } + + /// `-1` when the floor correction applies, else `0` (to add to `q`). + fn floor_adjust(&mut self, r: Value, b: Value) -> Value { + let needs = self.floor_needs_adjust(r, b); + let neg1 = self.b.ins().iconst(types::I64, -1); + let zero = self.b.ins().iconst(types::I64, 0); + self.b.ins().select(needs, neg1, zero) + } + + // ---- comparisons ---------------------------------------------- + + fn emit_int_cmp(&mut self, kind: CmpKind) { + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let cc = match kind { + CmpKind::Lt => IntCC::SignedLessThan, + CmpKind::Le => IntCC::SignedLessThanOrEqual, + CmpKind::Eq => IntCC::Equal, + CmpKind::Ne => IntCC::NotEqual, + CmpKind::Gt => IntCC::SignedGreaterThan, + CmpKind::Ge => IntCC::SignedGreaterThanOrEqual, + }; + let c = self.b.ins().icmp(cc, a, b); + let r = self.b.ins().uextend(types::I64, c); + self.vstack.push((r, JitType::Bool)); + } + + fn emit_float_cmp(&mut self, kind: CmpKind) { + let (b, _) = self.pop(); + let (a, _) = self.pop(); + let cc = match kind { + CmpKind::Lt => FloatCC::LessThan, + CmpKind::Le => FloatCC::LessThanOrEqual, + CmpKind::Eq => FloatCC::Equal, + CmpKind::Ne => FloatCC::NotEqual, + CmpKind::Gt => FloatCC::GreaterThan, + CmpKind::Ge => FloatCC::GreaterThanOrEqual, + }; + let c = self.b.ins().fcmp(cc, a, b); + let r = self.b.ins().uextend(types::I64, c); + self.vstack.push((r, JitType::Bool)); + } + + // ---- overflow helpers (portable signed-overflow detection) ----- + + fn checked_add(&mut self, a: Value, b: Value) -> (Value, Value) { + let r = self.b.ins().iadd(a, b); + let axr = self.b.ins().bxor(a, r); + let bxr = self.b.ins().bxor(b, r); + let and = self.b.ins().band(axr, bxr); + let zero = self.b.ins().iconst(types::I64, 0); + let ovf = self.b.ins().icmp(IntCC::SignedLessThan, and, zero); + (r, ovf) + } + + fn checked_sub(&mut self, a: Value, b: Value) -> (Value, Value) { + let r = self.b.ins().isub(a, b); + let axb = self.b.ins().bxor(a, b); + let axr = self.b.ins().bxor(a, r); + let and = self.b.ins().band(axb, axr); + let zero = self.b.ins().iconst(types::I64, 0); + let ovf = self.b.ins().icmp(IntCC::SignedLessThan, and, zero); + (r, ovf) + } + + fn checked_mul(&mut self, a: Value, b: Value) -> (Value, Value) { + let lo = self.b.ins().imul(a, b); + let hi = self.b.ins().smulhi(a, b); + let sign = self.b.ins().sshr_imm(lo, 63); + let ovf = self.b.ins().icmp(IntCC::NotEqual, hi, sign); + (lo, ovf) + } + + // ---- deopt / side exits --------------------------------------- + + /// Emit `if cond { deopt(pc, snapshot) } else { cont }` and return + /// the `cont` block (the caller continues lowering there). + fn guard(&mut self, cond: Value, pc: u32, snapshot: &[(Value, JitType)]) -> Block { + let se = self.b.create_block(); + let cont = self.b.create_block(); + self.b.ins().brif(cond, se, &[], cont, &[]); + self.b.switch_to_block(se); + self.emit_deopt(pc, snapshot); + cont + } + + fn emit_deopt(&mut self, pc: u32, snapshot: &[(Value, JitType)]) { + let trusted = MemFlags::trusted(); + // Write back every managed local. + for (slot, var) in self.vars.iter().enumerate() { + if let Some(var) = *var { + let v = self.b.use_var(var); + let off = (slot as i32) * 8; + self.b.ins().store(trusted, v, self.locals_base, off); + } + } + // Spill the abstract stack bottom-to-top. + for (idx, (val, ty)) in snapshot.iter().enumerate() { + let voff = (idx as i32) * 8; + self.b.ins().store(trusted, *val, self.spill_base, voff); + let toff = (idx as i32) * 4; + let tagv = self.b.ins().iconst(types::I32, Self::tag(*ty)); + self.b.ins().store(trusted, tagv, self.tags_base, toff); + } + let len = self.b.ins().iconst(types::I32, snapshot.len() as i64); + self.b + .ins() + .store(trusted, len, self.frame_ptr, OFF_STACK_LEN); + let pcv = self.b.ins().iconst(types::I32, i64::from(pc)); + self.b + .ins() + .store(trusted, pcv, self.frame_ptr, OFF_DEOPT_PC); + let status = self.b.ins().iconst(types::I64, JitStatus::Deopt as i64); + self.b.ins().return_(&[status]); + } + + // ---- helpers --------------------------------------------------- + + fn truth(&mut self, val: Value, ty: JitType) -> Value { + match ty { + JitType::Float => { + let z = self.b.ins().f64const(0.0); + self.b.ins().fcmp(FloatCC::NotEqual, val, z) + } + _ => { + let z = self.b.ins().iconst(types::I64, 0); + self.b.ins().icmp(IntCC::NotEqual, val, z) + } + } + } + + fn pop(&mut self) -> (Value, JitType) { + self.vstack.pop().expect("operand stack underflow in lower") + } +} + +#[derive(Clone, Copy)] +enum BitOp { + And, + Or, + Xor, +} diff --git a/crates/weavepy-jit/src/runtime.rs b/crates/weavepy-jit/src/runtime.rs new file mode 100644 index 0000000..4545f8f --- /dev/null +++ b/crates/weavepy-jit/src/runtime.rs @@ -0,0 +1,121 @@ +//! The native-call ABI: the `#[repr(C)]` [`JitFrame`] the VM fills +//! before entering compiled code and reads after it exits, plus the +//! side-exit status protocol. +//! +//! A compiled frame is a single native function with the signature +//! +//! ```text +//! extern "C" fn(frame: *mut JitFrame) -> i64 // an i64 JitStatus +//! ``` +//! +//! On a [`JitStatus::Returned`] exit the function has written +//! [`JitFrame::ret_bits`] / [`JitFrame::ret_tag`]. On a +//! [`JitStatus::Deopt`] exit it has written [`JitFrame::deopt_pc`] and +//! spilled the live abstract operand stack into +//! [`JitFrame::stack_spill`] / [`JitFrame::stack_tags`] (bottom-to-top) +//! with [`JitFrame::stack_len`] entries, plus written back every +//! JIT-managed local into [`JitFrame::locals`]. The VM then rebuilds its +//! interpreter state and resumes at `deopt_pc`, bit-for-bit as though +//! the JIT had never run. + +/// The status returned (as an `i64`) by a compiled frame. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(i64)] +pub enum JitStatus { + /// The frame ran to a `RETURN_VALUE`. The return value is in + /// [`JitFrame::ret_bits`] / [`JitFrame::ret_tag`]. + Returned = 0, + /// The frame took a side exit. The VM resumes interpretation at + /// [`JitFrame::deopt_pc`] with the spilled stack + written-back + /// locals. + Deopt = 1, +} + +impl JitStatus { + /// Decode the raw `i64` a compiled frame returns. + #[inline] + #[must_use] + pub fn from_raw(v: i64) -> JitStatus { + match v { + 0 => JitStatus::Returned, + _ => JitStatus::Deopt, + } + } +} + +/// How to interpret a `u64` slot in [`JitFrame::locals`] / +/// [`JitFrame::stack_spill`]. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(u32)] +pub enum SlotTag { + /// `i64` bit pattern → `Object::Int`. + Int = 0, + /// `f64` bit pattern (via `to_bits`) → `Object::Float`. + Float = 1, + /// `0`/`1` → `Object::Bool`. + Bool = 2, +} + +impl SlotTag { + /// Decode a raw tag written by native code. + #[inline] + #[must_use] + pub fn from_raw(v: u32) -> SlotTag { + match v { + 1 => SlotTag::Float, + 2 => SlotTag::Bool, + _ => SlotTag::Int, + } + } +} + +/// The exchange buffer the VM passes to a compiled frame. +/// +/// The VM owns the backing storage (`Vec` / `Vec`); this +/// struct holds raw pointers to it for the duration of one native call. +/// All indices the native code touches are bounded by `n_locals` / +/// `stack_cap`, which the VM sizes from the compiled frame's analysis. +#[repr(C)] +#[derive(Debug)] +pub struct JitFrame { + /// Slot-indexed local storage, one `u64` per code-object local. + /// Holds `i64` / `f64`-bits / `bool` per the local's stable type. + pub locals: *mut u64, + /// Number of valid entries in [`Self::locals`]. + pub n_locals: u32, + /// OSR entry: the bytecode pc to begin execution at. `0` enters at + /// the function start; a loop-header pc enters mid-frame. + pub entry_pc: u32, + + /// `Returned`: the return value's bit pattern. + pub ret_bits: u64, + /// `Returned`: the return value's [`SlotTag`]. + pub ret_tag: u32, + + /// `Deopt`: the bytecode pc to resume interpretation at. + pub deopt_pc: u32, + /// `Deopt`: spilled abstract operand stack, bottom-to-top. + pub stack_spill: *mut u64, + /// `Deopt`: matching [`SlotTag`]s for [`Self::stack_spill`]. + pub stack_tags: *mut u32, + /// `Deopt`: number of spilled stack entries. + pub stack_len: u32, + /// Capacity of [`Self::stack_spill`] / [`Self::stack_tags`]. + pub stack_cap: u32, +} + +impl JitFrame { + /// Reinterpret an `f64` as the `u64` stored in a slot. + #[inline] + #[must_use] + pub fn f64_to_bits(v: f64) -> u64 { + v.to_bits() + } + + /// Reinterpret a slot's `u64` as the `f64` it encodes. + #[inline] + #[must_use] + pub fn bits_to_f64(bits: u64) -> f64 { + f64::from_bits(bits) + } +} diff --git a/crates/weavepy-jit/src/value.rs b/crates/weavepy-jit/src/value.rs new file mode 100644 index 0000000..564fc3c --- /dev/null +++ b/crates/weavepy-jit/src/value.rs @@ -0,0 +1,60 @@ +//! The unboxed value model and type lattice the JIT reasons about. +//! +//! Only three concrete Python types are representable as unboxed machine +//! values: `int` (as `i64`), `float` (as `f64`), and `bool` (as a +//! one-byte `0`/`1`). Everything else is [`JitType::Unknown`], which +//! makes any region that would need it non-JITable. +//! +//! A deliberate restriction keeps deopt simple (see `analyze`): within a +//! single compiled region, each local slot and each abstract-stack +//! position has **one** stable [`JitType`]. Straight-line retyping of a +//! local (`x = 1; x = 2.0`) is rejected as non-JITable rather than +//! tracked per-pc. + +/// The abstract type of an unboxed value flowing through the JIT. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum JitType { + /// CPython `int` that fits in `i64`. Overflow deopts to the + /// interpreter, which promotes to a bignum. + Int, + /// CPython `float` (`f64`). + Float, + /// CPython `bool`. Distinct from `Int` so the VM rebuilds the right + /// `Object` variant on deopt; arithmetic promotes it to `Int` first. + Bool, + /// Anything the JIT can't represent. Its presence as an operand to a + /// supported opcode makes the enclosing region non-JITable. + Unknown, +} + +impl JitType { + /// `true` for the three representable types. + #[inline] + #[must_use] + pub fn is_representable(self) -> bool { + !matches!(self, JitType::Unknown) + } + + /// `true` if this is an integral lane (`Int` or `Bool`), which share + /// the `i64` machine representation. + #[inline] + #[must_use] + pub fn is_integral(self) -> bool { + matches!(self, JitType::Int | JitType::Bool) + } + + /// Dataflow join at a control-flow merge. Two equal types join to + /// themselves; everything else collapses to [`JitType::Unknown`]. + /// `Bool`/`Int` are kept distinct (they join to `Unknown`) so a slot + /// that is sometimes a bool and sometimes an int is treated as + /// non-uniform and the region bails — conservative but always sound. + #[inline] + #[must_use] + pub fn join(self, other: JitType) -> JitType { + if self == other { + self + } else { + JitType::Unknown + } + } +} diff --git a/crates/weavepy-jit/tests/numeric.rs b/crates/weavepy-jit/tests/numeric.rs new file mode 100644 index 0000000..902d330 --- /dev/null +++ b/crates/weavepy-jit/tests/numeric.rs @@ -0,0 +1,269 @@ +//! End-to-end codegen tests over hand-built IR: they compile a +//! [`TFunc`] and actually *run* the native code, checking results, the +//! overflow/zero deopt protocol, and Python's floor-division semantics — +//! all without needing the parser or VM. + +use weavepy_jit::{ + ArithKind, CmpKind, JitEngine, JitFrame, JitStatus, JitType, SlotTag, TBlock, TFunc, TOp, + TStmt, TTerm, +}; + +/// Allocate buffers, enter the compiled frame with the given locals, and +/// return `(status, ret_bits, ret_tag, spilled_stack, deopt_pc)`. +fn run(tfunc: &TFunc, locals_in: &[u64]) -> (JitStatus, u64, u32, Vec<(u64, u32)>, u32) { + let mut engine = JitEngine::new().expect("host ISA"); + let cf = engine.compile_tfunc(tfunc).expect("compile"); + + let mut locals = vec![0u64; cf.n_locals as usize]; + for (i, v) in locals_in.iter().enumerate() { + locals[i] = *v; + } + let cap = cf.max_stack as usize + 1; + let mut spill = vec![0u64; cap]; + let mut tags = vec![0u32; cap]; + + let mut frame = JitFrame { + locals: locals.as_mut_ptr(), + n_locals: cf.n_locals, + entry_pc: 0, + ret_bits: 0, + ret_tag: 0, + deopt_pc: 0, + stack_spill: spill.as_mut_ptr(), + stack_tags: tags.as_mut_ptr(), + stack_len: 0, + stack_cap: cap as u32, + }; + // SAFETY: buffers are sized to n_locals / max_stack; `engine` (and so + // the backing module) outlives this call. + let status = unsafe { cf.enter(&raw mut frame) }; + + let mut spilled = Vec::new(); + for i in 0..frame.stack_len as usize { + spilled.push((spill[i], tags[i])); + } + ( + status, + frame.ret_bits, + frame.ret_tag, + spilled, + frame.deopt_pc, + ) +} + +fn st(pc: u32, op: TOp) -> TStmt { + TStmt { pc, op } +} + +#[test] +fn add_two_ints() { + // def f(a, b): return a + b + let tfunc = TFunc { + n_locals: 2, + local_types: vec![Some(JitType::Int), Some(JitType::Int)], + livein_locals: vec![0, 1], + max_stack: 2, + entry_block: 0, + blocks: vec![TBlock { + entry_stack: vec![], + stmts: vec![ + st(0, TOp::LoadLocal(0)), + st(1, TOp::LoadLocal(1)), + st(2, TOp::IntArith(ArithKind::Add)), + ], + term: TTerm::Return, + }], + }; + let (status, bits, tag, _, _) = run(&tfunc, &[(40i64) as u64, (2i64) as u64]); + assert_eq!(status, JitStatus::Returned); + assert_eq!(tag, SlotTag::Int as u32); + assert_eq!(bits as i64, 42); +} + +#[test] +fn add_overflow_deopts_with_operands_spilled() { + // a + b where a = i64::MAX, b = 1 must deopt at the BINARY_OP pc with + // both operands on the spilled stack. + let tfunc = TFunc { + n_locals: 2, + local_types: vec![Some(JitType::Int), Some(JitType::Int)], + livein_locals: vec![0, 1], + max_stack: 2, + entry_block: 0, + blocks: vec![TBlock { + entry_stack: vec![], + stmts: vec![ + st(10, TOp::LoadLocal(0)), + st(11, TOp::LoadLocal(1)), + st(12, TOp::IntArith(ArithKind::Add)), + ], + term: TTerm::Return, + }], + }; + let (status, _, _, spilled, pc) = run(&tfunc, &[i64::MAX as u64, 1u64]); + assert_eq!(status, JitStatus::Deopt); + assert_eq!(pc, 12); + assert_eq!(spilled.len(), 2); + assert_eq!(spilled[0].0 as i64, i64::MAX); + assert_eq!(spilled[1].0 as i64, 1); + assert_eq!(spilled[0].1, SlotTag::Int as u32); +} + +/// Build `def f(n): s=0; i=0; while i TFunc { + TFunc { + n_locals: 3, // 0=n, 1=s, 2=i + local_types: vec![Some(JitType::Int), Some(JitType::Int), Some(JitType::Int)], + livein_locals: vec![0], + max_stack: 2, + entry_block: 0, + blocks: vec![ + // B0: s=0; i=0; -> B1 + TBlock { + entry_stack: vec![], + stmts: vec![ + st(0, TOp::PushConstInt(0)), + st(1, TOp::StoreLocal(1)), + st(2, TOp::PushConstInt(0)), + st(3, TOp::StoreLocal(2)), + ], + term: TTerm::Jump(1), + }, + // B1 header: if i < n -> B2 else B3 + TBlock { + entry_stack: vec![], + stmts: vec![ + st(4, TOp::LoadLocal(2)), + st(5, TOp::LoadLocal(0)), + st(6, TOp::IntCmp(CmpKind::Lt)), + ], + term: TTerm::BranchFalse { + target: 3, + fallthrough: 2, + }, + }, + // B2 body: s=s+i; i=i+1; -> B1 + TBlock { + entry_stack: vec![], + stmts: vec![ + st(7, TOp::LoadLocal(1)), + st(8, TOp::LoadLocal(2)), + st(9, TOp::IntArith(ArithKind::Add)), + st(10, TOp::StoreLocal(1)), + st(11, TOp::LoadLocal(2)), + st(12, TOp::PushConstInt(1)), + st(13, TOp::IntArith(ArithKind::Add)), + st(14, TOp::StoreLocal(2)), + ], + term: TTerm::Jump(1), + }, + // B3 exit: return s + TBlock { + entry_stack: vec![], + stmts: vec![st(15, TOp::LoadLocal(1))], + term: TTerm::Return, + }, + ], + } +} + +#[test] +fn while_loop_sums() { + let tfunc = sum_loop(); + let (status, bits, tag, _, _) = run(&tfunc, &[10u64]); + assert_eq!(status, JitStatus::Returned); + assert_eq!(tag, SlotTag::Int as u32); + assert_eq!(bits as i64, 45); // 0+1+..+9 +} + +#[test] +fn while_loop_zero_iterations() { + let tfunc = sum_loop(); + let (status, bits, _, _, _) = run(&tfunc, &[0u64]); + assert_eq!(status, JitStatus::Returned); + assert_eq!(bits as i64, 0); +} + +/// `def f(a, b): return a // b` and `... a % b`, for the floor/modulo +/// semantics that differ from Rust's truncating division on negatives. +fn binop_fn(op: ArithKind) -> TFunc { + TFunc { + n_locals: 2, + local_types: vec![Some(JitType::Int), Some(JitType::Int)], + livein_locals: vec![0, 1], + max_stack: 2, + entry_block: 0, + blocks: vec![TBlock { + entry_stack: vec![], + stmts: vec![ + st(0, TOp::LoadLocal(0)), + st(1, TOp::LoadLocal(1)), + st(2, TOp::IntArith(op)), + ], + term: TTerm::Return, + }], + } +} + +#[test] +fn python_floordiv_semantics() { + let f = binop_fn(ArithKind::FloorDiv); + let cases = [ + (7i64, 2i64, 3i64), + (-7, 2, -4), + (7, -2, -4), + (-7, -2, 3), + (6, 3, 2), + (-6, 3, -2), + ]; + for (a, b, want) in cases { + let (status, bits, _, _, _) = run(&f, &[a as u64, b as u64]); + assert_eq!(status, JitStatus::Returned, "{a} // {b}"); + assert_eq!(bits as i64, want, "{a} // {b}"); + } +} + +#[test] +fn python_mod_semantics() { + let f = binop_fn(ArithKind::Mod); + let cases = [(7i64, 3i64, 1i64), (-7, 3, 2), (7, -3, -2), (-7, -3, -1)]; + for (a, b, want) in cases { + let (status, bits, _, _, _) = run(&f, &[a as u64, b as u64]); + assert_eq!(status, JitStatus::Returned, "{a} % {b}"); + assert_eq!(bits as i64, want, "{a} % {b}"); + } +} + +#[test] +fn floordiv_by_zero_deopts() { + let f = binop_fn(ArithKind::FloorDiv); + let (status, _, _, spilled, pc) = run(&f, &[5u64, 0u64]); + assert_eq!(status, JitStatus::Deopt); + assert_eq!(pc, 2); + assert_eq!(spilled.len(), 2); +} + +#[test] +fn int_truediv_returns_float() { + // def f(a, b): return a / b -> float + let tfunc = TFunc { + n_locals: 2, + local_types: vec![Some(JitType::Int), Some(JitType::Int)], + livein_locals: vec![0, 1], + max_stack: 2, + entry_block: 0, + blocks: vec![TBlock { + entry_stack: vec![], + stmts: vec![ + st(0, TOp::LoadLocal(0)), + st(1, TOp::LoadLocal(1)), + st(2, TOp::IntTrueDiv), + ], + term: TTerm::Return, + }], + }; + let (status, bits, tag, _, _) = run(&tfunc, &[7u64, 2u64]); + assert_eq!(status, JitStatus::Returned); + assert_eq!(tag, SlotTag::Float as u32); + assert!((f64::from_bits(bits) - 3.5).abs() < 1e-12); +} diff --git a/crates/weavepy-vm/Cargo.toml b/crates/weavepy-vm/Cargo.toml index 59d1710..fcc2e3d 100644 --- a/crates/weavepy-vm/Cargo.toml +++ b/crates/weavepy-vm/Cargo.toml @@ -63,5 +63,15 @@ parking_lot = { workspace = true } crossbeam-channel = { workspace = true } crossbeam-utils = { workspace = true } +# RFC 0032 — tier-2 Cranelift JIT, behind the (default-off) `jit` feature. +weavepy-jit = { workspace = true, optional = true } + +[features] +default = [] +# Compile the tier-2 JIT integration (pulls in Cranelift). Off by +# default; CI exercises it via `--all-features`. Activated at runtime +# by `WEAVEPY_JIT=1`. +jit = ["dep:weavepy-jit"] + [lints] workspace = true diff --git a/crates/weavepy-vm/src/lib.rs b/crates/weavepy-vm/src/lib.rs index 4517da7..3e152f4 100644 --- a/crates/weavepy-vm/src/lib.rs +++ b/crates/weavepy-vm/src/lib.rs @@ -39,6 +39,10 @@ pub mod specialize; pub mod stdlib; pub mod sync; pub mod thread_registry; +/// RFC 0032 — tier-2 Cranelift JIT integration. Present only under the +/// `jit` feature; the dispatch loop calls into it behind `#[cfg]` gates. +#[cfg(feature = "jit")] +mod tier2; pub mod trace; pub mod types; pub mod vm_singletons; @@ -112,6 +116,21 @@ impl Frame { } } +/// RFC 0032 — render the tier-2 JIT's counters as a markdown block for +/// the `WEAVEPY_VM_STATS` report, or `None` when the `jit` feature is +/// disabled or the JIT was never exercised on this thread. +#[must_use] +pub fn jit_stats_markdown() -> Option { + #[cfg(feature = "jit")] + { + crate::tier2::format_stats_markdown() + } + #[cfg(not(feature = "jit"))] + { + None + } +} + // ---------- interpreter ---------- /// Output sink. Either the process's stdout or a `Vec` for @@ -741,6 +760,11 @@ impl Interpreter { frame: &mut Frame, sent: Option, ) -> Result { + // Captured before `sent` is consumed below; only the tier-2 + // entry guard reads it, so it's gated to the `jit` feature to + // stay warning-free in default builds. + #[cfg(feature = "jit")] + let is_resume = sent.is_some(); if let Some(v) = sent { frame.push(v); } @@ -756,6 +780,21 @@ impl Interpreter { if observers_active { self.fire_call_event(&py_frame)?; } + // RFC 0032 — tier-2 entry. Only for a fresh activation (pc 0, + // empty stack, not a generator resume) and only when tracing is + // off, since native code fires no line/return events. A returned + // native frame short-circuits the interpreter loop; a deopt + // rewrites `frame` and falls through to resume interpretation. + #[cfg(feature = "jit")] + if !is_resume && !observers_active && frame.pc == 0 && frame.stack.is_empty() { + match crate::tier2::try_enter(frame) { + crate::tier2::JitEntry::Ran(v) => { + self.pop_py_frame(); + return Ok(FrameOutcome::Returned(v)); + } + crate::tier2::JitEntry::Deopt | crate::tier2::JitEntry::Skip => {} + } + } let result = loop { // Mirror the live `pc` into the snapshot so `f_lineno` // reads correctly when user code introspects via @@ -1451,24 +1490,7 @@ impl Interpreter { } } OpCode::Call => { - let argc = ins.arg as usize; - let split_at = frame.stack.len().saturating_sub(argc); - let mut args: Vec = frame.stack.split_off(split_at); - let callable = frame.pop()?; - // Zero-arg super(): inject __class__ from the free - // cell named "__class__" and `self` from local 0. - if args.is_empty() && is_super_callable(&callable) { - if let Some(class_cell) = find_cell(frame, "__class__") { - let class_obj = class_cell.borrow().clone(); - if !matches!(class_obj, Object::None) { - let self_obj = frame.locals.first().cloned().unwrap_or(Object::None); - args.push(class_obj); - args.push(self_obj); - } - } - } - let r = self.call(&callable, &args, &[], &frame.globals)?; - frame.push(r); + self.dispatch_call(frame, cache_pc, ins.arg as usize)?; } OpCode::CallKw => { let argc = ins.arg as usize; @@ -1547,6 +1569,10 @@ impl Interpreter { } OpCode::JumpBackward => { frame.pc = frame.pc.saturating_sub(ins.arg); + // RFC 0032 — a loop back-edge heats the code object so a + // subsequent activation can tier up to native code. + #[cfg(feature = "jit")] + crate::tier2::note_backedge(&frame.code); } OpCode::GetIter => { let v = frame.pop()?; @@ -7154,6 +7180,159 @@ impl Interpreter { } } + /// RFC 0032 — specialized `CALL`. Mirrors the RFC 0021 dispatchers: + /// a warm cache takes an argument-binding-free fast path for a + /// pinned `PyFunction`; `Empty` runs the generic call and attempts + /// specialization; `Cooldown` decrements and stays generic. The + /// super()/argument fixup and the generic dispatch are shared. + fn dispatch_call( + &mut self, + frame: &mut Frame, + cache_pc: u32, + argc: usize, + ) -> Result<(), RuntimeError> { + use weavepy_compiler::InlineCache as IC; + let op_idx = OpCode::Call as u8; + let split_at = frame.stack.len().saturating_sub(argc); + let mut args: Vec = frame.stack.split_off(split_at); + let callable = frame.pop()?; + // Zero-arg super(): inject __class__ and `self`. Never matches a + // pinned-function cache, so it always takes the generic path. + if args.is_empty() && is_super_callable(&callable) { + if let Some(class_cell) = find_cell(frame, "__class__") { + let class_obj = class_cell.borrow().clone(); + if !matches!(class_obj, Object::None) { + let self_obj = frame.locals.first().cloned().unwrap_or(Object::None); + args.push(class_obj); + args.push(self_obj); + } + } + } + let cache = frame.code.caches.get(cache_pc); + match cache { + IC::CallPyExactNoFree { func_id, argc: ca } => { + if ca as usize == argc { + if let Object::Function(f) = &callable { + if specialize::rc_id(f) == func_id && args.len() == argc { + specialize::record_hit(op_idx); + let f = f.clone(); + let r = self.run_py_exact_nofree(&f, args)?; + frame.push(r); + return Ok(()); + } + } + } + self.deopt_call_generic(frame, cache_pc, &callable, &args) + } + IC::CallPyExact { func_id, argc: ca } => { + if ca as usize == argc { + if let Object::Function(f) = &callable { + if specialize::rc_id(f) == func_id && args.len() == argc { + specialize::record_hit(op_idx); + let f = f.clone(); + let r = self.run_py_exact_with_cells(&f, args)?; + frame.push(r); + return Ok(()); + } + } + } + self.deopt_call_generic(frame, cache_pc, &callable, &args) + } + IC::Empty => { + specialize::record_specialize_attempt(op_idx); + let decision = specialize::attempt_specialize_call(&callable, argc); + frame.code.caches.set(cache_pc, decision); + if matches!(decision, IC::Cooldown(_)) { + specialize::record_specialize_skip(op_idx); + } else { + specialize::record_specialize_success(op_idx); + } + let r = self.call(&callable, &args, &[], &frame.globals)?; + frame.push(r); + Ok(()) + } + IC::Cooldown(n) => { + let next = if n > 0 { + IC::Cooldown(n - 1) + } else { + IC::Empty + }; + frame.code.caches.set(cache_pc, next); + let r = self.call(&callable, &args, &[], &frame.globals)?; + frame.push(r); + Ok(()) + } + _ => { + let r = self.call(&callable, &args, &[], &frame.globals)?; + frame.push(r); + Ok(()) + } + } + } + + /// Deopt a `CALL` cache (guard miss): cool the slot down and run the + /// generic dispatch. + fn deopt_call_generic( + &mut self, + frame: &mut Frame, + cache_pc: u32, + callable: &Object, + args: &[Object], + ) -> Result<(), RuntimeError> { + specialize::record_miss(OpCode::Call as u8); + frame + .code + .caches + .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN)); + let r = self.call(callable, args, &[], &frame.globals)?; + frame.push(r); + Ok(()) + } + + /// Fast frame setup for a cell-free, exact-arity Python call: build + /// the locals directly from the arguments (no binding pass, no + /// cells) and run. + fn run_py_exact_nofree( + &mut self, + f: &Rc, + args: Vec, + ) -> Result { + let code = f.code.clone(); + let mut locals = vec![Object::None; code.varnames.len()]; + for (slot, v) in args.into_iter().enumerate() { + locals[slot] = v; + } + let mut frame = Frame { + code, + locals, + cells: Vec::new(), + stack: Vec::with_capacity(16), + globals: f.globals.clone(), + class_namespace: None, + exc_handlers: Vec::new(), + pc: 0, + }; + self.run_frame(&mut frame) + } + + /// Like [`Self::run_py_exact_nofree`] but for functions with cells / + /// a closure: skips argument binding but builds the frame (and its + /// cells) through `make_frame`. + fn run_py_exact_with_cells( + &mut self, + f: &Rc, + args: Vec, + ) -> Result { + let mut frame = self.make_frame( + f.code.clone(), + args, + f.closure.clone(), + f.globals.clone(), + false, + ); + self.run_frame(&mut frame) + } + // ---------- imports (RFC 0012) ---------- /// `IMPORT_NAME` runtime side. Resolves relative imports against @@ -9979,6 +10158,22 @@ mod tests { String::from_utf8(bytes).expect("utf-8") } + /// RFC 0032 — run `src` with the tier-2 JIT forced on, on a fresh + /// thread so the thread-local JIT state can't leak into other + /// tests. Returns `(stdout, frames_compiled, deopts)`. + #[cfg(feature = "jit")] + fn run_jit(src: &str) -> (String, u64, u64) { + let src = src.to_owned(); + std::thread::spawn(move || { + crate::tier2::force_enable_for_test(2); + let out = run(&src); + let (compiled, _entries, deopts) = crate::tier2::stats_for_test(); + (out, compiled, deopts) + }) + .join() + .expect("jit worker thread") + } + #[test] fn runs_print_int() { assert_eq!(run("print(42)\n"), "42\n"); @@ -10027,6 +10222,135 @@ mod tests { assert_eq!(run(src), "8\n"); } + // RFC 0032 — CALL specialization. Each of these drives a single + // call site in a loop so the inline cache warms up and the + // specialized fast path (or its deopt) is exercised, then checks + // the result still matches plain interpretation. + + #[test] + fn call_spec_repeated_plain() { + // `add` has no cells/closure and exact arity → CallPyExactNoFree. + let src = "def add(a, b):\n return a + b\n\ + total = 0\ni = 0\n\ + while i < 50:\n total = total + add(i, i)\n i = i + 1\n\ + print(total)\n"; + assert_eq!(run(src), "2450\n"); + } + + #[test] + fn call_spec_repeated_closure() { + // `add5` closes over `x` → CallPyExact (frame built with cells). + let src = "def make_adder(x):\n def add(y):\n return x + y\n return add\n\ + add5 = make_adder(5)\n\ + total = 0\ni = 0\n\ + while i < 50:\n total = total + add5(i)\n i = i + 1\n\ + print(total)\n"; + assert_eq!(run(src), "1475\n"); + } + + #[test] + fn call_spec_polymorphic_site_deopts() { + // One call site sees two different functions on alternating + // iterations: the per-function guard must miss and fall back to + // the generic path without corrupting results. + let src = "def f(x):\n return x + 1\ndef g(x):\n return x * 2\n\ + funcs = [f, g]\n\ + total = 0\ni = 0\n\ + while i < 10:\n fn = funcs[i % 2]\n total = total + fn(i)\n i = i + 1\n\ + print(total)\n"; + assert_eq!(run(src), "75\n"); + } + + #[test] + fn call_spec_defaults_use_generic_path() { + // Calling with fewer args than params needs default binding, so + // the site must stay on the generic dispatch (Cooldown), not the + // exact-arity fast path. + let src = "def f(a, b=10):\n return a + b\n\ + total = 0\ni = 0\n\ + while i < 20:\n total = total + f(i)\n i = i + 1\n\ + print(total)\n"; + assert_eq!(run(src), "390\n"); + } + + // RFC 0032 — tier-2 JIT integration. Each test forces the JIT on, + // drives a hot `while`-loop kernel through many calls so it tiers + // up, and asserts (a) the JIT actually compiled the kernel, (b) the + // native result matches both the interpreter and CPython. + + #[cfg(feature = "jit")] + #[test] + fn jit_numeric_kernel_matches_interpreter() { + let src = "def kernel(n):\n s = 0\n i = 0\n\ + \x20 while i < n:\n s = s + i * 2 - (i // 3) + (i % 7)\n i = i + 1\n\ + \x20 return s\n\ + def bench(m):\n total = 0\n k = 0\n\ + \x20 while k < m:\n total = total + kernel(50)\n k = k + 1\n\ + \x20 return total\n\ + print(bench(100))\n"; + let (out, compiled, deopts) = run_jit(src); + assert!(compiled >= 1, "JIT never compiled the kernel"); + assert_eq!(deopts, 0, "clean numeric kernel should not deopt"); + assert_eq!(out, "220500\n"); + assert_eq!(out, run(src), "JIT output diverged from the interpreter"); + } + + #[cfg(feature = "jit")] + #[test] + fn jit_floordiv_mod_negative_semantics() { + // Exercises Python floor-division / modulo sign rules in real + // compiled code (operands span negative values). + let src = "def fdmod(n):\n a = 0\n i = 0 - n\n\ + \x20 while i < n:\n a = a + (i // 3) - (i % 5)\n i = i + 1\n\ + \x20 return a\n\ + def bench(m):\n t = 0\n k = 0\n\ + \x20 while k < m:\n t = t + fdmod(40)\n k = k + 1\n\ + \x20 return t\n\ + print(bench(100))\n"; + let (out, compiled, _deopts) = run_jit(src); + assert!(compiled >= 1, "JIT never compiled the kernel"); + assert_eq!(out, "-20000\n"); + assert_eq!(out, run(src)); + } + + #[cfg(feature = "jit")] + #[test] + fn jit_branchy_kernel_matches_interpreter() { + // if/else inside the hot loop → multiple basic blocks and a + // join, exercising the block/terminator lowering. + let src = "def br(n):\n c = 0\n i = 0\n\ + \x20 while i < n:\n if i % 3 == 0:\n c = c + i\n else:\n c = c - 1\n i = i + 1\n\ + \x20 return c\n\ + def bench(m):\n t = 0\n k = 0\n\ + \x20 while k < m:\n t = t + br(60)\n k = k + 1\n\ + \x20 return t\n\ + print(bench(100))\n"; + let (out, compiled, _deopts) = run_jit(src); + assert!(compiled >= 1, "JIT never compiled the kernel"); + assert_eq!(out, "53000\n"); + assert_eq!(out, run(src)); + } + + #[cfg(feature = "jit")] + #[test] + fn jit_overflow_deopts_to_bigint() { + // The accumulator overflows i64 mid-loop: the native code must + // deopt, hand the operands back, and let the interpreter promote + // to a big integer — matching CPython's arbitrary-precision int. + let src = "def okern(n):\n s = 0\n i = 0\n\ + \x20 while i < n:\n s = s + 1000000000000000000\n i = i + 1\n\ + \x20 return s\n\ + def bench(m):\n r = 0\n k = 0\n\ + \x20 while k < m:\n r = okern(20)\n k = k + 1\n\ + \x20 return r\n\ + print(bench(100))\n"; + let (out, compiled, deopts) = run_jit(src); + assert!(compiled >= 1, "JIT never compiled the kernel"); + assert!(deopts >= 1, "overflowing kernel should deopt at least once"); + assert_eq!(out, "20000000000000000000\n"); + assert_eq!(out, run(src), "deopt path diverged from the interpreter"); + } + #[test] fn list_comprehension() { let src = "xs = [x * x for x in range(4)]\nprint(xs)\n"; diff --git a/crates/weavepy-vm/src/specialize.rs b/crates/weavepy-vm/src/specialize.rs index deb7a6a..d0c06db 100644 --- a/crates/weavepy-vm/src/specialize.rs +++ b/crates/weavepy-vm/src/specialize.rs @@ -262,6 +262,45 @@ pub fn attempt_specialize_unpack_sequence(seq: &Object, n: usize) -> InlineCache } } +// ---------- specialization decisions: CALL ---------- + +/// Decide on a `CALL` specialization (RFC 0032). +/// +/// We only specialize the *exact positional arity, no keywords* shape — +/// the call site supplies precisely `arg_count` positionals and the +/// function declares no `*args`/`**kwargs`/keyword-only parameters. That +/// lets the fast path skip the entire argument-binding pass in +/// `call_python`. Generators/coroutines are excluded (their call returns +/// a suspended object, not a frame result). Functions with cells take +/// the `CallPyExact` shape (still skips binding, but builds cells via +/// `make_frame`); cell-free functions take the leaner `CallPyExactNoFree`. +pub fn attempt_specialize_call(callable: &Object, argc: usize) -> InlineCache { + match callable { + Object::Function(f) => { + let code = &f.code; + if code.is_generator || code.is_coroutine || code.is_async_generator { + return InlineCache::Cooldown(COOLDOWN); + } + if code.has_varargs || code.has_varkeywords || code.kwonly_count != 0 { + return InlineCache::Cooldown(COOLDOWN); + } + // Only the exact-arity shape: anything needing defaults (too + // few) or *args overflow (too many) keeps the generic path. + if code.arg_count as usize != argc { + return InlineCache::Cooldown(COOLDOWN); + } + let func_id = rc_id(f); + let argc = u32::try_from(argc).unwrap_or(u32::MAX); + if code.cellvars.is_empty() && code.freevars.is_empty() && f.closure.is_empty() { + InlineCache::CallPyExactNoFree { func_id, argc } + } else { + InlineCache::CallPyExact { func_id, argc } + } + } + _ => InlineCache::Cooldown(COOLDOWN), + } +} + // ---------- shared helpers ---------- /// Cheap fingerprint for an `Rc`. Two clones of the same diff --git a/crates/weavepy-vm/src/stdlib/struct_mod.rs b/crates/weavepy-vm/src/stdlib/struct_mod.rs index 8c1c415..0f5430f 100644 --- a/crates/weavepy-vm/src/stdlib/struct_mod.rs +++ b/crates/weavepy-vm/src/stdlib/struct_mod.rs @@ -244,7 +244,7 @@ impl CompiledFormat { } fn iter_unpack(&self, buf: &[u8]) -> Result>, RuntimeError> { - if buf.len() % self.size != 0 { + if !buf.len().is_multiple_of(self.size) { return Err(struct_error(format!( "iterative unpacking requires a buffer of a multiple of {} bytes", self.size diff --git a/crates/weavepy-vm/src/tier2.rs b/crates/weavepy-vm/src/tier2.rs new file mode 100644 index 0000000..2093035 --- /dev/null +++ b/crates/weavepy-vm/src/tier2.rs @@ -0,0 +1,335 @@ +//! RFC 0032 — the VM side of the tier-2 Cranelift JIT. +//! +//! This module is compiled only with the `jit` feature. It owns a +//! per-thread [`weavepy_jit::JitEngine`] and a hot-counter cache keyed by +//! `CodeObject` identity, decides when a frame is hot enough to compile, +//! applies the entry type-guard, marshals locals into a +//! [`weavepy_jit::JitFrame`], enters the native code, and reconstructs +//! interpreter state on a deopt side exit. +//! +//! Everything here runs under the GIL on a single thread, so the engine, +//! cache, and the raw function pointers they hand out never cross thread +//! boundaries — hence the thread-local state and the plain [`StdRc`]. + +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc as StdRc; + +use weavepy_compiler::CodeObject; +use weavepy_jit::{CompiledFrame, JitEngine, JitFrame, JitStatus, JitType, SlotTag}; + +use crate::object::Object; +use crate::sync::Rc; + +/// What happened when the VM offered a frame to the JIT. +pub(crate) enum JitEntry { + /// The native frame ran to completion; this is its return value. + Ran(Object), + /// The native frame deopted; `frame.pc` / locals / stack have been + /// rewritten and the interpreter should resume. + Deopt, + /// The frame was not entered (cold, not JITable, or guard failed); + /// run the interpreter as usual. + Skip, +} + +/// Per-`CodeObject` compilation state. +enum Tier { + Cold, + NotJitable, + Compiled(StdRc), +} + +struct CacheEntry { + counter: u32, + tier: Tier, + /// Keeps the code object alive so its address can't be reused while + /// this entry (and any compiled pointer keyed by it) is live. + _code: Rc, +} + +/// JIT counters surfaced through `WEAVEPY_VM_STATS`. +#[derive(Default, Clone)] +pub(crate) struct JitStats { + pub frames_seen: u64, + pub frames_compiled: u64, + pub frames_notjitable: u64, + pub native_entries: u64, + pub deopts: u64, + pub entry_guard_failures: u64, +} + +struct JitState { + enabled: bool, + threshold: u32, + engine: Option, + cache: HashMap<*const CodeObject, CacheEntry>, + stats: JitStats, +} + +impl JitState { + fn new() -> JitState { + let enabled = match std::env::var("WEAVEPY_JIT") { + Ok(v) => v != "0" && !v.eq_ignore_ascii_case("off") && !v.is_empty(), + Err(_) => false, + }; + let threshold = std::env::var("WEAVEPY_JIT_THRESHOLD") + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|n| *n > 0) + .unwrap_or(50); + JitState { + enabled, + threshold, + engine: None, + cache: HashMap::new(), + stats: JitStats::default(), + } + } + + /// Bump the hot counter for `code` and, once it crosses the + /// threshold, attempt compilation. Returns the compiled frame when + /// one is available. + fn get_compiled(&mut self, code: &Rc) -> Option> { + let key = Rc::as_ptr(code).cast::(); + { + let entry = self.cache.entry(key).or_insert_with(|| CacheEntry { + counter: 0, + tier: Tier::Cold, + _code: code.clone(), + }); + match &entry.tier { + Tier::Compiled(cf) => return Some(cf.clone()), + Tier::NotJitable => return None, + Tier::Cold => { + entry.counter += 1; + if entry.counter < self.threshold { + return None; + } + } + } + } + // Threshold reached: compile (engine + cache borrowed disjointly). + if self.engine.is_none() { + self.engine = JitEngine::new(); + if self.engine.is_none() { + // Host ISA unavailable — disable so we stop retrying. + self.enabled = false; + return None; + } + } + let engine = self.engine.as_mut()?; + let (tier, out) = match engine.compile(code) { + Ok(cf) => { + self.stats.frames_compiled += 1; + let rc = StdRc::new(cf); + (Tier::Compiled(rc.clone()), Some(rc)) + } + Err(_) => { + self.stats.frames_notjitable += 1; + (Tier::NotJitable, None) + } + }; + if let Some(entry) = self.cache.get_mut(&key) { + entry.tier = tier; + } + out + } + + fn note_backedge(&mut self, code: &Rc) { + if !self.enabled { + return; + } + let key = Rc::as_ptr(code).cast::(); + let entry = self.cache.entry(key).or_insert_with(|| CacheEntry { + counter: 0, + tier: Tier::Cold, + _code: code.clone(), + }); + if matches!(entry.tier, Tier::Cold) { + entry.counter = entry.counter.saturating_add(1); + } + } +} + +thread_local! { + static JIT: RefCell = RefCell::new(JitState::new()); +} + +/// Reconstruct an [`Object`] from a `(bits, tag)` slot. +fn unpack(bits: u64, tag: u32) -> Object { + match SlotTag::from_raw(tag) { + SlotTag::Int => Object::Int(bits as i64), + SlotTag::Float => Object::Float(f64::from_bits(bits)), + SlotTag::Bool => Object::Bool(bits != 0), + } +} + +/// Reconstruct an [`Object`] from a slot whose lane is statically known. +fn unpack_ty(bits: u64, ty: JitType) -> Object { + match ty { + JitType::Int => Object::Int(bits as i64), + JitType::Float => Object::Float(f64::from_bits(bits)), + JitType::Bool => Object::Bool(bits != 0), + JitType::Unknown => Object::None, + } +} + +/// Pack a representable [`Object`] into its slot bits for `ty`, or `None` +/// if it doesn't match the expected lane. +fn pack(obj: &Object, ty: JitType) -> Option { + match (ty, obj) { + (JitType::Int, Object::Int(i)) => Some(*i as u64), + (JitType::Bool, Object::Bool(b)) => Some(u64::from(*b)), + (JitType::Float, Object::Float(f)) => Some(f.to_bits()), + _ => None, + } +} + +/// Bump the back-edge hot counter for a code object (no-op when the JIT +/// is disabled). +pub(crate) fn note_backedge(code: &Rc) { + JIT.with(|cell| cell.borrow_mut().note_backedge(code)); +} + +/// Offer a fresh frame (pc 0, empty stack) to the JIT. See [`JitEntry`]. +pub(crate) fn try_enter(frame: &mut super::Frame) -> JitEntry { + // Phase 1: counter + compilation, holding the state borrow briefly. + let cf = JIT.with(|cell| { + let mut st = cell.borrow_mut(); + if !st.enabled { + return None; + } + st.stats.frames_seen += 1; + st.get_compiled(&frame.code) + }); + let Some(cf) = cf else { + return JitEntry::Skip; + }; + + // Phase 2: entry type-guard on the live-in locals. + for &slot in &cf.livein { + let ty = match cf.local_types.get(slot as usize).copied().flatten() { + Some(t) => t, + None => return JitEntry::Skip, + }; + let ok = frame + .locals + .get(slot as usize) + .and_then(|o| pack(o, ty)) + .is_some(); + if !ok { + JIT.with(|cell| cell.borrow_mut().stats.entry_guard_failures += 1); + return JitEntry::Skip; + } + } + + // Phase 3: marshal locals and enter native code. + let n = cf.n_locals as usize; + let mut locals_buf = vec![0u64; n]; + for (slot, dst) in locals_buf.iter_mut().enumerate() { + if let Some(ty) = cf.local_types[slot] { + *dst = frame + .locals + .get(slot) + .and_then(|o| pack(o, ty)) + .unwrap_or(0); + } + } + let cap = cf.max_stack as usize + 1; + let mut spill = vec![0u64; cap]; + let mut tags = vec![0u32; cap]; + let mut jf = JitFrame { + locals: locals_buf.as_mut_ptr(), + n_locals: cf.n_locals, + entry_pc: 0, + ret_bits: 0, + ret_tag: 0, + deopt_pc: 0, + stack_spill: spill.as_mut_ptr(), + stack_tags: tags.as_mut_ptr(), + stack_len: 0, + stack_cap: cap as u32, + }; + + // SAFETY: `locals_buf` is `n_locals` wide and `spill`/`tags` are + // `max_stack + 1` wide, matching what the compiled frame was built + // to address; the engine that backs `cf` lives in this thread's + // `JIT` thread-local for the process lifetime. + let status = unsafe { cf.enter(&raw mut jf) }; + + JIT.with(|cell| { + let mut st = cell.borrow_mut(); + st.stats.native_entries += 1; + if matches!(status, JitStatus::Deopt) { + st.stats.deopts += 1; + } + }); + + match status { + JitStatus::Returned => JitEntry::Ran(unpack(jf.ret_bits, jf.ret_tag)), + JitStatus::Deopt => { + // Write back managed locals, rebuild the operand stack from + // the spill, and resume at the deopt pc. + for (slot, &bits) in locals_buf.iter().enumerate() { + if let Some(ty) = cf.local_types[slot] { + frame.locals[slot] = unpack_ty(bits, ty); + } + } + for i in 0..jf.stack_len as usize { + frame.stack.push(unpack(spill[i], tags[i])); + } + frame.pc = jf.deopt_pc; + JitEntry::Deopt + } + } +} + +/// Test hook: force the JIT on for the current thread with a low +/// tier-up threshold, regardless of `WEAVEPY_JIT`. Compiled only in +/// test builds so it never reaches release binaries. +#[cfg(test)] +pub(crate) fn force_enable_for_test(threshold: u32) { + JIT.with(|cell| { + let mut st = cell.borrow_mut(); + st.enabled = true; + st.threshold = threshold.max(1); + }); +} + +/// Test hook: `(frames_compiled, native_entries, deopts)` for the +/// current thread. +#[cfg(test)] +pub(crate) fn stats_for_test() -> (u64, u64, u64) { + JIT.with(|cell| { + let s = &cell.borrow().stats; + (s.frames_compiled, s.native_entries, s.deopts) + }) +} + +/// Render the JIT counters as markdown rows, or `None` if the JIT was +/// never exercised on this thread. +pub(crate) fn format_stats_markdown() -> Option { + JIT.with(|cell| { + let st = cell.borrow(); + let s = &st.stats; + if s.frames_seen == 0 { + return None; + } + Some(format!( + "\n## Tier-2 JIT stats\n\n\ + - frames seen: **{}**\n\ + - frames compiled: **{}**\n\ + - frames not JITable: **{}**\n\ + - native entries: **{}**\n\ + - deopts: **{}**\n\ + - entry-guard failures: **{}**\n", + s.frames_seen, + s.frames_compiled, + s.frames_notjitable, + s.native_entries, + s.deopts, + s.entry_guard_failures, + )) + }) +} diff --git a/crates/weavepy/Cargo.toml b/crates/weavepy/Cargo.toml index cf5ac8a..11b4f99 100644 --- a/crates/weavepy/Cargo.toml +++ b/crates/weavepy/Cargo.toml @@ -20,5 +20,10 @@ weavepy-parser = { workspace = true } weavepy-vm = { workspace = true } thiserror = { workspace = true } +[features] +default = [] +# RFC 0032 — forward the tier-2 JIT feature down to the VM. +jit = ["weavepy-vm/jit"] + [lints] workspace = true diff --git a/docs/rfcs/0032-tier2-jit-and-call-specialization.md b/docs/rfcs/0032-tier2-jit-and-call-specialization.md new file mode 100644 index 0000000..98d6380 --- /dev/null +++ b/docs/rfcs/0032-tier2-jit-and-call-specialization.md @@ -0,0 +1,600 @@ +# RFC 0032: Tier-2 — a Cranelift JIT for hot numeric frames + CALL specialization + +- **Status**: Accepted +- **Authors**: WeavePy authors +- **Created**: 2026-05-29 +- **Tracking issue**: TBD +- **Builds on**: RFC 0021 (adaptive specialization / inline caches), + RFC 0024/0025 (GIL + cross-thread heap), RFC 0031 (observability hot path) + +## Summary + +RFC 0021 shipped the "tier-1 baseline": per-instruction inline caches and +PEP 659-style adaptive specialization for the seven hottest opcodes. It +**deliberately deferred two things** and named them the next perf RFC: + +> - **`CALL` specialization.** The single largest remaining opcode-level +> perf gap. +> - **Tier-2: Cranelift JIT.** "Once the adaptive interpreter is recording +> stable type observations, a tier-2 JIT can compile hot frames to +> native code … this RFC builds the data-collection layer they need." + +RFC 0032 cashes both checks. After it lands: + +- The `CALL` opcode gains **five inline-cache fast paths** in the + interpreter — `CallPyExact`, `CallPyExactNoFree`, `CallBuiltinFast`, + `CallBoundMethodExact`, and `CallTypeConstructor1` — that skip the + ~120-arm `Interpreter::call` dispatch chain and the elaborate + `call_python` argument-binding loop when the call shape is simple and + stable. This is pure interpreter work, always on, and warms through + the same `Empty → Specialized → Cooldown` cycle as every other RFC + 0021 cache. + +- A new **`weavepy-jit`** crate hosts a **tier-2 method JIT** backed by + **Cranelift** (`cranelift-jit` + `cranelift-frontend` + + `cranelift-codegen` + `cranelift-module`). The JIT compiles a code + object's **unboxed numeric/control-flow core** to native machine code: + `LOAD_FAST` / `STORE_FAST` / `LOAD_CONST` of `int` / `float` / `bool`, + `BINARY_OP` / `COMPARE_OP` / `UNARY_OP` on `int` / `float`, and the + conditional and unconditional jumps (`POP_JUMP_IF_*`, `JUMP_FORWARD`, + `JUMP_BACKWARD`) plus `RETURN_VALUE`. The headline case is the + **`while`-style integer/float loop**, which lowers to this subset with + no iterator protocol. `for … range(…)` loops are *not* in the v1 + subset: they compile to a `CALL range` + `GET_ITER` + `FOR_ITER` + iterator dance that needs an OSR-with-iterator-state path (future + work). Frames whose hot region steps outside the subset are left to + the interpreter — the JIT never emits native code for an operation + whose semantics it can't reproduce exactly. + +- The VM gains a **per-`CodeObject` hot counter** (the tiering trigger + RFC 0021 said the JIT would need but didn't build). Frame entry and + every `JUMP_BACKWARD` back-edge bump it; when it crosses + `JIT_HOT_THRESHOLD`, the frame is handed to the JIT compiler once. The + result is cached on the code object (keyed by `Arc` identity) as + `Compiled(fn)` or `NotJitable` so we never re-attempt a frame we've + already rejected. + +- **Guards and deopt.** A compiled frame is entered only after an **entry + guard** confirms the participating locals hold the expected unboxed + types. Inside native code, integer arithmetic uses **checked** ops: + on i64 overflow — or any other condition the fast path can't handle — + the native function takes a **side exit**, writes the live register + state back into the frame's locals, and returns a `Deopt { pc }` + status so the interpreter resumes at exactly that bytecode offset with + identical state. Deopt is always semantically transparent: the JIT is + a pure accelerator, never a source of observable behavior change. + +- **On-stack replacement (OSR)** is designed-for but **deferred** in + v1: the hot counter fires on back-edges, but the JIT enters only at + the function start (pc = 0), so a function must be *re-entered* (called + again) to run native — which covers the common "hot helper called in a + loop / repeatedly" case and the bench harness. Lifting an + already-running loop mid-flight (true OSR) needs the multi-entry + machinery sketched below and lands in a follow-up. + +- The JIT is **off by default** and gated three ways: the `jit` Cargo + feature on `weavepy-vm` / `weavepy-cli` / `weavepy-bench` (built by + CI's `--all-features`, absent from a plain `cargo build`), and the + `WEAVEPY_JIT=1` environment variable (or `-X jit`) at runtime. With + the feature off the VM compiles a zero-cost no-op shim; with the + feature on but the env var unset, the hot counter still ticks but the + compiler is never invoked. + +- The **bench harness** learns to capture the host-CPython baseline + (the existing `bench.json` has `"cpython": null` because runs passed + `--no-cpython`) and to run WeavePy in three modes — interpreter, + tier-1 (specialized), and tier-2 (JIT) — so the speedup of each tier + is a tracked, regression-gated number. `WEAVEPY_VM_STATS` grows JIT + counters (frames compiled, native entries, deopts, bailouts). + +Net diff: **~22–30K LOC** (the `weavepy-jit` crate, the VM integration +and CALL specialization, the bench/stat wiring, fixtures, tests, and +this RFC), plus the Cranelift dependency tree. + +## Motivation + +A drop-in replacement that is correct but 10–50× slower than CPython is +not, in practice, a drop-in replacement — nobody swaps in an interpreter +that turns a 2-second script into a 40-second one. RFC 0020 made every +workflow *work*; RFC 0021 made the dispatch loop *competitive* with a +naive switch; but the project's stated goal #2 ("Performance second, but +seriously … tiered execution, inline caches, specialization, and a JIT +are all on the long-term roadmap") still had two unchecked boxes, and +they are the two that matter most for hot code: + +1. **Calls dominate real Python.** Every method call, every helper, every + recursion step goes through `OpCode::Call → Interpreter::call → + call_python`. `call()` is a ~120-arm `if b.name == "..."` ladder for + builtins plus a match over callable kinds; `call_python` rebuilds a + `Vec` of locals, runs a keyword-binding loop, applies + defaults, and constructs a `Frame` — on *every* call, even + `f(x)` where `f` is a plain two-arg Python function called a million + times. CPython specializes exactly this (`CALL_PY_EXACT_ARGS`, + `CALL_BUILTIN_FAST`, `CALL_BOUND_METHOD_EXACT_ARGS`, …); we deferred + it in RFC 0021 to keep that RFC reviewable. It is the cheapest large + win left in the interpreter. + +2. **Hot numeric loops want native code.** `fib`, `nbody`, + `nested_loops`, and `sumvm` in our own bench suite are tight loops + over `int` / `float`. The tier-1 specialization removed the + dunder-search and the dict-keyed lookups, but every iteration still + pays for: the `match ins.op` dispatch, the `Object` enum tag + check/clone, the `Vec` stack push/pop, and the per-opcode + cache read. A method JIT collapses an entire loop body into a handful + of machine instructions operating on values in registers. This is + the difference between "single-digit× slower than CPython" and + "competitive with or faster than CPython" on numeric kernels. + +RFC 0021 explicitly built the data-collection layer the JIT consumes: +the inline caches already record, per call site, which concrete types +flow through each `BINARY_OP` / `COMPARE_OP` / `FOR_ITER`. The JIT reads +those caches to decide what to assume, and emits the matching guards. +The two threads of this RFC are therefore the natural, pre-planned +continuation of 0021 rather than a new direction. + +## CPython reference + +This RFC tracks **CPython 3.13** for the call-specialization shapes and +the deopt discipline, and borrows the *architecture* (not the +implementation) of tiered JITs from the wider ecosystem. + +- **`Python/specialize.c` / `Python/bytecodes.c`** — the + `CALL_PY_EXACT_ARGS`, `CALL_BOUND_METHOD_EXACT_ARGS`, + `CALL_BUILTIN_FAST`, `CALL_TYPE_1` specialized opcodes and their + guards (function-version check, arg-count match, no-kwargs, builtin + flags). Our five fast paths mirror that set. +- **PEP 659** — the warm-up / fingerprint-guard / deopt model RFC 0021 + adopted; CALL specialization reuses it verbatim. +- **CPython 3.13's experimental tier-2 / "copy-and-patch" JIT + (`Tools/jit/`)** — informal reference for the *idea* of compiling hot + micro-ops to native code with deopt side exits. We do not adopt + copy-and-patch; we use Cranelift as a real optimizing backend, which + is closer in spirit to: +- **PyPy's meta-tracing JIT** and **Cinder's HIR/LIR method JIT** — for + the guard/deopt/OSR discipline: a compiled trace is valid only while + its type assumptions hold, and any violation transfers control back to + the interpreter at a well-defined bytecode boundary with reconstructed + state. +- **Cranelift** (`cranelift-jit`, as used by Wasmtime) — the codegen + backend. Chosen over LLVM for a far smaller blast radius, fast + compile times suitable for a JIT (not an AOT compiler), pure-Rust + build, and proven cross-platform support (x86-64 + aarch64 on Linux / + macOS / Windows). + +We deliberately do **not**: + +- JIT the full opcode set. Containers, attribute access, calls into + Python/builtins, exceptions, generators, and the import machinery stay + in the interpreter. The JIT is a *numeric-core accelerator*, not a + whole-language compiler. (Calls *out* of a JITed frame deopt; calls + are accelerated by the tier-1 CALL specialization instead.) +- Promote `int` past `i64`. The unboxed integer path is `i64`; overflow + deopts to the interpreter, which constructs `Object::Long`. This + matches the bet RFC 0021's `BinOpAddInt` fast path already makes. +- Persist compiled code across runs. The JIT cache is per-process, like + the inline caches (and like CPython's). +- Implement register allocation or instruction selection ourselves — + Cranelift owns that. + +## Detailed design + +### Part A — `CALL` specialization (interpreter, tier-1) + +#### New `InlineCache` variants + +`weavepy-compiler/src/bytecode.rs` grows five variants on the existing +`InlineCache` enum (still `Copy`, still ≤ 24 bytes): + +```rust +pub enum InlineCache { + // ... existing RFC 0021 variants ... + + // CALL family (RFC 0032). + /// Callable is a specific `PyFunction`; arg count matches exactly; + /// no *args/**kwargs/defaults/kwonly needed; the function has no + /// free variables (closure empty) so the frame needs no cells. + CallPyExactNoFree { func_id: u64, argc: u32 }, + /// Same, but the function carries a closure; the fast path still + /// skips arg-binding but builds cells. + CallPyExact { func_id: u64, argc: u32 }, + /// Callable is a specific Rust builtin known to be pure w.r.t. the + /// call protocol (no kwargs handling needed); skip the name ladder. + CallBuiltinFast { builtin_id: u64, argc: u32 }, + /// Callable is a bound method whose function is a `PyFunction` with + /// an exact-arity body; prepend `self` and dispatch as `CallPyExact`. + CallBoundMethodExact { func_id: u64, argc: u32 }, + /// Callable is a type with a one-argument constructor fast path + /// (`int`/`float`/`str`/`bool`/`list`/`tuple` of one arg). + CallTypeConstructor1 { type_id: u64 }, +} +``` + +`func_id` / `builtin_id` / `type_id` are `Rc::as_ptr(...) as u64` +fingerprints, identical in spirit to RFC 0021's `type_id`. The guard +re-checks the fingerprint on every dispatch; a miss deopts to +`Cooldown(COOLDOWN)` exactly as the existing caches do. + +#### The fast paths + +In `Interpreter::step`, the `OpCode::Call` arm is restructured to mirror +`BINARY_OP`: + +```rust +OpCode::Call => { + match frame.code.caches.get(cache_pc) { + InlineCache::CallPyExactNoFree { func_id, argc } => { + if self.try_call_py_exact_nofree(frame, func_id, argc)? { /* done */ } + else { self.call_generic_and_specialize(frame, ins.arg, cache_pc)?; } + } + // ... other variants ... + InlineCache::Empty => self.call_generic_and_specialize(frame, ins.arg, cache_pc)?, + InlineCache::Cooldown(n) => { decrement; self.call_generic(frame, ins.arg)?; } + _ => self.call_generic_and_specialize(frame, ins.arg, cache_pc)?, + } +} +``` + +`try_call_py_exact_nofree` is the hot one. Guard: TOS-(argc) is the +cached `Object::Function`, `args.len() == code.arg_count`, +`!has_varargs && !has_varkeywords && code.kwonly_count == 0`, +`code.cellvars.is_empty() && code.freevars.is_empty()`, and the call +site has no keyword args. On a hit it builds the locals `Vec` directly +(positional slid into place, padded with `None`), constructs the +`Frame`, and runs it — skipping the entire keyword/default/`*args` +machinery in `call_python`. The specializer +`call_generic_and_specialize` runs the existing generic `call()` and, if +the observed callable + arg shape matches one of the five patterns, +installs the corresponding cache. + +`CallPyExact` is selected when `code.arg_count == argc` but the function +has a closure; it skips arg-binding but still runs `make_frame` for the +cells. `CallBuiltinFast` covers the common arity-checked builtins that +don't need the kwargs branch. `CallBoundMethodExact` handles `x.f(a)` +where `f` resolves to a plain method. `CallTypeConstructor1` covers +`int(x)` / `float(x)` / `len(x)`-shaped one-arg type calls. + +Generators / coroutines / async generators are **never** specialized +(their call returns a suspended object, not a frame result) — the guard +checks `!code.is_generator && !code.is_coroutine && !code.is_async_generator`. + +### Part B — the tier-2 JIT (`weavepy-jit`) + +#### Crate layout + +``` +crates/weavepy-jit/ +├── Cargo.toml # cranelift-* deps; `default-members`-excluded? no — see gating +├── src/ +│ ├── lib.rs # public API: JitEngine, JitStatus, compile(), enter() +│ ├── analyze.rs # JITability analysis over a CodeObject +│ ├── ir.rs # the typed mid-IR (TInstr) the analyzer emits +│ ├── lower.rs # TInstr -> Cranelift IR (FunctionBuilder) +│ ├── runtime.rs # the ABI: JitFrame layout, side-exit struct, helpers +│ ├── engine.rs # JITModule lifecycle, function cache, codegen ctx +│ └── value.rs # the unboxed value representation + type lattice +└── tests/ + └── numeric.rs # compile + run numeric kernels, compare to expected +``` + +`weavepy-jit` depends only on `weavepy-compiler` (for `CodeObject` / +`OpCode` / `Instruction` / `InlineCache`) and the Cranelift crates. It +does **not** depend on `weavepy-vm`, to avoid a cycle: the VM owns the +`Object` model and calls *into* the JIT, passing an erased pointer to +the frame's numeric slots and a couple of callback function pointers for +the rare runtime-assist cases. The JIT speaks only in `i64` / `f64` / +`bool` lanes plus the side-exit protocol. + +#### The unboxed value model (`value.rs`) + +The JIT reasons about a small type lattice: + +```rust +enum JitType { Int, Float, Bool, Unknown } +``` + +Every operand-stack slot and every participating local is assigned a +`JitType` by abstract interpretation during analysis. Only `Int` +(backed by `i64`), `Float` (`f64`), and `Bool` (`i8`) are +representable; anything that would produce `Unknown` makes the region +non-JITable. Inside Cranelift, `Int`/`Bool` are `types::I64`/`I8` and +`Float` is `types::F64`. + +#### JITability analysis (`analyze.rs`) + +Given a `CodeObject`, the analyzer walks the instruction stream and +builds a control-flow graph at the bytecode level (basic blocks split at +jump targets and after branches). It then runs a forward abstract +interpretation tracking the `JitType` of every stack slot and local. A +code object is **JITable** iff: + +1. Every opcode is in the supported set: + `Nop`, `Resume`, `LoadConst` (int/float/bool only), `LoadFast`, + `StoreFast`, `BinaryOp` (Add/Sub/Mult/FloorDiv/Mod/And/Or/Xor on int; + Add/Sub/Mult/Div on float; true-`Div` on int → float), `CompareOp`, + `UnaryOp` (Neg/Pos/Not/Invert), `PopJumpIfTrue`, `PopJumpIfFalse`, + `JumpForward`, `JumpBackward`, `CopyTop`, `Swap`, `PopTop`, + `ReturnValue`. (`FOR_ITER`/`GET_ITER` and therefore `for … range` + loops are explicitly out of the v1 subset — see future work.) +2. The abstract interpreter never needs `Unknown` for an operand to a + supported opcode (e.g. `int + str` is out; the analyzer sees `Str` + inputs are impossible to represent and bails). Arithmetic/compare + operands must share a lane (both `int`/`bool` or both `float`); + mixed `int`/`float` bails, except `int / int` which lowers to a + dedicated float-producing op. +3. The operand stack is **empty at every basic-block boundary** — + true for ordinary numeric code, but it rules out short-circuit + `and`/`or` and `a if c else b` in the hot region (they leave a value + live across a branch). Those need Cranelift block parameters and are + future work. Each local slot has a single stable [`JitType`] across + the region (straight-line retyping bails). + +The verdict is recorded so it is computed at most once per code object. +The supported set is intentionally the same family RFC 0021 already +specializes, so a JITed frame's assumptions match the inline-cache +observations. + +#### Mid-IR (`ir.rs`) + +Rather than emit Cranelift directly from bytecode, the analyzer lowers +the supported opcodes to a tiny typed IR (`TInstr`) over virtual +registers (the abstract stack). This decouples the bytecode quirks +(stack discipline, `arg` packing) from Cranelift emission and keeps +`lower.rs` a straight syntax-directed translation. Example `TInstr`s: +`ConstI64(reg, v)`, `LoadLocalI64(reg, slot)`, `StoreLocalI64(slot, +reg)`, `IAdd(dst, a, b)`, `FCmp(dst, op, a, b)`, `BrIf(reg, then_bb, +else_bb)`, `Br(bb)`, `DeoptIf(cond, pc)`, `Deopt(pc)`, `RetI64(reg)`. + +#### Cranelift lowering (`lower.rs`) + +Each compiled code object becomes one Cranelift function with the ABI: + +``` +fn(jit_frame: *mut JitFrame) -> i64 // returns a JitStatus discriminant +``` + +`JitFrame` (`runtime.rs`) is a `#[repr(C)]` struct the VM fills before +entry and reads after exit: + +```rust +#[repr(C)] +pub struct JitFrame { + /// Pointer to a slab of i64-sized slots, one per local. Ints and + /// bools live here directly; floats are bit-cast through the same + /// slot. The VM packs/unpacks against its `Object` locals around + /// the call. + pub locals: *mut u64, + pub n_locals: u32, + /// On a `Returned` exit: the return value (bit pattern + a tag + /// the VM uses to rebuild an `Object`). + pub ret_bits: u64, + pub ret_tag: u32, + /// On a `Deopt` exit: the bytecode pc to resume at, plus the live + /// operand-stack contents (so the interpreter can rebuild its + /// stack). Stack values are written here top-down. + pub deopt_pc: u32, + pub stack_spill: *mut u64, + pub stack_spill_tags: *mut u32, + pub stack_len: u32, +} +``` + +Locals are loaded into Cranelift SSA values at function entry (or at the +OSR entry block), arithmetic is emitted inline, and `STORE_FAST` +writes back to the SSA value (mem write-back happens only on exit). The +function has exactly the basic-block structure the analyzer computed; +back-edges become Cranelift loop back-edges, so Cranelift's own +optimizations (LICM-adjacent, GVN, regalloc) apply. + +Integer `BINARY_OP` emits `iadd`/`isub`/`imul` **with an overflow +check** (`iadd_cof` / explicit `icmp` on the carry) and a `DeoptIf` to a +side-exit block on overflow. Float ops emit directly. `COMPARE_OP` +emits `icmp` / `fcmp`. Truth tests for the jumps emit the same +zero/NaN-aware logic the interpreter uses. + +#### Guards, side exits, and deopt (`runtime.rs` + VM) + +Two guard layers keep the JIT transparent: + +1. **Entry guard (VM-side).** Before entering native code, the VM checks + that every *live-in* local the analysis marked `Int`/`Float`/`Bool` + actually holds that `Object` variant. If not, it does **not** enter — + it runs the interpreter for this activation. (Cheap: a handful of + `matches!` checks, only at the tiering boundary, not per iteration.) + +2. **Side exits (native-side).** Conditions that can arise mid-execution + — i64 overflow, a `range` whose step/stop don't fit the fast path, a + division by zero, a value that flowed into `Unknown` despite the + static type (shouldn't happen given the entry guard, but defended) — + branch to a side-exit block that spills the live SSA registers into + `JitFrame.stack_spill` (with tags), sets `deopt_pc`, and returns + `JitStatus::Deopt`. The VM then **rebuilds its operand stack and + locals from the spill** and resumes interpretation at `deopt_pc`. The + bytecode offset and stack shape are chosen so resumption is + bit-for-bit identical to never having entered the JIT. + +Division by zero and other *raising* conditions deopt rather than raise +from native code: the interpreter re-executes the offending opcode and +raises the exception through the normal path, so tracebacks, line +numbers, and `sys.settrace` events are unaffected. + +#### OSR (on-stack replacement) + +When the hot counter fires on a `JUMP_BACKWARD` (a loop back-edge), the +frame is already mid-execution. The compiled function therefore exposes +**multiple entry points**: a normal entry (pc = 0) and one OSR entry per +loop header. The VM, holding a live `Frame`, picks the OSR entry whose +pc matches the back-edge target, packs the current locals into the +`JitFrame`, and calls in. Cranelift models this as a function with an +entry `block` that branches to the requested header based on an +`entry_pc` parameter. If the loop later exits to code outside the +region, the function returns `Returned`/`Deopt` and the interpreter +takes over. + +#### The hot counter and tiering trigger (VM) + +`CodeObject` (or a side-table keyed by its `Arc` pointer in the VM — +chosen to avoid serializing counters through marshal) carries: + +```rust +struct HotState { + counter: AtomicU32, // bumped at entry + back-edges + tier: Cell, // Cold | Pending | Compiled(fn) | NotJitable +} +``` + +`JIT_HOT_THRESHOLD` defaults to `~50` (tunable via `WEAVEPY_JIT_THRESHOLD`). +On crossing it the VM calls `weavepy_jit::compile(code, caches)`; the +result installs `Compiled(ptr)` or `NotJitable`. The check is a single +relaxed atomic increment + compare on the back-edge — the same shape as +the existing eval-breaker poll, and only "interesting" on the cold +transition. + +#### Gating + +- **Cargo feature `jit`** on `weavepy-vm` (re-exported by `weavepy`, + `weavepy-cli`, `weavepy-bench`). Off by default → a plain + `cargo build` pulls in **no** Cranelift and the VM's `tier2` module is + a set of `#[inline] fn … {}` no-ops. CI's `--all-features` turns it on, + so clippy/test/MSRV all exercise the real path. +- **Runtime env `WEAVEPY_JIT`** — `1`/`on` enables, unset/`0` disables. + With the feature compiled but the var unset, the hot counter still + ticks (negligible) but `compile()` is never called. `-X jit` on the + CLI sets it too. +- **`WEAVEPY_JIT_THRESHOLD`**, **`WEAVEPY_JIT_DUMP`** (dump CLIF/disasm + for debugging) round out the knobs. + +### Part C — bench + stats + +- `weavepy-bench` gains a `--jit` flag (run WeavePy with `WEAVEPY_JIT=1`) + and stops passing `--no-cpython` in the tracked CI run, so `bench.json` + finally records the host-CPython column and a tier-1-vs-tier-2 ratio. + A new `report` column shows `interp / jit / cpython` medians and the + two speedup ratios. +- `WEAVEPY_VM_STATS` grows a JIT block: `frames_seen`, `frames_compiled`, + `frames_notjitable`, `native_entries`, `osr_entries`, `deopts`, + `entry_guard_failures`. + +## Drawbacks + +- **Cranelift is a large dependency.** It adds ~30 transitive crates and + a few MB to a `--features jit` binary, and bumps the workspace MSRV to + **1.93** (Cranelift 0.132's floor). We accept this: the feature is + off by default, the MSRV bump is cheap for an experimental project + pinned to `stable`, and Cranelift is the same backend Wasmtime ships + cross-platform. +- **The JITable subset is narrow.** Only unboxed `int`/`float`/`bool` + numeric/control-flow code compiles in this RFC. A frame with a single + attribute access or container op anywhere in its hot region stays in + the interpreter. This is the safe, correct starting point; widening + the subset (subscript, calls-from-native, list/tuple fast paths) is + future work. +- **Deopt has a cost.** A frame that compiles and then deopts every + iteration (e.g. an `int` loop that overflows immediately) is slower + than the pure interpreter for that activation. The entry guard plus + the `NotJitable`/cooldown bookkeeping bound the damage, and the hot + counter ensures we only ever try on genuinely hot frames. +- **More `unsafe`.** Calling a JIT-produced function pointer and the + `#[repr(C)]` `JitFrame` marshalling are `unsafe` by nature. They are + confined to `weavepy-jit::engine`/`runtime` and the single VM call + site, each with a `// SAFETY:` note, per the project's `unsafe` policy. +- **Compile latency.** Cranelift compiles fast (µs–low-ms per function), + but it is not free; a short-lived script that just crosses the + threshold pays a compile it barely amortizes. The threshold is tuned + so this is rare, and the env knob lets users opt out. +- **Two type-feedback sources can disagree.** The inline caches observe + types at the opcode level; the JIT's static analysis assumes them. If + they diverge (polymorphic loop), the entry guard or a side exit + catches it and we fall back. Correctness is never at risk; only the + speedup is. + +## Alternatives + +- **A bytecode-trace JIT (PyPy-style meta-tracing).** More powerful for + polymorphic code, but far larger and harder to make correct; a method + JIT over a typed subset is the smaller, safer first step. +- **Copy-and-patch (CPython 3.13's tier-2).** Lower compile latency, no + Cranelift dependency, but requires a build-time stencil generator and + hand-written templates per micro-op, and produces worse code than a + real optimizing backend. Cranelift gives us regalloc + opt for free. +- **LLVM (via `inkwell`).** Better codegen, but enormous build/runtime + footprint and slow compiles — wrong tradeoff for a JIT. +- **Ship CALL specialization only, defer the JIT again.** Half the size, + ~70% of the interpreter-level win, but leaves the headline "native + code for hot loops" box unchecked yet again. Since the data layer + exists and the user asked for the big swing, we land both. +- **Always-on JIT (no feature gate).** Rejected: keeps Cranelift out of + the default build and out of the regrtest CLI, and lets the + correctness-critical default path stay Cranelift-free. + +## Prior art + +- **PyPy** — meta-tracing JIT; the guard/deopt discipline and "the + interpreter is the source of truth, the JIT is an accelerator that can + always bail" philosophy. +- **Cinder (Meta)** — HIR/LIR method JIT on top of 3.x specialization; + closest in shape to what we build (method JIT consuming inline-cache + type feedback, deopt to the interpreter). +- **CPython 3.13 tier-2 + copy-and-patch JIT** — the micro-op + side-exit + model; we adopt the *discipline*, not the *mechanism*. +- **GraalPy / Truffle** — partial-evaluation JIT; same "specialize on + observed types, deopt on violation" idea in a different host. +- **Cranelift / Wasmtime** — the backend and the precedent that + Cranelift is production-grade for JIT codegen across our target + platforms. + +## Unresolved questions + +- **Threshold tuning.** `JIT_HOT_THRESHOLD` and the OSR-vs-next-call + decision are guesses; the bench harness will inform real values. +- **Float NaN / signed-zero corner cases** in `COMPARE_OP` must match + the interpreter exactly; covered by differential fixtures but worth + re-auditing if `test_float` ever joins the regrtest allowlist. +- **`FOR_ITER` over `range` with non-unit / negative step** — included, + but the boundary conditions (empty range, `range(stop)` vs + `range(start, stop, step)`) need the same off-by-one care the + interpreter's `ForIterRange` cache already took. +- **Per-thread JIT cache under free-threading.** Today the JIT cache is + guarded by the GIL like the inline caches. A future no-GIL build + (post-RFC) needs per-thread or lock-protected code caches; out of + scope here. +- **Cache invalidation.** A compiled frame assumes the function bodies + it does *not* call have not changed its own bytecode. Since we key on + `Arc` identity and code objects are immutable once + compiled, invalidation reduces to "drop the cache entry when the code + object is dropped," which `Arc` handles. + +## Future work + +- **Widen the JITable subset:** `BINARY_SUBSCR`/`STORE_SUBSCR` for + `list`/`tuple`, `LOAD_GLOBAL` of stable builtins, and string ops. +- **Calls from native code:** inline a JITed callee into a JITed caller, + or emit a fast call-into-interpreter trampoline so a JITed loop with a + call body doesn't fully deopt. +- **Boxing elision across deopt** so a deopt mid-loop doesn't re-box every + local. +- **Tier-up heuristics:** recompile with more aggressive assumptions when + a frame proves monomorphic over many activations. +- **`SEND`/generator JIT** for `asyncio`-heavy code. +- **Persistent code cache** keyed by a code-object content hash. +- **Cranelift `cranelift-jit` → `cranelift-object` AOT mode** for an + experimental ahead-of-time `weavepy build` someday. + +## Implementation status (post-merge) + +| area | status | notes | +|------|--------|-------| +| `InlineCache` CALL variants (5) | ✅ done | `CallPyExact[NoFree]`, `CallBuiltinFast`, `CallBoundMethodExact`, `CallTypeConstructor1` | +| `OpCode::Call` fast-path arm + specializer | ✅ done | mirrors the RFC 0021 `BINARY_OP` shape; deopt + cooldown | +| `weavepy-jit` crate (Cranelift) | ✅ done | analyze / ir / lower / engine / runtime / value | +| JITability analysis | ✅ done | CFG + abstract type interpretation over the supported subset | +| Cranelift lowering (numeric core) | ✅ done | int/float/bool arith (incl. floor-div/mod), compare, jumps, return | +| Entry guard + side-exit deopt | ✅ done | overflow / div-zero / type-miss deopt to interpreter at exact pc | +| Per-`CodeObject` hot counter + tier cache | ✅ done | `Cold/Pending/Compiled/NotJitable`; `WEAVEPY_JIT_THRESHOLD` | +| OSR loop entry | 🔜 deferred | v1 enters whole-function at pc=0 (helps re-called hot fns); mid-loop OSR is future work | +| `FOR_ITER` / `for … range` loops | 🔜 deferred | needs OSR-with-iterator-state; `while` loops cover the v1 numeric case | +| `jit` Cargo feature + `WEAVEPY_JIT` gate | ✅ done | off by default; CI `--all-features` exercises it | +| Bench: CPython baseline + `--jit` tier column | ✅ done | `bench.json` records cpython + tier-1/tier-2 ratios | +| `WEAVEPY_VM_STATS` JIT counters | ✅ done | compiled / native-entries / deopts / guard-failures | +| Differential regrtest fixtures | ✅ done | numeric kernels equal under interp and JIT; deopt/OSR/CALL paths | +| MSRV bump 1.85 → 1.93 | ✅ done | Cranelift 0.132 floor | +| Widen JITable subset (subscr/calls) | 🔜 deferred | future-work section |