diff --git a/Cargo.lock b/Cargo.lock
index f22ba3c..f7de0e7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -19,18 +19,6 @@ dependencies = [
  "version_check",
 ]
 
-[[package]]
-name = "ahash"
-version = "0.8.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
-dependencies = [
- "cfg-if",
- "once_cell",
- "version_check",
- "zerocopy",
-]
-
 [[package]]
 name = "aho-corasick"
 version = "1.1.4"
@@ -85,7 +73,7 @@ version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys",
 ]
 
 [[package]]
@@ -96,7 +84,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
 dependencies = [
  "anstyle",
  "once_cell_polyfill",
- "windows-sys 0.61.2",
+ "windows-sys",
 ]
 
 [[package]]
@@ -143,11 +131,11 @@ dependencies = [
 
 [[package]]
 name = "block-buffer"
-version = "0.10.4"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
 dependencies = [
- "generic-array",
+ "hybrid-array",
 ]
 
 [[package]]
@@ -158,7 +146,7 @@ checksum = "cfd1e3f8955a5d7de9fab72fc8373fade9fb8a703968cb200ae3dc6cf08e185a"
 dependencies = [
  "borsh-derive",
  "bytes",
- "cfg_aliases 0.2.1",
+ "cfg_aliases",
 ]
 
 [[package]]
@@ -216,12 +204,12 @@ checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
 [[package]]
 name = "bzip2"
-version = "0.4.4"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
+checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c"
 dependencies = [
  "bzip2-sys",
- "libc",
+ "libbz2-rs-sys",
 ]
 
 [[package]]
@@ -250,12 +238,6 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
-[[package]]
-name = "cfg_aliases"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
-
 [[package]]
 name = "cfg_aliases"
 version = "0.2.1"
@@ -323,12 +305,24 @@ dependencies = [
  "error-code",
 ]
 
+[[package]]
+name = "cmov"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f88a43d011fc4a6876cb7344703e297c71dda42494fee094d5f7c76bf13f746"
+
 [[package]]
 name = "colorchoice"
 version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
 
+[[package]]
+name = "const-oid"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
+
 [[package]]
 name = "core-foundation-sys"
 version = "0.8.7"
@@ -337,9 +331,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.17"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
 dependencies = [
  "libc",
 ]
@@ -355,44 +349,53 @@ dependencies = [
 
 [[package]]
 name = "crypto-common"
-version = "0.1.7"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+checksum = "ce6e4c961d6cd6c9a86db418387425e8bdeaf05b3c8bc1411e6dca4c252f1453"
 dependencies = [
- "generic-array",
- "typenum",
+ "hybrid-array",
+]
+
+[[package]]
+name = "ctutils"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e"
+dependencies = [
+ "cmov",
 ]
 
 [[package]]
 name = "digest"
-version = "0.10.7"
+version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
 dependencies = [
  "block-buffer",
+ "const-oid",
  "crypto-common",
- "subtle",
+ "ctutils",
 ]
 
 [[package]]
 name = "dirs"
-version = "5.0.1"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
+checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e"
 dependencies = [
  "dirs-sys",
 ]
 
 [[package]]
 name = "dirs-sys"
-version = "0.4.1"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
+checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab"
 dependencies = [
  "libc",
  "option-ext",
  "redox_users",
- "windows-sys 0.48.0",
+ "windows-sys",
 ]
 
 [[package]]
@@ -417,7 +420,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys",
 ]
 
 [[package]]
@@ -438,17 +441,6 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
 
-[[package]]
-name = "fd-lock"
-version = "4.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78"
-dependencies = [
- "cfg-if",
- "rustix",
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "find-msvc-tools"
 version = "0.1.9"
@@ -465,6 +457,12 @@ dependencies = [
  "miniz_oxide",
 ]
 
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
 [[package]]
 name = "funty"
 version = "2.0.0"
@@ -495,16 +493,6 @@ dependencies = [
  "slab",
 ]
 
-[[package]]
-name = "generic-array"
-version = "0.14.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
-dependencies = [
- "typenum",
- "version_check",
-]
-
 [[package]]
 name = "getrandom"
 version = "0.2.17"
@@ -522,16 +510,16 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 dependencies = [
- "ahash 0.7.8",
+ "ahash",
 ]
 
 [[package]]
 name = "hashbrown"
-version = "0.14.5"
+version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
 dependencies = [
- "ahash 0.8.12",
+ "foldhash",
 ]
 
 [[package]]
@@ -542,11 +530,11 @@ checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
 
 [[package]]
 name = "hashlink"
-version = "0.9.1"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
+checksum = "ea0b22561a9c04a7cb1a302c013e0259cd3b4bb619f145b32f72b8b4bcbed230"
 dependencies = [
- "hashbrown 0.14.5",
+ "hashbrown 0.16.1",
 ]
 
 [[package]]
@@ -557,13 +545,22 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
 [[package]]
 name = "hmac"
-version = "0.12.1"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f"
 dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "hybrid-array"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da"
+dependencies = [
+ "typenum",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.65"
@@ -628,6 +625,12 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 
+[[package]]
+name = "libbz2-rs-sys"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c"
+
 [[package]]
 name = "libc"
 version = "0.2.186"
@@ -645,9 +648,9 @@ dependencies = [
 
 [[package]]
 name = "libsqlite3-sys"
-version = "0.28.0"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f"
+checksum = "b1f111c8c41e7c61a49cd34e44c7619462967221a6443b0ec299e0ac30cfb9b1"
 dependencies = [
  "cc",
  "pkg-config",
@@ -688,9 +691,9 @@ dependencies = [
 
 [[package]]
 name = "md-5"
-version = "0.10.6"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+checksum = "69b6441f590336821bb897fb28fc622898ccceb1d6cea3fde5ea86b090c4de98"
 dependencies = [
  "cfg-if",
  "digest",
@@ -721,18 +724,18 @@ dependencies = [
  "libc",
  "log",
  "wasi",
- "windows-sys 0.61.2",
+ "windows-sys",
 ]
 
 [[package]]
 name = "nix"
-version = "0.28.0"
+version = "0.31.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
+checksum = "cf20d2fde8ff38632c426f1165ed7436270b44f199fc55284c38276f9db47c3d"
 dependencies = [
  "bitflags",
  "cfg-if",
- "cfg_aliases 0.1.1",
+ "cfg_aliases",
  "libc",
 ]
 
@@ -742,7 +745,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys",
 ]
 
 [[package]]
@@ -908,13 +911,13 @@ dependencies = [
 
 [[package]]
 name = "redox_users"
-version = "0.4.6"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
+checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac"
 dependencies = [
  "getrandom",
  "libredox",
- "thiserror 1.0.69",
+ "thiserror",
 ]
 
 [[package]]
@@ -984,11 +987,21 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "rsqlite-vfs"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c51c9ae4df8a7fba42103df5c621fa3c37eccf3a3c650879e90fc48b11cc192c"
+dependencies = [
+ "hashbrown 0.16.1",
+ "thiserror",
+]
+
 [[package]]
 name = "rusqlite"
-version = "0.31.0"
+version = "0.39.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b838eba278d213a8beaf485bd313fd580ca4505a00d5871caeb1457c55322cae"
+checksum = "a0d2b0146dd9661bf67bb107c0bb2a55064d556eeb3fc314151b957f313bcd4e"
 dependencies = [
  "bitflags",
  "fallible-iterator",
@@ -996,6 +1009,7 @@ dependencies = [
  "hashlink",
  "libsqlite3-sys",
  "smallvec",
+ "sqlite-wasm-rs",
 ]
 
 [[package]]
@@ -1025,7 +1039,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.61.2",
+ "windows-sys",
 ]
 
 [[package]]
@@ -1036,14 +1050,13 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
 
 [[package]]
 name = "rustyline"
-version = "14.0.0"
+version = "18.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7803e8936da37efd9b6d4478277f4b2b9bb5cdb37a113e8d63222e58da647e63"
+checksum = "4a990b25f351b25139ddc7f21ee3f6f56f86d6846b74ac8fad3a719a287cd4a0"
 dependencies = [
  "bitflags",
  "cfg-if",
  "clipboard-win",
- "fd-lock",
  "libc",
  "log",
  "memchr",
@@ -1051,7 +1064,7 @@ dependencies = [
  "unicode-segmentation",
  "unicode-width",
  "utf8parse",
- "windows-sys 0.52.0",
+ "windows-sys",
 ]
 
 [[package]]
@@ -1101,9 +1114,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.149"
+version = "1.0.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
 dependencies = [
  "itoa",
  "memchr",
@@ -1114,9 +1127,9 @@ dependencies = [
 
 [[package]]
 name = "sha1"
-version = "0.10.6"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+checksum = "aacc4cc499359472b4abe1bf11d0b12e688af9a805fa5e3016f9a386dc2d0214"
 dependencies = [
  "cfg-if",
  "cpufeatures",
@@ -1125,9 +1138,9 @@ dependencies = [
 
 [[package]]
 name = "sha2"
-version = "0.10.9"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4"
 dependencies = [
  "cfg-if",
  "cpufeatures",
@@ -1175,25 +1188,31 @@ checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
 
 [[package]]
 name = "socket2"
-version = "0.5.10"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
+checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys",
 ]
 
 [[package]]
-name = "strsim"
-version = "0.11.1"
+name = "sqlite-wasm-rs"
+version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+checksum = "cdd578e94101503d97e2b286bbf8db2135035ca24b2ce4cbf3f9e2fb2bbf1eee"
+dependencies = [
+ "cc",
+ "js-sys",
+ "rsqlite-vfs",
+ "wasm-bindgen",
+]
 
 [[package]]
-name = "subtle"
-version = "2.6.1"
+name = "strsim"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
 [[package]]
 name = "syn"
@@ -1230,16 +1249,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
 dependencies = [
  "rustix",
- "windows-sys 0.61.2",
-]
-
-[[package]]
-name = "thiserror"
-version = "1.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
-dependencies = [
- "thiserror-impl 1.0.69",
+ "windows-sys",
 ]
 
 [[package]]
@@ -1248,18 +1258,7 @@ version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
 dependencies = [
- "thiserror-impl 2.0.18",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "1.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.117",
+ "thiserror-impl",
 ]
 
 [[package]]
@@ -1408,9 +1407,9 @@ checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c"
 
 [[package]]
 name = "unicode-width"
-version = "0.1.14"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
 
 [[package]]
 name = "utf8parse"
@@ -1512,13 +1511,25 @@ dependencies = [
 name = "weavepy"
 version = "0.0.0"
 dependencies = [
- "thiserror 2.0.18",
+ "thiserror",
  "weavepy-compiler",
  "weavepy-lexer",
  "weavepy-parser",
  "weavepy-vm",
 ]
 
+[[package]]
+name = "weavepy-bench"
+version = "0.0.0"
+dependencies = [
+ "serde",
+ "serde_json",
+ "weavepy",
+ "weavepy-compiler",
+ "weavepy-parser",
+ "weavepy-vm",
+]
+
 [[package]]
 name = "weavepy-cli"
 version = "0.0.0"
@@ -1543,7 +1554,7 @@ version = "0.0.0"
 dependencies = [
  "indexmap",
  "num-bigint",
- "thiserror 2.0.18",
+ "thiserror",
  "weavepy-lexer",
  "weavepy-parser",
 ]
@@ -1564,7 +1575,7 @@ dependencies = [
 name = "weavepy-lexer"
 version = "0.0.0"
 dependencies = [
- "thiserror 2.0.18",
+ "thiserror",
  "unicode-ident",
 ]
 
@@ -1573,7 +1584,7 @@ name = "weavepy-parser"
 version = "0.0.0"
 dependencies = [
  "num-bigint",
- "thiserror 2.0.18",
+ "thiserror",
  "weavepy-lexer",
 ]
 
@@ -1604,7 +1615,7 @@ dependencies = [
  "sha1",
  "sha2",
  "socket2",
- "thiserror 2.0.18",
+ "thiserror",
  "tracing",
  "weavepy-compiler",
  "weavepy-lexer",
@@ -1618,7 +1629,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys",
 ]
 
 [[package]]
@@ -1680,24 +1691,6 @@ dependencies = [
  "windows-link",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
-dependencies = [
- "windows-targets 0.48.5",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.52.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.61.2"
@@ -1707,127 +1700,6 @@ dependencies = [
  "windows-link",
 ]
 
-[[package]]
-name = "windows-targets"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
-dependencies = [
- "windows_aarch64_gnullvm 0.48.5",
- "windows_aarch64_msvc 0.48.5",
- "windows_i686_gnu 0.48.5",
- "windows_i686_msvc 0.48.5",
- "windows_x86_64_gnu 0.48.5",
- "windows_x86_64_gnullvm 0.48.5",
- "windows_x86_64_msvc 0.48.5",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
-dependencies = [
- "windows_aarch64_gnullvm 0.52.6",
- "windows_aarch64_msvc 0.52.6",
- "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm",
- "windows_i686_msvc 0.52.6",
- "windows_x86_64_gnu 0.52.6",
- "windows_x86_64_gnullvm 0.52.6",
- "windows_x86_64_msvc 0.52.6",
-]
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
-
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
-
 [[package]]
 name = "winnow"
 version = "1.0.3"
diff --git a/Cargo.toml b/Cargo.toml
index b73b3a9..e63bb2d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,7 @@
 resolver = "2"
 members = [
     "crates/weavepy",
+    "crates/weavepy-bench",
     "crates/weavepy-cli",
     "crates/weavepy-compiler",
     "crates/weavepy-conformance",
@@ -9,10 +10,11 @@ members = [
     "crates/weavepy-parser",
     "crates/weavepy-vm",
 ]
-# `weavepy-conformance` is a development tool that shells out to `python3`
-# and isn't useful to publish or to compile by default. Keep it out of
-# `default-members` so `cargo build/test --workspace` stays light; opt in
-# explicitly via `-p weavepy-conformance`.
+# `weavepy-conformance` and `weavepy-bench` are development tools that shell
+# out to `python3` and aren't useful to publish or to compile by default.
+# Keep them out of `default-members` so `cargo build/test --workspace`
+# stays light; opt in explicitly via `-p weavepy-conformance` /
+# `-p weavepy-bench`.
 default-members = [
     "crates/weavepy",
     "crates/weavepy-cli",
@@ -59,12 +61,12 @@ walkdir     = "2.5"
 
 # RFC 0017 — OS interface, networking, subprocess.
 mio         = { version = "1.0", features = ["os-poll", "os-ext", "net"] }
-socket2     = { version = "0.5", features = ["all"] }
-sha2        = "0.10"
-sha1        = "0.10"
-md-5        = "0.10"
-digest      = "0.10"
-hmac        = "0.12"
+socket2     = { version = "0.6", features = ["all"] }
+sha2        = "0.11"
+sha1        = "0.11"
+md-5        = "0.11"
+digest      = "0.11"
+hmac        = "0.13"
 base64      = "0.22"
 crc32fast   = "1.4"
 flate2      = { version = "1.0", default-features = false, features = ["rust_backend"] }
@@ -76,14 +78,14 @@ num-traits  = "0.2"
 num-rational = "0.4"
 byteorder   = "1.5"
 encoding_rs = "0.8"
-bzip2       = { version = "0.4", features = ["static"] }
+bzip2       = { version = "0.6", features = ["static"] }
 xz2         = "0.1"
-rusqlite    = { version = "0.31", features = ["bundled"] }
+rusqlite    = { version = "0.39", features = ["bundled"] }
 rust_decimal = "1.36"
 
 # RFC 0020 — interactive REPL + CLI surface.
-rustyline   = { version = "14.0", default-features = false, features = ["with-file-history"] }
-dirs        = "5.0"
+rustyline   = { version = "18.0", default-features = false, features = ["with-file-history"] }
+dirs        = "6.0"
 
 # Test/bench-only.
 insta       = { version = "1.40", features = ["yaml"] }
diff --git a/crates/weavepy-bench/Cargo.toml b/crates/weavepy-bench/Cargo.toml
new file mode 100644
index 0000000..a474a89
--- /dev/null
+++ b/crates/weavepy-bench/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "weavepy-bench"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+publish = false
+description = "RFC 0021 — pyperformance-shaped microbench harness for WeavePy."
+
+[lib]
+path = "src/lib.rs"
+
+[[bin]]
+name = "weavepy-bench"
+path = "src/main.rs"
+
+[dependencies]
+weavepy = { workspace = true }
+weavepy-compiler = { workspace = true }
+weavepy-parser = { workspace = true }
+weavepy-vm = { workspace = true }
+
+serde = { workspace = true }
+serde_json = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/crates/weavepy-bench/README.md b/crates/weavepy-bench/README.md
new file mode 100644
index 0000000..897dccd
--- /dev/null
+++ b/crates/weavepy-bench/README.md
@@ -0,0 +1,44 @@
+# weavepy-bench
+
+RFC 0021 — `pyperformance`-shaped microbench harness for WeavePy.
+
+The crate is excluded from `default-members` so `cargo build` /
+`cargo test --workspace` doesn't pull it in. Opt in with `-p
+weavepy-bench` when you want to run the benches.
+
+## Usage
+
+```bash
+# Run all fixtures, print a markdown report.
+cargo run -p weavepy-bench -- run
+
+# Skip the host CPython subprocess (faster on CI without python3).
+cargo run -p weavepy-bench -- run --no-cpython
+
+# Print the report as JSON instead of markdown.
+cargo run -p weavepy-bench -- run --json
+
+# Refresh the baseline JSON tracked at `baselines/bench.json`.
+cargo run -p weavepy-bench -- run --update-baseline
+
+# Compare current run against the baseline; exit non-zero on
+# regression beyond 10% (default threshold).
+cargo run -p weavepy-bench -- gate
+cargo run -p weavepy-bench -- gate --pct=15
+```
+
+Run with `--release` for representative numbers — the dev profile
+is far slower than what CI / shipped binaries see.
+
+## Adding a fixture
+
+1. Drop `fixtures/foo.py`. The file should:
+   - Import `os`.
+   - Define a `bench(n)` callable that runs the workload `n` times.
+   - Have a `if __name__ == "__main__":` block that reads
+     `WEAVEPY_BENCH_WORK` from the environment so the runner can
+     parameterize CPython runs.
+2. Add `"foo"` to `FIXTURES` in `src/fixtures.rs`.
+3. Pick a default `work` parameter in `default_work(...)`.
+4. Run `cargo run -p weavepy-bench -- run --update-baseline` and
+   inspect the diff before committing.
diff --git a/crates/weavepy-bench/baselines/bench.json b/crates/weavepy-bench/baselines/bench.json
new file mode 100644
index 0000000..540c51c
--- /dev/null
+++ b/crates/weavepy-bench/baselines/bench.json
@@ -0,0 +1,135 @@
+{
+  "version": 1,
+  "host": "unknown",
+  "created_at": "ts=1779652079",
+  "rows": [
+    {
+      "name": "fannkuch",
+      "work": 7,
+      "weavepy": {
+        "samples": [
+          51417.0,
+          44583.0,
+          42208.0
+        ],
+        "mean_ns": 46069.333333333336,
+        "median_ns": 44583.0,
+        "p95_ns": 51417.0,
+        "stddev_ns": 4781.03653336108
+      },
+      "cpython": null
+    },
+    {
+      "name": "nbody",
+      "work": 200,
+      "weavepy": {
+        "samples": [
+          105667.0,
+          109167.0,
+          104584.0
+        ],
+        "mean_ns": 106472.66666666667,
+        "median_ns": 105667.0,
+        "p95_ns": 109167.0,
+        "stddev_ns": 2395.3697696458753
+      },
+      "cpython": null
+    },
+    {
+      "name": "fib",
+      "work": 28,
+      "weavepy": {
+        "samples": [
+          10405334.0,
+          10576333.0,
+          11119250.0
+        ],
+        "mean_ns": 10700305.666666666,
+        "median_ns": 10576333.0,
+        "p95_ns": 11119250.0,
+        "stddev_ns": 372754.5175102957
+      },
+      "cpython": null
+    },
+    {
+      "name": "pidigits",
+      "work": 100,
+      "weavepy": {
+        "samples": [
+          83292.0,
+          77042.0,
+          72584.0
+        ],
+        "mean_ns": 77639.33333333333,
+        "median_ns": 77042.0,
+        "p95_ns": 83292.0,
+        "stddev_ns": 5378.93310363062
+      },
+      "cpython": null
+    },
+    {
+      "name": "pyaes",
+      "work": 50,
+      "weavepy": {
+        "samples": [
+          9593791.0,
+          8878333.0,
+          8792875.0
+        ],
+        "mean_ns": 9088333.0,
+        "median_ns": 8878333.0,
+        "p95_ns": 9593791.0,
+        "stddev_ns": 439819.97426674474
+      },
+      "cpython": null
+    },
+    {
+      "name": "richards",
+      "work": 1,
+      "weavepy": {
+        "samples": [
+          88166.0,
+          82959.0,
+          82750.0
+        ],
+        "mean_ns": 84625.0,
+        "median_ns": 82959.0,
+        "p95_ns": 88166.0,
+        "stddev_ns": 3068.3759548008456
+      },
+      "cpython": null
+    },
+    {
+      "name": "sumvm",
+      "work": 50000,
+      "weavepy": {
+        "samples": [
+          1243792.0,
+          1147167.0,
+          1162917.0
+        ],
+        "mean_ns": 1184625.3333333333,
+        "median_ns": 1162917.0,
+        "p95_ns": 1243792.0,
+        "stddev_ns": 51841.45501751792
+      },
+      "cpython": null
+    },
+    {
+      "name": "nested_loops",
+      "work": 30,
+      "weavepy": {
+        "samples": [
+          2163000.0,
+          2122750.0,
+          2242167.0
+        ],
+        "mean_ns": 2175972.3333333335,
+        "median_ns": 2163000.0,
+        "p95_ns": 2242167.0,
+        "stddev_ns": 60756.20171088161
+      },
+      "cpython": null
+    }
+  ]
+}
\ No newline at end of file
diff --git a/crates/weavepy-bench/fixtures/fannkuch.py b/crates/weavepy-bench/fixtures/fannkuch.py
new file mode 100644
index 0000000..96fe18d
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/fannkuch.py
@@ -0,0 +1,42 @@
+"""Tiny pancake-flip kernel — stresses list mutation, integer
+arithmetic, and tight loops. Not the canonical fannkuch-redux
+(which uses reverse slicing), but the same shape: count the
+flips needed to reach a permutation in increasing order."""
+
+import os
+
+
+def _flips_to_sort(n):
+    perm = list(range(n))
+    flips = 0
+    while perm[0] != 0:
+        k = perm[0]
+        # Reverse perm[:k+1] in place.
+        i = 0
+        j = k
+        while i < j:
+            perm[i], perm[j] = perm[j], perm[i]
+            i += 1
+            j -= 1
+        flips += 1
+        # Rotate the list left by one to give the kernel a
+        # different starting permutation each iteration; a
+        # random-looking sequence keeps the JIT-style cache
+        # honest without depending on a real RNG.
+        first = perm[0]
+        for idx in range(len(perm) - 1):
+            perm[idx] = perm[idx + 1]
+        perm[-1] = first
+    return flips
+
+
+def bench(n):
+    out = 0
+    for _ in range(n):
+        out = _flips_to_sort(7)
+    return out
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "1"))
+    bench(n)
diff --git a/crates/weavepy-bench/fixtures/fib.py b/crates/weavepy-bench/fixtures/fib.py
new file mode 100644
index 0000000..6234027
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/fib.py
@@ -0,0 +1,18 @@
+"""Naive recursive fib — pure call-overhead benchmark."""
+
+import os
+
+
+def _fib(n):
+    if n < 2:
+        return n
+    return _fib(n - 1) + _fib(n - 2)
+
+
+def bench(n):
+    return _fib(n)
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "20"))
+    bench(n)
diff --git a/crates/weavepy-bench/fixtures/nbody.py b/crates/weavepy-bench/fixtures/nbody.py
new file mode 100644
index 0000000..7db2abe
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/nbody.py
@@ -0,0 +1,66 @@
+"""Tiny n-body simulation — float-heavy arithmetic dominates."""
+
+import os
+
+
+def _advance(bodies, dt):
+    pairs = []
+    n = len(bodies)
+    i = 0
+    while i < n:
+        j = i + 1
+        while j < n:
+            pairs.append((i, j))
+            j += 1
+        i += 1
+    for i, j in pairs:
+        bi = bodies[i]
+        bj = bodies[j]
+        dx = bi[0] - bj[0]
+        dy = bi[1] - bj[1]
+        dz = bi[2] - bj[2]
+        d2 = dx * dx + dy * dy + dz * dz
+        mag = dt / (d2 * (d2 ** 0.5))
+        bm = bj[6] * mag
+        bi[3] -= dx * bm
+        bi[4] -= dy * bm
+        bi[5] -= dz * bm
+        am = bi[6] * mag
+        bj[3] += dx * am
+        bj[4] += dy * am
+        bj[5] += dz * am
+    for b in bodies:
+        b[0] += dt * b[3]
+        b[1] += dt * b[4]
+        b[2] += dt * b[5]
+
+
+def _energy(bodies):
+    e = 0.0
+    n = len(bodies)
+    for i in range(n):
+        b = bodies[i]
+        e += 0.5 * b[6] * (b[3] * b[3] + b[4] * b[4] + b[5] * b[5])
+        for j in range(i + 1, n):
+            c = bodies[j]
+            dx = b[0] - c[0]
+            dy = b[1] - c[1]
+            dz = b[2] - c[2]
+            e -= b[6] * c[6] / (dx * dx + dy * dy + dz * dz) ** 0.5
+    return e
+
+
+def bench(n):
+    bodies = [
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
+        [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.001],
+        [0.0, 1.0, 0.0, -1.0, 0.0, 0.0, 0.001],
+    ]
+    for _ in range(n):
+        _advance(bodies, 0.01)
+    return _energy(bodies)
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "1"))
+    bench(n)
diff --git a/crates/weavepy-bench/fixtures/nested_loops.py b/crates/weavepy-bench/fixtures/nested_loops.py
new file mode 100644
index 0000000..1fa542c
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/nested_loops.py
@@ -0,0 +1,17 @@
+"""Three-level nested loop — measures nested FOR_ITER + BINARY_OP."""
+
+import os
+
+
+def bench(n):
+    total = 0
+    for i in range(n):
+        for j in range(n):
+            for k in range(n):
+                total = total + i + j + k
+    return total
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "20"))
+    bench(n)
diff --git a/crates/weavepy-bench/fixtures/pidigits.py b/crates/weavepy-bench/fixtures/pidigits.py
new file mode 100644
index 0000000..1196ba4
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/pidigits.py
@@ -0,0 +1,24 @@
+"""Bignum-arithmetic stress test — keeps multiplying ints past
+the i64 boundary so the BinOp specializations need to deopt to
+the BigInt slow path. Loosely modeled after the spigot for
+digits of pi but trimmed to the simplest shape that exercises
+overflow promotion without a full pi spigot."""
+
+import os
+
+
+def _bignum_loop(n):
+    a = 1
+    b = 1
+    for _ in range(n):
+        a, b = b, a + b
+    return b
+
+
+def bench(n):
+    return _bignum_loop(n)
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "100"))
+    bench(n)
diff --git a/crates/weavepy-bench/fixtures/pyaes.py b/crates/weavepy-bench/fixtures/pyaes.py
new file mode 100644
index 0000000..b58cddd
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/pyaes.py
@@ -0,0 +1,27 @@
+"""Tiny pure-Python AES-style XOR scrambler. Not real AES — a
+fixed-shape byte-and-XOR loop that stresses string slicing and
+list-of-int arithmetic."""
+
+import os
+
+
+def _scramble(plain, key):
+    out = []
+    klen = len(key)
+    for i, c in enumerate(plain):
+        out.append((c ^ key[i % klen]) & 0xFF)
+    return bytes(out)
+
+
+def bench(n):
+    plain = bytes(range(256)) * 4  # 1024 bytes
+    key = bytes(range(16))
+    last = b""
+    for _ in range(n):
+        last = _scramble(plain, key)
+    return len(last)
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "10"))
+    bench(n)
diff --git a/crates/weavepy-bench/fixtures/richards.py b/crates/weavepy-bench/fixtures/richards.py
new file mode 100644
index 0000000..ce15255
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/richards.py
@@ -0,0 +1,28 @@
+"""Tiny Richards-style task scheduler — exercises classes,
+attribute access, and method dispatch."""
+
+import os
+
+
+class Task:
+    def __init__(self, ident, prio):
+        self.ident = ident
+        self.prio = prio
+        self.run_count = 0
+
+    def run(self):
+        self.run_count += 1
+        return self.run_count
+
+
+def bench(n):
+    tasks = [Task(i, 10 - i) for i in range(8)]
+    for _ in range(n):
+        for t in tasks:
+            t.run()
+    return sum(t.run_count for t in tasks)
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "1"))
+    bench(n)
diff --git a/crates/weavepy-bench/fixtures/sumvm.py b/crates/weavepy-bench/fixtures/sumvm.py
new file mode 100644
index 0000000..57d7420
--- /dev/null
+++ b/crates/weavepy-bench/fixtures/sumvm.py
@@ -0,0 +1,17 @@
+"""Pure dispatch-loop benchmark — a tight `total += i` loop that
+exercises the hot path the BINARY_OP / FOR_ITER specializations
+target most directly."""
+
+import os
+
+
+def bench(n):
+    total = 0
+    for i in range(n):
+        total = total + i
+    return total
+
+
+if __name__ == "__main__":
+    n = int(os.environ.get("WEAVEPY_BENCH_WORK", "10000"))
+    bench(n)
diff --git a/crates/weavepy-bench/src/fixtures.rs b/crates/weavepy-bench/src/fixtures.rs
new file mode 100644
index 0000000..74793d3
--- /dev/null
+++ b/crates/weavepy-bench/src/fixtures.rs
@@ -0,0 +1,82 @@
+//! Discovery of fixtures embedded in this crate.
+//!
+//! Each fixture is a self-contained `.py` file that exports a
+//! top-level `bench(n)` callable. The list below is the
+//! authoritative set used by the runner and the CI gate; new
+//! fixtures need to be both dropped on disk *and* added here so
+//! the runner finds them.
+
+use std::path::PathBuf;
+
+/// The full set of fixtures the runner knows about. Order is
+/// preserved in CLI output and in the JSON report.
+pub const FIXTURES: &[&str] = &[
+    "fannkuch",
+    "nbody",
+    "fib",
+    "pidigits",
+    "pyaes",
+    "richards",
+    "sumvm",
+    "nested_loops",
+];
+
+/// Default per-fixture work parameter passed as `bench(n)`.
+/// Picked to make a single iteration take ~10-100ms on CPython —
+/// small enough to keep the bench job under a minute, large
+/// enough to dwarf timer overhead.
+pub fn default_work(name: &str) -> u32 {
+    match name {
+        "fannkuch" => 7,
+        "nbody" => 200,
+        "fib" => 28,
+        "pidigits" => 100,
+        "pyaes" => 50,
+        "richards" => 1,
+        "sumvm" => 50_000,
+        "nested_loops" => 30,
+        _ => 1,
+    }
+}
+
+/// One discovered fixture (path + display name).
+#[derive(Debug, Clone)]
+pub struct Fixture {
+    pub name: String,
+    pub path: PathBuf,
+    pub work: u32,
+}
+
+/// Resolve `fixtures/` next to the crate's `Cargo.toml`.
+pub fn fixtures_dir() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures")
+}
+
+/// Load all known fixtures, returning the ones that exist on disk.
+/// Missing files are skipped silently so an in-flight rename
+/// doesn't break the runner.
+pub fn discover_fixtures() -> Vec<Fixture> {
+    let dir = fixtures_dir();
+    FIXTURES
+        .iter()
+        .filter_map(|name| {
+            let path = dir.join(format!("{name}.py"));
+            if path.exists() {
+                Some(Fixture {
+                    name: (*name).to_owned(),
+                    path,
+                    work: default_work(name),
+                })
+            } else {
+                None
+            }
+        })
+        .collect()
+}
+
+/// Path to the baseline JSON tracked alongside the fixtures.
+pub fn baseline_path() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("baselines")
+        .join("bench.json")
+}
diff --git a/crates/weavepy-bench/src/lib.rs b/crates/weavepy-bench/src/lib.rs
new file mode 100644
index 0000000..e67b42c
--- /dev/null
+++ b/crates/weavepy-bench/src/lib.rs
@@ -0,0 +1,28 @@
+//! RFC 0021 — `weavepy-bench`.
+//!
+//! A `pyperformance`-shaped microbench harness for WeavePy. Each
+//! fixture is a self-contained `.py` file under `fixtures/` that
+//! exposes a single top-level callable `bench(N)` performing some
+//! workload `N` times. The runner times each fixture under
+//! WeavePy (in-process) and the host's CPython (subprocess), and
+//! emits a JSON report comparing the two. CI compares the report
+//! against [`fixtures::BASELINE`] and fails on regressions over a
+//! configurable threshold.
+//!
+//! ## Adding a fixture
+//!
+//! 1. Drop `fixtures/foo.py` containing a `bench(n)` callable.
+//! 2. Add `"foo"` to [`fixtures::FIXTURES`].
+//! 3. Run `cargo run -p weavepy-bench -- run --update-baseline`
+//!    to refresh the baseline JSON. Inspect the diff in
+//!    `baselines/bench.json` before committing.
+
+pub mod fixtures;
+pub mod report;
+pub mod runner;
+pub mod stats;
+
+pub use fixtures::{Fixture, FIXTURES};
+pub use report::{Report, Row};
+pub use runner::{run_one, run_suite, RunOpts};
+pub use stats::{mean, median, percentile, stddev};
diff --git a/crates/weavepy-bench/src/main.rs b/crates/weavepy-bench/src/main.rs
new file mode 100644
index 0000000..9782b17
--- /dev/null
+++ b/crates/weavepy-bench/src/main.rs
@@ -0,0 +1,151 @@
+//! `weavepy-bench` CLI entry point.
+//!
+//! Subcommands:
+//!
+//! - `run` — runs all fixtures, prints a markdown report.
+//! - `run --json` — emits the report as JSON to stdout.
+//! - `run --update-baseline` — overwrites
+//!   `baselines/bench.json` with the run's results.
+//! - `gate` — runs the suite, compares against the baseline,
+//!   and exits non-zero if any fixture regressed.
+//!
+//! For maximum portability we hand-roll arg parsing rather than
+//! pull in `clap` — the tool has at most a handful of flags.
+
+use std::env;
+use std::fs;
+use std::io;
+use std::process::ExitCode;
+
+use weavepy_bench::fixtures::baseline_path;
+use weavepy_bench::report::Report;
+use weavepy_bench::runner::{run_suite, RunOpts};
+use weavepy_vm::specialize::{format_stats_markdown, snapshot, stats_enabled};
+
+fn main() -> ExitCode {
+    let args: Vec<String> = env::args().collect();
+    let cmd = args.get(1).map(String::as_str).unwrap_or("run");
+    match cmd {
+        "run" => match cmd_run(&args[2..]) {
+            Ok(()) => ExitCode::SUCCESS,
+            Err(e) => {
+                eprintln!("weavepy-bench: {e}");
+                ExitCode::FAILURE
+            }
+        },
+        "gate" => match cmd_gate(&args[2..]) {
+            Ok(true) => ExitCode::SUCCESS,
+            Ok(false) => ExitCode::FAILURE,
+            Err(e) => {
+                eprintln!("weavepy-bench: {e}");
+                ExitCode::FAILURE
+            }
+        },
+        "help" | "-h" | "--help" => {
+            print_help();
+            ExitCode::SUCCESS
+        }
+        other => {
+            eprintln!("weavepy-bench: unknown command '{other}'");
+            print_help();
+            ExitCode::FAILURE
+        }
+    }
+}
+
+fn print_help() {
+    eprintln!("weavepy-bench — RFC 0021 microbench harness");
+    eprintln!();
+    eprintln!("USAGE:");
+    eprintln!("    weavepy-bench [run|gate|help] [flags]");
+    eprintln!();
+    eprintln!("COMMANDS:");
+    eprintln!("    run    Run the suite and print a markdown report.");
+    eprintln!("    gate   Run the suite and compare against the baseline.");
+    eprintln!("    help   Print this message.");
+    eprintln!();
+    eprintln!("FLAGS for `run`:");
+    eprintln!("    --json                Print report as JSON.");
+    eprintln!("    --update-baseline     Overwrite baselines/bench.json.");
+    eprintln!("    --no-cpython          Skip the host CPython subprocess.");
+    eprintln!("    --samples=N           Timing samples per fixture (default 5).");
+    eprintln!();
+    eprintln!("FLAGS for `gate`:");
+    eprintln!("    --pct=PCT             Regression threshold (default 10).");
+}
+
+fn cmd_run(args: &[String]) -> io::Result<()> {
+    let mut opts = RunOpts::default();
+    let mut emit_json = false;
+    let mut update_baseline = false;
+    for a in args {
+        match a.as_str() {
+            "--json" => emit_json = true,
+            "--update-baseline" => update_baseline = true,
+            "--no-cpython" => opts.include_cpython = false,
+            x if x.starts_with("--samples=") => {
+                opts.samples = x[10..].parse().unwrap_or(opts.samples);
+            }
+            other => {
+                return Err(io::Error::other(format!("unknown flag '{other}'")));
+            }
+        }
+    }
+    let rows = run_suite(&opts)?;
+    let report = Report::new(rows);
+
+    if update_baseline {
+        let dst = baseline_path();
+        if let Some(parent) = dst.parent() {
+            fs::create_dir_all(parent)?;
+        }
+        fs::write(&dst, serde_json::to_string_pretty(&report)?)?;
+        eprintln!("baseline updated: {}", dst.display());
+    }
+
+    if emit_json {
+        println!("{}", serde_json::to_string_pretty(&report)?);
+    } else {
+        println!("{}", report.to_markdown());
+        if stats_enabled() {
+            // RFC 0021 — when WEAVEPY_VM_STATS=1 is set, append a
+            // markdown stats table to the report so users can see
+            // how the specialization layer performed across the
+            // suite. Off by default; cheap when off.
+            println!();
+            println!("{}", format_stats_markdown(&snapshot()));
+        }
+    }
+    Ok(())
+}
+
+fn cmd_gate(args: &[String]) -> io::Result<bool> {
+    let mut pct = 10.0_f64;
+    let mut opts = RunOpts::default();
+    for a in args {
+        match a.as_str() {
+            x if x.starts_with("--pct=") => {
+                pct = x[6..].parse().unwrap_or(pct);
+            }
+            "--no-cpython" => opts.include_cpython = false,
+            other => {
+                return Err(io::Error::other(format!("unknown flag '{other}'")));
+            }
+        }
+    }
+    let baseline_bytes = fs::read_to_string(baseline_path())?;
+    let baseline: Report = serde_json::from_str(&baseline_bytes)?;
+    let rows = run_suite(&opts)?;
+    let report = Report::new(rows);
+    let regs = report.regressions(&baseline, pct);
+    if regs.is_empty() {
+        println!("OK: no regressions over {pct:.1}%");
+        Ok(true)
+    } else {
+        println!("REGRESSIONS:");
+        for r in &regs {
+            println!("  {r}");
+        }
+        Ok(false)
+    }
+}
diff --git a/crates/weavepy-bench/src/report.rs b/crates/weavepy-bench/src/report.rs
new file mode 100644
index 0000000..ca7abe6
--- /dev/null
+++ b/crates/weavepy-bench/src/report.rs
@@ -0,0 +1,148 @@
+//! JSON / markdown report formatting for the bench runner.
+
+use serde::{Deserialize, Serialize};
+
+use crate::stats;
+
+/// One sample summary — captures the timing distribution for a
+/// single (fixture × runtime) pair.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RunSet {
+    pub samples: Vec<f64>,
+    pub mean_ns: f64,
+    pub median_ns: f64,
+    pub p95_ns: f64,
+    pub stddev_ns: f64,
+}
+
+impl RunSet {
+    /// Build a [`RunSet`] from raw timing samples (in nanoseconds).
+    pub fn from_samples_ns(samples: &[f64]) -> Self {
+        Self {
+            samples: samples.to_vec(),
+            mean_ns: stats::mean(samples),
+            median_ns: stats::median(samples),
+            p95_ns: stats::percentile(samples, 95.0),
+            stddev_ns: stats::stddev(samples),
+        }
+    }
+}
+
+/// One row of the bench report — fixture name, work parameter,
+/// and timing for each runtime.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Row {
+    pub name: String,
+    pub work: u32,
+    pub weavepy: RunSet,
+    pub cpython: Option<RunSet>,
+}
+
+/// Top-level report shape. Persisted as `baselines/bench.json`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Report {
+    pub version: u32,
+    pub host: String,
+    pub created_at: String,
+    pub rows: Vec<Row>,
+}
+
+impl Report {
+    pub fn new(rows: Vec<Row>) -> Self {
+        Self {
+            version: 1,
+            host: hostname_or_unknown(),
+            created_at: now_rfc3339(),
+            rows,
+        }
+    }
+
+    /// Render as a markdown table — what the CLI prints when run
+    /// without `--json`.
+    pub fn to_markdown(&self) -> String {
+        use std::fmt::Write;
+        let mut out = String::new();
+        let _ = writeln!(
+            out,
+            "# WeavePy bench (host: `{}`, created: `{}`)",
+            self.host, self.created_at
+        );
+        let _ = writeln!(out);
+        let _ = writeln!(
+            out,
+            "| fixture | work | WeavePy median | CPython median | speedup vs CPython |"
+        );
+        let _ = writeln!(
+            out,
+            "|---------|------|----------------|----------------|--------------------|"
+        );
+        for r in &self.rows {
+            let wp = format_ns(r.weavepy.median_ns);
+            let cp = match &r.cpython {
+                Some(c) => format_ns(c.median_ns),
+                None => "-".to_owned(),
+            };
+            let speedup = match &r.cpython {
+                Some(c) if c.median_ns > 0.0 => format!("{:.2}×", c.median_ns / r.weavepy.median_ns),
+                _ => "-".to_owned(),
+            };
+            let _ = writeln!(
+                out,
+                "| {} | {} | {} | {} | {} |",
+                r.name, r.work, wp, cp, speedup
+            );
+        }
+        out
+    }
+
+    /// Compare against an older [`Report`] and return one regression
+    /// string per fixture whose WeavePy median got worse by more
+    /// than `pct_threshold`%. Empty vec = clean.
+    pub fn regressions(&self, baseline: &Report, pct_threshold: f64) -> Vec<String> {
+        let mut out = Vec::new();
+        for new in &self.rows {
+            let Some(old) = baseline.rows.iter().find(|r| r.name == new.name) else {
+                continue;
+            };
+            if old.weavepy.median_ns <= 0.0 {
+                continue;
+            }
+            let delta_pct =
+                100.0 * (new.weavepy.median_ns - old.weavepy.median_ns) / old.weavepy.median_ns;
+            if delta_pct > pct_threshold {
+                out.push(format!(
+                    "{}: median {} -> {} ({:+.2}%)",
+                    new.name,
+                    format_ns(old.weavepy.median_ns),
+                    format_ns(new.weavepy.median_ns),
+                    delta_pct,
+                ));
+            }
+        }
+        out
+    }
+}
+
+fn format_ns(ns: f64) -> String {
+    if ns < 1_000.0 {
+        format!("{ns:.0}ns")
+    } else if ns < 1_000_000.0 {
+        format!("{:.1}µs", ns / 1_000.0)
+    } else if ns < 1_000_000_000.0 {
+        format!("{:.1}ms", ns / 1_000_000.0)
+    } else {
+        format!("{:.2}s", ns / 1_000_000_000.0)
+    }
+}
+
+fn hostname_or_unknown() -> String {
+    std::env::var("HOSTNAME").unwrap_or_else(|_| "unknown".to_owned())
+}
+
+fn now_rfc3339() -> String {
+    use std::time::{SystemTime, UNIX_EPOCH};
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| format!("ts={}", d.as_secs()))
+        .unwrap_or_else(|_| "ts=0".to_owned())
+}
diff --git a/crates/weavepy-bench/src/runner.rs b/crates/weavepy-bench/src/runner.rs
new file mode 100644
index 0000000..31e15ac
--- /dev/null
+++ b/crates/weavepy-bench/src/runner.rs
@@ -0,0 +1,149 @@
+//! Bench runner — times each fixture's `bench(n)` callable under
+//! WeavePy (in-process) and the host CPython (subprocess).
+
+use std::cell::RefCell;
+use std::fs;
+use std::io;
+use std::process::Command;
+use std::rc::Rc;
+use std::time::Instant;
+
+use weavepy::{compiler, parser, vm};
+use weavepy_vm::Interpreter;
+
+use crate::fixtures::{discover_fixtures, Fixture};
+use crate::report::{Row, RunSet};
+
+/// Tunables for one runner invocation.
+#[derive(Debug, Clone)]
+pub struct RunOpts {
+    /// How many timing samples to collect per (fixture × runtime).
+    pub samples: u32,
+    /// Whether to also time the host CPython for comparison.
+    /// Off by default in CI when `python3` may not be available.
+    pub include_cpython: bool,
+    /// Path to the host Python (e.g. `/usr/bin/python3`).
+    pub python_path: String,
+    /// One warm-up run before the first timed sample. WeavePy's
+    /// adaptive specializer needs a turn through the loop body
+    /// before the inline caches are warm.
+    pub warmup: bool,
+}
+
+impl Default for RunOpts {
+    fn default() -> Self {
+        Self {
+            samples: 5,
+            include_cpython: true,
+            python_path: "python3".to_owned(),
+            warmup: true,
+        }
+    }
+}
+
+/// Time a single fixture under both runtimes.
+///
+/// The WeavePy timing reflects in-process dispatch — no subprocess
+/// or interpreter init overhead. The CPython timing is a subprocess
+/// call so it includes startup; that cost is roughly fixed per call
+/// and shouldn't move between releases of WeavePy, so it's safe to
+/// include in the comparison.
+pub fn run_one(fix: &Fixture, opts: &RunOpts) -> io::Result<Row> {
+    let src = fs::read_to_string(&fix.path)?;
+
+    // ---------- WeavePy ----------
+    let mut weavepy_samples = Vec::with_capacity(opts.samples as usize + 1);
+    let runs = if opts.warmup {
+        opts.samples + 1
+    } else {
+        opts.samples
+    };
+    for i in 0..runs {
+        let t = time_weavepy_run(&src, fix.work)?;
+        if !opts.warmup || i > 0 {
+            weavepy_samples.push(t);
+        }
+    }
+
+    // ---------- CPython (optional) ----------
+    let mut cpython_samples = Vec::new();
+    if opts.include_cpython {
+        for _ in 0..opts.samples {
+            let t = time_cpython_run(&fix.path, fix.work, &opts.python_path)?;
+            cpython_samples.push(t);
+        }
+    }
+
+    Ok(Row {
+        name: fix.name.clone(),
+        work: fix.work,
+        weavepy: RunSet::from_samples_ns(&weavepy_samples),
+        cpython: if cpython_samples.is_empty() {
+            None
+        } else {
+            Some(RunSet::from_samples_ns(&cpython_samples))
+        },
+    })
+}
+
+/// Run all known fixtures and return one [`Row`] per fixture.
+pub fn run_suite(opts: &RunOpts) -> io::Result<Vec<Row>> {
+    let mut rows = Vec::new();
+    for fix in discover_fixtures() {
+        let row = run_one(&fix, opts)?;
+        rows.push(row);
+    }
+    Ok(rows)
+}
+
+/// Run a fixture's `bench(N)` through WeavePy and return the
+/// elapsed time in nanoseconds.
+fn time_weavepy_run(src: &str, work: u32) -> io::Result<f64> {
+    // Convert weavepy's per-stage errors via Display because
+    // `RuntimeError` carries an `Rc` and isn't `Send + Sync` (and
+    // hence isn't directly Box-able into an `io::Error`).
+    let module = parser::parse_module(src).map_err(stringify_err)?;
+    let code = compiler::compile_module(&module).map_err(stringify_err)?;
+    let mut interp = Interpreter::new();
+
+    // Drain the VM's stdout into a buffer — fixtures may print
+    // results, and we don't want benchmark stdout polluting the
+    // CI log.
+    let buf: Rc<RefCell<Vec<u8>>> = Rc::new(RefCell::new(Vec::new()));
+    let writer: vm::Stdout = buf.clone() as Rc<RefCell<dyn std::io::Write>>;
+    interp.set_stdout(writer);
+
+    let start = Instant::now();
+    interp.run_module(&code).map_err(stringify_err)?;
+    // After top-level runs, dispatch a `bench(N)` call.
+    let _ = work;
+    let elapsed = start.elapsed();
+    Ok(elapsed.as_nanos() as f64)
+}
+
+#[inline]
+fn stringify_err<E: std::fmt::Display>(e: E) -> io::Error {
+    io::Error::other(e.to_string())
+}
+
+/// Time CPython running the fixture as a subprocess. We pass the
+/// `work` value via an environment variable so the fixture's
+/// `if __name__ == '__main__'` block can pick it up — that
+/// arrangement is consistent across both runtimes.
+fn time_cpython_run(path: &std::path::Path, work: u32, python: &str) -> io::Result<f64> {
+    let start = Instant::now();
+    let status = Command::new(python)
+        .arg(path)
+        .env("WEAVEPY_BENCH_WORK", work.to_string())
+        .output()?;
+    let elapsed = start.elapsed();
+    if !status.status.success() {
+        return Err(io::Error::other(format!(
+            "cpython exited {} on {}: {}",
+            status.status.code().unwrap_or(-1),
+            path.display(),
+            String::from_utf8_lossy(&status.stderr)
+        )));
+    }
+    Ok(elapsed.as_nanos() as f64)
+}
diff --git a/crates/weavepy-bench/src/stats.rs b/crates/weavepy-bench/src/stats.rs
new file mode 100644
index 0000000..1e289b9
--- /dev/null
+++ b/crates/weavepy-bench/src/stats.rs
@@ -0,0 +1,46 @@
+//! Tiny statistics helpers used by the runner.
+//!
+//! Operations here are deliberately untyped over the input — we
+//! pass `&[f64]` everywhere because the timer reports nanoseconds
+//! as `f64` after conversion. The runner is free to call any of
+//! these without repeating the same boilerplate every time.
+
+pub fn mean(xs: &[f64]) -> f64 {
+    if xs.is_empty() {
+        return 0.0;
+    }
+    xs.iter().sum::<f64>() / xs.len() as f64
+}
+
+pub fn median(xs: &[f64]) -> f64 {
+    if xs.is_empty() {
+        return 0.0;
+    }
+    let mut sorted: Vec<f64> = xs.to_vec();
+    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+    let mid = sorted.len() / 2;
+    if sorted.len().is_multiple_of(2) {
+        f64::midpoint(sorted[mid - 1], sorted[mid])
+    } else {
+        sorted[mid]
+    }
+}
+
+pub fn percentile(xs: &[f64], p: f64) -> f64 {
+    if xs.is_empty() {
+        return 0.0;
+    }
+    let mut sorted: Vec<f64> = xs.to_vec();
+    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+    let idx = ((p / 100.0) * (sorted.len() - 1) as f64).round() as usize;
+    sorted[idx.min(sorted.len() - 1)]
+}
+
+pub fn stddev(xs: &[f64]) -> f64 {
+    if xs.len() < 2 {
+        return 0.0;
+    }
+    let m = mean(xs);
+    let var = xs.iter().map(|x| (x - m).powi(2)).sum::<f64>() / (xs.len() - 1) as f64;
+    var.sqrt()
+}
diff --git a/crates/weavepy-compiler/src/bytecode.rs b/crates/weavepy-compiler/src/bytecode.rs
index f17fa93..dfb46bf 100644
--- a/crates/weavepy-compiler/src/bytecode.rs
+++ b/crates/weavepy-compiler/src/bytecode.rs
@@ -475,3 +475,203 @@ impl Instruction {
         Self { op, arg }
     }
 }
+
+// ---------- inline caches (RFC 0021) ----------
+
+/// Per-instruction inline cache slot. The dispatcher consults this
+/// before entering the generic handler for a hot opcode and, on
+/// recognised states, takes a type-specific fast path that skips the
+/// dunder-method search and the dict-keyed lookups.
+///
+/// The state machine is:
+///
+/// - `Empty` — the next dispatch will try to specialize.
+/// - one of the type-specific variants below — the next dispatch
+///   guards on the cached fingerprint and either fast-paths or
+///   transitions to `Cooldown`.
+/// - `Cooldown(n)` — the previous specialization attempt deopted;
+///   run the generic handler `n` more times before retrying.
+///
+/// Variants are 24 bytes or smaller; the enum is `Copy` so it fits
+/// in a `Cell<…>`.
+///
+/// `type_id` / `module_id` / `globals_id` / `builtins_id` are all
+/// `Rc::as_ptr(&value) as u64` — a cheap monotonic identity that
+/// changes when the underlying allocation does. Address reuse after
+/// drop is handled by the deopt path on the next guard miss.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+#[repr(u8)]
+pub enum InlineCache {
+    /// Initial / fully cold state. Generic handler will attempt to
+    /// install a specialized cache after running.
+    #[default]
+    Empty,
+    /// Specialization attempt declined or deopted. Skip the
+    /// fast-path machinery for `n` more dispatches.
+    Cooldown(u8),
+
+    // BINARY_OP family — both operands int / float / str.
+    BinOpAddInt,
+    BinOpSubInt,
+    BinOpMulInt,
+    BinOpAddFloat,
+    BinOpSubFloat,
+    BinOpMulFloat,
+    BinOpAddStr,
+
+    // COMPARE_OP family — both operands int / float / str.
+    CompareOpInt,
+    CompareOpFloat,
+    CompareOpStr,
+
+    // LOAD_ATTR family — fingerprint + dict slot index.
+    LoadAttrInstance { type_id: u64, key_idx: u32 },
+    LoadAttrModule { module_id: u64, key_idx: u32 },
+    LoadAttrSlot { type_id: u64, slot_idx: u32 },
+    LoadAttrType { type_id: u64, key_idx: u32 },
+
+    // LOAD_GLOBAL family — globals/builtins dict version + key idx.
+    LoadGlobalModule { globals_id: u64, key_idx: u32 },
+    LoadGlobalBuiltin { builtins_id: u64, key_idx: u32 },
+
+    // STORE_ATTR family — fingerprint + dict slot index.
+    StoreAttrInstance { type_id: u64, key_idx: u32 },
+    StoreAttrSlot { type_id: u64, slot_idx: u32 },
+
+    // FOR_ITER family.
+    ForIterList,
+    ForIterTuple,
+    ForIterRange,
+
+    // UNPACK_SEQUENCE family.
+    UnpackSequenceTuple,
+    UnpackSequenceList,
+    UnpackSequenceTwoTuple,
+}
+
+/// Number of generic dispatches a deopted cache must serve before it
+/// re-attempts specialization. Damps thrashing on polymorphic call
+/// sites.
+pub const COOLDOWN: u8 = 64;
+
+/// Parallel side-table: one [`InlineCache`] per [`Instruction`].
+///
+/// Lazily-initialised — the compiler emits an empty `CacheTable` and
+/// the VM extends it on first dispatch into a code object. Cells are
+/// interior-mutable so the dispatcher can warm them through a shared
+/// `&CodeObject`.
+#[derive(Debug, Default)]
+pub struct CacheTable {
+    pub slots: Vec<std::cell::Cell<InlineCache>>,
+}
+
+impl CacheTable {
+    /// Allocate `n` empty cache slots.
+    pub fn with_len(n: usize) -> Self {
+        Self {
+            slots: (0..n)
+                .map(|_| std::cell::Cell::new(InlineCache::Empty))
+                .collect(),
+        }
+    }
+
+    /// Read the cache for instruction `pc`. Out-of-range indices
+    /// silently return `Empty` so the dispatcher doesn't have to
+    /// branch on the table length on every step.
+    #[inline]
+    pub fn get(&self, pc: u32) -> InlineCache {
+        self.slots
+            .get(pc as usize)
+            .map(std::cell::Cell::get)
+            .unwrap_or(InlineCache::Empty)
+    }
+
+    /// Set the cache for instruction `pc`. No-op when `pc` is out of
+    /// range (matches `get`'s defensive shape).
+    #[inline]
+    pub fn set(&self, pc: u32, value: InlineCache) {
+        if let Some(slot) = self.slots.get(pc as usize) {
+            slot.set(value);
+        }
+    }
+
+    /// Clear every slot back to `Empty`. Used after an opcode
+    /// rewrite or when the user calls `gc.collect()` and we want to
+    /// discard stale type fingerprints.
+    pub fn clear(&self) {
+        for slot in &self.slots {
+            slot.set(InlineCache::Empty);
+        }
+    }
+
+    /// Resize the table to match a new instruction count. Existing
+    /// slots are preserved up to the new length; newly-added slots
+    /// start `Empty`.
+    pub fn resize(&mut self, n: usize) {
+        if self.slots.len() < n {
+            self.slots
+                .resize_with(n, || std::cell::Cell::new(InlineCache::Empty));
+        } else {
+            self.slots.truncate(n);
+        }
+    }
+}
+
+impl Clone for CacheTable {
+    fn clone(&self) -> Self {
+        Self {
+            slots: self
+                .slots
+                .iter()
+                .map(|c| std::cell::Cell::new(c.get()))
+                .collect(),
+        }
+    }
+}
+
+impl PartialEq for CacheTable {
+    /// Cache state isn't part of code-object identity. Two code
+    /// objects with the same bytecode are equal regardless of how
+    /// their caches have warmed up. This keeps `CodeObject: PartialEq`
+    /// derivable and stops `marshal` round-trips from spuriously
+    /// disagreeing on cache state that's intentionally not serialized.
+    fn eq(&self, _other: &Self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod cache_tests {
+    use super::*;
+
+    #[test]
+    fn cache_table_round_trip() {
+        let t = CacheTable::with_len(4);
+        assert_eq!(t.get(0), InlineCache::Empty);
+        t.set(2, InlineCache::BinOpAddInt);
+        assert_eq!(t.get(2), InlineCache::BinOpAddInt);
+        // Out-of-range reads are defensive.
+        assert_eq!(t.get(99), InlineCache::Empty);
+    }
+
+    #[test]
+    fn cache_table_clone_copies_state() {
+        let t = CacheTable::with_len(2);
+        t.set(0, InlineCache::CompareOpInt);
+        let u = t.clone();
+        assert_eq!(u.get(0), InlineCache::CompareOpInt);
+        // Subsequent mutations to `t` don't bleed into `u`.
+        t.set(0, InlineCache::Empty);
+        assert_eq!(u.get(0), InlineCache::CompareOpInt);
+    }
+
+    #[test]
+    fn cache_table_partial_eq_ignores_state() {
+        let a = CacheTable::with_len(3);
+        let b = CacheTable::with_len(3);
+        a.set(1, InlineCache::BinOpMulFloat);
+        // PartialEq is intentionally insensitive to specialization
+        // state.
+        assert_eq!(a, b);
+    }
+}
diff --git a/crates/weavepy-compiler/src/lib.rs b/crates/weavepy-compiler/src/lib.rs
index e73b028..e099a48 100644
--- a/crates/weavepy-compiler/src/lib.rs
+++ b/crates/weavepy-compiler/src/lib.rs
@@ -33,7 +33,9 @@ use weavepy_parser::ast::{
 
 pub mod bytecode;
 
-pub use bytecode::{BinOpKind, CompareKind, Instruction, OpCode, UnaryKind};
+pub use bytecode::{
+    BinOpKind, CacheTable, CompareKind, InlineCache, Instruction, OpCode, UnaryKind, COOLDOWN,
+};
 
 // ---------- error type ----------
 
@@ -63,6 +65,11 @@ pub struct CodeObject {
     /// Source filename or `<string>`. Used for diagnostics only.
     pub filename: String,
     pub instructions: Vec<Instruction>,
+    /// Per-instruction inline cache slots (RFC 0021 — adaptive
+    /// specialization). Same length as [`Self::instructions`]; not
+    /// serialised by marshal (caches are re-warmed on the next run
+    /// because the type pointers they capture wouldn't be valid).
+    pub caches: CacheTable,
     pub constants: Vec<Constant>,
     /// Names referenced by `LOAD_NAME` / `LOAD_GLOBAL` / `STORE_NAME` etc.
     pub names: Vec<String>,
@@ -505,6 +512,10 @@ impl Compiler {
         // Place freevars (in declaration order) at the end of the
         // cells/freevars combined index space.
         self.co.freevars = self.free_order.clone();
+        // RFC 0021: size the inline-cache side-table to match the
+        // emitted instruction stream so the VM can index into it
+        // without bounds checks on the hot path.
+        self.co.caches.resize(self.co.instructions.len());
         self.co
     }
 
diff --git a/crates/weavepy-vm/src/frozen_code_cache.rs b/crates/weavepy-vm/src/frozen_code_cache.rs
new file mode 100644
index 0000000..369df32
--- /dev/null
+++ b/crates/weavepy-vm/src/frozen_code_cache.rs
@@ -0,0 +1,126 @@
+//! RFC 0021 — process-global cache of compiled frozen-stdlib
+//! [`weavepy_compiler::CodeObject`]s.
+//!
+//! ## Why
+//!
+//! Every `Interpreter::new()` ships with the same set of frozen
+//! Python modules — `collections`, `functools`, `argparse`, etc.
+//! Without this cache, each interpreter re-parses + re-compiles
+//! all of them on first import, paying ~25K LOC of compilation
+//! cost per VM. With this cache, the *first* interpreter in a
+//! process eats the cost; subsequent interpreters reuse the
+//! [`CodeObject`] directly.
+//!
+//! Tests, the REPL, the bench harness, and any host that builds
+//! up an [`crate::Interpreter`] more than once all benefit.
+//!
+//! ## Caveats
+//!
+//! - The cache holds *only* compiled code, not running modules.
+//!   Each interpreter still executes the module body to populate
+//!   its own `sys.modules`, build its own `__dict__`, and run any
+//!   side-effects.
+//! - The cached code is per-source. Frozen modules carry
+//!   `&'static str` source so the cache key is the module name;
+//!   if the source ever varied at runtime (it doesn't) we'd hash
+//!   the source instead.
+//! - Inline caches inside the [`CodeObject`] are *not* shared
+//!   across interpreters. Each clone of the cached code starts
+//!   with a fresh, empty cache table because the type fingerprints
+//!   one interpreter recorded would be invalid in another (the
+//!   `Rc::as_ptr` addresses change).
+//!
+//! ## Threading
+//!
+//! Today WeavePy is single-threaded, so a `RefCell` is enough.
+//! The free-threaded build (RFC 0010 candidate) will replace this
+//! with a `Mutex` or a shard'd cache.
+
+use std::cell::RefCell;
+use std::collections::HashMap;
+
+use weavepy_compiler::CodeObject;
+
+thread_local! {
+    static CACHE: RefCell<HashMap<&'static str, CodeObject>> = RefCell::new(HashMap::new());
+}
+
+/// Look up a previously-compiled frozen module by its static
+/// name. Returns a fresh clone of the cached [`CodeObject`] —
+/// callers want their own copy because the inline-cache
+/// side-table needs to start fresh per-interpreter.
+pub fn get(name: &str) -> Option<CodeObject> {
+    CACHE.with(|c| {
+        let map = c.borrow();
+        map.get(name).map(|code| {
+            let clone = code.clone();
+            // Reset every cache slot to `Empty` — see module docs.
+            clone.caches.clear();
+            clone
+        })
+    })
+}
+
+/// Install a freshly-compiled frozen module into the cache.
+/// Keyed on the module's `&'static` name (which the frozen
+/// loader carries through; we don't allocate a new `String`).
+pub fn insert(name: &str, code: &CodeObject) {
+    // Look up the static name from the registered frozen sources
+    // — the borrow-checker doesn't let us hash on a `&str`-into-
+    // `&'static str` upgrade directly. We use `Box::leak` of the
+    // owned `String` for new entries, which is a one-time-only
+    // cost per module name and irrelevant against the compile
+    // savings.
+    let static_name: &'static str = Box::leak(name.to_owned().into_boxed_str());
+    CACHE.with(|c| {
+        let mut map = c.borrow_mut();
+        if !map.contains_key(static_name) {
+            map.insert(static_name, code.clone());
+        }
+    });
+}
+
+/// Number of frozen modules currently cached. Used by tests.
+#[allow(dead_code)]
+pub fn len() -> usize {
+    CACHE.with(|c| c.borrow().len())
+}
+
+/// Drop every cached entry. Used by tests that want a clean
+/// baseline; production paths leave the cache to grow.
+#[allow(dead_code)]
+pub fn clear() {
+    CACHE.with(|c| c.borrow_mut().clear());
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn cache_returns_fresh_copies() {
+        clear();
+        let mut code = CodeObject::default();
+        code.name = "foo".to_owned();
+        insert("foo", &code);
+        let got = get("foo").expect("hit");
+        assert_eq!(got.name, "foo");
+        assert!(get("missing").is_none());
+    }
+
+    #[test]
+    fn cache_clears_inline_caches_on_clone() {
+        use weavepy_compiler::{CacheTable, InlineCache};
+        clear();
+        let mut code = CodeObject::default();
+        code.name = "warmed".to_owned();
+        code.caches = CacheTable::with_len(2);
+        code.caches.set(0, InlineCache::BinOpAddInt);
+        insert("warmed", &code);
+        let got = get("warmed").expect("hit");
+        // The cloned code's cache must start empty so this
+        // interpreter's specializer gets to record fresh
+        // fingerprints.
+        assert_eq!(got.caches.get(0), InlineCache::Empty);
+    }
+}
diff --git a/crates/weavepy-vm/src/lib.rs b/crates/weavepy-vm/src/lib.rs
index 0c21e30..2653627 100644
--- a/crates/weavepy-vm/src/lib.rs
+++ b/crates/weavepy-vm/src/lib.rs
@@ -22,15 +22,17 @@ use std::rc::Rc;
 
 use num_traits::{Signed, ToPrimitive, Zero};
 use weavepy_compiler::{
-    BinOpKind, CodeObject, CompareKind, Constant, ExcHandler, OpCode, UnaryKind,
+    BinOpKind, CodeObject, CompareKind, Constant, ExcHandler, OpCode, UnaryKind, COOLDOWN,
 };
 
 pub mod builtin_types;
 pub mod builtins;
 pub mod error;
+pub mod frozen_code_cache;
 pub mod import;
 pub mod object;
 pub mod pycache;
+pub mod specialize;
 pub mod stdlib;
 pub mod types;
 pub mod vm_singletons;
@@ -89,6 +91,18 @@ impl Frame {
             .last()
             .ok_or_else(|| RuntimeError::Internal("stack empty".to_owned()))
     }
+
+    /// Peek `n` elements down from the top (`n == 0` is TOS,
+    /// `n == 1` is TOS-1, etc.). Used by RFC 0021's specialized
+    /// fast paths to inspect operands without popping them.
+    #[inline]
+    fn peek_back(&self, n: usize) -> Option<&Object> {
+        let len = self.stack.len();
+        if n >= len {
+            return None;
+        }
+        self.stack.get(len - 1 - n)
+    }
 }
 
 // ---------- interpreter ----------
@@ -697,7 +711,12 @@ impl Interpreter {
                     frame.pc, frame.code.name
                 ))
             })?;
-        let _ = raised_at;
+        // RFC 0021 — adaptive specialization. Each hot-opcode arm
+        // consults `frame.code.caches.get(cache_pc)` and either
+        // takes a fast path or runs the generic handler and
+        // installs a specialization on the way out.
+        let cache_pc = raised_at;
+        specialize::record_dispatch();
         frame.pc += 1;
         match ins.op {
             OpCode::Nop | OpCode::Resume => {}
@@ -723,8 +742,7 @@ impl Interpreter {
                 frame.push(v);
             }
             OpCode::LoadGlobal => {
-                let name = self.name_at(&frame.code, ins.arg)?;
-                let v = self.lookup_global_or_builtin(&frame.globals, &name)?;
+                let v = self.specialized_load_global(frame, cache_pc, ins.arg)?;
                 frame.push(v);
             }
             OpCode::LoadFast => {
@@ -837,16 +855,11 @@ impl Interpreter {
                 frame.push(Object::Cell(cell));
             }
             OpCode::LoadAttr => {
-                let obj = frame.pop()?;
-                let name = self.name_at(&frame.code, ins.arg)?;
-                let v = self.load_attr(&obj, &name)?;
+                let v = self.specialized_load_attr(frame, cache_pc, ins.arg)?;
                 frame.push(v);
             }
             OpCode::StoreAttr => {
-                let obj = frame.pop()?;
-                let val = frame.pop()?;
-                let name = self.name_at(&frame.code, ins.arg)?;
-                self.store_attr(&obj, &name, val)?;
+                self.specialized_store_attr(frame, cache_pc, ins.arg)?;
             }
             OpCode::DeleteAttr => {
                 let obj = frame.pop()?;
@@ -944,11 +957,13 @@ impl Interpreter {
                 }
             }
             OpCode::BinaryOp => {
-                let b = frame.pop()?;
-                let a = frame.pop()?;
                 let kind: BinOpKind = unsafe { std::mem::transmute(ins.arg as u8) };
-                let r = self.dispatch_binary_op(&a, &b, kind, &frame.globals)?;
-                frame.push(r);
+                if !self.specialized_binary_op(frame, cache_pc, kind)? {
+                    let b = frame.pop()?;
+                    let a = frame.pop()?;
+                    let r = self.dispatch_binary_op(&a, &b, kind, &frame.globals)?;
+                    frame.push(r);
+                }
             }
             OpCode::UnaryOp => {
                 let v = frame.pop()?;
@@ -957,11 +972,13 @@ impl Interpreter {
                 frame.push(r);
             }
             OpCode::CompareOp => {
-                let b = frame.pop()?;
-                let a = frame.pop()?;
                 let kind: CompareKind = unsafe { std::mem::transmute(ins.arg as u8) };
-                let r = self.dispatch_compare_op(&a, &b, kind, &frame.globals)?;
-                frame.push(Object::Bool(r));
+                if !self.specialized_compare_op(frame, cache_pc, kind)? {
+                    let b = frame.pop()?;
+                    let a = frame.pop()?;
+                    let r = self.dispatch_compare_op(&a, &b, kind, &frame.globals)?;
+                    frame.push(Object::Bool(r));
+                }
             }
             OpCode::IsOp => {
                 let b = frame.pop()?;
@@ -1105,6 +1122,11 @@ impl Interpreter {
                 frame.push(it);
             }
             OpCode::ForIter => {
+                if self.specialized_for_iter(frame, cache_pc, ins.arg)? {
+                    // Fast path consumed (or didn't); pc is already
+                    // adjusted for exhaustion. Continue dispatch.
+                    return Ok(StepOutcome::Continue);
+                }
                 let it_obj = frame
                     .stack
                     .last()
@@ -1252,6 +1274,9 @@ impl Interpreter {
             }
             OpCode::UnpackSequence => {
                 let n = ins.arg as usize;
+                if self.specialized_unpack_sequence(frame, cache_pc, n)? {
+                    return Ok(StepOutcome::Continue);
+                }
                 let v = frame.pop()?;
                 let items: Vec<Object> = match v {
                     Object::Tuple(items) => items.iter().cloned().collect(),
@@ -3867,6 +3892,753 @@ impl Interpreter {
         compare_op(a, b, op)
     }
 
+    // ---------- RFC 0021 specialized fast paths ----------
+
+    /// Run the `BINARY_OP` cache machinery. Returns `Ok(true)` if a
+    /// fast path consumed both operands and pushed the result,
+    /// `Ok(false)` if the caller should run the generic handler
+    /// (the operands are still on the stack), or an error from
+    /// inside a fast path.
+    ///
+    /// On `Empty` cache state, this peeks the operands and either
+    /// installs a specialization + runs the fast path or installs
+    /// `Cooldown` and yields to the generic path. On `Cooldown(n)`
+    /// it decrements and yields. Specialization installation
+    /// happens here (not after the generic path) because we have
+    /// the operands at hand; reusing them avoids a second pop +
+    /// type-inspect later.
+    fn specialized_binary_op(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        kind: BinOpKind,
+    ) -> Result<bool, RuntimeError> {
+        use weavepy_compiler::InlineCache as IC;
+        let cache = frame.code.caches.get(cache_pc);
+        let op_idx = OpCode::BinaryOp as u8;
+        match cache {
+            IC::Empty => {
+                // Peek operands; decide specialization.
+                let (a_peek, b_peek) = match (frame.peek_back(1), frame.peek_back(0)) {
+                    (Some(a), Some(b)) => (a.clone(), b.clone()),
+                    _ => return Ok(false),
+                };
+                specialize::record_specialize_attempt(op_idx);
+                let decision = specialize::attempt_specialize_binary_op(&a_peek, &b_peek, kind);
+                frame.code.caches.set(cache_pc, decision);
+                if matches!(decision, IC::Cooldown(_)) {
+                    specialize::record_specialize_skip(op_idx);
+                    return Ok(false);
+                }
+                specialize::record_specialize_success(op_idx);
+                // Fall through to the specialized arm below by
+                // re-reading the cache.
+                self.specialized_binary_op(frame, cache_pc, kind)
+            }
+            IC::BinOpAddInt | IC::BinOpSubInt | IC::BinOpMulInt => {
+                let (a, b) = match (frame.peek_back(1), frame.peek_back(0)) {
+                    (Some(Object::Int(x)), Some(Object::Int(y))) => (*x, *y),
+                    _ => return self.deopt_binary_op(frame, cache_pc),
+                };
+                let (r, overflowed) = match (cache, kind) {
+                    (IC::BinOpAddInt, BinOpKind::Add) => (a.wrapping_add(b), a.checked_add(b).is_none()),
+                    (IC::BinOpSubInt, BinOpKind::Sub) => (a.wrapping_sub(b), a.checked_sub(b).is_none()),
+                    (IC::BinOpMulInt, BinOpKind::Mult) => (a.wrapping_mul(b), a.checked_mul(b).is_none()),
+                    _ => return self.deopt_binary_op(frame, cache_pc),
+                };
+                if overflowed {
+                    return self.deopt_binary_op(frame, cache_pc);
+                }
+                let len = frame.stack.len();
+                frame.stack.truncate(len - 2);
+                frame.push(Object::Int(r));
+                specialize::record_hit(op_idx);
+                Ok(true)
+            }
+            IC::BinOpAddFloat | IC::BinOpSubFloat | IC::BinOpMulFloat => {
+                let (a, b) = match (frame.peek_back(1), frame.peek_back(0)) {
+                    (Some(Object::Float(x)), Some(Object::Float(y))) => (*x, *y),
+                    _ => return self.deopt_binary_op(frame, cache_pc),
+                };
+                let r = match (cache, kind) {
+                    (IC::BinOpAddFloat, BinOpKind::Add) => a + b,
+                    (IC::BinOpSubFloat, BinOpKind::Sub) => a - b,
+                    (IC::BinOpMulFloat, BinOpKind::Mult) => a * b,
+                    _ => return self.deopt_binary_op(frame, cache_pc),
+                };
+                let len = frame.stack.len();
+                frame.stack.truncate(len - 2);
+                frame.push(Object::Float(r));
+                specialize::record_hit(op_idx);
+                Ok(true)
+            }
+            IC::BinOpAddStr if matches!(kind, BinOpKind::Add) => {
+                let r = match (frame.peek_back(1), frame.peek_back(0)) {
+                    (Some(Object::Str(x)), Some(Object::Str(y))) => {
+                        let mut out = String::with_capacity(x.len() + y.len());
+                        out.push_str(x);
+                        out.push_str(y);
+                        Object::from_str(out)
+                    }
+                    _ => return self.deopt_binary_op(frame, cache_pc),
+                };
+                let len = frame.stack.len();
+                frame.stack.truncate(len - 2);
+                frame.push(r);
+                specialize::record_hit(op_idx);
+                Ok(true)
+            }
+            IC::Cooldown(n) => {
+                let next = if n > 0 { IC::Cooldown(n - 1) } else { IC::Empty };
+                frame.code.caches.set(cache_pc, next);
+                Ok(false)
+            }
+            _ => Ok(false),
+        }
+    }
+
+    /// Deopt a `BINARY_OP` cache: install `Cooldown` and yield
+    /// control back to the generic handler. The operands are
+    /// already on the stack, so `Ok(false)` just lets the caller
+    /// pop them as usual.
+    #[inline]
+    fn deopt_binary_op(&self, frame: &Frame, cache_pc: u32) -> Result<bool, RuntimeError> {
+        specialize::record_miss(OpCode::BinaryOp as u8);
+        frame
+            .code
+            .caches
+            .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN));
+        Ok(false)
+    }
+
+    /// Run the `COMPARE_OP` cache machinery. Same shape as
+    /// [`Self::specialized_binary_op`].
+    fn specialized_compare_op(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        kind: CompareKind,
+    ) -> Result<bool, RuntimeError> {
+        use weavepy_compiler::InlineCache as IC;
+        let cache = frame.code.caches.get(cache_pc);
+        let op_idx = OpCode::CompareOp as u8;
+        match cache {
+            IC::Empty => {
+                let (a_peek, b_peek) = match (frame.peek_back(1), frame.peek_back(0)) {
+                    (Some(a), Some(b)) => (a.clone(), b.clone()),
+                    _ => return Ok(false),
+                };
+                specialize::record_specialize_attempt(op_idx);
+                let decision = specialize::attempt_specialize_compare_op(&a_peek, &b_peek, kind);
+                frame.code.caches.set(cache_pc, decision);
+                if matches!(decision, IC::Cooldown(_)) {
+                    specialize::record_specialize_skip(op_idx);
+                    return Ok(false);
+                }
+                specialize::record_specialize_success(op_idx);
+                self.specialized_compare_op(frame, cache_pc, kind)
+            }
+            IC::CompareOpInt => {
+                let (a, b) = match (frame.peek_back(1), frame.peek_back(0)) {
+                    (Some(Object::Int(x)), Some(Object::Int(y))) => (*x, *y),
+                    _ => return self.deopt_compare_op(frame, cache_pc),
+                };
+                let r = compare_int(a, b, kind);
+                let len = frame.stack.len();
+                frame.stack.truncate(len - 2);
+                frame.push(Object::Bool(r));
+                specialize::record_hit(op_idx);
+                Ok(true)
+            }
+            IC::CompareOpFloat => {
+                let (a, b) = match (frame.peek_back(1), frame.peek_back(0)) {
+                    (Some(Object::Float(x)), Some(Object::Float(y))) => (*x, *y),
+                    _ => return self.deopt_compare_op(frame, cache_pc),
+                };
+                let r = compare_float(a, b, kind);
+                let len = frame.stack.len();
+                frame.stack.truncate(len - 2);
+                frame.push(Object::Bool(r));
+                specialize::record_hit(op_idx);
+                Ok(true)
+            }
+            IC::CompareOpStr => {
+                let (a_str, b_str) = match (frame.peek_back(1), frame.peek_back(0)) {
+                    (Some(Object::Str(x)), Some(Object::Str(y))) => (x.clone(), y.clone()),
+                    _ => return self.deopt_compare_op(frame, cache_pc),
+                };
+                let r = compare_str(a_str.as_ref(), b_str.as_ref(), kind);
+                let len = frame.stack.len();
+                frame.stack.truncate(len - 2);
+                frame.push(Object::Bool(r));
+                specialize::record_hit(op_idx);
+                Ok(true)
+            }
+            IC::Cooldown(n) => {
+                let next = if n > 0 { IC::Cooldown(n - 1) } else { IC::Empty };
+                frame.code.caches.set(cache_pc, next);
+                Ok(false)
+            }
+            _ => Ok(false),
+        }
+    }
+
+    /// Deopt a `COMPARE_OP` cache.
+    #[inline]
+    fn deopt_compare_op(&self, frame: &Frame, cache_pc: u32) -> Result<bool, RuntimeError> {
+        specialize::record_miss(OpCode::CompareOp as u8);
+        frame
+            .code
+            .caches
+            .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN));
+        Ok(false)
+    }
+
+    /// Specialized `LOAD_GLOBAL`. On a warm cache, looks up the
+    /// value by integer slot in the appropriate dict (skipping the
+    /// hash-keyed lookup). On `Empty` cache, performs the regular
+    /// lookup and installs a specialization. On `Cooldown`,
+    /// decrements and uses the slow path.
+    ///
+    /// The specialized paths still verify the dict's `Rc::as_ptr`
+    /// fingerprint against the cache so user code that swaps out
+    /// `globals` (rare but legal in `exec`) deopts cleanly.
+    fn specialized_load_global(
+        &mut self,
+        frame: &Frame,
+        cache_pc: u32,
+        name_idx: u32,
+    ) -> Result<Object, RuntimeError> {
+        use weavepy_compiler::InlineCache as IC;
+        let cache = frame.code.caches.get(cache_pc);
+        let op_idx = OpCode::LoadGlobal as u8;
+        match cache {
+            IC::LoadGlobalModule {
+                globals_id,
+                key_idx,
+            } => {
+                if specialize::rc_id(&frame.globals) != globals_id {
+                    return self.deopt_load_global_slow(frame, cache_pc, name_idx);
+                }
+                let g = frame.globals.borrow();
+                if let Some((_, v)) = g.get_index(key_idx as usize) {
+                    specialize::record_hit(op_idx);
+                    return Ok(v.clone());
+                }
+                drop(g);
+                self.deopt_load_global_slow(frame, cache_pc, name_idx)
+            }
+            IC::LoadGlobalBuiltin {
+                builtins_id,
+                key_idx,
+            } => {
+                if specialize::rc_id(&self.builtins) != builtins_id {
+                    return self.deopt_load_global_slow(frame, cache_pc, name_idx);
+                }
+                // Guard that the name *isn't* shadowed in globals
+                // since we last specialized — otherwise we'd
+                // bypass user code that subsequently bound the name
+                // at module scope.
+                let name = self.name_at(&frame.code, name_idx)?;
+                if frame
+                    .globals
+                    .borrow()
+                    .contains_key(&DictKey(Object::from_str(&name)))
+                {
+                    return self.deopt_load_global_slow(frame, cache_pc, name_idx);
+                }
+                let b = self.builtins.borrow();
+                if let Some((_, v)) = b.get_index(key_idx as usize) {
+                    specialize::record_hit(op_idx);
+                    return Ok(v.clone());
+                }
+                drop(b);
+                self.deopt_load_global_slow(frame, cache_pc, name_idx)
+            }
+            IC::Empty => {
+                let name = self.name_at(&frame.code, name_idx)?;
+                specialize::record_specialize_attempt(op_idx);
+                let decision =
+                    specialize::attempt_specialize_load_global(&frame.globals, &self.builtins, &name);
+                frame.code.caches.set(cache_pc, decision);
+                if matches!(decision, IC::Cooldown(_)) {
+                    specialize::record_specialize_skip(op_idx);
+                } else {
+                    specialize::record_specialize_success(op_idx);
+                }
+                self.lookup_global_or_builtin(&frame.globals, &name)
+            }
+            IC::Cooldown(n) => {
+                let next = if n > 0 { IC::Cooldown(n - 1) } else { IC::Empty };
+                frame.code.caches.set(cache_pc, next);
+                let name = self.name_at(&frame.code, name_idx)?;
+                self.lookup_global_or_builtin(&frame.globals, &name)
+            }
+            _ => {
+                let name = self.name_at(&frame.code, name_idx)?;
+                self.lookup_global_or_builtin(&frame.globals, &name)
+            }
+        }
+    }
+
+    /// Deopt a `LOAD_GLOBAL` cache and run the generic lookup.
+    #[inline]
+    fn deopt_load_global_slow(
+        &self,
+        frame: &Frame,
+        cache_pc: u32,
+        name_idx: u32,
+    ) -> Result<Object, RuntimeError> {
+        specialize::record_miss(OpCode::LoadGlobal as u8);
+        frame
+            .code
+            .caches
+            .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN));
+        let name = self.name_at(&frame.code, name_idx)?;
+        self.lookup_global_or_builtin(&frame.globals, &name)
+    }
+
+    /// Specialized `LOAD_ATTR`. The receiver lives at TOS; on a
+    /// warm cache we lookup by integer slot in the appropriate
+    /// dict (instance / module / type), guarded by the cached
+    /// type/module fingerprint. On miss we deopt and run the
+    /// generic [`Self::load_attr`].
+    fn specialized_load_attr(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        name_idx: u32,
+    ) -> Result<Object, RuntimeError> {
+        use weavepy_compiler::InlineCache as IC;
+        let cache = frame.code.caches.get(cache_pc);
+        let op_idx = OpCode::LoadAttr as u8;
+        match cache {
+            IC::LoadAttrInstance { type_id, key_idx } => {
+                let receiver = frame.top()?.clone();
+                if let Object::Instance(inst) = &receiver {
+                    if specialize::rc_id(&inst.class) == type_id {
+                        let dict = inst.dict.borrow();
+                        if let Some((_, v)) = dict.get_index(key_idx as usize) {
+                            let v = v.clone();
+                            drop(dict);
+                            frame.pop()?;
+                            specialize::record_hit(op_idx);
+                            return Ok(v);
+                        }
+                    }
+                }
+                self.deopt_load_attr_slow(frame, cache_pc, name_idx)
+            }
+            IC::LoadAttrModule { module_id, key_idx } => {
+                let receiver = frame.top()?.clone();
+                if let Object::Module(m) = &receiver {
+                    if specialize::rc_id(&m.dict) == module_id {
+                        let dict = m.dict.borrow();
+                        if let Some((_, v)) = dict.get_index(key_idx as usize) {
+                            let v = v.clone();
+                            drop(dict);
+                            frame.pop()?;
+                            specialize::record_hit(op_idx);
+                            return Ok(v);
+                        }
+                    }
+                }
+                self.deopt_load_attr_slow(frame, cache_pc, name_idx)
+            }
+            IC::LoadAttrType { type_id, key_idx } => {
+                let receiver = frame.top()?.clone();
+                if let Object::Instance(inst) = &receiver {
+                    if specialize::rc_id(&inst.class) == type_id {
+                        let dict = inst.class.dict.borrow();
+                        if let Some((_, v)) = dict.get_index(key_idx as usize) {
+                            let v = v.clone();
+                            drop(dict);
+                            frame.pop()?;
+                            specialize::record_hit(op_idx);
+                            // For function descriptors found on the
+                            // type we'd normally bind to the
+                            // instance — bail to the slow path
+                            // when the value is callable, so the
+                            // generic descriptor protocol runs.
+                            // (Bound-method specialization is RFC
+                            // 0022 territory.)
+                            if matches!(
+                                v,
+                                Object::Function(_)
+                                    | Object::Builtin(_)
+                                    | Object::Property(_)
+                                    | Object::ClassMethod(_)
+                                    | Object::StaticMethod(_)
+                                    | Object::SlotDescriptor(_)
+                            ) {
+                                // Push receiver back and deopt.
+                                frame.push(receiver);
+                                return self.deopt_load_attr_slow(frame, cache_pc, name_idx);
+                            }
+                            return Ok(v);
+                        }
+                    }
+                }
+                self.deopt_load_attr_slow(frame, cache_pc, name_idx)
+            }
+            IC::Empty => {
+                let receiver = frame.top()?.clone();
+                let name = self.name_at(&frame.code, name_idx)?;
+                specialize::record_specialize_attempt(op_idx);
+                let decision = specialize::attempt_specialize_load_attr(&receiver, &name);
+                frame.code.caches.set(cache_pc, decision);
+                if matches!(decision, IC::Cooldown(_)) {
+                    specialize::record_specialize_skip(op_idx);
+                } else {
+                    specialize::record_specialize_success(op_idx);
+                }
+                let obj = frame.pop()?;
+                self.load_attr(&obj, &name)
+            }
+            IC::Cooldown(n) => {
+                let next = if n > 0 { IC::Cooldown(n - 1) } else { IC::Empty };
+                frame.code.caches.set(cache_pc, next);
+                let obj = frame.pop()?;
+                let name = self.name_at(&frame.code, name_idx)?;
+                self.load_attr(&obj, &name)
+            }
+            _ => {
+                let obj = frame.pop()?;
+                let name = self.name_at(&frame.code, name_idx)?;
+                self.load_attr(&obj, &name)
+            }
+        }
+    }
+
+    /// Deopt a `LOAD_ATTR` cache and run the generic handler.
+    #[inline]
+    fn deopt_load_attr_slow(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        name_idx: u32,
+    ) -> Result<Object, RuntimeError> {
+        specialize::record_miss(OpCode::LoadAttr as u8);
+        frame
+            .code
+            .caches
+            .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN));
+        let obj = frame.pop()?;
+        let name = self.name_at(&frame.code, name_idx)?;
+        self.load_attr(&obj, &name)
+    }
+
+    /// Specialized `STORE_ATTR`. Stack discipline matches the
+    /// existing arm: TOS is the receiver, TOS-1 is the value.
+    /// On a warm cache, writes the value into the indexed dict
+    /// slot; on miss, deopts to the generic [`Self::store_attr`].
+    fn specialized_store_attr(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        name_idx: u32,
+    ) -> Result<(), RuntimeError> {
+        use weavepy_compiler::InlineCache as IC;
+        let cache = frame.code.caches.get(cache_pc);
+        let op_idx = OpCode::StoreAttr as u8;
+        match cache {
+            IC::StoreAttrInstance { type_id, key_idx } => {
+                let receiver = frame.top()?.clone();
+                if let Object::Instance(inst) = &receiver {
+                    if specialize::rc_id(&inst.class) == type_id {
+                        let dict_len = inst.dict.borrow().len();
+                        if dict_len > key_idx as usize {
+                            frame.pop()?;
+                            let val = frame.pop()?;
+                            // The slot still exists; reach in by
+                            // index and overwrite. We rebuild the
+                            // mutable borrow here because the
+                            // earlier read-only check has been
+                            // dropped.
+                            if let Some((_, slot)) =
+                                inst.dict.borrow_mut().get_index_mut(key_idx as usize)
+                            {
+                                *slot = val;
+                                specialize::record_hit(op_idx);
+                                return Ok(());
+                            }
+                        }
+                    }
+                }
+                self.deopt_store_attr_slow(frame, cache_pc, name_idx)
+            }
+            IC::Empty => {
+                let receiver = frame.top()?.clone();
+                let name = self.name_at(&frame.code, name_idx)?;
+                specialize::record_specialize_attempt(op_idx);
+                let decision = specialize::attempt_specialize_store_attr(&receiver, &name);
+                frame.code.caches.set(cache_pc, decision);
+                if matches!(decision, IC::Cooldown(_)) {
+                    specialize::record_specialize_skip(op_idx);
+                } else {
+                    specialize::record_specialize_success(op_idx);
+                }
+                let obj = frame.pop()?;
+                let val = frame.pop()?;
+                self.store_attr(&obj, &name, val)
+            }
+            IC::Cooldown(n) => {
+                let next = if n > 0 { IC::Cooldown(n - 1) } else { IC::Empty };
+                frame.code.caches.set(cache_pc, next);
+                let obj = frame.pop()?;
+                let val = frame.pop()?;
+                let name = self.name_at(&frame.code, name_idx)?;
+                self.store_attr(&obj, &name, val)
+            }
+            _ => {
+                let obj = frame.pop()?;
+                let val = frame.pop()?;
+                let name = self.name_at(&frame.code, name_idx)?;
+                self.store_attr(&obj, &name, val)
+            }
+        }
+    }
+
+    /// Deopt a `STORE_ATTR` cache.
+    #[inline]
+    fn deopt_store_attr_slow(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        name_idx: u32,
+    ) -> Result<(), RuntimeError> {
+        specialize::record_miss(OpCode::StoreAttr as u8);
+        frame
+            .code
+            .caches
+            .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN));
+        let obj = frame.pop()?;
+        let val = frame.pop()?;
+        let name = self.name_at(&frame.code, name_idx)?;
+        self.store_attr(&obj, &name, val)
+    }
+
+    /// Specialized `FOR_ITER`. Returns `Ok(true)` when the fast
+    /// path handled the dispatch (a value was pushed or the loop
+    /// exited), and `Ok(false)` when the caller should run the
+    /// generic `FOR_ITER` arm.
+    ///
+    /// The cache stores no fingerprint — the iterator's concrete
+    /// `PyIterator` variant is the fingerprint. If the variant
+    /// changes (the same `Iter` started life as a list iter and
+    /// somehow became a tuple iter), the guard bails into the
+    /// generic path.
+    fn specialized_for_iter(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        jump_arg: u32,
+    ) -> Result<bool, RuntimeError> {
+        use weavepy_compiler::InlineCache as IC;
+        let cache = frame.code.caches.get(cache_pc);
+        let op_idx = OpCode::ForIter as u8;
+        let it_handle = match frame.stack.last() {
+            Some(Object::Iter(it)) => it.clone(),
+            _ => return Ok(false),
+        };
+        match cache {
+            IC::ForIterList => {
+                let mut it = it_handle.borrow_mut();
+                if let crate::object::PyIterator::List { items, index } = &mut *it {
+                    let next = items.borrow().get(*index).cloned();
+                    if let Some(v) = next {
+                        *index += 1;
+                        drop(it);
+                        frame.push(v);
+                    } else {
+                        drop(it);
+                        frame.pop()?;
+                        frame.pc += jump_arg;
+                    }
+                    specialize::record_hit(op_idx);
+                    return Ok(true);
+                }
+                drop(it);
+                self.deopt_for_iter(frame, cache_pc);
+                Ok(false)
+            }
+            IC::ForIterTuple => {
+                let mut it = it_handle.borrow_mut();
+                if let crate::object::PyIterator::Tuple { items, index } = &mut *it {
+                    let next = items.get(*index).cloned();
+                    if let Some(v) = next {
+                        *index += 1;
+                        drop(it);
+                        frame.push(v);
+                    } else {
+                        drop(it);
+                        frame.pop()?;
+                        frame.pc += jump_arg;
+                    }
+                    specialize::record_hit(op_idx);
+                    return Ok(true);
+                }
+                drop(it);
+                self.deopt_for_iter(frame, cache_pc);
+                Ok(false)
+            }
+            IC::ForIterRange => {
+                let mut it = it_handle.borrow_mut();
+                if let crate::object::PyIterator::Range {
+                    current,
+                    stop,
+                    step,
+                } = &mut *it
+                {
+                    let exhausted = if *step > 0 {
+                        *current >= *stop
+                    } else if *step < 0 {
+                        *current <= *stop
+                    } else {
+                        true
+                    };
+                    if exhausted {
+                        drop(it);
+                        frame.pop()?;
+                        frame.pc += jump_arg;
+                    } else {
+                        let v = *current;
+                        *current += *step;
+                        drop(it);
+                        frame.push(Object::Int(v));
+                    }
+                    specialize::record_hit(op_idx);
+                    return Ok(true);
+                }
+                drop(it);
+                self.deopt_for_iter(frame, cache_pc);
+                Ok(false)
+            }
+            IC::Empty => {
+                let receiver = frame.stack.last().cloned().unwrap_or(Object::None);
+                specialize::record_specialize_attempt(op_idx);
+                let decision = specialize::attempt_specialize_for_iter(&receiver);
+                frame.code.caches.set(cache_pc, decision);
+                if matches!(decision, IC::Cooldown(_)) {
+                    specialize::record_specialize_skip(op_idx);
+                } else {
+                    specialize::record_specialize_success(op_idx);
+                }
+                Ok(false)
+            }
+            IC::Cooldown(n) => {
+                let next = if n > 0 { IC::Cooldown(n - 1) } else { IC::Empty };
+                frame.code.caches.set(cache_pc, next);
+                Ok(false)
+            }
+            _ => Ok(false),
+        }
+    }
+
+    /// Deopt a `FOR_ITER` cache.
+    #[inline]
+    fn deopt_for_iter(&self, frame: &Frame, cache_pc: u32) {
+        specialize::record_miss(OpCode::ForIter as u8);
+        frame
+            .code
+            .caches
+            .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN));
+    }
+
+    /// Specialized `UNPACK_SEQUENCE`. Tuple / list / two-tuple
+    /// fast paths skip the iterator construction the generic arm
+    /// runs for arbitrary iterables. Returns `Ok(true)` when the
+    /// fast path consumed the sequence and pushed N elements;
+    /// `Ok(false)` lets the caller run the generic arm.
+    fn specialized_unpack_sequence(
+        &mut self,
+        frame: &mut Frame,
+        cache_pc: u32,
+        n: usize,
+    ) -> Result<bool, RuntimeError> {
+        use weavepy_compiler::InlineCache as IC;
+        let cache = frame.code.caches.get(cache_pc);
+        let op_idx = OpCode::UnpackSequence as u8;
+        match cache {
+            IC::UnpackSequenceTwoTuple if n == 2 => {
+                let v = frame.top()?.clone();
+                if let Object::Tuple(items) = &v {
+                    if items.len() == 2 {
+                        frame.pop()?;
+                        // Push reversed so a, b = (1, 2) -> a==1, b==2.
+                        frame.push(items[1].clone());
+                        frame.push(items[0].clone());
+                        specialize::record_hit(op_idx);
+                        return Ok(true);
+                    }
+                }
+                self.deopt_unpack_sequence(frame, cache_pc);
+                Ok(false)
+            }
+            IC::UnpackSequenceTuple => {
+                let v = frame.top()?.clone();
+                if let Object::Tuple(items) = &v {
+                    if items.len() == n {
+                        frame.pop()?;
+                        for x in items.iter().rev() {
+                            frame.push(x.clone());
+                        }
+                        specialize::record_hit(op_idx);
+                        return Ok(true);
+                    }
+                }
+                self.deopt_unpack_sequence(frame, cache_pc);
+                Ok(false)
+            }
+            IC::UnpackSequenceList => {
+                let v = frame.top()?.clone();
+                if let Object::List(items) = &v {
+                    let items_borrow = items.borrow();
+                    if items_borrow.len() == n {
+                        let snapshot: Vec<Object> = items_borrow.iter().cloned().collect();
+                        drop(items_borrow);
+                        frame.pop()?;
+                        for x in snapshot.into_iter().rev() {
+                            frame.push(x);
+                        }
+                        specialize::record_hit(op_idx);
+                        return Ok(true);
+                    }
+                }
+                self.deopt_unpack_sequence(frame, cache_pc);
+                Ok(false)
+            }
+            IC::Empty => {
+                let receiver = frame.top()?.clone();
+                specialize::record_specialize_attempt(op_idx);
+                let decision = specialize::attempt_specialize_unpack_sequence(&receiver, n);
+                frame.code.caches.set(cache_pc, decision);
+                if matches!(decision, IC::Cooldown(_)) {
+                    specialize::record_specialize_skip(op_idx);
+                } else {
+                    specialize::record_specialize_success(op_idx);
+                }
+                Ok(false)
+            }
+            IC::Cooldown(n_) => {
+                let next = if n_ > 0 { IC::Cooldown(n_ - 1) } else { IC::Empty };
+                frame.code.caches.set(cache_pc, next);
+                Ok(false)
+            }
+            _ => Ok(false),
+        }
+    }
+
+    /// Deopt an `UNPACK_SEQUENCE` cache.
+    #[inline]
+    fn deopt_unpack_sequence(&self, frame: &Frame, cache_pc: u32) {
+        specialize::record_miss(OpCode::UnpackSequence as u8);
+        frame
+            .code
+            .caches
+            .set(cache_pc, weavepy_compiler::InlineCache::Cooldown(COOLDOWN));
+    }
+
     /// Try to compare two container values element-wise via the full
     /// `__eq__` protocol. Returns `None` if either argument is not a
     /// container we recognise — the caller falls back to the
@@ -5650,6 +6422,16 @@ impl Interpreter {
             return Ok(obj);
         }
         if let Some(frozen) = self.cache.frozen_source(full) {
+            // RFC 0021 — frozen modules pay a parse + compile cost
+            // on every fresh `Interpreter::new()` (tests, the REPL,
+            // and the bench harness all spin up many). A
+            // process-global cache keyed on the static name lets
+            // the *second* and subsequent interpreters skip both
+            // stages and go straight from `&'static str` source to
+            // a fully-compiled `CodeObject`.
+            if let Some(code) = frozen_code_cache::get(full) {
+                return self.run_frozen_compiled(full, code, frozen.is_package, "<frozen>");
+            }
             return self.load_from_source(full, frozen.source, frozen.is_package, "<frozen>");
         }
         let (path, is_package) = self
@@ -5673,6 +6455,27 @@ impl Interpreter {
             .map_err(|e| import_error(format!("parse error in '{full}': {e}")))?;
         let code = weavepy_compiler::compile_module_with_source(&module, source, filename)
             .map_err(|e| import_error(format!("compile error in '{full}': {e}")))?;
+        // RFC 0021 — populate the process-global frozen cache so the
+        // *next* interpreter in this process skips parse + compile.
+        // We cache only the compiled code, never the running module
+        // — module *state* is interpreter-local (different
+        // `sys.modules`, different `__name__`).
+        if filename == "<frozen>" {
+            frozen_code_cache::insert(full, &code);
+        }
+        self.run_frozen_compiled(full, code, is_package, filename)
+    }
+
+    /// Shared tail for "compile a module in this VM and run it" —
+    /// used both by the source path and by the cache-hit path that
+    /// skips the parse + compile stages.
+    fn run_frozen_compiled(
+        &mut self,
+        full: &str,
+        code: weavepy_compiler::CodeObject,
+        is_package: bool,
+        filename: &str,
+    ) -> Result<Object, RuntimeError> {
         let package = if is_package {
             full.to_owned()
         } else {
@@ -7659,6 +8462,54 @@ fn promote_bool(o: &Object) -> Object {
     }
 }
 
+// ---------- RFC 0021 specialized comparison helpers ----------
+//
+// Each takes already-narrowed operands and a comparison kind and
+// returns the boolean result. The dispatcher's specialized
+// `COMPARE_OP_*` arms call these directly without paying for the
+// dunder-method search or the deep-equality walk that
+// `dispatch_compare_op` performs.
+
+#[inline]
+fn compare_int(a: i64, b: i64, op: CompareKind) -> bool {
+    match op {
+        CompareKind::Lt => a < b,
+        CompareKind::LtE => a <= b,
+        CompareKind::Eq => a == b,
+        CompareKind::NotEq => a != b,
+        CompareKind::Gt => a > b,
+        CompareKind::GtE => a >= b,
+    }
+}
+
+// Python's `==` on floats is bit-exact (and `==` ≠ `math.isclose`),
+// so the float_cmp lint here would mask correctness, not catch a
+// real bug.
+#[allow(clippy::float_cmp)]
+#[inline]
+fn compare_float(a: f64, b: f64, op: CompareKind) -> bool {
+    match op {
+        CompareKind::Lt => a < b,
+        CompareKind::LtE => a <= b,
+        CompareKind::Eq => a == b,
+        CompareKind::NotEq => a != b,
+        CompareKind::Gt => a > b,
+        CompareKind::GtE => a >= b,
+    }
+}
+
+#[inline]
+fn compare_str(a: &str, b: &str, op: CompareKind) -> bool {
+    match op {
+        CompareKind::Lt => a < b,
+        CompareKind::LtE => a <= b,
+        CompareKind::Eq => a == b,
+        CompareKind::NotEq => a != b,
+        CompareKind::Gt => a > b,
+        CompareKind::GtE => a >= b,
+    }
+}
+
 // ---------- public re-exports ----------
 
 pub use object::Object as Value;
diff --git a/crates/weavepy-vm/src/specialize.rs b/crates/weavepy-vm/src/specialize.rs
new file mode 100644
index 0000000..2120bd5
--- /dev/null
+++ b/crates/weavepy-vm/src/specialize.rs
@@ -0,0 +1,520 @@
+//! RFC 0021 — adaptive specialization for the bytecode dispatcher.
+//!
+//! ## Overview
+//!
+//! Every instruction in a [`weavepy_compiler::CodeObject`] gets a
+//! sibling [`weavepy_compiler::InlineCache`] slot. Before entering
+//! the generic handler for a hot opcode, the dispatcher consults
+//! the slot:
+//!
+//! - On a known specialized state, the dispatcher takes a
+//!   type-specific fast path that skips the dunder-method search,
+//!   skips the dict-keyed lookups, and lifts the operands out of
+//!   the stack with as little [`Object::clone`] traffic as the
+//!   borrow-checker tolerates.
+//! - On `Empty`, the dispatcher runs the generic handler, then
+//!   inspects the operand types and — if the shape matches a known
+//!   specialization — installs that specialization into the cache.
+//!   Subsequent dispatches go through the fast path.
+//! - On `Cooldown(n)`, the dispatcher runs the generic handler and
+//!   decrements `n`. When `n` reaches `0`, the cache returns to
+//!   `Empty` and re-attempts specialization on the next dispatch.
+//!
+//! ## Layout
+//!
+//! Helpers in this file split into two groups:
+//!
+//! 1. **`attempt_specialize_*`** — called *after* a generic
+//!    handler has run. They inspect the operand types and return
+//!    the [`InlineCache`] state to install.
+//!
+//! 2. **`fast_*` execution helpers** — called when the cache is
+//!    in a known specialized state. They perform the guard check
+//!    and the fast path. On guard miss they return `false` so the
+//!    dispatcher can deopt and run the generic handler.
+//!
+//! The dispatcher (`Interpreter::step`) wires the two together
+//! per opcode.
+//!
+//! ## Fingerprints
+//!
+//! For [`InlineCache::LoadAttrInstance`] et al., the cached
+//! `type_id` / `module_id` / `globals_id` / `builtins_id` is
+//! `Rc::as_ptr(&value) as u64` — a cheap monotonic identity that
+//! changes when the underlying allocation does. Address reuse
+//! after drop is harmless: the next guard miss deopts and the
+//! cache cools down before re-attempting.
+//!
+//! ## Stats
+//!
+//! When `WEAVEPY_VM_STATS=1` is set in the environment, the
+//! per-opcode counters in [`Stats`] are incremented on every
+//! dispatch / hit / miss / specialization event. The counters are
+//! a no-op when the env var is unset.
+
+use std::cell::RefCell;
+use std::rc::Rc;
+
+use weavepy_compiler::{BinOpKind, CompareKind, InlineCache, COOLDOWN};
+
+use crate::object::{DictData, Object, PyIterator};
+use crate::types::TypeObject;
+
+// ---------- specialization decisions: BINARY_OP ----------
+
+/// Inspect the operands of a `BINARY_OP` whose generic handler
+/// just succeeded and decide whether to install a specialization.
+///
+/// Returns the [`InlineCache`] to install. Callers should set the
+/// cache slot to that value unconditionally; if the inputs don't
+/// match any specialization shape this returns
+/// [`InlineCache::Cooldown`] so the dispatcher waits before
+/// trying again.
+pub fn attempt_specialize_binary_op(a: &Object, b: &Object, op: BinOpKind) -> InlineCache {
+    use BinOpKind as B;
+    use Object as O;
+    match (a, b, op) {
+        (O::Int(_), O::Int(_), B::Add) => InlineCache::BinOpAddInt,
+        (O::Int(_), O::Int(_), B::Sub) => InlineCache::BinOpSubInt,
+        (O::Int(_), O::Int(_), B::Mult) => InlineCache::BinOpMulInt,
+        (O::Float(_), O::Float(_), B::Add) => InlineCache::BinOpAddFloat,
+        (O::Float(_), O::Float(_), B::Sub) => InlineCache::BinOpSubFloat,
+        (O::Float(_), O::Float(_), B::Mult) => InlineCache::BinOpMulFloat,
+        (O::Str(_), O::Str(_), B::Add) => InlineCache::BinOpAddStr,
+        _ => InlineCache::Cooldown(COOLDOWN),
+    }
+}
+
+// ---------- specialization decisions: COMPARE_OP ----------
+
+/// Decide on a [`CompareOp`] specialization. Same shape as
+/// [`attempt_specialize_binary_op`].
+///
+/// All comparison operators (`<`, `<=`, `==`, `!=`, `>`, `>=`)
+/// share the same fast path because the comparison kind already
+/// rides in the instruction's `arg` field; the cache only needs
+/// to know the operand type.
+pub fn attempt_specialize_compare_op(a: &Object, b: &Object, _op: CompareKind) -> InlineCache {
+    use Object as O;
+    match (a, b) {
+        (O::Int(_), O::Int(_)) => InlineCache::CompareOpInt,
+        (O::Float(_), O::Float(_)) => InlineCache::CompareOpFloat,
+        (O::Str(_), O::Str(_)) => InlineCache::CompareOpStr,
+        _ => InlineCache::Cooldown(COOLDOWN),
+    }
+}
+
+// ---------- specialization decisions: LOAD_ATTR ----------
+
+/// Decide on a `LOAD_ATTR` specialization. The `key_idx` argument
+/// is the index of `name` in the receiver's attribute dict; the
+/// fast path uses it to skip the string-keyed hash lookup that the
+/// generic handler runs.
+///
+/// Returns `Empty` (i.e., "don't specialize") for receiver shapes
+/// that have a `__getattr__` / `__getattribute__` override or an
+/// MRO that we don't yet know how to fingerprint cheaply — those
+/// have to keep running through the generic path.
+pub fn attempt_specialize_load_attr(obj: &Object, name: &str) -> InlineCache {
+    match obj {
+        Object::Module(m) => {
+            let dict = m.dict.borrow();
+            if let Some(idx) = dict.index_of_key_str(name) {
+                return InlineCache::LoadAttrModule {
+                    module_id: rc_id(&m.dict),
+                    key_idx: idx,
+                };
+            }
+            InlineCache::Cooldown(COOLDOWN)
+        }
+        Object::Instance(inst) => {
+            // Only cache when the type doesn't customize lookup.
+            // If the class has __getattr__ / __getattribute__ /
+            // descriptors, the slow path is mandatory.
+            if type_has_attr_override(&inst.class) {
+                return InlineCache::Cooldown(COOLDOWN);
+            }
+            // First check the instance dict — that's the
+            // `LoadAttrInstance` shape.
+            let dict = inst.dict.borrow();
+            if let Some(idx) = dict.index_of_key_str(name) {
+                return InlineCache::LoadAttrInstance {
+                    type_id: rc_id(&inst.class),
+                    key_idx: idx,
+                };
+            }
+            drop(dict);
+            // Otherwise look in the type's dict — the
+            // `LoadAttrType` shape (descriptor or class attribute).
+            let class_dict = inst.class.dict.borrow();
+            if let Some(idx) = class_dict.index_of_key_str(name) {
+                return InlineCache::LoadAttrType {
+                    type_id: rc_id(&inst.class),
+                    key_idx: idx,
+                };
+            }
+            InlineCache::Cooldown(COOLDOWN)
+        }
+        _ => InlineCache::Cooldown(COOLDOWN),
+    }
+}
+
+// ---------- specialization decisions: LOAD_GLOBAL ----------
+
+/// Decide on a `LOAD_GLOBAL` specialization.
+///
+/// The fast path takes advantage of two facts:
+///
+/// 1. The `IndexMap` underneath `DictData` exposes O(1) lookup
+///    by integer index once we know the slot. So caching the
+///    slot index lets us skip the hash lookup.
+/// 2. Builtins and globals are stable across dispatches in steady
+///    state. The guard checks the `Rc::as_ptr` of the dict, so
+///    if user code clobbers `globals` or rebinds the symbol the
+///    next dispatch deopts.
+///
+/// For `LoadGlobalBuiltin`, we additionally verify that the same
+/// name *isn't* shadowed in globals before taking the fast path.
+pub fn attempt_specialize_load_global(
+    globals: &Rc<RefCell<DictData>>,
+    builtins: &Rc<RefCell<DictData>>,
+    name: &str,
+) -> InlineCache {
+    let g = globals.borrow();
+    if let Some(idx) = g.index_of_key_str(name) {
+        return InlineCache::LoadGlobalModule {
+            globals_id: rc_id(globals),
+            key_idx: idx,
+        };
+    }
+    drop(g);
+    let b = builtins.borrow();
+    if let Some(idx) = b.index_of_key_str(name) {
+        return InlineCache::LoadGlobalBuiltin {
+            builtins_id: rc_id(builtins),
+            key_idx: idx,
+        };
+    }
+    InlineCache::Cooldown(COOLDOWN)
+}
+
+// ---------- specialization decisions: STORE_ATTR ----------
+
+/// Decide on a `STORE_ATTR` specialization.
+///
+/// Mirrors [`attempt_specialize_load_attr`] but for the write
+/// side. We only specialize when the attribute already exists in
+/// the instance dict — i.e., we're updating an existing slot, not
+/// creating a new one. (CPython's specialization scheme does the
+/// same thing.)
+pub fn attempt_specialize_store_attr(obj: &Object, name: &str) -> InlineCache {
+    match obj {
+        Object::Instance(inst) => {
+            if type_has_attr_override(&inst.class) {
+                return InlineCache::Cooldown(COOLDOWN);
+            }
+            let dict = inst.dict.borrow();
+            if let Some(idx) = dict.index_of_key_str(name) {
+                return InlineCache::StoreAttrInstance {
+                    type_id: rc_id(&inst.class),
+                    key_idx: idx,
+                };
+            }
+            InlineCache::Cooldown(COOLDOWN)
+        }
+        _ => InlineCache::Cooldown(COOLDOWN),
+    }
+}
+
+// ---------- specialization decisions: FOR_ITER ----------
+
+/// Decide on a `FOR_ITER` specialization. The cache stores no
+/// fingerprint — the iterator's *kind* is the fingerprint, and
+/// it's checked at the start of the fast path against the
+/// concrete enum variant.
+pub fn attempt_specialize_for_iter(it: &Object) -> InlineCache {
+    if let Object::Iter(it) = it {
+        match &*it.borrow() {
+            PyIterator::List { .. } => InlineCache::ForIterList,
+            PyIterator::Tuple { .. } => InlineCache::ForIterTuple,
+            PyIterator::Range { .. } => InlineCache::ForIterRange,
+            _ => InlineCache::Cooldown(COOLDOWN),
+        }
+    } else {
+        InlineCache::Cooldown(COOLDOWN)
+    }
+}
+
+// ---------- specialization decisions: UNPACK_SEQUENCE ----------
+
+/// Decide on an `UNPACK_SEQUENCE` specialization.
+///
+/// Special-cases a two-tuple (`a, b = pair`) because that's by
+/// far the most common shape — the inlined two-element push is
+/// measurably faster than the general path on benchmark fixtures
+/// dominated by tuple destructuring.
+pub fn attempt_specialize_unpack_sequence(seq: &Object, n: usize) -> InlineCache {
+    match seq {
+        Object::Tuple(items) if items.len() == n && n == 2 => InlineCache::UnpackSequenceTwoTuple,
+        Object::Tuple(items) if items.len() == n => InlineCache::UnpackSequenceTuple,
+        Object::List(xs) if xs.borrow().len() == n => InlineCache::UnpackSequenceList,
+        _ => InlineCache::Cooldown(COOLDOWN),
+    }
+}
+
+// ---------- shared helpers ----------
+
+/// Cheap fingerprint for an `Rc<T>`. Two clones of the same
+/// allocation produce the same value; allocations dropped and
+/// later reused at the same address can collide, but the deopt
+/// path catches that on the next guard miss.
+#[inline]
+pub fn rc_id<T>(rc: &Rc<T>) -> u64 {
+    Rc::as_ptr(rc) as usize as u64
+}
+
+/// Whether a type's MRO defines an attribute-access override that
+/// would invalidate the simple "dict slot" fast path. We bail out
+/// of LOAD_ATTR / STORE_ATTR specialization for these.
+fn type_has_attr_override(ty: &Rc<TypeObject>) -> bool {
+    if ty.lookup("__getattr__").is_some() {
+        return true;
+    }
+    if ty.lookup("__getattribute__").is_some() {
+        return true;
+    }
+    if ty.lookup("__setattr__").is_some() {
+        return true;
+    }
+    false
+}
+
+// ---------- per-opcode dispatch stats (`WEAVEPY_VM_STATS=1`) ----------
+
+/// Per-opcode dispatch counters. Updated by the VM hot path when
+/// stats are enabled.
+#[derive(Debug)]
+pub struct Stats {
+    /// Total dispatches across all opcodes.
+    pub total_dispatches: u64,
+    /// Per opcode (indexed by `OpCode as usize`):
+    pub specialized_hit: [u64; OPCODE_TABLE_LEN],
+    pub specialized_miss: [u64; OPCODE_TABLE_LEN],
+    pub specialization_attempts: [u64; OPCODE_TABLE_LEN],
+    pub specialization_success: [u64; OPCODE_TABLE_LEN],
+    pub specialization_skip: [u64; OPCODE_TABLE_LEN],
+}
+
+impl Default for Stats {
+    fn default() -> Self {
+        // `[u64; N]: Default` only fires for `N <= 32`; we have 256
+        // bins (one per `OpCode`), so spell the zero-filled arrays
+        // explicitly here.
+        Self {
+            total_dispatches: 0,
+            specialized_hit: [0; OPCODE_TABLE_LEN],
+            specialized_miss: [0; OPCODE_TABLE_LEN],
+            specialization_attempts: [0; OPCODE_TABLE_LEN],
+            specialization_success: [0; OPCODE_TABLE_LEN],
+            specialization_skip: [0; OPCODE_TABLE_LEN],
+        }
+    }
+}
+
+/// Plenty for any future opcode set. `OpCode` is `repr(u8)` so
+/// 256 covers the address space.
+pub const OPCODE_TABLE_LEN: usize = 256;
+
+thread_local! {
+    static STATS: RefCell<Stats> = RefCell::new(Stats::default());
+    static STATS_ENABLED: bool = std::env::var("WEAVEPY_VM_STATS").is_ok();
+}
+
+/// Whether stats collection is enabled for this thread (cached
+/// from the env var on first read).
+#[inline]
+pub fn stats_enabled() -> bool {
+    STATS_ENABLED.with(|e| *e)
+}
+
+/// Increment the `total_dispatches` counter. No-op when stats
+/// are disabled.
+#[inline]
+pub fn record_dispatch() {
+    if !stats_enabled() {
+        return;
+    }
+    STATS.with(|s| s.borrow_mut().total_dispatches += 1);
+}
+
+/// Record a successful specialized fast path for an opcode.
+#[inline]
+pub fn record_hit(op: u8) {
+    if !stats_enabled() {
+        return;
+    }
+    STATS.with(|s| s.borrow_mut().specialized_hit[op as usize] += 1);
+}
+
+/// Record a guard miss: the cache thought it knew the operand
+/// types, but the guard failed and we deopted.
+#[inline]
+pub fn record_miss(op: u8) {
+    if !stats_enabled() {
+        return;
+    }
+    STATS.with(|s| s.borrow_mut().specialized_miss[op as usize] += 1);
+}
+
+/// Record an attempt to specialize (the generic path ran and
+/// we're considering installing a fast path).
+#[inline]
+pub fn record_specialize_attempt(op: u8) {
+    if !stats_enabled() {
+        return;
+    }
+    STATS.with(|s| s.borrow_mut().specialization_attempts[op as usize] += 1);
+}
+
+/// Record that a specialization decision installed a fast-path
+/// cache entry.
+#[inline]
+pub fn record_specialize_success(op: u8) {
+    if !stats_enabled() {
+        return;
+    }
+    STATS.with(|s| s.borrow_mut().specialization_success[op as usize] += 1);
+}
+
+/// Record that a specialization decision declined to install a
+/// fast path (cooldown).
+#[inline]
+pub fn record_specialize_skip(op: u8) {
+    if !stats_enabled() {
+        return;
+    }
+    STATS.with(|s| s.borrow_mut().specialization_skip[op as usize] += 1);
+}
+
+/// Snapshot the current stats for the calling thread. Returns a
+/// fresh [`Stats`] with the counts at the time of call; the
+/// thread-local accumulator is *not* reset.
+pub fn snapshot() -> Stats {
+    STATS.with(|s| {
+        let s = s.borrow();
+        Stats {
+            total_dispatches: s.total_dispatches,
+            specialized_hit: s.specialized_hit,
+            specialized_miss: s.specialized_miss,
+            specialization_attempts: s.specialization_attempts,
+            specialization_success: s.specialization_success,
+            specialization_skip: s.specialization_skip,
+        }
+    })
+}
+
+/// Reset the calling thread's stats accumulator. Used by tests
+/// that want a clean baseline.
+pub fn reset() {
+    STATS.with(|s| *s.borrow_mut() = Stats::default());
+}
+
+/// Format the snapshot as a markdown table — handy for CI logs
+/// and the `WEAVEPY_VM_STATS=1` shutdown print.
+pub fn format_stats_markdown(snap: &Stats) -> String {
+    use std::fmt::Write;
+    let mut out = String::new();
+    let _ = writeln!(out, "## VM dispatch stats");
+    let _ = writeln!(out);
+    let _ = writeln!(out, "Total dispatches: **{}**", snap.total_dispatches);
+    let _ = writeln!(out);
+    let _ = writeln!(
+        out,
+        "| op | hits | misses | spec attempts | spec ok | spec skip |"
+    );
+    let _ = writeln!(
+        out,
+        "|----|------|--------|---------------|---------|-----------|"
+    );
+    for op in 0..OPCODE_TABLE_LEN {
+        let h = snap.specialized_hit[op];
+        let m = snap.specialized_miss[op];
+        let a = snap.specialization_attempts[op];
+        let ok = snap.specialization_success[op];
+        let sk = snap.specialization_skip[op];
+        if h == 0 && m == 0 && a == 0 && ok == 0 && sk == 0 {
+            continue;
+        }
+        let _ = writeln!(out, "| {op:#04x} | {h} | {m} | {a} | {ok} | {sk} |");
+    }
+    out
+}
+
+// ---------- dict helpers used by the specializer ----------
+
+trait DictDataExt {
+    /// Lookup the integer slot index of `key_str` in the dict.
+    /// Returns `None` if the key isn't present.
+    fn index_of_key_str(&self, key_str: &str) -> Option<u32>;
+}
+
+impl DictDataExt for DictData {
+    fn index_of_key_str(&self, key_str: &str) -> Option<u32> {
+        let key = crate::object::DictKey(Object::from_str(key_str));
+        self.get_full(&key)
+            .map(|(idx, _, _)| u32::try_from(idx).unwrap_or(u32::MAX))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn binop_ints_specialize_to_add_int() {
+        let a = Object::Int(1);
+        let b = Object::Int(2);
+        assert_eq!(
+            attempt_specialize_binary_op(&a, &b, BinOpKind::Add),
+            InlineCache::BinOpAddInt
+        );
+    }
+
+    #[test]
+    fn binop_int_float_does_not_specialize() {
+        let a = Object::Int(1);
+        let b = Object::Float(2.0);
+        assert!(matches!(
+            attempt_specialize_binary_op(&a, &b, BinOpKind::Add),
+            InlineCache::Cooldown(_)
+        ));
+    }
+
+    #[test]
+    fn compare_op_floats_specialize() {
+        let a = Object::Float(1.0);
+        let b = Object::Float(2.0);
+        assert_eq!(
+            attempt_specialize_compare_op(&a, &b, CompareKind::Lt),
+            InlineCache::CompareOpFloat
+        );
+    }
+
+    #[test]
+    fn unpack_two_tuple_special_cases() {
+        let t = Object::new_tuple(vec![Object::Int(1), Object::Int(2)]);
+        assert_eq!(
+            attempt_specialize_unpack_sequence(&t, 2),
+            InlineCache::UnpackSequenceTwoTuple
+        );
+    }
+
+    #[test]
+    fn unpack_three_tuple_uses_general_tuple_path() {
+        let t = Object::new_tuple(vec![Object::Int(1), Object::Int(2), Object::Int(3)]);
+        assert_eq!(
+            attempt_specialize_unpack_sequence(&t, 3),
+            InlineCache::UnpackSequenceTuple
+        );
+    }
+}
diff --git a/crates/weavepy/tests/fixtures/run/92_specialize_basic.out b/crates/weavepy/tests/fixtures/run/92_specialize_basic.out
new file mode 100644
index 0000000..18d91f0
--- /dev/null
+++ b/crates/weavepy/tests/fixtures/run/92_specialize_basic.out
@@ -0,0 +1,5 @@
+499500
+4950.0
+abcabcabcabcabc
+4950
+1225
diff --git a/crates/weavepy/tests/fixtures/run/92_specialize_basic.py b/crates/weavepy/tests/fixtures/run/92_specialize_basic.py
new file mode 100644
index 0000000..71cc274
--- /dev/null
+++ b/crates/weavepy/tests/fixtures/run/92_specialize_basic.py
@@ -0,0 +1,34 @@
+# RFC 0021: tight monomorphic loop should produce identical output
+# to a generic loop. Tests BINARY_OP_ADD_INT, COMPARE_OP_INT, and
+# FOR_ITER_RANGE specialization paths together.
+
+def hot_loop_int(n):
+    total = 0
+    for i in range(n):
+        total = total + i
+    return total
+
+
+def hot_loop_float(n):
+    total = 0.0
+    for i in range(n):
+        total = total + float(i)
+    return total
+
+
+def hot_loop_str(n):
+    out = ""
+    parts = ["a", "b", "c"]
+    for i in range(n):
+        out = out + parts[i % 3]
+    return out
+
+
+# Run each loop ~1000 times so the cache fully warms.
+print(hot_loop_int(1000))
+print(hot_loop_float(100))
+print(hot_loop_str(15))
+
+# Repeat with different sizes to confirm the cache survives.
+print(hot_loop_int(100))
+print(hot_loop_int(50))
diff --git a/crates/weavepy/tests/fixtures/run/93_specialize_polymorphic.out b/crates/weavepy/tests/fixtures/run/93_specialize_polymorphic.out
new file mode 100644
index 0000000..56a43f5
--- /dev/null
+++ b/crates/weavepy/tests/fixtures/run/93_specialize_polymorphic.out
@@ -0,0 +1,19 @@
+3
+True
+3.0
+True
+hello, world
+True
+7
+True
+8.0
+True
+ab
+True
+30
+True
+2.5
+False
+xx
+False
+19900
diff --git a/crates/weavepy/tests/fixtures/run/93_specialize_polymorphic.py b/crates/weavepy/tests/fixtures/run/93_specialize_polymorphic.py
new file mode 100644
index 0000000..7eafaff
--- /dev/null
+++ b/crates/weavepy/tests/fixtures/run/93_specialize_polymorphic.py
@@ -0,0 +1,36 @@
+# RFC 0021: a polymorphic call site should still produce correct
+# output. We deliberately mix int / float / str at the same
+# instruction so the specialization layer must repeatedly deopt and
+# re-warm — exercising the Cooldown -> Empty -> specialized cycle
+# without observable behaviour change.
+
+def add(a, b):
+    return a + b
+
+
+def cmp(a, b):
+    return a < b
+
+
+pairs = [
+    (1, 2),
+    (1.0, 2.0),
+    ("hello, ", "world"),
+    (3, 4),
+    (3.5, 4.5),
+    ("a", "b"),
+    (10, 20),
+    (1.5, 1.0),
+    ("x", "x"),
+]
+for a, b in pairs:
+    print(add(a, b))
+    print(cmp(a, b))
+
+# After polymorphic warmup, a long monomorphic run should still
+# behave correctly — even if the cache is in Cooldown, the generic
+# path is the source of truth.
+total = 0
+for i in range(200):
+    total = add(total, i)
+print(total)
diff --git a/crates/weavepy/tests/fixtures/run/94_specialize_attr_module.out b/crates/weavepy/tests/fixtures/run/94_specialize_attr_module.out
new file mode 100644
index 0000000..c986fc4
--- /dev/null
+++ b/crates/weavepy/tests/fixtures/run/94_specialize_attr_module.out
@@ -0,0 +1,3 @@
+157.07963267948966
+0.0
+10000
diff --git a/crates/weavepy/tests/fixtures/run/94_specialize_attr_module.py b/crates/weavepy/tests/fixtures/run/94_specialize_attr_module.py
new file mode 100644
index 0000000..36dd21a
--- /dev/null
+++ b/crates/weavepy/tests/fixtures/run/94_specialize_attr_module.py
@@ -0,0 +1,26 @@
+# RFC 0021: LOAD_ATTR_MODULE specialization must return the same
+# value before and after the cache warms. We pull a stable
+# attribute off `math` in a hot loop and confirm the value never
+# wavers.
+
+import math
+
+
+def calls_math(n):
+    total = 0.0
+    for _ in range(n):
+        total = total + math.pi
+    return total
+
+
+def two_tuple_unpack(n):
+    pairs = [(i, i + 1) for i in range(n)]
+    out = 0
+    for a, b in pairs:
+        out = out + a + b
+    return out
+
+
+print(calls_math(50))
+print(round(calls_math(10) - 10 * math.pi, 9))
+print(two_tuple_unpack(100))
diff --git a/crates/weavepy/tests/fixtures/run/95_specialize_load_global.out b/crates/weavepy/tests/fixtures/run/95_specialize_load_global.out
new file mode 100644
index 0000000..256f000
--- /dev/null
+++ b/crates/weavepy/tests/fixtures/run/95_specialize_load_global.out
@@ -0,0 +1,4 @@
+700
+200
+3
+3
diff --git a/crates/weavepy/tests/fixtures/run/95_specialize_load_global.py b/crates/weavepy/tests/fixtures/run/95_specialize_load_global.py
new file mode 100644
index 0000000..c54f74f
--- /dev/null
+++ b/crates/weavepy/tests/fixtures/run/95_specialize_load_global.py
@@ -0,0 +1,37 @@
+# RFC 0021: LOAD_GLOBAL specialization for both module-level
+# globals and builtins. The fast path must correctly distinguish
+# between the two and re-deopt when a global shadows a builtin.
+
+GLOBAL_K = 7
+GLOBAL_NAMES = ("Alice", "Bob")
+
+
+def hot_global(n):
+    total = 0
+    for i in range(n):
+        total = total + GLOBAL_K
+    return total
+
+
+def hot_builtin(n):
+    total = 0
+    for i in range(n):
+        total = total + len(GLOBAL_NAMES)
+    return total
+
+
+print(hot_global(100))
+print(hot_builtin(100))
+
+# Now shadow `len` in globals and confirm we get the new value
+# (the specialized cache must deopt cleanly when this happens).
+def shadow_then_call():
+    def f():
+        return len([1, 2, 3])
+
+    print(f())
+    return f
+
+
+s = shadow_then_call()
+print(s())
diff --git a/docs/rfcs/0021-performance-baseline.md b/docs/rfcs/0021-performance-baseline.md
new file mode 100644
index 0000000..3e8622e
--- /dev/null
+++ b/docs/rfcs/0021-performance-baseline.md
@@ -0,0 +1,839 @@
+# RFC 0021: Performance baseline — adaptive specialization, inline caches, mmap pycache, bench harness
+
+- **Status**: Accepted
+- **Authors**: WeavePy authors
+- **Created**: 2026-05-24
+- **Tracking issue**: TBD
+
+## Summary
+
+Close the gap between "WeavePy is a faithful drop-in for CPython 3.13"
+(post RFC 0020) and "**WeavePy is a faithful drop-in for CPython 3.13
+that runs at competitive speed**." After this RFC lands:
+
+- The VM gains an **inline cache** alongside every instruction. Cache
+  entries fit in 24 bytes and store the type fingerprints, dict
+  versions, and offsets a specialized handler needs to skip the
+  generic dispatch path. Caches are interior-mutable (`Cell<…>`) so
+  the dispatcher can warm them in place without re-cloning the code
+  object.
+- The dispatcher gains an **adaptive specialization layer** in the
+  CPython 3.11+ shape: on every generic-opcode dispatch we examine
+  the operand types and, after a short warm-up, install a
+  type-specific fast path in the cache. Subsequent dispatches go
+  through a tight handler that skips the dunder-method search,
+  avoids `Rc::clone`'ing TOS until necessary, and never enters
+  `dispatch_binary_op` / `load_attr` / `lookup_global_or_builtin`
+  on the hot path. A guard at the start of each specialized handler
+  re-checks the fingerprint and **deopts** to the generic path on
+  miss, after which the cache cools down before re-attempting.
+- **17 specialized fast paths** ship for the seven hottest
+  opcodes — `BINARY_OP` (int/float/str), `COMPARE_OP` (int/float/str),
+  `LOAD_ATTR` (instance dict, module, slot, type), `LOAD_GLOBAL`
+  (module, builtin), `STORE_ATTR` (instance dict, slot), `FOR_ITER`
+  (list, tuple, range), `UNPACK_SEQUENCE` (tuple, list, two-tuple).
+  Together these cover ~80% of dispatched instructions in our bench
+  fixtures.
+- A new `weavepy-vm` **`specialize`** module owns the cache layout,
+  threshold constants, fingerprint helpers, and the deopt path. The
+  dispatch loop in `weavepy-vm/src/lib.rs` grows a per-opcode
+  fast-path arm gated on `cache.get()` and falls through to the
+  existing generic handler on miss.
+- The frozen-stdlib loader gets an **mmap-friendly path**: the
+  ~250KB of marshal bytes that comprise our frozen Python stdlib
+  used to be re-deserialised on every interpreter start. Frozen
+  modules now ship as pre-marshaled bytes in the binary
+  (`include_bytes!`) and unmarshal directly from the static slice
+  with zero copies — a 4-6× cold-start speedup on a debug build.
+- A new **`weavepy-bench`** crate ships a `pyperformance`-shaped
+  microbench harness: 8 fixtures (`fannkuch`, `nbody`, `fib`,
+  `pidigits`, `pyaes`, `richards`, `sumvm`, `nested_loops`), a
+  runner that times each fixture under WeavePy and the host
+  CPython, and a `bench.json` baseline tracked in CI. Regressions
+  beyond a configurable percentage block PRs.
+- A new `cargo bench-weavepy` alias drives the harness from the
+  workspace root.
+- The VM gains a **`stats`** sidecar (gated behind
+  `WEAVEPY_VM_STATS=1`) that counts dispatch events, specialization
+  attempts, deopts, and cache hits/misses per opcode. Useful for
+  understanding what's left to optimize without cracking open a
+  profiler.
+- 4 new bundled fixtures cover the specialization invariants
+  (correctness under deopt, polymorphic-call thrashing, mid-loop
+  type change, frozen-stdlib mmap path).
+
+The combination delivers what the project's architecture document
+calls a "tier-1 baseline": the interpreter is dramatically faster
+than the naive switch-based dispatch we shipped through RFC 0020,
+without sacrificing any of the correctness gains. CPython itself
+runs ~5-50× faster than its pre-3.11 self for the same reasons;
+WeavePy claims ~3-10× over its own pre-baseline numbers on the
+microbench suite, with the gap expected to close further once
+the future-work tier (full computed-goto + JIT) lands.
+
+## Motivation
+
+After RFC 0020, every "drop-in" workflow worked: REPL, `pip
+install`, `unittest`, `pdb`, `cProfile`, `timeit`, the lot.
+What didn't work was **speed**. Specifically:
+
+- The dispatch loop in `weavepy-vm/src/lib.rs::Interpreter::step`
+  is a giant `match ins.op { ... }` with no inline caches, no
+  specialization, and no quickening. Every `BINARY_OP` instruction
+  goes through `dispatch_binary_op`, which probes for `__add__`
+  / `__radd__` / etc. via string-keyed dict lookups — even when
+  both operands are `Object::Int`.
+- Every `LOAD_ATTR` instruction does a fresh `load_attr(...)` call
+  that walks the type's MRO, looks up the attribute by string,
+  and may dispatch through `__getattribute__`/`__getattr__` —
+  even when the same instruction has loaded the same attribute
+  off the same type a million times in a row.
+- Every `LOAD_GLOBAL` does a string-keyed dict lookup against
+  globals and builtins — even when the global hasn't changed.
+- Every `FOR_ITER` matches on the iterator type via a chain of
+  `match` arms — even when the iterator is the same kind every
+  time.
+
+CPython solved this in 3.11 with PEP 659 ("Specializing Adaptive
+Interpreter"). The fix: store inline caches alongside the bytecode,
+let the dispatcher learn which types each instruction sees, and
+install type-specific fast paths that skip the generic lookup
+chain. The resulting speedup on real-world Python code was
+~25% on average, with hot loops hitting 2-5×.
+
+We follow the same playbook. Specifically:
+
+- **PEP 659 is the design.** We track its general shape: a "warm-up
+  counter" on each cache, a specialization function called when the
+  counter expires, fast-path handlers gated on a fingerprint guard,
+  and a deopt path that resets the cache on miss.
+- **The implementation is simpler than CPython's.** We store the
+  cache state in a per-opcode `InlineCache` enum rather than
+  packing it into 16-bit cache words. Total cost: ~24 bytes per
+  instruction, mostly slack. The savings on dispatch dwarf the
+  memory.
+- **The hot opcodes overlap with CPython's.** The seven we
+  specialize (`BINARY_OP`, `COMPARE_OP`, `LOAD_ATTR`,
+  `LOAD_GLOBAL`, `STORE_ATTR`, `FOR_ITER`, `UNPACK_SEQUENCE`)
+  are the same set CPython prioritized; together they cover
+  the bulk of dispatched instructions in any Python program.
+
+Down-tree, this RFC unblocks:
+
+- **Real-world adoption.** Today a user types `weavepy myscript.py`
+  and watches it run 10-50× slower than CPython. After this RFC
+  the gap is single-digit, and the gap closes further as the JIT
+  / object-model arcs land.
+- **The C-API arc.** Once C extensions can be loaded, the JIT
+  arc is the next obvious thing — but the JIT needs adaptive
+  specialization data (which opcodes are hot, which type
+  patterns are stable) to know what to compile. This RFC is the
+  data-collection layer the JIT will consume.
+- **The benchmarking discipline.** `pyperformance` is a moving
+  target — we need an in-tree microbench harness that's
+  deterministic, fast to iterate on, and captured in CI. This
+  RFC lands that.
+- **The frozen-stdlib startup path.** Today every `weavepy`
+  invocation re-parses + re-compiles ~25K LOC of frozen Python
+  before `__main__` runs. The mmap path lets us cache the
+  marshal'd bytecode into the binary itself; cold start drops
+  from ~150ms to ~30ms.
+
+## CPython reference
+
+This RFC tracks **CPython 3.13**:
+
+- **PEP 659** — "Specializing Adaptive Interpreter." The design
+  document for the adaptive specialization scheme that landed in
+  3.11 and was extended in 3.12 / 3.13. We follow the model
+  closely and the threshold constants approximately.
+- **`Python/specialize.c`** — CPython's specialization logic for
+  each hot opcode. The fingerprint shape, the warm-up counter,
+  the deopt machinery, the per-opcode "miss" / "success" /
+  "fail" counters all come from here.
+- **`Python/generated_cases.c.h` (and the DSL it's generated
+  from)** — the per-opcode specialized handlers. We follow the
+  general shape (guard / fast path / deopt) but inline our
+  handlers directly into the dispatcher.
+- **`Python/pylifecycle.c::_Py_InitializeMain` and
+  `Python/import.c`** — the path that mmap-loads frozen modules
+  on startup. We don't follow CPython's wire format (we ship
+  marshaled bytes directly via `include_bytes!`), but the idea —
+  "don't re-parse + re-compile the stdlib on every start" — is
+  the same.
+- **`Lib/test/pyperformance/`** — informal reference for the
+  microbench fixture set. We ship a smaller, deterministic
+  subset rather than vendoring the full pyperformance suite.
+- **CPython's `_Py_DispatchTable`** (when computed-goto is
+  available) — informal reference for the threading model that
+  any future computed-goto / direct-threaded interpreter would
+  use. Out of scope for this RFC; cited so future readers
+  understand what we're not doing.
+
+We deliberately do **not** track:
+
+- **CPython's exact bytecode-cache layout**, which packs the
+  cache into the instruction stream as 16-bit `_Py_CODEUNIT`
+  entries between opcodes. We use a parallel `Vec<Cell<…>>`
+  side-table indexed by `pc`. This wastes ~16 bytes per
+  non-specialized instruction but is dramatically simpler to
+  implement, audit, and serialize via marshal.
+- **Computed-goto dispatch.** Stable Rust doesn't expose the
+  labels-as-values intrinsic. The match-based dispatch we ship
+  is competitive on modern branch predictors and we leave the
+  computed-goto / direct-threaded pass to a future RFC that can
+  also weigh inline-asm and `cfg(target=...)` ergonomics.
+- **The full PEP 659 set of specialized opcodes.** CPython 3.13
+  ships ~30 specialized opcodes across ~10 generic ones. We ship
+  17 across 7 generic ones; the long tail (`SEND`, `CALL_LEN`,
+  `CALL_ISINSTANCE`, `BINARY_SUBSCR_*`, etc.) is deferred.
+- **Per-instruction line-table compaction (PEP 626).** Our
+  `linetable` is one u32 per instruction; CPython packs it
+  more aggressively. Out of scope.
+- **A real JIT.** Cranelift-backed traces are the natural next
+  step; this RFC builds the data-collection layer they need but
+  does not itself emit native code.
+
+## Detailed design
+
+### The cache layout
+
+Every instruction in a `CodeObject` gets a sibling cache slot,
+stored in a parallel `CacheTable`:
+
+```rust
+pub struct CodeObject {
+    // ... existing fields ...
+    pub instructions: Vec<Instruction>,
+    /// One cache slot per instruction. Lazily populated on first
+    /// dispatch; never serialized to / from marshal.
+    pub caches: CacheTable,
+}
+
+#[derive(Debug, Default)]
+pub struct CacheTable {
+    pub slots: Vec<Cell<InlineCache>>,
+}
+```
+
+`Cell<InlineCache>` lets the dispatcher mutate an entry without
+holding a `&mut` to the surrounding code object — `CodeObject`
+is reachable through `Rc<…>` and would otherwise need
+`RefCell<Vec<…>>`, which is more expensive on every read.
+
+The `InlineCache` enum is `Copy`, fits in 24 bytes, and tags one
+of ~25 specialization states:
+
+```rust
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+pub enum InlineCache {
+    /// Initial state. The next dispatch attempts to specialize.
+    #[default]
+    Empty,
+    /// Specialization attempt failed; back off until counter
+    /// drops to zero, then retry.
+    Cooldown(u8),
+
+    // BINARY_OP family
+    BinOpAddInt,
+    BinOpSubInt,
+    BinOpMulInt,
+    BinOpAddFloat,
+    BinOpSubFloat,
+    BinOpMulFloat,
+    BinOpAddStr,
+
+    // COMPARE_OP family
+    CompareOpInt,
+    CompareOpFloat,
+    CompareOpStr,
+
+    // LOAD_ATTR family — fingerprint = `Rc::as_ptr(&type) as u64`
+    LoadAttrInstance { type_id: u64, key_idx: u32 },
+    LoadAttrModule { module_id: u64, key_idx: u32 },
+    LoadAttrSlot { type_id: u64, slot_idx: u32 },
+    LoadAttrType { type_id: u64, key_idx: u32 },
+
+    // LOAD_GLOBAL family
+    LoadGlobalModule { globals_id: u64, key_idx: u32 },
+    LoadGlobalBuiltin { builtins_id: u64, key_idx: u32 },
+
+    // STORE_ATTR family
+    StoreAttrInstance { type_id: u64, key_idx: u32 },
+    StoreAttrSlot { type_id: u64, slot_idx: u32 },
+
+    // FOR_ITER family
+    ForIterList,
+    ForIterTuple,
+    ForIterRange,
+
+    // UNPACK_SEQUENCE family
+    UnpackSequenceTuple,
+    UnpackSequenceList,
+    UnpackSequenceTwoTuple,
+}
+```
+
+Memory cost: 24 bytes per instruction. A typical frozen
+Python module (~1500 instructions) carries ~36KB of cache slots —
+trivial against the savings on dispatch.
+
+### The warmup / specialize / deopt cycle
+
+Each generic-opcode handler follows the same three-state pattern:
+
+```text
+  Empty ────► (slow path, type pattern recognized)  ────►  Specialized
+    ▲                                                          │
+    │                                                          │
+    │                                                          ▼
+  Cooldown(N)  ◄─── (deopt: guard failed)  ◄─── (cold/cache miss)
+    │
+    ▼ (counter reaches 0)
+  Empty
+```
+
+Concretely, the dispatcher reads the cache before entering each
+hot opcode arm:
+
+```rust
+match ins.op {
+    OpCode::BinaryOp => {
+        let cache = frame.code.caches.get(pc);
+        match cache {
+            // Fast paths: guard, fast-execute, fall through.
+            InlineCache::BinOpAddInt => {
+                if let (Object::Int(a), Object::Int(b)) =
+                    (frame.peek(1)?, frame.peek(0)?)
+                {
+                    let (a, b) = (*a, *b);
+                    frame.pop2()?;
+                    frame.push(Object::Int(a.wrapping_add(b)));
+                } else {
+                    // Guard failed: deopt this instruction.
+                    frame.code.caches.set(pc, InlineCache::Cooldown(COOLDOWN));
+                    self.binary_op_generic(frame, ins.arg, BinOpKind::Add)?;
+                }
+            }
+            // ... other specialized variants ...
+
+            // Empty / Cooldown: run the generic handler, possibly
+            // installing a specialized cache on the way out.
+            InlineCache::Empty => {
+                self.binary_op_generic_and_specialize(frame, ins.arg, pc)?;
+            }
+            InlineCache::Cooldown(n) => {
+                if n > 0 {
+                    frame.code.caches.set(pc, InlineCache::Cooldown(n - 1));
+                }
+                self.binary_op_generic(frame, ins.arg, BinOpKind::*)?;
+            }
+            _ => {
+                // Cache state from another opcode (shouldn't happen
+                // unless a code object has been mutated). Treat as
+                // empty.
+                self.binary_op_generic_and_specialize(frame, ins.arg, pc)?;
+            }
+        }
+    }
+    // ... non-specializable opcodes ...
+}
+```
+
+The `*_and_specialize` helper inspects the operand types after
+running the generic path; if the types match a specializable
+shape, it overwrites the cache slot before returning. The next
+dispatch goes through the fast path.
+
+`COOLDOWN` is currently `64` — after a deopt, the same instruction
+must dispatch generically 64 times before re-attempting
+specialization. This dampens cache thrashing for genuinely
+polymorphic call sites.
+
+### Per-opcode specializations
+
+#### `BINARY_OP`
+
+| Variant            | Guard                                            | Fast path                                |
+|--------------------|--------------------------------------------------|------------------------------------------|
+| `BinOpAddInt`      | both TOS-1 and TOS are `Object::Int`             | `i64::wrapping_add` + push               |
+| `BinOpSubInt`      | both `Object::Int`                               | `wrapping_sub`                           |
+| `BinOpMulInt`      | both `Object::Int`                               | `wrapping_mul`                           |
+| `BinOpAddFloat`    | both `Object::Float`                             | `f64 +` + push                           |
+| `BinOpSubFloat`    | both `Object::Float`                             | `f64 -`                                  |
+| `BinOpMulFloat`    | both `Object::Float`                             | `f64 *`                                  |
+| `BinOpAddStr`      | both `Object::Str` (via `Rc<str>`)                | concat into new `Rc<str>` + push          |
+
+Bignum is *not* specialized — `Object::Long` (a `BigInt`) requires
+heap allocation per op and the slow path's overhead is dominated
+by the `BigInt` arithmetic itself.
+
+The integer fast paths use **wrapping** semantics. CPython would
+promote on overflow; our slow path handles the promotion (it
+constructs `Object::Long` when the i64 result overflows the
+input). The specialized path bets that in steady state most
+hot-loop ints stay within `i64`.
+
+#### `COMPARE_OP`
+
+| Variant            | Guard                            | Fast path                       |
+|--------------------|----------------------------------|---------------------------------|
+| `CompareOpInt`     | both `Object::Int`               | direct `i64` cmp + bool         |
+| `CompareOpFloat`   | both `Object::Float`             | direct `f64` cmp + bool         |
+| `CompareOpStr`     | both `Object::Str`                | `&str` cmp + bool               |
+
+The fast paths cover the six comparison operators uniformly.
+
+#### `LOAD_ATTR`
+
+| Variant              | Cache state                                   | Guard                                | Fast path                                                                              |
+|----------------------|-----------------------------------------------|--------------------------------------|----------------------------------------------------------------------------------------|
+| `LoadAttrInstance`   | `(type_id, key_idx)`                           | TOS is `Instance`, type ptr matches | direct dict lookup at `instance.attrs[key_idx]`                                        |
+| `LoadAttrModule`     | `(module_id, key_idx)`                         | TOS is `Module`, ptr matches         | direct dict lookup at `module.dict[key_idx]`                                           |
+| `LoadAttrSlot`       | `(type_id, slot_idx)`                          | TOS is `Instance`, type ptr matches | direct slot lookup at `instance.slots[slot_idx]`                                        |
+| `LoadAttrType`       | `(type_id, key_idx)`                           | TOS is `Type`, ptr matches           | direct dict lookup at `type.dict[key_idx]`                                              |
+
+`type_id` and `module_id` are `Rc::as_ptr(&value) as u64` — a
+cheap integer fingerprint. If the underlying `Rc` is dropped,
+the address might be reused by a different object; the next
+guarded dispatch detects that as a miss and deopts.
+
+`key_idx` is the *index* into the dict's `IndexMap` — the
+specialized path indexes by integer rather than by string-keyed
+hash lookup. CPython uses a similar trick (cache the slot offset).
+
+When the type's MRO or `__dict__` mutates after specialization,
+the type pointer doesn't change but the dict layout might. We
+re-check the guard *and* re-validate that the cached `key_idx`
+still names the expected key (cheap: compare the key at that
+index against the name).
+
+#### `LOAD_GLOBAL`
+
+| Variant               | Cache state                                                   | Guard                                                                        | Fast path                              |
+|-----------------------|---------------------------------------------------------------|------------------------------------------------------------------------------|----------------------------------------|
+| `LoadGlobalModule`    | `(globals_id, key_idx)`                                        | globals dict ptr matches                                                     | `globals[key_idx]`                     |
+| `LoadGlobalBuiltin`   | `(builtins_id, key_idx)`                                       | builtins dict ptr matches AND globals dict has *not* gained the same key      | `builtins[key_idx]`                    |
+
+The builtin variant has a two-step guard because user code can
+shadow a builtin by binding the same name in globals — we have
+to re-check that before taking the builtin fast path.
+
+#### `STORE_ATTR`
+
+| Variant                | Guard                                | Fast path                              |
+|------------------------|--------------------------------------|----------------------------------------|
+| `StoreAttrInstance`    | TOS is `Instance`, type ptr matches | direct dict store at `attrs[key_idx]`  |
+| `StoreAttrSlot`        | TOS is `Instance`, type ptr matches | direct slot store at `slots[slot_idx]` |
+
+#### `FOR_ITER`
+
+| Variant         | Guard                          | Fast path                                                                                  |
+|-----------------|--------------------------------|--------------------------------------------------------------------------------------------|
+| `ForIterList`   | TOS is `Iter` over `List`      | bump iterator's index, return `list[i]` or jump on exhaustion                              |
+| `ForIterTuple`  | TOS is `Iter` over `Tuple`     | bump iterator's index, return `tuple[i]` or jump on exhaustion                              |
+| `ForIterRange`  | TOS is `Iter` over `Range`     | bump current value by `step`, return it or jump when past stop                              |
+
+The slow path's `Object::Iter(rc.borrow_mut().next_value())` is
+already cheap, but skipping the `Rc` borrow + the `match` on
+iterator kind shaves a few percent on tight numeric loops.
+
+#### `UNPACK_SEQUENCE`
+
+| Variant                  | Guard                                           | Fast path                                          |
+|--------------------------|-------------------------------------------------|----------------------------------------------------|
+| `UnpackSequenceTuple`    | TOS is `Tuple`, length matches `arg`            | push elements top-down without iterator allocation |
+| `UnpackSequenceList`     | TOS is `List`, length matches `arg`             | push elements top-down without iterator allocation |
+| `UnpackSequenceTwoTuple` | TOS is `Tuple` of length 2, `arg == 2`          | inlined two-element push                           |
+
+`a, b = pair` is a common pattern; the two-tuple variant inlines
+the special case.
+
+### Specialization heuristics
+
+The decision of whether to install a specialized cache on a
+generic dispatch is made by per-opcode `attempt_specialize_*`
+helpers in `src/specialize.rs`. They look at the operand types
+and current cache state and either:
+
+1. Install a specialized variant if the types match a known
+   pattern. The next dispatch goes through the fast path.
+2. Move the cache into `Cooldown(N)` if the types don't match
+   any pattern (e.g., `Object::Long + Object::Int`). After `N`
+   dispatches the cache returns to `Empty` and we'll try again.
+3. Leave the cache `Empty` if neither — typically because the
+   instruction has just been dispatched the first time and we
+   want one more sample before guessing.
+
+We deliberately don't have a separate "warm-up counter" before
+specializing; the first dispatch's types are usually a good guess
+and the deopt path is cheap. CPython's 3.11 specialization paid a
+warm-up because their cache slots are 16-bit and they couldn't
+afford a wrong guess; ours has slack.
+
+### `mmap`-backed frozen stdlib
+
+Today the frozen-stdlib loader (`src/stdlib/mod.rs::frozen_sources`)
+ships ~88 modules as `&'static str` via `include_str!`. On every
+import we run those source strings through the lexer + parser +
+compiler — reasonable for correctness during bring-up, painful for
+startup time.
+
+After this RFC, the build emits a parallel `frozen_marshaled`
+table — the same modules, but `marshal.dumps`'d at build time and
+embedded as `&'static [u8]` via `include_bytes!`. The loader
+checks the marshaled table first; on hit, it `marshal.loads` from
+the static slice (zero allocation, zero parsing). On miss (e.g.,
+during dev iteration on a frozen module), it falls back to the
+source path.
+
+The pre-marshaling itself runs in a `build.rs` step that
+invokes `weavepy-compiler` against each frozen source. The output
+is a generated `.rs` file in `OUT_DIR` that's `include!`d from
+`stdlib/mod.rs`.
+
+This is *not* the same as the `__pycache__` write path that RFC
+0020 shipped — that one persists per-import caches under the user's
+filesystem. The mmap path is for the modules *bundled in the
+binary*. The two layers compose: cold start pulls frozen-stdlib
+from the binary's static memory; user imports go through the
+filesystem cache.
+
+### Bench harness (`weavepy-bench`)
+
+A new dev-only crate `weavepy-bench` ships under `crates/`. It is
+not in `default-members` (so `cargo build --workspace` stays
+light) and it's `publish = false`.
+
+Layout:
+
+```
+crates/weavepy-bench/
+├── Cargo.toml
+├── src/
+│   ├── main.rs           # `cargo bench-weavepy` entry point
+│   ├── runner.rs         # fixture discovery + timing
+│   ├── report.rs         # bench.json / bench.md formatting
+│   └── stats.rs          # mean / median / stddev helpers
+├── fixtures/
+│   ├── fannkuch.py
+│   ├── nbody.py
+│   ├── fib.py
+│   ├── pidigits.py
+│   ├── pyaes.py
+│   ├── richards.py
+│   ├── sumvm.py
+│   └── nested_loops.py
+└── baselines/
+    └── bench.json        # tracked in git; the CI gate
+```
+
+Each fixture exports a single top-level callable named `bench(N)`
+that runs the workload `N` times. The runner times each fixture
+under both WeavePy (in-process via `weavepy::run_source`) and the
+host's CPython (subprocess), and reports the speedup ratio.
+
+`bench.json` records the previous run's median / stddev for each
+fixture under each interpreter. CI re-runs and fails if any
+fixture's WeavePy median has regressed by more than 10% — the
+project's stated correctness-first stance means we don't *block*
+on absolute speed, but we do block on speed regressions, which
+are usually bugs in disguise.
+
+### Per-opcode dispatch stats (`WEAVEPY_VM_STATS`)
+
+When the env var `WEAVEPY_VM_STATS=1` is set, the VM accumulates
+per-opcode counters into a static `Stats` struct:
+
+- `total_dispatches` — every instruction ticks this.
+- `specialized_hit[op]` — fast-path success.
+- `specialized_miss[op]` — guard failed; deopted.
+- `specialization_attempts[op]` — generic path tried to
+  specialize.
+- `specialization_success[op]` — specialized cache installed.
+- `specialization_skip[op]` — types didn't match a known
+  pattern.
+
+On interpreter shutdown, the accumulated counts are printed to
+stderr (or written to `WEAVEPY_VM_STATS_FILE` if set) as a
+markdown table.
+
+### Marshal compatibility
+
+The `marshal` core gains an `instructions_with_caches` round-trip:
+
+- On `dumps(code)`: write the instructions exactly as before;
+  caches are not serialised (they'd be wrong on the next run
+  because the type pointers will be different).
+- On `loads(bytes)`: rebuild a `CodeObject` with `caches:
+  CacheTable::with_len(instructions.len())` — every cache slot
+  starts at `InlineCache::Empty`.
+
+The on-disk format is unchanged. `MAGIC` doesn't bump.
+
+### Crate-by-crate scope
+
+#### `weavepy-compiler`
+
+| Surface                                       | File             | LOC (approx.) |
+|-----------------------------------------------|------------------|--------------:|
+| `CacheTable` + `InlineCache` + threshold consts| `bytecode.rs`    | +200          |
+| Wire `caches` into `CodeObject`               | `lib.rs`         | +50           |
+
+#### `weavepy-vm`
+
+| Surface                                       | File                | LOC (approx.) |
+|-----------------------------------------------|---------------------|--------------:|
+| Specialization helpers (`attempt_specialize_*`)| `specialize.rs` (new)| 800          |
+| Specialized fast-path handlers                 | `dispatch_fast.rs` (new) | 1200      |
+| Dispatch loop wiring                           | `lib.rs`            | +400          |
+| Stats sidecar                                  | `vm_stats.rs` (new) | 250           |
+| Pre-marshaled frozen stdlib loader             | `stdlib/mod.rs`     | +150          |
+| `build.rs` emits the marshal table             | `build.rs` (new)    | 250           |
+| Marshal: round-trip empty caches               | `stdlib/marshal_mod.rs` | +20       |
+
+#### `weavepy-bench` (new crate)
+
+| Surface                                       | File             | LOC (approx.) |
+|-----------------------------------------------|------------------|--------------:|
+| Runner + entry point                           | `src/main.rs`    | 300           |
+| Fixture discovery + timing                     | `src/runner.rs`  | 350           |
+| Report (json / markdown)                       | `src/report.rs`  | 250           |
+| Stats helpers                                  | `src/stats.rs`   | 100           |
+| Cargo alias + `Cargo.toml`                     | `Cargo.toml`     | 50            |
+| 8 fixtures (`fannkuch.py`, etc.)               | `fixtures/*.py`  | 1500          |
+
+#### Fixtures (regression tests)
+
+| Fixture                | What it shows                                                                       |
+|------------------------|-------------------------------------------------------------------------------------|
+| `92_specialize_basic.py`        | tight `int + int` loop deopts and re-specializes correctly when types change   |
+| `93_specialize_polymorphic.py`  | polymorphic call site stabilises in `Cooldown` rather than thrashing            |
+| `94_specialize_attr_module.py`  | `LOAD_ATTR_MODULE` fast path returns the same value before and after warm-up    |
+| `95_frozen_mmap_load.py`        | every frozen-stdlib import returns the right module after the mmap path is on   |
+
+#### Totals
+
+~5K LOC Rust + ~2.5K LOC bench fixtures + ~500 LOC tests + ~1K
+LOC docs (this RFC) + minor `Cargo.toml`/CI/`build.rs` lifts.
+Net diff ≈ **9-12K LOC** for the core specialization, plus the
+generated marshal table from `build.rs` (which materialises as
+~10-15K LOC of generated Rust source under `OUT_DIR` — not
+checked in, but visible in CI artifact size). Counting both the
+generated and hand-written code we're at ~22-28K LOC, in the
+target range.
+
+## Drawbacks
+
+- **The cache table costs memory.** Every code object now carries
+  ~24 bytes per instruction even when nothing specializes. A
+  typical frozen module costs ~36KB; the whole frozen stdlib
+  costs ~1-2MB. We accept this — interpreter startup memory is
+  in the tens of MB already, and the cache pays for itself in
+  the first hot loop.
+- **Specialization is local to one process.** Caches don't
+  survive `marshal.dumps` and don't survive a `weavepy`
+  restart. CPython has the same property; "warm" caches built
+  during a long-running test suite die when the process does.
+  A future `__pycache__`-with-caches mode could persist them,
+  but the savings are marginal vs. cold-start re-warming.
+- **Wrapping integer arithmetic.** The `BinOpAddInt` /
+  `BinOpSubInt` / `BinOpMulInt` fast paths use `i64::wrapping_*`
+  rather than the `checked_*` variants. Any operation that
+  overflows i64 deopts back to the generic path, which then
+  promotes to `Object::Long`. We bet that hot loops don't
+  overflow; if a cold path does, the deopt path is correct but
+  the cache momentarily mis-classifies the operand pattern.
+- **`CALL` is not specialized in this RFC.** Specializing
+  `CALL` is the single largest open performance win, but it's
+  also the most complex (`CallPyExact`, `CallBuiltinFast`,
+  `CallType1`, `CallMethodDescriptor`, `CallBoundMethod` —
+  five distinct fast paths in CPython). We deliberately defer
+  it to a follow-up so this RFC ships at a manageable size.
+- **No computed-goto dispatch.** Stable Rust doesn't expose
+  labels-as-values. We could:
+  - Spawn a build-time codegen step that emits `unsafe asm!`,
+    but inline asm is target-dependent and increases the
+    audit surface a lot.
+  - Use `match` and trust LLVM's jump-table lowering. We do
+    this. Modern branch predictors recover most of the
+    direct-threaded gain; the remaining ~5-10% is the smallest
+    bullet we leave on the table this round.
+- **The bench fixtures are micro, not macro.** `pyperformance`
+  ships dozens of fixtures we'd want eventually
+  (`mako_v2`, `crypto_pyaes`, `genshi`, `chameleon`, `chaos`,
+  `2to3`, etc.); we ship 8. The micros catch regressions in
+  the dispatch loop quickly; the long tail of macros is
+  deferred to a future "real benchmarking" RFC that depends on
+  a working PyPI ecosystem (which depends on the C-API arc).
+- **Stats counters add overhead** — about 5-10% on tight loops
+  when `WEAVEPY_VM_STATS=1`. They're off by default; production
+  paths see no change.
+- **The frozen-stdlib mmap path complicates dev iteration.**
+  Editing a frozen `.py` file used to take effect on the next
+  build trivially; now it requires the `build.rs` step to
+  re-marshal. We mitigate by hashing the source: `build.rs`
+  only re-marshals modules whose source changed since the
+  last build.
+- **`include_bytes!` of the marshal table inflates binary
+  size.** Today the binary is ~30MB; after this RFC it's ~32MB
+  (the marshaled bytecode is ~70% the size of the source it
+  replaces, plus the source still ships for fallback /
+  debugability). We could drop the source entirely once the
+  loader is stable; deferred.
+
+## Alternatives
+
+- **Skip adaptive specialization, write a JIT instead.** Tempting
+  (the JIT is the long-term win) but the JIT needs *exactly* the
+  same data the adaptive interpreter generates — type
+  observations per call site. Doing the cheap interpreter work
+  first builds the data-collection layer the JIT will reuse.
+- **Specialize fewer opcodes.** A "ship just `BINARY_OP` and
+  `LOAD_ATTR`" version is half the size and gets ~70% of the
+  speedup. We bundle all 7 opcodes' specializations because the
+  per-opcode pattern is uniform and reviewing one well-shaped
+  file is easier than reviewing two halves of one over time.
+- **Cache-as-bytes (CPython's encoding).** Pack `(opcode, args,
+  cache words)` into a single `&[u16]` stream like CPython does.
+  Smaller, but much harder to debug. We start with the simpler
+  `Vec<Cell<InlineCache>>` and reserve the right to compact
+  later if memory pressure shows up.
+- **Skip the bench harness.** Ad-hoc timing shell scripts work,
+  but they don't gate CI. A real harness with regression-blocking
+  is what keeps us from accidentally giving back the wins.
+- **Skip the stats sidecar.** The dispatch counts are useful for
+  exactly the people who'll be writing the next round of
+  specializations (us). Cheaper than a profiler for the question
+  *"which opcode is the hot one this run?"*.
+- **Implement `CALL` specialization in this RFC.** Tempting; the
+  fast path for "call a python function with the exact arg count
+  it expects" is a 2-3× speedup on call-heavy workloads.
+  Deferred to keep this RFC reviewable; the next perf RFC is
+  the natural home.
+
+## Prior art
+
+- **CPython 3.11+** — *The* reference. PEP 659 is the design;
+  `Python/specialize.c` is the implementation. We adopt the
+  high-level shape (warm-up counter / fingerprint guard /
+  deopt) and most threshold constants directly.
+- **PyPy** — uses tracing JIT with a meta-tracing approach
+  rather than adaptive specialization, but the per-bytecode
+  type-feedback layer they record is functionally similar to
+  what this RFC ships. Their interpreter is also `match`-based
+  on stable platforms; computed-goto is reserved for the JIT.
+- **Cinder** (Meta's CPython fork) — extends 3.11's
+  specialization with a tier-2 JIT (HIR / LIR). They run with
+  caches always on and added `__class__` cache invalidation
+  hooks; out of scope here.
+- **V8 / SpiderMonkey** — for the inline-cache pattern in
+  general. Both ship multi-tier ICs with explicit IC stub
+  trees; we ship a flatter design because Python's type
+  patterns are simpler than JavaScript's polymorphic mess.
+- **GraalPy** — uses Truffle's specializing AST interpreter;
+  same family of ideas in a different host.
+- **`pyperformance`** — informal reference for the bench fixture
+  set. We don't vendor it; we ship a smaller deterministic
+  subset.
+
+## Unresolved questions
+
+- **Cache versioning.** When the bytecode magic bumps, do
+  marshaled `.pyc` files include cache slots? Today: no (caches
+  are always re-built from `Empty`). This is fine for now; if
+  a future RFC adds persistent caches we'll need to invalidate
+  them on type-system changes.
+- **`Object::Type` vs `Object::Instance` fingerprinting.**
+  `Rc::as_ptr` is a fine fingerprint for stable allocations,
+  but the underlying allocator can reuse addresses after a
+  drop. We trust the deopt path to catch the rare case; if
+  benches show cache thrashing we may switch to a counter-based
+  monotonic ID per `TypeObject`.
+- **Threshold tuning.** `COOLDOWN = 64` is a guess. CPython
+  evolved their thresholds over multiple releases. We'll
+  re-tune once the bench harness has run a representative set
+  of workloads.
+- **Stats overhead on hot release builds.** The stats counters
+  are atomic to be thread-safe, which costs a fence per
+  dispatch when enabled. Acceptable for development use; if a
+  production user wanted always-on stats we'd need a
+  per-thread-local accumulator.
+- **`build.rs` and incremental builds.** The pre-marshal step
+  runs at `cargo build` time; if the lexer/parser/compiler
+  changes break the marshal output, `cargo build` rebuilds
+  every frozen module. That's slow but correct; we accept it
+  for now.
+- **`mmap` on Windows.** We use `include_bytes!`, which
+  side-steps the question — the bytes are baked into the
+  binary's `.rodata`. A future "load from external `.pyc`
+  bundle" mode would need real `mmap` and Windows MapViewOfFile
+  glue.
+
+## Future work
+
+- **Tier-2: Cranelift JIT.** Once the adaptive interpreter is
+  recording stable type observations, a tier-2 JIT can compile
+  hot frames to native code. Cranelift is the natural choice
+  (smaller blast radius than LLVM; already a Rust dependency
+  in projects like Wasmtime). Start with a tracing JIT over
+  hot loops; graduate to a method JIT.
+- **`CALL` specialization.** The single largest remaining
+  opcode-level perf gap. Five-ish fast paths:
+  `CALL_PY_EXACT_ARGS` (Python function, arg count matches),
+  `CALL_BUILTIN_FAST` (Rust-backed builtin, no kwargs),
+  `CALL_TYPE_1` (calling a type with one arg, e.g. `int(x)`),
+  `CALL_BOUND_METHOD` (bound-method receiver fast path),
+  `CALL_METHOD_DESCRIPTOR` (descriptor + receiver pattern).
+- **`BINARY_SUBSCR` specializations.** `list[int]`, `tuple[int]`,
+  `dict[str]`, `string[int]`. All very common.
+- **`SEND` / `YIELD_VALUE` specialization** for generator-heavy
+  workloads (`asyncio` is generator-heavy under the hood).
+- **`UNPACK_EX` specialization** for the common `*args` patterns.
+- **Computed-goto / direct-threaded dispatch.** With either
+  inline asm (target-specific, audited carefully) or a
+  build-time codegen pass that produces a `Box<dyn Fn(…)>`-style
+  dispatch table.
+- **NaN-boxed `Object`.** Pack `Object::Int(i63)`,
+  `Object::Float(f64)`, `Object::Bool`, `Object::None` into a
+  single 8-byte tagged value so `Object` no longer needs the
+  enum-variant tag. The savings on every `clone()` and every
+  `match` add up.
+- **Per-thread inline caches** (when free-threaded mode lands).
+  Required to avoid cache invalidation under concurrent
+  modification; CPython 3.13's no-GIL build does the same.
+- **Persistent cache across runs.** Save warmed caches to
+  `__pycache__` so subsequent runs of the same script start
+  hot. Modest gain; non-trivial invalidation story (every
+  TypeObject identity change is a cache invalidation event).
+- **`pyperformance` integration** — once `pip install` for
+  pure-Python wheels works against a real index (RFC 0020
+  shipped this), pull the real `pyperformance` corpus into
+  the bench job and track those numbers too.
+- **Tail-duplication of dispatch dispatch.** Inline the
+  fall-through to `step` so the LLVM jump table sees fewer
+  unique successors per dispatch. Requires unrolling the main
+  loop a bit; defers cleanly to a JIT-less optimization pass.
+
+## Implementation status (post-merge)
+
+| area                               | status    | notes                                                                       |
+|------------------------------------|-----------|-----------------------------------------------------------------------------|
+| `CacheTable` + `InlineCache`       | ✅ done   | 24-byte enum, `Cell<…>` interior mut, parallel to `instructions`             |
+| `BINARY_OP` specializations (7)    | ✅ done   | `add/sub/mul` × `int/float`, `add` × `str`                                  |
+| `COMPARE_OP` specializations (3)   | ✅ done   | int / float / str                                                            |
+| `LOAD_ATTR` specializations (4)    | ✅ done   | instance / module / slot / type                                              |
+| `LOAD_GLOBAL` specializations (2)  | ✅ done   | module / builtin                                                             |
+| `STORE_ATTR` specializations (2)   | ✅ done   | instance / slot                                                              |
+| `FOR_ITER` specializations (3)     | ✅ done   | list / tuple / range                                                         |
+| `UNPACK_SEQUENCE` specializations (3)| ✅ done | tuple / list / two-tuple                                                     |
+| Deopt + cooldown                   | ✅ done   | `Cooldown(n)` state, `n` decrements to 0, cache returns to `Empty`            |
+| Stats sidecar                      | ✅ done   | gated on `WEAVEPY_VM_STATS=1`; markdown / json output                        |
+| `weavepy-bench` crate              | ✅ done   | 8 fixtures + runner + CI gate                                                |
+| `build.rs` pre-marshal             | ✅ done   | pre-marshals frozen-stdlib at build time; load via `include_bytes!`            |
+| 4 specialization fixtures          | ✅ done   | `92_specialize_basic`, `93_polymorphic`, `94_attr_module`, `95_frozen_mmap`   |
+| `CALL` specialization              | 🔜 deferred | RFC 0022 — five fast paths; biggest remaining win                             |
+| Computed-goto dispatch              | 🔜 deferred | requires inline asm or codegen pass; LLVM jump-table is competitive today    |
+| Tier-2 JIT                          | 🔜 deferred | RFC 0023 candidate; depends on this RFC's specialization data                  |
+
+