diff --git a/Cargo.lock b/Cargo.lock index 997e604..0d17758 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,9 +89,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.102" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +checksum = "2a4385e2e34eb35d6b3efe798b9eb88096925d87726c0798709bf56d9ed84af3" [[package]] name = "arc-swap" @@ -694,6 +694,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.13.0" @@ -729,13 +735,13 @@ dependencies = [ [[package]] name = "bstr" -version = "1.12.1" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +checksum = "5cee35f73844aa3014bb606320a6c1f010249dbdf43342fe54b5a4f6a8ed4b79" dependencies = [ "memchr", "regex-automata", - "serde", + "serde_core", ] [[package]] @@ -830,9 +836,9 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "chacha20" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +checksum = "d524456ba66e72eb8b115ff89e01e497f8e6d11d78b70b1aa13c0fbd97540a81" dependencies = [ "cfg-if", "cpufeatures 0.3.0", @@ -1138,6 +1144,38 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2" +[[package]] +name = "defmt" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6e524506490a1953d237cb87b1cfc1e46f88c18f10a22dfe0f507dc6bfc7f7f" +dependencies = [ + "bitflags 1.3.2", + "defmt-macros", +] + +[[package]] +name = "defmt-macros" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a27770e9c8f719a79d8b638281f4d828f77d8fd61e0bd94451b9b85e576a0b" +dependencies = [ + "defmt-parser", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "defmt-parser" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10d60334b3b2e7c9d91ef8150abfb6fa4c1c39ebbcf4a81c2e346aad939fee3e" +dependencies = [ + "thiserror 2.0.18", +] + [[package]] name = "delegate" version = "0.10.0" @@ -1543,10 +1581,11 @@ dependencies = [ [[package]] name = "graphify" -version = "0.8.44" +version = "0.8.49" dependencies = [ "anyhow", "assert_cmd", + "chrono", "clap", "graphify-affected", "graphify-analyze", @@ -1565,6 +1604,7 @@ dependencies = [ "graphify-ingest", "graphify-llm", "graphify-prs", + "graphify-reflect", "graphify-report", "graphify-security", "graphify-serve", @@ -1581,7 +1621,7 @@ dependencies = [ [[package]] name = "graphify-affected" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-build", "graphify-security", @@ -1594,7 +1634,7 @@ dependencies = [ [[package]] name = "graphify-analyze" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-build", "graphify-cluster", @@ -1606,7 +1646,7 @@ dependencies = [ [[package]] name = "graphify-benchmark" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-build", "graphify-security", @@ -1619,7 +1659,7 @@ dependencies = [ [[package]] name = "graphify-build" -version = "0.8.44" +version = "0.8.49" dependencies = [ "caseless", "graphify-security", @@ -1636,8 +1676,9 @@ dependencies = [ [[package]] name = "graphify-cache" -version = "0.8.44" +version = "0.8.49" dependencies = [ + "graphify-security", "hex", "indexmap", "serde", @@ -1651,7 +1692,7 @@ dependencies = [ [[package]] name = "graphify-cluster" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-build", "indexmap", @@ -1664,7 +1705,7 @@ dependencies = [ [[package]] name = "graphify-dedup" -version = "0.8.44" +version = "0.8.49" dependencies = [ "caseless", "indexmap", @@ -1679,10 +1720,11 @@ dependencies = [ [[package]] name = "graphify-detect" -version = "0.8.44" +version = "0.8.49" dependencies = [ "calamine", "graphify-google", + "graphify-security", "hex", "ignore", "indexmap", @@ -1693,6 +1735,7 @@ dependencies = [ "regex", "serde", "serde_json", + "serial_test", "sha2 0.11.0", "shlex", "tempfile", @@ -1703,7 +1746,7 @@ dependencies = [ [[package]] name = "graphify-diagnostics" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-build", "graphify-security", @@ -1717,7 +1760,7 @@ dependencies = [ [[package]] name = "graphify-export" -version = "0.8.44" +version = "0.8.49" dependencies = [ "chrono", "graphify-build", @@ -1741,7 +1784,7 @@ dependencies = [ [[package]] name = "graphify-extract" -version = "0.8.44" +version = "0.8.49" dependencies = [ "flate2", "glob", @@ -1796,7 +1839,7 @@ dependencies = [ [[package]] name = "graphify-global" -version = "0.8.44" +version = "0.8.49" dependencies = [ "chrono", "graphify-build", @@ -1812,7 +1855,7 @@ dependencies = [ [[package]] name = "graphify-google" -version = "0.8.44" +version = "0.8.49" dependencies = [ "hex", "regex", @@ -1825,7 +1868,7 @@ dependencies = [ [[package]] name = "graphify-hooks" -version = "0.8.44" +version = "0.8.49" dependencies = [ "regex", "serde_json", @@ -1838,7 +1881,7 @@ dependencies = [ [[package]] name = "graphify-html" -version = "0.8.44" +version = "0.8.49" dependencies = [ "chrono", "graphify-build", @@ -1855,7 +1898,7 @@ dependencies = [ [[package]] name = "graphify-ingest" -version = "0.8.44" +version = "0.8.49" dependencies = [ "chrono", "graphify-security", @@ -1873,7 +1916,7 @@ dependencies = [ [[package]] name = "graphify-llm" -version = "0.8.44" +version = "0.8.49" dependencies = [ "aws-config", "aws-sdk-bedrockruntime", @@ -1900,14 +1943,14 @@ dependencies = [ [[package]] name = "graphify-manifest" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-detect", ] [[package]] name = "graphify-multigraph-compat" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-build", "indexmap", @@ -1918,7 +1961,7 @@ dependencies = [ [[package]] name = "graphify-prs" -version = "0.8.44" +version = "0.8.49" dependencies = [ "chrono", "graphify-security", @@ -1929,9 +1972,22 @@ dependencies = [ "thiserror 2.0.18", ] +[[package]] +name = "graphify-reflect" +version = "0.8.49" +dependencies = [ + "chrono", + "graphify-ingest", + "indexmap", + "regex", + "serde_json", + "tempfile", + "thiserror 2.0.18", +] + [[package]] name = "graphify-report" -version = "0.8.44" +version = "0.8.49" dependencies = [ "chrono", "graphify-analyze", @@ -1945,7 +2001,7 @@ dependencies = [ [[package]] name = "graphify-scip" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-security", "hex", @@ -1960,12 +2016,13 @@ dependencies = [ [[package]] name = "graphify-security" -version = "0.8.44" +version = "0.8.49" dependencies = [ "ipnet", "mockito", "regex", "serde_json", + "serial_test", "tempfile", "thiserror 2.0.18", "ureq", @@ -1974,7 +2031,7 @@ dependencies = [ [[package]] name = "graphify-semantic" -version = "0.8.44" +version = "0.8.49" dependencies = [ "indexmap", "regex", @@ -1986,7 +2043,7 @@ dependencies = [ [[package]] name = "graphify-serve" -version = "0.8.44" +version = "0.8.49" dependencies = [ "axum", "chrono", @@ -2007,7 +2064,7 @@ dependencies = [ [[package]] name = "graphify-transcribe" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-security", "hex", @@ -2020,7 +2077,7 @@ dependencies = [ [[package]] name = "graphify-validate" -version = "0.8.44" +version = "0.8.49" dependencies = [ "serde_json", "thiserror 2.0.18", @@ -2028,7 +2085,7 @@ dependencies = [ [[package]] name = "graphify-watch" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-analyze", "graphify-build", @@ -2050,7 +2107,7 @@ dependencies = [ [[package]] name = "graphify-wiki" -version = "0.8.44" +version = "0.8.49" dependencies = [ "graphify-build", "indexmap", @@ -2443,7 +2500,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "533e68a5842e734946fe159fb03fc9bbbb254f590dd0d8ad321ae5ff7beca2c1" dependencies = [ - "bitflags", + "bitflags 2.13.0", "inotify-sys", "libc", ] @@ -2496,10 +2553,11 @@ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jiff" -version = "0.2.28" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4603d3033e49e2b0e31229fcab20a5d40089c607d975cd9c80551dc69eed9102" +checksum = "34f877a98676d2fb664698d74cc6a51ce6c484ce8c770f05d0108ec9090aeb46" dependencies = [ + "defmt", "jiff-static", "jiff-tzdb-platform", "log", @@ -2511,9 +2569,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.28" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" +checksum = "0666b5ab5ecaca213fc2a85b8c0083d9004e84ee2d5f9a7e0017aaf50986f25f" dependencies = [ "proc-macro2", "quote", @@ -2589,9 +2647,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.102" +version = "0.3.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" +checksum = "53b44bfcdb3f8d5837a46dae1ca9660a837176eee74a28b229bc626816589102" dependencies = [ "cfg-if", "futures-util", @@ -2614,7 +2672,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07293a4e297ac234359b510362495713f75ea345d5307140414f20c69ffeb087" dependencies = [ - "bitflags", + "bitflags 2.13.0", "libc", ] @@ -2680,18 +2738,18 @@ dependencies = [ [[package]] name = "log" -version = "0.4.32" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" +checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad" [[package]] name = "lopdf" -version = "0.41.0" +version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67513274c50a2b51e5f75d9e682fcf4ab064a8a9c9ae2c3c59309084882bb24d" +checksum = "25aab26d99567469098e64a02f42679f8965c6401263eefa31d8f2dcc37a221c" dependencies = [ "aes", - "bitflags", + "bitflags 2.13.0", "cbc", "chrono", "ecb", @@ -2929,7 +2987,7 @@ version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d3d07927151ff8575b7087f245456e549fea62edf0ec4e565a5ee50c8402bc3" dependencies = [ - "bitflags", + "bitflags 2.13.0", "fsevent-sys", "inotify", "kqueue", @@ -2960,7 +3018,7 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42b8cfee0e339a0337359f3c88165702ac6e600dc01c0cc9579a92d62b08477a" dependencies = [ - "bitflags", + "bitflags 2.13.0", ] [[package]] @@ -3022,7 +3080,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags", + "bitflags 2.13.0", ] [[package]] @@ -3052,7 +3110,7 @@ version = "0.10.81" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77823a27f0babb03091cb9ed9ef80af3b39dbc82f97e8fa530374b7dafd87a45" dependencies = [ - "bitflags", + "bitflags 2.13.0", "cfg-if", "foreign-types", "libc", @@ -3356,6 +3414,28 @@ dependencies = [ "termtree", ] +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", + "syn 2.0.118", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -3386,9 +3466,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.45" +version = "1.0.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" dependencies = [ "proc-macro2", ] @@ -3531,7 +3611,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.13.0", ] [[package]] @@ -3621,7 +3701,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.13.0", "errno", "libc", "linux-raw-sys", @@ -3630,9 +3710,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.40" +version = "0.23.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +checksum = "6b92b125634d9b795e7beca796cc790df15a7fb38323bf3196fda83292d06b1f" dependencies = [ "aws-lc-rs", "log", @@ -3741,7 +3821,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags", + "bitflags 2.13.0", "core-foundation 0.9.4", "core-foundation-sys", "libc", @@ -3754,7 +3834,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags", + "bitflags 2.13.0", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -4197,9 +4277,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.49" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711a53c2d47bbd818258c498c8dbfe186a2526c631495cfe7e078567f86b8469" +checksum = "85c17d80feb7334b40c484e45ed1a5273dfd8bfda537c3be2e74a06a6686f327" dependencies = [ "deranged", "num-conv", @@ -4217,9 +4297,9 @@ checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" [[package]] name = "time-macros" -version = "0.2.29" +version = "0.2.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c652a3727a9cbb9a02f707f530b618ce00d0ccd762009c8c23bd191df3c17d" +checksum = "dcef1a61bdb119096e153208ec5cbec23944ce8bca13be5c7f60c634f7403935" dependencies = [ "num-conv", "time-core", @@ -4907,9 +4987,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.23.3" +version = "1.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" +checksum = "bf80a72845275afea99e7f2b434723d3bc7e38470fcd1c7ed39a599c73319a53" dependencies = [ "js-sys", "wasm-bindgen", @@ -5002,9 +5082,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.125" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" +checksum = "4b067c0c11094aef6b7a801c1e34a26affafdf3d051dba08456b868789aaf9a4" dependencies = [ "cfg-if", "once_cell", @@ -5015,9 +5095,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.125" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" +checksum = "167ce5e579f6bcf889c4f7175a8a5a585de84e8ff93976ce393efa5f2837aab1" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5025,9 +5105,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.125" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" +checksum = "f3997c7839262f4ef12cf90b818d6340c18e80f263f1a94bf157d0ec4420380e" dependencies = [ "bumpalo", "proc-macro2", @@ -5038,18 +5118,18 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.125" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" +checksum = "dc1b4cb0cc549fcf58d7dfc081778139b3d283a081644e833e84682ad71cea24" dependencies = [ "unicode-ident", ] [[package]] name = "web-sys" -version = "0.3.102" +version = "0.3.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" +checksum = "8622dcb61c0bcc9fffa6938bed81210af2da9a7e4a1a834b2e37a59b6dfb6141" dependencies = [ "js-sys", "wasm-bindgen", @@ -5490,9 +5570,9 @@ dependencies = [ [[package]] name = "zlib-rs" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" +checksum = "977347db8caa080403f6b6b7c1cda9479a8e869316f7e13a59b19076a40f94e3" [[package]] name = "zmij" diff --git a/Cargo.toml b/Cargo.toml index d8d0b74..3e77608 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ members = [ "crates/graphify-manifest", "crates/graphify-multigraph-compat", "crates/graphify-prs", + "crates/graphify-reflect", "crates/graphify-report", "crates/graphify-scip", "crates/graphify-security", @@ -43,7 +44,7 @@ license = "Apache-2.0" publish = false repository = "https://github.com/bunkerlab-net/graphify" rust-version = "1.95" -version = "0.8.44" +version = "0.8.49" [workspace.dependencies] anyhow = "1" @@ -55,13 +56,10 @@ chrono = { version = "0.4", default-features = false, features = [ clap = { version = "4", features = ["derive", "env"] } hex = "0.4" htmlescape = "0.3" -idna = "1" ignore = "0.4" indexmap = { version = "2", features = ["serde"] } mockito = "1" -once_cell = "1" percent-encoding = "2" -petgraph = { version = "0.8", features = ["serde-1"] } rayon = "1" regex = "1" serde = { version = "1", features = ["derive"] } @@ -100,6 +98,7 @@ graphify-llm = { path = "crates/graphify-llm" } graphify-manifest = { path = "crates/graphify-manifest" } graphify-multigraph-compat = { path = "crates/graphify-multigraph-compat" } graphify-prs = { path = "crates/graphify-prs" } +graphify-reflect = { path = "crates/graphify-reflect" } graphify-report = { path = "crates/graphify-report" } graphify-scip = { path = "crates/graphify-scip" } graphify-security = { path = "crates/graphify-security" } @@ -137,6 +136,7 @@ path = "src/main.rs" [dependencies] anyhow = { workspace = true } +chrono = { workspace = true } clap = { workspace = true } graphify-affected = { workspace = true } graphify-analyze = { workspace = true } @@ -155,6 +155,7 @@ graphify-html = { workspace = true } graphify-ingest = { workspace = true } graphify-llm = { workspace = true } graphify-prs = { workspace = true } +graphify-reflect = { workspace = true } graphify-report = { workspace = true } graphify-security = { workspace = true } graphify-serve = { workspace = true } diff --git a/README.md b/README.md index 570b545..46f075c 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ a Rust equivalent, and outputs are byte-identical where the test suite asserts i (`.razor`, `.cshtml`) for package, project-reference, target-framework, and `@code` extraction, Verilog/SystemVerilog (`.v`, `.sv`, `.svh`), BYOND DreamMaker (`.dm`, `.dme` source plus `.dmi` icon sheets, `.dmm` maps, and `.dmf` interface forms), + CUDA (`.cu`, `.cuh`) routed through the C++ extractor, and MCP config files (`.mcp.json`, `claude_desktop_config.json`, `mcp.json`, `mcp_servers.json`) — servers, commands, packages, and env-var _names_ (values are never read). @@ -75,7 +76,8 @@ a Rust equivalent, and outputs are byte-identical where the test suite asserts i - **Structural introspection** — `graphify extract --cargo` adds `crate -> crate` dependency edges from `Cargo.toml` manifests; `--postgres ` adds a live PostgreSQL schema (requires the `postgres` build feature). - **LLM community naming** — `graphify label` (or `cluster-only`) auto-names graph communities with the configured - backend; degrades to `Community N` placeholders when no backend is available. + backend, fanning out batches in parallel (`--max-concurrency`, `--batch-size`); degrades to `Community N` + placeholders when no backend is available. - **AI-assistant integration** — drop-in installers for Claude Code, CodeBuddy, Codex, Amp, Cursor, Gemini CLI, GitHub Copilot, VS Code, OpenCode, Aider, Factory Droid, Trae, Hermes, Kiro, Kilo Code, Pi, Devin CLI, Google Antigravity, and more. @@ -83,6 +85,9 @@ a Rust equivalent, and outputs are byte-identical where the test suite asserts i (`--transport http`, requires the `http` build feature) so one shared process can host the graph for a team. - **Git hooks + merge driver** so two branches editing the same `graph.json` produce a union-merged result. - **Cross-repo global graph** — aggregate every project you care about into one `~/.graphify/global-graph.json`. +- **Work memory** — `graphify save-result` records Q&A outcomes under `graphify-out/memory/`, and `graphify reflect` + aggregates them into a deterministic `reflections/LESSONS.md` lessons doc (refreshed automatically by the + post-commit/post-checkout hooks). - **Deterministic outputs** — same inputs on the same machine produce byte-identical JSON. ## Install @@ -146,7 +151,7 @@ For development conventions (lint policy, porting rules, test layout, definition ```text graphify/ ├── src/ # graphify CLI binary -├── crates/ # 29 focused workspace crates +├── crates/ # 30 focused workspace crates │ ├── graphify-detect/ # filesystem walking + file-type detection │ ├── graphify-extract/ # tree-sitter / document / media extractors │ ├── graphify-build/ # graph construction @@ -167,6 +172,7 @@ graphify/ │ ├── graphify-multigraph-compat/ # runtime keyed-edge capability probe │ ├── graphify-scip/ # SCIP-style JSON ingest │ ├── graphify-semantic/ # LLM extraction fragment validator +│ ├── graphify-reflect/ # work-memory reflection (LESSONS.md aggregator) │ └── ... # benchmark, cache, dedup, ingest, manifest, transcribe, validate, watch, google └── graphify-py/ # read-only git submodule — Python reference ``` diff --git a/USAGE.md b/USAGE.md index 300028b..107e4de 100644 --- a/USAGE.md +++ b/USAGE.md @@ -26,7 +26,8 @@ graphify-out/ Optional output lands under `graphify-out/` only when you opt in: `wiki/` (`graphify export wiki`), `GRAPH_TREE.html` (`graphify tree`), `cypher.txt` (`graphify export neo4j`), `/` backups (created automatically when -`graph.json` is overwritten), and `memory/` (Q&A saved by `graphify save-result`). +`graph.json` is overwritten), `memory/` (Q&A saved by `graphify save-result`), +and `reflections/LESSONS.md` (aggregated work-memory lessons from `graphify reflect`). This is the Rust reimplementation of the Python `graphify` reference; the CLI surface is 1:1 with `python -m graphify`. @@ -117,7 +118,9 @@ and **Markdown links** (inline, reference-style, and `[[wikilinks]]`) as `references` edges, so a hub doc (`index.md`, a table of contents) connects to the documents it links instead of being an orphan (#1376). Swift `import` targets become shared `type=module` anchor nodes and cross-file member calls -(`recv.method()`) resolve through the file's local type table (#1327, #1356). +(`recv.method()`) resolve through the file's local type table (#1327, #1356); +Python `ClassName.method()` calls resolve to the class-qualified method node +across files (#1446). CUDA sources (`.cu`, `.cuh`) are extracted through the C++ pass. Optional LLM-driven semantic extraction is wired through `--backend`/`--model`/`--mode`/`--token-budget`/ `--max-concurrency`/`--api-timeout`/`--max-workers` (see `graphify extract --help` and the @@ -164,7 +167,8 @@ Rerun clustering on an existing `graph.json` and regenerate the report and HTML parameters or when you only want to refresh `GRAPH_REPORT.md`. When no `.graphify_labels.json` exists yet, `cluster-only` auto-names communities with the configured LLM backend -in a single batched call, falling back to `Community N` placeholders if no backend is configured or the call fails. +in batched calls (fanned out in parallel — tune with `--max-concurrency` / `--batch-size`), falling back to +`Community N` placeholders if no backend is configured or the call fails. An existing labels file is preserved (re-run `graphify label` to force a refresh). ```bash @@ -173,6 +177,8 @@ graphify cluster-only . --no-viz # skip graph.html (saves time graphify cluster-only . --graph other/graph.json # use a non-default graph location graphify cluster-only . --no-label # keep "Community N" placeholders (skip LLM naming) graphify cluster-only . --backend openai # backend to use for naming (default: auto-detect) +graphify cluster-only . --max-concurrency 8 # parallel LLM naming batches (default 4) +graphify cluster-only . --batch-size 50 # communities per LLM call (default 100) ``` ### `label ` @@ -184,6 +190,8 @@ graphify cluster-only . --backend openai # backend to use for naming ( graphify label . # re-name with the auto-detected backend graphify label . --backend gemini # force a specific backend graphify label . --no-viz # skip graph.html regeneration +graphify label . --max-concurrency 8 # parallel LLM naming batches (default 4) +graphify label . --batch-size 50 # communities per LLM call (default 100) ``` If no backend is configured (no API key), `label` degrades to `Community N` placeholders and prints a hint. @@ -304,15 +312,38 @@ graphify explain "AuthMiddleware" Save a Q&A result back into `graphify-out/memory/` so it gets re-extracted into the graph on the next `update` (the feedback loop). Files under `graphify-out/memory/` are always detected: they bypass `.gitignore` / `.graphifyignore` filtering, so a broad ignore pattern (e.g. `*.md`) can't silently erase generated memory notes. +Pass `--outcome useful|dead_end|corrected` (and `--correction ""` for the `corrected` case) to record +a work-memory signal that `graphify reflect` later aggregates into `LESSONS.md`. An out-of-set `--outcome` is rejected. ```bash graphify save-result \ --question "how is auth scoped" \ --answer "AuthMiddleware checks tenant_id from JWT and binds it to the request context" \ --type query \ - --nodes AuthMiddleware request_context + --nodes AuthMiddleware request_context \ + --outcome useful ``` +### `reflect` + +Aggregate the work-memory outcomes saved under `graphify-out/memory/` into a single deterministic +`graphify-out/reflections/LESSONS.md`. Each `save-result --outcome` signal is time-decayed (a signal's weight +halves every 30 days by default), and nodes are bucketed into **preferred** (corroborated by ≥2 useful sessions), +**tentative** (seen once), and **contested** (mixed signals — the most recent verdict wins). Dead ends and +corrections are listed so the next session avoids re-deriving them. When a `graph.json` is present, lessons are +grouped by community. + +```bash +graphify reflect # writes graphify-out/reflections/LESSONS.md +graphify reflect --if-stale # skip when LESSONS.md is already newer than every input +graphify reflect --half-life-days 14 # signals decay twice as fast +graphify reflect --min-corroboration 3 # require 3 useful sessions to promote a node to "preferred" +graphify reflect --out custom/LESSONS.md # write the lessons doc elsewhere +``` + +The post-commit / post-checkout hooks refresh `LESSONS.md` automatically after each rebuild when saved outcomes +exist, so the lessons doc stays current without a manual run. + ### `affected ""` Reverse-traversal impact analysis: given a node label / ID / source-file substring, enumerate every node that @@ -564,12 +595,17 @@ available and is untouched by the project flag. graphify install --platform claude # same as `graphify claude install` graphify install claude # positional shorthand also accepted graphify install --platform kimi # Kimi CLI → ~/.kimi/skills/graphify/SKILL.md (no dedicated subcommand) +graphify install --platform agents # cross-framework Agent-Skills → ~/.agents/skills/graphify/SKILL.md +graphify agents install # same target; `graphify skills install` is an accepted alias +graphify agents install --project # ./.agents/skills/graphify/SKILL.md + AGENTS.md section graphify uninstall # removes graphify from every detected platform graphify uninstall --purge # also deletes graphify-out/ ``` The aggregate `install` is a convenience dispatcher to the per-platform installer; the aggregate `uninstall` scans -every supported platform and removes the integration wherever it finds one. +every supported platform and removes the integration wherever it finds one. The `agents` platform (aliased as +`skills`) targets the cross-framework Agent-Skills location — `~/.agents/skills/graphify/SKILL.md` globally, or +`./.agents/skills/graphify/SKILL.md` plus an `AGENTS.md` section under `--project`. ### `hook-check` @@ -629,7 +665,7 @@ completes the feature.) | Variable | Effect | | -------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | -| `GRAPHIFY_OUT` | Override the output directory name (default `graphify-out`). | +| `GRAPHIFY_OUT` | Override the output directory (default `graphify-out`); a relative name or an absolute path, honoured everywhere. | | `GRAPHIFY_FORCE` | Same effect as `--force` on `update`. | | `GRAPHIFY_VIZ_NODE_LIMIT` | Cap nodes before HTML export is skipped (default 5000). | | `GRAPHIFY_GOOGLE_WORKSPACE` | Truthy value enables `.gdoc/.gsheet/.gslides` export by default. | diff --git a/crates/graphify-build/src/build_fn.rs b/crates/graphify-build/src/build_fn.rs index ced61e5..fd96490 100644 --- a/crates/graphify-build/src/build_fn.rs +++ b/crates/graphify-build/src/build_fn.rs @@ -170,16 +170,31 @@ pub fn build_from_json( ); } - if let Some(hyperedges) = extraction - .as_object() - .and_then(|o| o.get("hyperedges")) - .cloned() - && let Some(arr) = hyperedges.as_array() + if let Some(arr) = extraction + .as_object_mut() + .and_then(|o| o.get_mut("hyperedges")) + .and_then(Value::as_array_mut) && !arr.is_empty() { + // Relativize hyperedge source_file the same way nodes and edges are, so + // `to_json` — which writes `graph.hyperedges` verbatim and has no root — + // never leaks an absolute path from a semantic subagent (#1418). + for he in arr.iter_mut() { + let Some(map) = he.as_object_mut() else { + continue; + }; + let Some(sf) = map.get("source_file").and_then(Value::as_str) else { + continue; + }; + if sf.is_empty() { + continue; + } + let normalized = norm_source_file(sf, root_str.as_deref()); + map.insert("source_file".to_string(), Value::String(normalized)); + } graph .graph_attrs - .insert("hyperedges".to_string(), hyperedges); + .insert("hyperedges".to_string(), Value::Array(arr.clone())); } Ok(graph) diff --git a/crates/graphify-build/tests/parity.rs b/crates/graphify-build/tests/parity.rs index ac64bd5..ecd1caa 100644 --- a/crates/graphify-build/tests/parity.rs +++ b/crates/graphify-build/tests/parity.rs @@ -359,6 +359,84 @@ fn build_from_json_relative_source_file_unchanged() { ); } +#[test] +fn build_from_json_relativizes_hyperedge_source_file() { + // #1418: hyperedge source_file must be relativized like nodes and edges, so + // `to_json` — which writes `graph.hyperedges` verbatim and has no root — + // never leaks an absolute path from a semantic subagent. + let tmp = tempfile::tempdir().expect("tempdir"); + let base = tmp.path().canonicalize().expect("canonicalize"); + let abs_doc = base.join("docs").join("CLAUDE.md"); + let abs_str = abs_doc.to_string_lossy().into_owned(); + let ext = json!({ + "nodes": [ + {"id": "a", "label": "A", "file_type": "document", "source_file": abs_str.clone()}, + ], + "edges": [], + "hyperedges": [ + {"id": "arch", "label": "Architecture", "nodes": ["a"], + "relation": "participate_in", "confidence": "INFERRED", + "confidence_score": 0.75, "source_file": abs_str}, + ], + }); + let g = build_from_json(ext, false, Some(&base)).expect("build"); + let he = g + .graph_attrs + .get("hyperedges") + .and_then(Value::as_array) + .and_then(|a| a.first()) + .expect("hyperedge present"); + assert_eq!( + he.get("source_file").and_then(Value::as_str), + Some("docs/CLAUDE.md") + ); + // Anchor: the node path is relativized the same way (the contract this mirrors). + assert_eq!( + g.node_data("a") + .and_then(|a| a.get("source_file")) + .and_then(Value::as_str), + Some("docs/CLAUDE.md") + ); +} + +#[test] +fn build_from_json_skips_non_hashable_node_id() { + // A malformed LLM extraction can emit a list-valued id; build_from_json must + // skip it and still build the graph from the well-formed nodes. + let ext = json!({ + "nodes": [ + {"id": "a", "label": "A", "file_type": "code", "source_file": "a.py"}, + {"id": ["x", "y"], "label": "B", "file_type": "code", "source_file": "b.py"}, + {"label": "C", "file_type": "code", "source_file": "c.py"}, + ], + "edges": [], + }); + let g = build_from_json(ext, false, None).expect("build"); + let ids: std::collections::BTreeSet = g.nodes().map(|(id, _)| id.clone()).collect(); + assert_eq!(ids, ["a".to_string()].into_iter().collect()); +} + +#[test] +fn build_from_json_skips_edge_with_non_hashable_endpoint() { + // A list-valued edge endpoint must be skipped; the well-formed edge survives. + let ext = json!({ + "nodes": [ + {"id": "a", "label": "A", "file_type": "code", "source_file": "a.py"}, + {"id": "b", "label": "B", "file_type": "code", "source_file": "b.py"}, + ], + "edges": [ + {"source": "a", "target": ["b", "c"], "relation": "calls", + "confidence": "INFERRED", "source_file": "a.py"}, + {"source": "a", "target": "b", "relation": "imports", + "confidence": "EXTRACTED", "source_file": "a.py"}, + ], + }); + let g = build_from_json(ext, false, None).expect("build"); + assert_eq!(g.node_count(), 2); + assert_eq!(g.edge_count(), 1); + assert!(g.edge_data("a", "b").is_some()); +} + #[test] fn build_merge_preserves_call_edge_direction() { // #760: build_merge must read source/target verbatim, not re-derive edge diff --git a/crates/graphify-cache/Cargo.toml b/crates/graphify-cache/Cargo.toml index 6038ace..01da476 100644 --- a/crates/graphify-cache/Cargo.toml +++ b/crates/graphify-cache/Cargo.toml @@ -9,6 +9,7 @@ rust-version.workspace = true version.workspace = true [dependencies] +graphify-security = { workspace = true } hex = { workspace = true } indexmap = { workspace = true } serde = { workspace = true } diff --git a/crates/graphify-cache/src/paths.rs b/crates/graphify-cache/src/paths.rs index bedad3f..e0e001c 100644 --- a/crates/graphify-cache/src/paths.rs +++ b/crates/graphify-cache/src/paths.rs @@ -18,12 +18,6 @@ pub const EXTRACTOR_VERSION: &str = env!("CARGO_PKG_VERSION"); static CLEANED_AST_DIRS: LazyLock>> = LazyLock::new(|| Mutex::new(HashSet::new())); -/// Output directory name; defaults to `"graphify-out"` and respects the -/// `GRAPHIFY_OUT` environment variable override. -pub(crate) fn graphify_out() -> String { - std::env::var("GRAPHIFY_OUT").unwrap_or_else(|_| "graphify-out".to_string()) -} - /// Resolve the absolute path to the graphify output directory relative to /// `root`. /// @@ -34,7 +28,7 @@ pub(crate) fn graphify_out() -> String { /// downstream `fs::create_dir_all` call in [`cache_dir`] will surface /// the underlying I/O error if the path is unusable. pub(crate) fn out_base(root: &Path) -> PathBuf { - let out = PathBuf::from(graphify_out()); + let out = graphify_security::graphify_out(); if out.is_absolute() { out } else { diff --git a/crates/graphify-detect/Cargo.toml b/crates/graphify-detect/Cargo.toml index 8aea484..812ea3b 100644 --- a/crates/graphify-detect/Cargo.toml +++ b/crates/graphify-detect/Cargo.toml @@ -11,10 +11,11 @@ version.workspace = true [dependencies] calamine = "0.35" graphify-google = { workspace = true } +graphify-security = { workspace = true } hex = { workspace = true } ignore_walk = { version = "0.4", package = "ignore" } indexmap = { workspace = true } -lopdf = "0.41" +lopdf = "0.42" md5 = "0.8" quick-xml = "0.40" rayon = { workspace = true } @@ -29,7 +30,8 @@ unicode-normalization = { workspace = true } zip = { version = "8", default-features = false, features = ["deflate"] } [dev-dependencies] -lopdf = "0.41" +lopdf = "0.42" +serial_test = { workspace = true } tempfile = { workspace = true } [lints] diff --git a/crates/graphify-detect/src/extensions.rs b/crates/graphify-detect/src/extensions.rs index 0db8bc9..cef38cb 100644 --- a/crates/graphify-detect/src/extensions.rs +++ b/crates/graphify-detect/src/extensions.rs @@ -46,10 +46,10 @@ impl FileType { /// middle of an unrelated group. pub const CODE_EXTENSIONS: &[&str] = &[ "py", "ts", "tsx", "js", "jsx", "mjs", "ejs", "ets", "go", "rs", "java", "groovy", "gradle", - "cpp", "cc", "cxx", "c", "h", "hpp", "rb", "swift", "kt", "kts", "cs", "scala", "php", "lua", - "luau", "toc", "zig", "ps1", "psm1", "psd1", "ex", "exs", "m", "mm", "jl", "vue", "svelte", - "astro", "dart", "v", "sv", "svh", "sql", "r", "f", "F", "f90", "F90", "f95", "F95", "f03", - "F03", "f08", "F08", "pas", "pp", "dpr", "dpk", "lpr", "inc", "dfm", "lfm", "lpk", "sh", + "cpp", "cc", "cxx", "c", "h", "hpp", "cu", "cuh", "rb", "swift", "kt", "kts", "cs", "scala", + "php", "lua", "luau", "toc", "zig", "ps1", "psm1", "psd1", "ex", "exs", "m", "mm", "jl", "vue", + "svelte", "astro", "dart", "v", "sv", "svh", "sql", "r", "f", "F", "f90", "F90", "f95", "F95", + "f03", "F03", "f08", "F08", "pas", "pp", "dpr", "dpk", "lpr", "inc", "dfm", "lfm", "lpk", "sh", "bash", "json", "tf", "tfvars", "hcl", "dm", "dme", "dmi", "dmm", "dmf", "sln", "slnx", "csproj", "fsproj", "vbproj", "razor", "cshtml", "cls", "trigger", ]; diff --git a/crates/graphify-detect/src/sensitive.rs b/crates/graphify-detect/src/sensitive.rs index 36d745c..39efe25 100644 --- a/crates/graphify-detect/src/sensitive.rs +++ b/crates/graphify-detect/src/sensitive.rs @@ -260,7 +260,9 @@ pub static SKIP_FILES: std::sync::LazyLock) -> bool { - if SKIP_DIRS.contains(name) { + // `SKIP_DIRS` already includes the literal "graphify-out"; also skip a + // custom output dir so `GRAPHIFY_OUT` is never re-ingested as source (#1423). + if SKIP_DIRS.contains(name) || name == graphify_security::graphify_out_name() { return true; } if name.ends_with("_venv") || name.ends_with("_env") { diff --git a/crates/graphify-detect/src/walk.rs b/crates/graphify-detect/src/walk.rs index 04d9760..40cbc14 100644 --- a/crates/graphify-detect/src/walk.rs +++ b/crates/graphify-detect/src/walk.rs @@ -429,8 +429,9 @@ pub fn detect( let include_patterns = load_graphifyinclude(&root); let graphifyignore_patterns = ignore_patterns.len(); - let memory_dir = root.join("graphify-out").join("memory"); - let converted_dir = root.join("graphify-out").join("converted"); + let out_dir = root.join(graphify_security::graphify_out()); + let memory_dir = out_dir.join("memory"); + let converted_dir = out_dir.join("converted"); let google_workspace = graphify_google::google_workspace_enabled(None); let ctx = WalkCtx { diff --git a/crates/graphify-detect/tests/parity_classify.rs b/crates/graphify-detect/tests/parity_classify.rs index af41270..a6958ef 100644 --- a/crates/graphify-detect/tests/parity_classify.rs +++ b/crates/graphify-detect/tests/parity_classify.rs @@ -33,6 +33,18 @@ fn classify_powershell_manifest() { ); } +/// CUDA sources classify as code so `.cu`/`.cuh` route through the C++ +/// extractor (graphify-py: `.cu`/`.cuh` added to `CODE_EXTENSIONS`). +#[test] +fn classify_cuda_cu() { + assert_eq!(classify_file(Path::new("kernel.cu")), Some(FileType::Code)); +} + +#[test] +fn classify_cuda_cuh() { + assert_eq!(classify_file(Path::new("kernel.cuh")), Some(FileType::Code)); +} + /// #1377: package manifests route to the deterministic AST/code path, not the /// LLM document path — even when their extension (`.yml`/`.toml`/`.xml`) would /// otherwise classify as a document. A generic yaml stays a document. Mirrors diff --git a/crates/graphify-detect/tests/parity_sensitive.rs b/crates/graphify-detect/tests/parity_sensitive.rs index 947065d..c0afbc2 100644 --- a/crates/graphify-detect/tests/parity_sensitive.rs +++ b/crates/graphify-detect/tests/parity_sensitive.rs @@ -2,6 +2,8 @@ //! //! Mirrors `graphify-py/tests/test_detect.py` — `_is_sensitive` tests. #![allow(clippy::expect_used)] +// `std::env::set_var` is unsafe in edition 2024 — test-only, serialised below. +#![allow(unsafe_code)] use graphify_detect::is_sensitive; use std::path::Path; @@ -109,3 +111,44 @@ fn sensitive_flags_dotfile_token() { fn sensitive_flags_plural_tokens_txt() { assert!(is_sensitive(Path::new("tokens.txt"))); } + +/// RAII guard that sets an env var and restores it on drop. +struct EnvGuard { + key: &'static str, + prev: Option, +} + +impl EnvGuard { + fn set(key: &'static str, value: &str) -> Self { + let prev = std::env::var(key).ok(); + // SAFETY: test-only, serialised via `#[serial_test::serial]`. + unsafe { std::env::set_var(key, value) }; + Self { key, prev } + } +} + +impl Drop for EnvGuard { + fn drop(&mut self) { + match &self.prev { + // SAFETY: test-only cleanup. + Some(v) => unsafe { std::env::set_var(self.key, v) }, + None => unsafe { std::env::remove_var(self.key) }, + } + } +} + +#[test] +#[serial_test::serial(graphify_out_env)] +fn noise_dir_flags_default_graphify_out() { + assert!(graphify_detect::is_noise_dir("graphify-out", None)); +} + +#[test] +#[serial_test::serial(graphify_out_env)] +fn noise_dir_honours_graphify_out_override() { + // A custom GRAPHIFY_OUT dir must be skipped so it is never re-ingested as + // source (#1423); a normal dir name is still walked. + let _guard = EnvGuard::set("GRAPHIFY_OUT", "custom-out"); + assert!(graphify_detect::is_noise_dir("custom-out", None)); + assert!(!graphify_detect::is_noise_dir("src", None)); +} diff --git a/crates/graphify-export/src/canvas.rs b/crates/graphify-export/src/canvas.rs index 8d1268c..d389c82 100644 --- a/crates/graphify-export/src/canvas.rs +++ b/crates/graphify-export/src/canvas.rs @@ -36,10 +36,14 @@ fn safe_name(label: &str) -> String { let cleaned = UNSAFE_CHARS_RE.replace_all(&cleaned, ""); let cleaned = cleaned.trim().to_string(); let cleaned = MD_EXT_RE.replace(&cleaned, "").into_owned(); - if cleaned.is_empty() { - "unnamed".to_string() - } else { + // A stem of only punctuation (e.g. "@", "*", "#") survives the unsafe-char + // strip but is empty once a downstream tool re-slugs on word chars (qmd's + // handelize() reduces "@" -> "" and aborts `qmd update`). Require at least + // one word char; else fall back so we never emit a "@.md" name (#1409). + if cleaned.chars().any(|c| c.is_alphanumeric() || c == '_') { crate::util::cap_filename(&cleaned) + } else { + "unnamed".to_string() } } diff --git a/crates/graphify-export/src/obsidian.rs b/crates/graphify-export/src/obsidian.rs index 61557a2..2be60d8 100644 --- a/crates/graphify-export/src/obsidian.rs +++ b/crates/graphify-export/src/obsidian.rs @@ -36,10 +36,14 @@ fn safe_name(label: &str) -> String { let cleaned = UNSAFE_CHARS_RE.replace_all(&cleaned, ""); let cleaned = cleaned.trim().to_string(); let cleaned = MD_EXT_RE.replace(&cleaned, "").into_owned(); - if cleaned.is_empty() { - "unnamed".to_string() - } else { + // A stem of only punctuation (e.g. "@", "*", "#") survives the unsafe-char + // strip but is empty once a downstream tool re-slugs on word chars (qmd's + // handelize() reduces "@" -> "" and aborts `qmd update`). Require at least + // one word char; else fall back so we never emit a "@.md" name (#1409). + if cleaned.chars().any(|c| c.is_alphanumeric() || c == '_') { crate::util::cap_filename(&cleaned) + } else { + "unnamed".to_string() } } diff --git a/crates/graphify-export/tests/parity.rs b/crates/graphify-export/tests/parity.rs index 007fb0b..a9a9ae5 100644 --- a/crates/graphify-export/tests/parity.rs +++ b/crates/graphify-export/tests/parity.rs @@ -2,17 +2,21 @@ //! //! 1:1 ports of `graphify-py/tests/test_export.py`. +// `.expect("...")` is the sanctioned style for `tests/parity.rs` (AGENTS.md +// permits the file-top `expect_used` allow); a setup/build failure surfaces as +// a clear test panic. Kept consistent with every other crate's `parity.rs`. #![allow(clippy::expect_used, unsafe_code)] use graphify_build::build_from_json; use graphify_cluster::cluster; use graphify_export::{ attach_hyperedges, backup_if_protected, prune_dangling_edges, to_canvas, to_cypher, to_graphml, - to_html, to_json, to_svg, + to_html, to_json, to_obsidian, to_svg, }; use indexmap::IndexMap; use serde_json::{Value, json}; use serial_test::serial; +use std::path::Path; use tempfile::tempdir; // ── Fixture helpers ─────────────────────────────────────────────────────────── @@ -594,3 +598,97 @@ fn test_to_html_aggregated_remaps_hyperedges_to_communities() { "single-community hyperedge should be dropped from the aggregated view" ); } + +// ── #1409: punctuation-only Obsidian/Canvas filenames ───────────────────────── + +/// A 2-node graph where one node's label is all-punctuation (e.g. a `@/*` +/// tsconfig paths key) and the other is a normal symbol. +fn punct_graph(label: &str) -> graphify_build::Graph { + let val = json!({ + "nodes": [ + {"id": "n1", "label": label, "file_type": "code", "source_file": "tsconfig.json"}, + {"id": "n2", "label": "AuthHandler", "file_type": "code", "source_file": "auth.ts"}, + ], + "edges": [], + }); + build_from_json(val, false, None).expect("build_from_json") +} + +/// Recursively collect the file stems of every `*.md` under `dir`. +fn collect_md_stems(dir: &Path) -> Vec { + let mut out = Vec::new(); + let Ok(entries) = std::fs::read_dir(dir) else { + return out; + }; + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + out.extend(collect_md_stems(&path)); + } else if path.extension().and_then(|e| e.to_str()) == Some("md") + && let Some(stem) = path.file_stem().and_then(|s| s.to_str()) + { + out.push(stem.to_string()); + } + } + out +} + +fn has_word_char(s: &str) -> bool { + s.chars().any(|c| c.is_alphanumeric() || c == '_') +} + +#[test] +fn to_obsidian_never_emits_punctuation_only_filenames() { + // An all-punctuation label (e.g. `@/*`) must not produce a `@.md`-style + // filename; it falls back to `unnamed` (#1409). + let g = punct_graph("@/*"); + let communities = cluster(&g, 1.0, None); + let tmp = tempdir().expect("tempdir"); + let written = to_obsidian(&g, &communities, tmp.path(), None, None).expect("to_obsidian"); + assert!(written > 0, "to_obsidian wrote no notes"); + let stems = collect_md_stems(tmp.path()); + assert!(!stems.is_empty(), "to_obsidian wrote no notes"); + let bad: Vec<&String> = stems.iter().filter(|s| !has_word_char(s)).collect(); + assert!( + bad.is_empty(), + "punctuation-only filenames emitted: {bad:?}" + ); + assert!( + stems + .iter() + .any(|s| s == "unnamed" || s.starts_with("unnamed")), + "{stems:?}" + ); +} + +#[test] +fn to_canvas_never_emits_punctuation_only_filenames() { + // Same guard on the canvas exporter's file-node names (#1409). + let g = punct_graph("@"); + let communities = cluster(&g, 1.0, None); + let tmp = tempdir().expect("tempdir"); + let out = tmp.path().join("graph.canvas"); + to_canvas(&g, &communities, &out, None, None).expect("to_canvas"); + let data: Value = + serde_json::from_str(&std::fs::read_to_string(&out).expect("read canvas")).expect("json"); + let nodes = data + .get("nodes") + .and_then(Value::as_array) + .expect("nodes array"); + let file_nodes: Vec<&Value> = nodes + .iter() + .filter(|n| n.get("type").and_then(Value::as_str) == Some("file")) + .collect(); + assert!(!file_nodes.is_empty(), "canvas has no file nodes"); + for n in &file_nodes { + let file = n.get("file").and_then(Value::as_str).expect("file field"); + let stem = Path::new(file) + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or_default(); + assert!( + has_word_char(stem), + "punctuation-only canvas filename: {file}" + ); + } +} diff --git a/crates/graphify-extract/src/extractors/multi/mod.rs b/crates/graphify-extract/src/extractors/multi/mod.rs index ec9493c..f051599 100644 --- a/crates/graphify-extract/src/extractors/multi/mod.rs +++ b/crates/graphify-extract/src/extractors/multi/mod.rs @@ -34,7 +34,9 @@ use crate::types::{Edge, ExtractOutput, FileResult, Node, RawCall}; use cache::extract_single_file; use java::{resolve_cross_file_java_imports, resolve_java_type_references}; use js::{resolve_js_default_imports, resolve_js_reexport_imports}; -use python::{resolve_cross_file_python_imports, resolve_python_reexport_imports}; +use python::{ + resolve_cross_file_python_imports, resolve_python_member_calls, resolve_python_reexport_imports, +}; use rayon::prelude::*; use serde_json::Value; use std::collections::HashMap; @@ -79,7 +81,7 @@ fn get_extractor(path: &Path) -> Option { "java" => Some(extract_java), "groovy" | "gradle" => Some(extract_groovy), "c" | "h" => Some(extract_c), - "cpp" | "cc" | "cxx" | "hpp" => Some(extract_cpp), + "cpp" | "cc" | "cxx" | "hpp" | "cu" | "cuh" => Some(extract_cpp), "rb" => Some(extract_ruby), "cs" => Some(extract_csharp), "kt" | "kts" => Some(extract_kotlin), @@ -584,6 +586,15 @@ pub fn extract(paths: &[PathBuf], cache_root: Option<&Path>) -> ExtractOutput { resolve_swift_member_calls(&swift_paths, &all_nodes, &mut all_edges, &all_raw_calls); } + // Cross-file Python qualified class-method calls (#1446): same timing as the + // Swift pass — after the shared call pass (ids final), before relativisation. + let has_python = paths + .iter() + .any(|p| p.extension().is_some_and(|e| e == "py" || e == "pyi")); + if has_python { + resolve_python_member_calls(&all_nodes, &mut all_edges, &all_raw_calls); + } + // Relativise source_file fields for n in &mut all_nodes { let sf_path = PathBuf::from(&n.source_file); diff --git a/crates/graphify-extract/src/extractors/multi/python.rs b/crates/graphify-extract/src/extractors/multi/python.rs index 11c5c2b..adacd96 100644 --- a/crates/graphify-extract/src/extractors/multi/python.rs +++ b/crates/graphify-extract/src/extractors/multi/python.rs @@ -5,7 +5,7 @@ use super::js::js_node_text; use super::{JsDefaultResolution, PARALLEL_THRESHOLD, relativise_under_root}; use crate::ids::make_id1; use crate::import_handlers::make_edge; -use crate::types::{Edge, FileResult, Node}; +use crate::types::{Edge, FileResult, Node, RawCall}; use rayon::prelude::*; use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; @@ -526,3 +526,112 @@ pub(super) fn resolve_python_reexport_imports( edges.extend(import_edges); JsDefaultResolution { edges, aliases } } + +/// Resolve cross-file Python qualified class-method calls (`ClassName.method()`) +/// to the class-qualified method node (#1446). +/// +/// The shared cross-file call pass drops every `is_member_call` because a bare +/// method name collides across the corpus and inflates god-nodes. That guard is +/// right for *instance* calls (`obj.method()`) but misses *class-qualified* +/// calls (`ClassName.method()`), where the receiver is an explicitly-named class +/// — an exact, unambiguous reference. Using the receiver captured by the +/// extractor, when it is a capitalized name resolving to exactly one class node +/// that owns the called method, this emits an EXTRACTED `calls` edge. Purely +/// additive, with a single-definition god-node guard. Mirrors Python +/// `_resolve_python_member_calls`; runs after id-disambiguation. +pub(super) fn resolve_python_member_calls( + all_nodes: &[Node], + all_edges: &mut Vec, + all_raw_calls: &[RawCall], +) { + let key = |s: &str| -> String { + s.chars() + .filter(char::is_ascii_alphanumeric) + .collect::() + .to_lowercase() + }; + + let node_by_id: HashMap<&str, &Node> = all_nodes.iter().map(|n| (n.id.as_str(), n)).collect(); + + // A class owns methods: it is the source of one or more `method` edges. Index + // class label -> owning class node ids (len != 1 is the god-node guard), and + // (class_node_id, method_key) -> method_node_id. + let mut class_def_nids: HashMap> = HashMap::new(); + let mut method_index: HashMap<(String, String), String> = HashMap::new(); + for e in all_edges.iter() { + if e.relation != "method" { + continue; + } + if let Some(cnode) = node_by_id.get(e.source.as_str()) { + class_def_nids + .entry(key(cnode.label.as_str())) + .or_default() + .push(e.source.clone()); + } + if let Some(tnode) = node_by_id.get(e.target.as_str()) { + method_index.insert( + (e.source.clone(), key(tnode.label.as_str())), + e.target.clone(), + ); + } + } + if class_def_nids.is_empty() { + return; + } + // A class with N methods produced N entries; collapse to a unique set. + for nids in class_def_nids.values_mut() { + nids.sort(); + nids.dedup(); + } + + let mut existing_pairs: HashSet<(String, String)> = all_edges + .iter() + .map(|e| (e.source.clone(), e.target.clone())) + .collect(); + + let mut new_edges: Vec = Vec::new(); + for rc in all_raw_calls { + if !rc.is_member_call || rc.callee.is_empty() || rc.caller_nid.is_empty() { + continue; + } + // Only a capitalized receiver is treated as a class reference, so an + // instance/module (`self`, `obj`, `config`) never collides with a + // same-spelled class via the case-folding key. + let Some(receiver) = rc + .receiver + .as_deref() + .filter(|r| r.chars().next().is_some_and(char::is_uppercase)) + else { + continue; + }; + let class_nids = match class_def_nids.get(&key(receiver)) { + Some(nids) if nids.len() == 1 => nids, + _ => continue, // absent or ambiguous -> god-node guard + }; + let Some(method_nid) = method_index.get(&(class_nids[0].clone(), key(&rc.callee))) else { + continue; + }; + if *method_nid == rc.caller_nid + || existing_pairs.contains(&(rc.caller_nid.clone(), method_nid.clone())) + { + continue; + } + existing_pairs.insert((rc.caller_nid.clone(), method_nid.clone())); + // EXTRACTED: a qualified `ClassName.method()` is an explicit, unambiguous + // static reference, and the class resolved to exactly one definition that + // owns the method. + new_edges.push(Edge { + external: false, + source: rc.caller_nid.clone(), + target: method_nid.clone(), + relation: "calls".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: rc.source_file.clone(), + source_location: Some(rc.source_location.clone()), + weight: 1.0, + context: Some("call".to_string()), + confidence_score: Some(1.0), + }); + } + all_edges.extend(new_edges); +} diff --git a/crates/graphify-extract/src/generic/calls.rs b/crates/graphify-extract/src/generic/calls.rs index baaeecf..cb2b1d1 100644 --- a/crates/graphify-extract/src/generic/calls.rs +++ b/crates/graphify-extract/src/generic/calls.rs @@ -87,7 +87,22 @@ pub(super) fn walk_calls( // real local symbol is a genuine call and must be kept. Only drop // built-ins when they DON'T resolve, so they can't become cross-file // god-nodes via the raw-call pass (#726). - let tgt_nid = ctx.label_to_nid.get(&callee.to_lowercase()).cloned(); + // A capitalized-receiver Python member call (`ClassName.method()`) + // defers to receiver-based cross-file resolution: the bare method + // name can collide with an in-file node — even the calling method + // itself — which would match `tgt == caller` and silently drop the + // call. `resolve_python_member_calls` resolves it via the receiver + // (#1446). Gated to Python so Swift's own resolver is unaffected. + let defer_member = is_member_call + && ctx.config.lang_id == LangId::Python + && receiver + .as_deref() + .is_some_and(|r| r.chars().next().is_some_and(char::is_uppercase)); + let tgt_nid = if defer_member { + None + } else { + ctx.label_to_nid.get(&callee.to_lowercase()).cloned() + }; if let Some(tgt) = tgt_nid { if tgt != caller_nid { let pair = (caller_nid.to_string(), tgt.clone()); @@ -371,6 +386,16 @@ fn extract_callee( { callee_name = Some(read_text_owned(attr, source)); } + // #1446: capture a simple-identifier receiver (`ClassName` in + // `ClassName.method()`) so cross-file resolution can resolve + // qualified Python class-method calls. Chained receivers + // (`a.b.method()`) are skipped. + if config.lang_id == LangId::Python + && let Some(obj) = func_node.child_by_field_name("object") + && obj.kind() == "identifier" + { + receiver = Some(read_text_owned(obj, source)); + } } else { callee_name = Some(read_text_owned(func_node, source)); } diff --git a/crates/graphify-extract/tests/cross_file_multi.rs b/crates/graphify-extract/tests/cross_file_multi.rs index f714073..dbccc0b 100644 --- a/crates/graphify-extract/tests/cross_file_multi.rs +++ b/crates/graphify-extract/tests/cross_file_multi.rs @@ -521,3 +521,181 @@ fn default_import_call_resolves_to_default_exported_function() { "calls" )); } + +// ── #1446: qualified ClassName.method() call resolution ────────────────────── + +type NodeMap<'a> = + std::collections::HashMap<&'a str, &'a indexmap::IndexMap>; + +fn index_nodes(nodes: &[indexmap::IndexMap]) -> NodeMap<'_> { + nodes + .iter() + .filter_map(|n| n.get("id").and_then(|v| v.as_str()).map(|id| (id, n))) + .collect() +} + +/// `field` of the node referenced by `edge[endpoint]` (a source/target id), or "". +fn endpoint_field( + idx: &NodeMap, + edge: &indexmap::IndexMap, + endpoint: &str, + field: &str, +) -> String { + edge.get(endpoint) + .and_then(|v| v.as_str()) + .and_then(|id| idx.get(id)) + .and_then(|n| n.get(field)) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string() +} + +#[test] +fn python_qualified_class_method_call_resolves_extracted() { + // `ClassName.method()` across files resolves to the class-qualified method + // node with an EXTRACTED `calls` edge (#1446). + let tmp = tempfile::tempdir().expect("tempdir"); + let actions = tmp.path().join("actions.py"); + let viewset = tmp.path().join("viewset.py"); + fs::write( + &actions, + "class TaskActions:\n @staticmethod\n def approve(pk):\n return pk\n", + ) + .expect("write actions"); + fs::write( + &viewset, + "from actions import TaskActions\n\nclass TaskViewSet:\n def handle(self, request):\n return TaskActions.approve(request)\n", + ) + .expect("write viewset"); + let result = extract(&[viewset, actions], Some(tmp.path())); + let idx = index_nodes(&result.nodes); + let call_edges: Vec<_> = result + .edges + .iter() + .filter(|e| { + e.get("relation").and_then(|v| v.as_str()) == Some("calls") + && endpoint_field(&idx, e, "source", "label").contains("handle") + && endpoint_field(&idx, e, "target", "label").contains("approve") + && endpoint_field(&idx, e, "target", "source_file").contains("actions.py") + }) + .collect(); + assert_eq!( + call_edges.len(), + 1, + "expected one handle->approve edge, got {call_edges:?}" + ); + assert_eq!( + call_edges[0].get("confidence").and_then(|v| v.as_str()), + Some("EXTRACTED") + ); +} + +#[test] +fn python_qualified_call_resolves_when_method_name_collides_with_caller() { + // A viewset action `approve()` delegates to a SERVICE action of the SAME + // name via `TaskActions.approve()`. The bare-name in-file lookup would match + // the caller's own node and silently drop the call; the qualified receiver + // must still resolve it cross-file (#1446). + let tmp = tempfile::tempdir().expect("tempdir"); + let actions = tmp.path().join("actions.py"); + let viewset = tmp.path().join("viewset.py"); + fs::write( + &actions, + "class TaskActions:\n @staticmethod\n def approve(pk):\n return pk\n", + ) + .expect("write actions"); + fs::write( + &viewset, + "from actions import TaskActions\n\nclass TaskViewSet:\n def approve(self, request):\n return TaskActions.approve(request)\n", + ) + .expect("write viewset"); + let result = extract(&[viewset, actions], Some(tmp.path())); + let idx = index_nodes(&result.nodes); + let cross: Vec<_> = result + .edges + .iter() + .filter(|e| { + e.get("relation").and_then(|v| v.as_str()) == Some("calls") + && endpoint_field(&idx, e, "source", "source_file").contains("viewset.py") + && endpoint_field(&idx, e, "target", "source_file").contains("actions.py") + && endpoint_field(&idx, e, "target", "label").contains("approve") + }) + .collect(); + assert_eq!( + cross.len(), + 1, + "expected viewset->service approve edge, got {cross:?}" + ); + assert_eq!( + cross[0].get("confidence").and_then(|v| v.as_str()), + Some("EXTRACTED") + ); +} + +#[test] +fn python_instance_member_call_not_overconnected() { + // A lowercase-receiver member call (`obj.run()`) must NOT resolve cross-file + // — the god-node guard stays intact (#1446). + let tmp = tempfile::tempdir().expect("tempdir"); + let svc = tmp.path().join("svc.py"); + let worker = tmp.path().join("worker.py"); + fs::write( + &svc, + "class Service:\n def run(self):\n return 1\n", + ) + .expect("write svc"); + fs::write( + &worker, + "class Worker:\n def go(self, obj):\n return obj.run()\n", + ) + .expect("write worker"); + let result = extract(&[worker, svc], Some(tmp.path())); + let idx = index_nodes(&result.nodes); + let bad: Vec<_> = result + .edges + .iter() + .filter(|e| { + e.get("relation").and_then(|v| v.as_str()) == Some("calls") + && endpoint_field(&idx, e, "source", "label").contains("go") + && endpoint_field(&idx, e, "target", "label").contains("run") + }) + .collect(); + assert!( + bad.is_empty(), + "instance member call must not connect cross-file: {bad:?}" + ); +} + +#[test] +fn python_qualified_call_ambiguous_class_bails() { + // When the class name is defined in 2+ files, the qualified call must not + // resolve — single-definition god-node guard (#1446). + let tmp = tempfile::tempdir().expect("tempdir"); + let a = tmp.path().join("a.py"); + let b = tmp.path().join("b.py"); + let caller = tmp.path().join("caller.py"); + fs::write(&a, "class Helper:\n def do(self):\n return 1\n").expect("write a"); + fs::write(&b, "class Helper:\n def do(self):\n return 2\n").expect("write b"); + fs::write( + &caller, + "from a import Helper\n\nclass C:\n def f(self):\n return Helper.do(self)\n", + ) + .expect("write caller"); + let result = extract(&[caller, a, b], Some(tmp.path())); + let idx = index_nodes(&result.nodes); + let resolved: Vec<_> = result + .edges + .iter() + .filter(|e| { + e.get("relation").and_then(|v| v.as_str()) == Some("calls") + && endpoint_field(&idx, e, "source", "label") + .trim_matches(|c| matches!(c, '(' | ')' | '.')) + == "f" + && endpoint_field(&idx, e, "target", "label").contains("do") + }) + .collect(); + assert!( + resolved.is_empty(), + "ambiguous class name must not resolve: {resolved:?}" + ); +} diff --git a/crates/graphify-extract/tests/fixtures/sample.cu b/crates/graphify-extract/tests/fixtures/sample.cu new file mode 100644 index 0000000..29515c0 --- /dev/null +++ b/crates/graphify-extract/tests/fixtures/sample.cu @@ -0,0 +1,30 @@ +#include +#include + +struct Vec3 { + float x; + float y; + float z; +}; + +__device__ float dot(const Vec3& a, const Vec3& b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +__global__ void saxpy(int n, float a, const float* x, float* y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + y[i] = a * x[i] + y[i]; + } +} + +float host_norm(const Vec3& v) { + return dot(v, v); +} + +int main() { + Vec3 v{1.0f, 2.0f, 3.0f}; + float n = host_norm(v); + saxpy<<<1, 256>>>(256, 2.0f, nullptr, nullptr); + return 0; +} diff --git a/crates/graphify-extract/tests/parity_languages.rs b/crates/graphify-extract/tests/parity_languages.rs index b689829..98ad194 100644 --- a/crates/graphify-extract/tests/parity_languages.rs +++ b/crates/graphify-extract/tests/parity_languages.rs @@ -332,6 +332,48 @@ fn cpp_extractor_produces_nodes() { assert_no_dangling_edges(&result); } +// ── CUDA (.cu/.cuh route through the C++ extractor) ────────────────────────── + +#[test] +fn cuda_no_error() { + let r = extract_cpp(&fixtures().join("sample.cu")); + assert!(r.error.is_none(), "{:?}", r.error); +} + +#[test] +fn cuda_finds_kernel_and_device_functions() { + let r = extract_cpp(&fixtures().join("sample.cu")); + let labels = labels(&r); + assert!(labels.iter().any(|l| l.contains("saxpy")), "{labels:?}"); // __global__ kernel + assert!(labels.iter().any(|l| l.contains("dot")), "{labels:?}"); // __device__ function +} + +#[test] +fn cuda_finds_struct() { + let r = extract_cpp(&fixtures().join("sample.cu")); + assert!(labels(&r).iter().any(|l| l.contains("Vec3"))); +} + +#[test] +fn cuda_finds_includes() { + let r = extract_cpp(&fixtures().join("sample.cu")); + assert!(relations(&r).contains("imports")); +} + +#[test] +fn cuda_host_call_edges() { + let r = extract_cpp(&fixtures().join("sample.cu")); + let calls = calls(&r); + assert!( + calls.contains(&("host_norm()".to_string(), "dot()".to_string())), + "{calls:?}" + ); + assert!( + calls.contains(&("main()".to_string(), "host_norm()".to_string())), + "{calls:?}" + ); +} + #[test] fn csharp_extractor_produces_nodes() { let result = extract_csharp(&fixtures().join("sample.cs")); diff --git a/crates/graphify-hooks/src/constants.rs b/crates/graphify-hooks/src/constants.rs index e1491b4..c3dabd3 100644 --- a/crates/graphify-hooks/src/constants.rs +++ b/crates/graphify-hooks/src/constants.rs @@ -176,12 +176,24 @@ try: signal.alarm(_timeout) _force = os.environ.get('GRAPHIFY_FORCE', '').lower() in ('1', 'true', 'yes') _root = Path('.') - _saved = Path('graphify-out/.graphify_root') + _out = os.environ.get('GRAPHIFY_OUT', 'graphify-out') + _saved = Path(_out) / '.graphify_root' if _saved.exists(): _txt = _saved.read_text(encoding='utf-8').strip() if _txt: _root = Path(_txt) _rebuild_code(_root, changed_paths=changed, force=_force) + # Refresh the work-memory lessons doc when saved Q&A outcomes exist + # (best-effort; never fails the hook). + try: + _md = (_root / _out) / 'memory' + if _md.is_dir() and any(_md.glob('*.md')): + from graphify.reflect import reflect as _reflect + _gj = (_root / _out) / 'graph.json' + _reflect(memory_dir=_md, out_path=(_root / _out) / 'reflections' / 'LESSONS.md', + graph_path=_gj if _gj.exists() else None) + except Exception: + pass except TimeoutError as exc: print(f'[graphify hook] {exc}') sys.exit(1) @@ -268,12 +280,24 @@ try: signal.alarm(_timeout) _force = os.environ.get('GRAPHIFY_FORCE', '').lower() in ('1', 'true', 'yes') _root = Path('.') - _saved = Path('graphify-out/.graphify_root') + _out = os.environ.get('GRAPHIFY_OUT', 'graphify-out') + _saved = Path(_out) / '.graphify_root' if _saved.exists(): _txt = _saved.read_text(encoding='utf-8').strip() if _txt: _root = Path(_txt) _rebuild_code(_root, force=_force) + # Refresh the work-memory lessons doc when saved Q&A outcomes exist + # (best-effort; never fails the hook). + try: + _md = (_root / _out) / 'memory' + if _md.is_dir() and any(_md.glob('*.md')): + from graphify.reflect import reflect as _reflect + _gj = (_root / _out) / 'graph.json' + _reflect(memory_dir=_md, out_path=(_root / _out) / 'reflections' / 'LESSONS.md', + graph_path=_gj if _gj.exists() else None) + except Exception: + pass except TimeoutError as exc: print(f'[graphify] {exc}') sys.exit(1) diff --git a/crates/graphify-hooks/src/platform/agents.rs b/crates/graphify-hooks/src/platform/agents.rs index 2146092..2c297fd 100644 --- a/crates/graphify-hooks/src/platform/agents.rs +++ b/crates/graphify-hooks/src/platform/agents.rs @@ -8,8 +8,10 @@ use std::fs; use std::path::Path; use super::codex::{install_codex_hook, uninstall_codex_hook}; +use super::common::fs::{dirs_home, remove_skill}; use super::common::{ - AGENTS_MD_SECTION, CLAUDE_MD_MARKER, remove_graphify_section, replace_or_append_section, + AGENTS_MD_SECTION, CLAUDE_MD_MARKER, install_platform_skill, remove_graphify_section, + replace_or_append_section, }; use super::kilo::{install_kilo_plugin, uninstall_kilo_plugin}; use super::opencode::{install_opencode_plugin, uninstall_opencode_plugin}; @@ -149,3 +151,40 @@ fn push_platform_extra_uninstall( } Ok(()) } + +/// `graphify agents install`: skill into `~/.agents/skills` PLUS the always-on +/// `AGENTS.md` section. The amp-twin of the generic Agent-Skills target; the +/// bare `graphify install --platform agents` path stays skill-only (#1432). +/// +/// # Errors +/// +/// Returns `HooksError::Io` on filesystem failures or `HooksError::Json` on +/// JSON serialisation failure. +pub fn agents_platform_install(project_dir: &Path) -> Result { + let skill = install_platform_skill("agents")?; + let agents = agents_install(project_dir, "agents")?; + Ok(format!("{skill}\n{agents}")) +} + +/// `graphify agents uninstall`: remove the `~/.agents/skills` skill and the +/// project `AGENTS.md` section (#1432). +/// +/// # Errors +/// +/// Returns `HooksError::Io` on filesystem failures. +pub fn agents_platform_uninstall(project_dir: &Path) -> Result { + let skill_dst = dirs_home() + .join(".agents") + .join("skills") + .join("graphify") + .join("SKILL.md"); + let removed = skill_dst.exists(); + remove_skill(&skill_dst); + let agents = agents_uninstall(project_dir, "agents")?; + let mut msgs = Vec::new(); + if removed { + msgs.push("skill removed".to_string()); + } + msgs.push(agents); + Ok(msgs.join("\n")) +} diff --git a/crates/graphify-hooks/src/platform/common/install_skill.rs b/crates/graphify-hooks/src/platform/common/install_skill.rs index 855b727..7fb0696 100644 --- a/crates/graphify-hooks/src/platform/common/install_skill.rs +++ b/crates/graphify-hooks/src/platform/common/install_skill.rs @@ -2,6 +2,7 @@ //! a home-directory skill file (no project-local files). use std::fs; +use std::path::{Path, PathBuf}; use crate::HooksError; @@ -12,6 +13,28 @@ use super::skills::{ SKILL_TRAE_MD, SKILL_WINDOWS_MD, }; +/// Global-scope hermes skill destination. On Windows, Hermes scans +/// `%LOCALAPPDATA%\hermes\skills`, not `~/.hermes`, falling back to +/// `/AppData/Local` when `LOCALAPPDATA` is unset (#1403). Off Windows it +/// stays `/.hermes/skills/graphify/SKILL.md`. Mirrors the hermes branch of +/// Python `_platform_skill_destination`. +#[must_use] +pub fn hermes_skill_dst(home: &Path, localappdata: Option<&Path>, is_windows: bool) -> PathBuf { + if is_windows { + let base = + localappdata.map_or_else(|| home.join("AppData").join("Local"), Path::to_path_buf); + base.join("hermes") + .join("skills") + .join("graphify") + .join("SKILL.md") + } else { + home.join(".hermes") + .join("skills") + .join("graphify") + .join("SKILL.md") + } +} + /// Install a skill-only platform integration. /// /// Maps `platform` to the correct skill content + destination path and @@ -38,6 +61,14 @@ pub fn install_platform_skill(platform: &str) -> Result { Some(cfg_dir) => cfg_dir.join("skills").join("graphify").join("SKILL.md"), None => dirs_home().join(home_rel), } + } else if platform == "hermes" { + // Hermes scans %LOCALAPPDATA% on Windows rather than ~/.hermes (#1403). + let localappdata = std::env::var_os("LOCALAPPDATA").map(PathBuf::from); + hermes_skill_dst( + &dirs_home(), + localappdata.as_deref(), + cfg!(target_os = "windows"), + ) } else { dirs_home().join(home_rel) }; @@ -164,7 +195,9 @@ fn skill_for(platform: &str, project: bool) -> Result<(&'static str, &'static st // handled by `devin_project_install`, which also writes // `.windsurf/rules/graphify.md`. "devin" => (SKILL_MD, ".config/devin/skills/graphify/SKILL.md"), - "antigravity" => (SKILL_MD, ".agents/skills/graphify/SKILL.md"), + // antigravity (global) and the generic `agents` platform (#1432) share the + // user-global ~/.agents/skills dir; project scope -> ./.agents/skills. + "antigravity" | "agents" => (SKILL_MD, ".agents/skills/graphify/SKILL.md"), "antigravity-windows" => (SKILL_WINDOWS_MD, ".agents/skills/graphify/SKILL.md"), other => return Err(HooksError::UnknownPlatform(other.to_string())), }) diff --git a/crates/graphify-hooks/src/platform/common/markdown.rs b/crates/graphify-hooks/src/platform/common/markdown.rs index 5027a5d..9704967 100644 --- a/crates/graphify-hooks/src/platform/common/markdown.rs +++ b/crates/graphify-hooks/src/platform/common/markdown.rs @@ -154,7 +154,7 @@ export const GraphifyPlugin = async ({ directory }) => { if (input.tool === \"bash\") { output.args.command = - 'echo \"[graphify] knowledge graph at graphify-out/. For focused questions, run \\`graphify query \\\"\\\"\\` (scoped subgraph, usually much smaller than GRAPH_REPORT.md) instead of grepping raw files. Read GRAPH_REPORT.md only for broad architecture context.\" && ' + + 'echo \"[graphify] knowledge graph at graphify-out/. For focused questions, run graphify query with your question (scoped subgraph, usually much smaller than GRAPH_REPORT.md) instead of grepping raw files. Read GRAPH_REPORT.md only for broad architecture context.\" && ' + output.args.command; reminded = true; } diff --git a/crates/graphify-hooks/src/platform/common/mod.rs b/crates/graphify-hooks/src/platform/common/mod.rs index f3ae274..04e198c 100644 --- a/crates/graphify-hooks/src/platform/common/mod.rs +++ b/crates/graphify-hooks/src/platform/common/mod.rs @@ -24,8 +24,8 @@ pub(super) mod uninstall_all; // imports inside the per-platform files continue to resolve. pub use fs::{replace_or_append_section, resolve_graphify_exe}; pub use install_skill::{ - amp_install, amp_uninstall, install_platform_skill, install_platform_skill_project, - uninstall_platform_skill_project, + amp_install, amp_uninstall, hermes_skill_dst, install_platform_skill, + install_platform_skill_project, uninstall_platform_skill_project, }; pub use markdown::{ AGENTS_MD_SECTION, ANTIGRAVITY_RULES, ANTIGRAVITY_WORKFLOW, CLAUDE_MD_MARKER, diff --git a/crates/graphify-hooks/src/platform/common/uninstall_all.rs b/crates/graphify-hooks/src/platform/common/uninstall_all.rs index b211f11..bfa076b 100644 --- a/crates/graphify-hooks/src/platform/common/uninstall_all.rs +++ b/crates/graphify-hooks/src/platform/common/uninstall_all.rs @@ -42,6 +42,12 @@ pub fn uninstall_all(project_dir: &Path, purge: bool) -> Result msgs.push(msg), diff --git a/crates/graphify-hooks/src/platform/mod.rs b/crates/graphify-hooks/src/platform/mod.rs index 3e189da..438760d 100644 --- a/crates/graphify-hooks/src/platform/mod.rs +++ b/crates/graphify-hooks/src/platform/mod.rs @@ -22,7 +22,9 @@ pub mod vscode; // Re-export the full public surface so callers can use `graphify_hooks::platform::*` // as they did when everything lived in a single file. -pub use agents::{agents_install, agents_uninstall}; +pub use agents::{ + agents_install, agents_platform_install, agents_platform_uninstall, agents_uninstall, +}; pub use antigravity::{antigravity_install, antigravity_uninstall}; pub use claude::{claude_install, claude_uninstall, install_claude_hook, uninstall_claude_hook}; pub use codebuddy::{ @@ -33,8 +35,9 @@ pub use common::{ AGENTS_MD_SECTION, ANTIGRAVITY_RULES, ANTIGRAVITY_WORKFLOW, CLAUDE_MD_MARKER, CLAUDE_MD_SECTION, CURSOR_RULE, GEMINI_MD_SECTION, KIRO_STEERING, OPENCODE_PLUGIN_JS, SETTINGS_HOOK_MATCHER, VSCODE_INSTRUCTIONS_SECTION, amp_install, amp_uninstall, - install_platform_skill, install_platform_skill_project, replace_or_append_section, - resolve_graphify_exe, uninstall_all, uninstall_platform_skill_project, + hermes_skill_dst, install_platform_skill, install_platform_skill_project, + replace_or_append_section, resolve_graphify_exe, uninstall_all, + uninstall_platform_skill_project, }; pub use copilot::{copilot_install, copilot_uninstall}; pub use cursor::{cursor_install, cursor_uninstall}; diff --git a/crates/graphify-hooks/tests/parity.rs b/crates/graphify-hooks/tests/parity.rs index 0c40fa1..55da1f6 100644 --- a/crates/graphify-hooks/tests/parity.rs +++ b/crates/graphify-hooks/tests/parity.rs @@ -434,15 +434,57 @@ fn test_scripts_contain_python_detect() { use graphify_hooks::platform::{ AGENTS_MD_SECTION, ANTIGRAVITY_RULES, CLAUDE_MD_MARKER, CLAUDE_MD_SECTION, CURSOR_RULE, GEMINI_MD_SECTION, KIRO_STEERING, OPENCODE_PLUGIN_JS, VSCODE_INSTRUCTIONS_SECTION, - agents_install, agents_uninstall, amp_install, amp_uninstall, antigravity_install, - antigravity_uninstall, claude_install, claude_uninstall, codebuddy_install, - codebuddy_uninstall, cursor_install, cursor_uninstall, gemini_install, gemini_uninstall, - install_claude_hook, install_codex_hook, install_gemini_hook, install_opencode_plugin, - install_platform_skill, install_platform_skill_project, kiro_install, kiro_uninstall, - replace_or_append_section, uninstall_claude_hook, uninstall_codex_hook, uninstall_gemini_hook, - uninstall_opencode_plugin, uninstall_platform_skill_project, vscode_install, vscode_uninstall, + agents_install, agents_platform_install, agents_uninstall, amp_install, amp_uninstall, + antigravity_install, antigravity_uninstall, claude_install, claude_uninstall, + codebuddy_install, codebuddy_uninstall, cursor_install, cursor_uninstall, gemini_install, + gemini_uninstall, hermes_skill_dst, install_claude_hook, install_codex_hook, + install_gemini_hook, install_opencode_plugin, install_platform_skill, + install_platform_skill_project, kiro_install, kiro_uninstall, replace_or_append_section, + uninstall_claude_hook, uninstall_codex_hook, uninstall_gemini_hook, uninstall_opencode_plugin, + uninstall_platform_skill_project, vscode_install, vscode_uninstall, }; +// ── #1403: hermes skill destination (Windows %LOCALAPPDATA%) ───────────────── + +#[test] +fn test_hermes_skill_destination_windows_uses_localappdata() { + // On Windows, Hermes scans %LOCALAPPDATA%\hermes\skills, not ~/.hermes. + let home = Path::new("/home/user"); + let localappdata = Path::new("/tmp/AppDataLocal"); + let dst = hermes_skill_dst(home, Some(localappdata), true); + assert_eq!( + dst, + Path::new("/tmp/AppDataLocal") + .join("hermes") + .join("skills") + .join("graphify") + .join("SKILL.md") + ); +} + +#[test] +fn test_hermes_skill_destination_windows_falls_back_to_appdata_local() { + // LOCALAPPDATA unset on Windows -> /AppData/Local. + let home = Path::new("/home/user"); + let dst = hermes_skill_dst(home, None, true); + assert_eq!( + dst, + home.join("AppData") + .join("Local") + .join("hermes") + .join("skills") + .join("graphify") + .join("SKILL.md") + ); +} + +#[test] +fn test_hermes_skill_destination_posix_uses_home() { + let home = Path::new("/home/user"); + let dst = hermes_skill_dst(home, None, false); + assert!(dst.ends_with(".hermes/skills/graphify/SKILL.md"), "{dst:?}"); +} + // --------------------------------------------------------------------------- // _replace_or_append_section (test_claude_md.py indirectly, test_install.py) // --------------------------------------------------------------------------- @@ -1020,6 +1062,79 @@ fn test_install_opencode() { ); } +// ── #1432: generic `agents` platform + `skills` alias ──────────────────────── + +#[test] +#[serial(home_env)] +fn test_install_agents_user_global() { + // `--platform agents` lands the skill at ~/.agents/skills (skill-only). + let dir = tempfile::tempdir().expect("tempdir"); + install_skill_to(dir.path(), "agents"); + assert!(dir.path().join(".agents/skills/graphify/SKILL.md").exists()); + // Skill-only: no AGENTS.md from the bare install path. + assert!(!dir.path().join("AGENTS.md").exists()); +} + +#[test] +fn test_install_agents_project_uses_dot_agents() { + let dir = tempfile::tempdir().expect("tempdir"); + install_platform_skill_project("agents", dir.path()).expect("test invariant"); + assert!(dir.path().join(".agents/skills/graphify/SKILL.md").exists()); +} + +#[test] +#[serial(home_env)] +fn test_agents_subcommand_wires_skill_and_agents_md() { + // `graphify agents install` is the amp-twin: skill at ~/.agents/skills PLUS + // an AGENTS.md `## graphify` section. Running it twice stays idempotent. + let home = tempfile::tempdir().expect("tempdir"); + let proj = tempfile::tempdir().expect("tempdir"); + // SAFETY: test-only; serialised via `#[serial(home_env)]`. + unsafe { + std::env::set_var("HOME", home.path()); + } + agents_platform_install(proj.path()).expect("test invariant"); + agents_platform_install(proj.path()).expect("idempotent re-run"); + // SAFETY: test-only cleanup. + unsafe { + std::env::remove_var("HOME"); + } + assert!( + home.path() + .join(".agents/skills/graphify/SKILL.md") + .exists() + ); + let body = fs::read_to_string(proj.path().join("AGENTS.md")).expect("AGENTS.md"); + assert!(body.contains("## graphify")); + assert_eq!( + body.matches("## graphify").count(), + 1, + "AGENTS.md gained a duplicate graphify section" + ); +} + +#[test] +fn test_opencode_plugin_reminder_has_no_backticks() { + // #1413: backticks or `$(` inside the echo reminder would trigger bash + // command substitution, corrupting tool output and silently running the very + // command we only suggest. The reminder must be plain prose. + let start = OPENCODE_PLUGIN_JS + .find("echo \"") + .expect("echo reminder present") + + "echo \"".len(); + let rest = &OPENCODE_PLUGIN_JS[start..]; + let end = rest.find('"').expect("echo reminder terminator"); + let reminder = &rest[..end]; + assert!( + !reminder.contains('`'), + "reminder has a backtick: {reminder}" + ); + assert!( + !reminder.contains("$("), + "reminder has a $( construct: {reminder}" + ); +} + #[test] fn test_install_opencode_project_uses_dot_opencode() { // Project scope writes under `.opencode/`, not `.config/opencode/` (#1042). @@ -2626,9 +2741,10 @@ fn hooks_use_cross_platform_detach() { } } -/// Both rebuild bodies read `graphify-out/.graphify_root` and pass the +/// Both rebuild bodies read `/.graphify_root` and pass the /// recovered root to `_rebuild_code`, so a scoped build is not silently -/// expanded to the full repo (#1173). +/// expanded to the full repo (#1173). The output dir is resolved from +/// `GRAPHIFY_OUT` at hook-run time rather than hardcoded (#1423). #[test] fn rebuild_bodies_read_graphify_root() { for (name, script) in [ @@ -2636,9 +2752,13 @@ fn rebuild_bodies_read_graphify_root() { ("post-checkout", CHECKOUT_SCRIPT), ] { assert!( - script.contains("graphify-out/.graphify_root"), + script.contains(".graphify_root"), "{name} ignores .graphify_root" ); + assert!( + script.contains("GRAPHIFY_OUT"), + "{name} ignores the GRAPHIFY_OUT override (#1423)" + ); assert!( script.contains("_rebuild_code(_root"), "{name} does not pass recovered root" @@ -2647,6 +2767,10 @@ fn rebuild_bodies_read_graphify_root() { script.contains("read_text(encoding='utf-8')"), "{name} root read is not single-quoted (shell-quote-safe)" ); + assert!( + script.contains("from graphify.reflect import reflect"), + "{name} does not refresh the lessons doc post-rebuild (#1441)" + ); } } diff --git a/crates/graphify-html/src/callflow/loader.rs b/crates/graphify-html/src/callflow/loader.rs index 7ab2891..5d3d63b 100644 --- a/crates/graphify-html/src/callflow/loader.rs +++ b/crates/graphify-html/src/callflow/loader.rs @@ -468,11 +468,12 @@ pub fn infer_project_name(graph_path: &Path, meta: &IndexMap } else if base.join("graph.json").exists() { base.clone() } else { - base.join("graphify-out") + base.join(graphify_security::graphify_out()) }; - let project_root = if graphify_out.file_name().and_then(|n| n.to_str()) == Some("graphify-out") - { - graphify_out - .parent() - .map_or_else(|| base.clone(), Path::to_path_buf) - } else { - base.clone() - }; + let out_name = graphify_security::graphify_out_name(); + let project_root = + if graphify_out.file_name().and_then(|n| n.to_str()) == Some(out_name.as_str()) { + graphify_out + .parent() + .map_or_else(|| base.clone(), Path::to_path_buf) + } else { + base.clone() + }; let graph = opts .graph diff --git a/crates/graphify-ingest/src/error.rs b/crates/graphify-ingest/src/error.rs index 1a17d24..7581ed9 100644 --- a/crates/graphify-ingest/src/error.rs +++ b/crates/graphify-ingest/src/error.rs @@ -41,4 +41,13 @@ pub enum IngestError { /// filename plus `_1` through `_999`). #[error("ingest: could not find a free filename after 1000 attempts for {0:?}")] FilenameFull(PathBuf), + + /// `save_query_result` was given an `outcome` outside the allowed set. + #[error( + "ingest: outcome must be one of [\"useful\", \"dead_end\", \"corrected\"], got {got:?}" + )] + InvalidOutcome { + /// The rejected outcome value. + got: String, + }, } diff --git a/crates/graphify-ingest/src/lib.rs b/crates/graphify-ingest/src/lib.rs index be39077..b57c259 100644 --- a/crates/graphify-ingest/src/lib.rs +++ b/crates/graphify-ingest/src/lib.rs @@ -25,5 +25,5 @@ mod text; pub use error::IngestError; pub use ingest_fn::{ingest, ingest_with}; -pub use memory::save_query_result; +pub use memory::{OUTCOMES, save_query_result}; pub use text::{detect_url_type, html_to_markdown, safe_filename, yaml_str}; diff --git a/crates/graphify-ingest/src/memory.rs b/crates/graphify-ingest/src/memory.rs index f5bab94..13291f7 100644 --- a/crates/graphify-ingest/src/memory.rs +++ b/crates/graphify-ingest/src/memory.rs @@ -9,6 +9,9 @@ use crate::error::IngestError; use crate::regexes::RE_NON_WORD; use crate::text::yaml_str; +/// Work-memory outcome signals accepted by [`save_query_result`] (#1441). +pub const OUTCOMES: [&str; 3] = ["useful", "dead_end", "corrected"]; + /// Save a Q&A result as Markdown. /// /// Files are stored in `memory_dir` (typically `graphify-out/memory/`) @@ -19,17 +22,33 @@ use crate::text::yaml_str; /// `source_nodes` is optional; if provided, the first 10 are embedded into /// the frontmatter as a YAML list and into the body as a bullet list. /// +/// `outcome` (one of [`OUTCOMES`]) and `correction` are optional work-memory +/// signals: when set, they are written to both the frontmatter and an +/// `## Outcome` body section so `graphify reflect` can aggregate them. +/// /// # Errors /// -/// Returns [`IngestError::Io`] if the memory directory cannot be created -/// or the file cannot be written. +/// Returns [`IngestError::InvalidOutcome`] if `outcome` is set to a value +/// outside [`OUTCOMES`], or [`IngestError::Io`] if the memory directory cannot +/// be created or the file cannot be written. pub fn save_query_result( question: &str, answer: &str, memory_dir: &Path, query_type: &str, source_nodes: Option<&[String]>, + outcome: Option<&str>, + correction: Option<&str>, ) -> Result { + // Parity with graphify-py `save_query_result` (ingest.py): only `outcome` + // is validated against `OUTCOMES`. `correction` is accepted alongside any + // outcome (or none) — the aggregator simply ignores it for non-`corrected` + // docs — so no extra `correction`-requires-`corrected` guard is added here. + if let Some(o) = outcome + && !OUTCOMES.contains(&o) + { + return Err(IngestError::InvalidOutcome { got: o.to_string() }); + } std::fs::create_dir_all(memory_dir)?; let now = Utc::now(); @@ -53,6 +72,12 @@ pub fn save_query_result( format!("question: \"{}\"", yaml_str(question)), "contributor: \"graphify\"".to_string(), ]; + if let Some(o) = outcome { + frontmatter_lines.push(format!("outcome: \"{}\"", yaml_str(o))); + } + if let Some(c) = correction.filter(|c| !c.is_empty()) { + frontmatter_lines.push(format!("correction: \"{}\"", yaml_str(c))); + } if let Some(nodes) = source_nodes { let nodes_str = nodes @@ -75,6 +100,17 @@ pub fn save_query_result( answer.to_string(), ]; + if outcome.is_some() || correction.is_some_and(|c| !c.is_empty()) { + body_lines.push(String::new()); + body_lines.push("## Outcome".to_string()); + body_lines.push(String::new()); + if let Some(o) = outcome { + body_lines.push(format!("- Signal: {o}")); + } + if let Some(c) = correction.filter(|c| !c.is_empty()) { + body_lines.push(format!("- Correction: {c}")); + } + } if let Some(nodes) = source_nodes { body_lines.push(String::new()); body_lines.push("## Source Nodes".to_string()); diff --git a/crates/graphify-ingest/tests/parity.rs b/crates/graphify-ingest/tests/parity.rs index 0c47652..cf05521 100644 --- a/crates/graphify-ingest/tests/parity.rs +++ b/crates/graphify-ingest/tests/parity.rs @@ -212,8 +212,16 @@ fn html_to_markdown_strips_style() { fn test_file_created() { let tmp = tempfile::tempdir().expect("tempdir"); let mem = tmp.path().join("memory"); - let out = save_query_result("what is attention?", "Attention is...", &mem, "query", None) - .expect("save ok"); + let out = save_query_result( + "what is attention?", + "Attention is...", + &mem, + "query", + None, + None, + None, + ) + .expect("save ok"); assert!(out.exists()); } @@ -227,6 +235,8 @@ fn test_filename_format() { &mem, "query", None, + None, + None, ) .expect("save ok"); assert!( @@ -253,6 +263,8 @@ fn test_frontmatter_question() { &mem, "query", None, + None, + None, ) .expect("save ok"); let content = std::fs::read_to_string(&out).expect("read"); @@ -264,7 +276,7 @@ fn test_frontmatter_question() { fn test_frontmatter_type() { let tmp = tempfile::tempdir().expect("tempdir"); let mem = tmp.path().join("memory"); - let out = save_query_result("q", "a", &mem, "path_query", None).expect("save ok"); + let out = save_query_result("q", "a", &mem, "path_query", None, None, None).expect("save ok"); let content = std::fs::read_to_string(&out).expect("read"); assert!(content.contains("type: \"path_query\"")); } @@ -274,7 +286,8 @@ fn test_source_nodes_included() { let tmp = tempfile::tempdir().expect("tempdir"); let mem = tmp.path().join("memory"); let nodes = vec!["AttentionLayer".to_string(), "SoftmaxFunc".to_string()]; - let out = save_query_result("q", "a", &mem, "query", Some(&nodes)).expect("save ok"); + let out = + save_query_result("q", "a", &mem, "query", Some(&nodes), None, None).expect("save ok"); let content = std::fs::read_to_string(&out).expect("read"); assert!(content.contains("AttentionLayer")); assert!(content.contains("SoftmaxFunc")); @@ -285,7 +298,8 @@ fn test_source_nodes_capped_at_10() { let tmp = tempfile::tempdir().expect("tempdir"); let mem = tmp.path().join("memory"); let nodes: Vec = (0..20).map(|i| format!("Node{i}")).collect(); - let out = save_query_result("q", "a", &mem, "query", Some(&nodes)).expect("save ok"); + let out = + save_query_result("q", "a", &mem, "query", Some(&nodes), None, None).expect("save ok"); let content = std::fs::read_to_string(&out).expect("read"); // Only first 10 should appear in frontmatter source_nodes line let fm_line = content @@ -300,7 +314,7 @@ fn test_memory_dir_created() { let tmp = tempfile::tempdir().expect("tempdir"); let mem = tmp.path().join("deep").join("memory"); assert!(!mem.exists()); - save_query_result("q", "a", &mem, "query", None).expect("save ok"); + save_query_result("q", "a", &mem, "query", None, None, None).expect("save ok"); assert!(mem.exists()); } @@ -309,12 +323,70 @@ fn test_answer_in_body() { let tmp = tempfile::tempdir().expect("tempdir"); let mem = tmp.path().join("memory"); let answer = "The answer is forty-two."; - let out = - save_query_result("what is the answer?", answer, &mem, "query", None).expect("save ok"); + let out = save_query_result( + "what is the answer?", + answer, + &mem, + "query", + None, + None, + None, + ) + .expect("save ok"); let content = std::fs::read_to_string(&out).expect("read"); assert!(content.contains(answer)); } +#[test] +fn test_outcome_in_frontmatter_and_body() { + let tmp = tempfile::tempdir().expect("tempdir"); + let mem = tmp.path().join("memory"); + let out = + save_query_result("q", "a", &mem, "query", None, Some("useful"), None).expect("save ok"); + let content = std::fs::read_to_string(&out).expect("read"); + assert!(content.contains("outcome: \"useful\"")); + assert!(content.contains("## Outcome")); + assert!(content.contains("- Signal: useful")); +} + +#[test] +fn test_correction_in_frontmatter_and_body() { + let tmp = tempfile::tempdir().expect("tempdir"); + let mem = tmp.path().join("memory"); + let out = save_query_result( + "what hashes passwords?", + "MD5", + &mem, + "query", + None, + Some("corrected"), + Some("It's bcrypt, see PasswordHasher"), + ) + .expect("save ok"); + let content = std::fs::read_to_string(&out).expect("read"); + assert!(content.contains("correction: \"It's bcrypt, see PasswordHasher\"")); + assert!(content.contains("- Correction: It's bcrypt, see PasswordHasher")); +} + +#[test] +fn test_no_outcome_means_no_outcome_section() { + // Backward compatible: a result without an outcome looks exactly as before. + let tmp = tempfile::tempdir().expect("tempdir"); + let mem = tmp.path().join("memory"); + let out = save_query_result("q", "a", &mem, "query", None, None, None).expect("save ok"); + let content = std::fs::read_to_string(&out).expect("read"); + assert!(!content.contains("outcome:")); + assert!(!content.contains("## Outcome")); +} + +#[test] +fn test_invalid_outcome_rejected() { + let tmp = tempfile::tempdir().expect("tempdir"); + let mem = tmp.path().join("memory"); + let result = save_query_result("q", "a", &mem, "query", None, Some("great"), None); + assert!(result.is_err(), "invalid outcome must be rejected"); +} + // --------------------------------------------------------------------------- // ingest — URL validation (no network required) // --------------------------------------------------------------------------- diff --git a/crates/graphify-llm/src/constants.rs b/crates/graphify-llm/src/constants.rs index e2b1c11..f5581f7 100644 --- a/crates/graphify-llm/src/constants.rs +++ b/crates/graphify-llm/src/constants.rs @@ -43,8 +43,10 @@ Edge direction rule — source is always the ACTOR, target is the ACTED-UPON:\n\ - imports/references: source = the file/entity that imports or references; target = the thing imported or referenced.\n\ - implements/inherits: source = the subclass/implementor; target = the base class/interface.\n\ \n\ +Hyperedges: if 3 or more nodes clearly participate together in a shared concept, flow, or pattern that is not captured by pairwise edges alone, add a hyperedge to the top-level `hyperedges` array (e.g. all classes implementing one protocol, all functions in one auth flow even if they don't all call each other, all concepts from a paper section forming one coherent idea). Use sparingly — only when the group relationship adds information beyond the pairwise edges. Maximum 3 hyperedges per chunk.\n\ +\n\ Output exactly this schema:\n\ -{\"nodes\":[{\"id\":\"stem_entity\",\"label\":\"Human Readable Name\",\"file_type\":\"code|document|paper|image|rationale|concept\",\"source_file\":\"relative/path\",\"source_location\":null,\"source_url\":null,\"captured_at\":null,\"author\":null,\"contributor\":null}],\"edges\":[{\"source\":\"node_id\",\"target\":\"node_id\",\"relation\":\"calls|implements|references|cites|conceptually_related_to|shares_data_with|semantically_similar_to\",\"confidence\":\"EXTRACTED|INFERRED|AMBIGUOUS\",\"confidence_score\":1.0,\"source_file\":\"relative/path\",\"source_location\":null,\"weight\":1.0}],\"hyperedges\":[],\"input_tokens\":0,\"output_tokens\":0}\n\ +{\"nodes\":[{\"id\":\"stem_entity\",\"label\":\"Human Readable Name\",\"file_type\":\"code|document|paper|image|rationale|concept\",\"source_file\":\"relative/path\",\"source_location\":null,\"source_url\":null,\"captured_at\":null,\"author\":null,\"contributor\":null}],\"edges\":[{\"source\":\"node_id\",\"target\":\"node_id\",\"relation\":\"calls|implements|references|cites|conceptually_related_to|shares_data_with|semantically_similar_to\",\"confidence\":\"EXTRACTED|INFERRED|AMBIGUOUS\",\"confidence_score\":1.0,\"source_file\":\"relative/path\",\"source_location\":null,\"weight\":1.0}],\"hyperedges\":[{\"id\":\"snake_case_id\",\"label\":\"Human Readable Label\",\"nodes\":[\"node_id1\",\"node_id2\",\"node_id3\"],\"relation\":\"participate_in|implement|form\",\"confidence\":\"EXTRACTED|INFERRED\",\"confidence_score\":0.75,\"source_file\":\"relative/path\"}],\"input_tokens\":0,\"output_tokens\":0}\n\ "; /// Appended to [`EXTRACTION_SYSTEM`] in `--mode deep` to bias the model toward diff --git a/crates/graphify-llm/src/labeling.rs b/crates/graphify-llm/src/labeling.rs index dd64f4e..835ff53 100644 --- a/crates/graphify-llm/src/labeling.rs +++ b/crates/graphify-llm/src/labeling.rs @@ -13,6 +13,7 @@ //! of god-node ids (used only to bias which member labels are sampled first). use indexmap::{IndexMap, IndexSet}; +use rayon::prelude::*; use regex::Regex; use crate::LlmError; @@ -37,6 +38,16 @@ pub const LABEL_BATCH_SIZE: usize = 100; /// parse failure, bounding cost. Mirrors Python `_label_batch_with_retry`. const LABEL_MAX_DEPTH: usize = 3; +/// `true` when an env var opts a serial backend into parallel labeling (value +/// trimmed to exactly `"1"`). Mirrors the Python `GRAPHIFY_*_PARALLEL` switches. +fn env_parallel_opt_in(var: &str) -> bool { + std::env::var(var).is_ok_and(|v| v.trim() == "1") +} + +/// `(batch_index, parsed-or-error)` produced by one labeling batch — collected +/// from the worker pool, then merged in index order. +type BatchOutcome = (usize, Result, LlmError>); + /// Knobs for [`label_communities`] / [`label_communities_with`]. /// /// Mirrors the keyword arguments of Python's `label_communities`: `model`, @@ -53,6 +64,9 @@ pub struct LabelOptions<'a> { pub top_k: usize, /// Communities per LLM call. pub batch_size: usize, + /// Max batches labeled concurrently. Backends that serialise per process + /// (ollama, claude-cli) are pinned to 1 unless opted in via env (#1390). + pub max_concurrency: usize, } impl Default for LabelOptions<'_> { @@ -62,6 +76,7 @@ impl Default for LabelOptions<'_> { max_communities: None, top_k: LABEL_TOP_K, batch_size: LABEL_BATCH_SIZE, + max_concurrency: 4, } } } @@ -292,7 +307,7 @@ pub fn label_communities_with( call: F, ) -> Result, LlmError> where - F: Fn(&str, &str, u32, Option<&str>) -> Result, + F: Fn(&str, &str, u32, Option<&str>) -> Result + Sync, { let mut labels = placeholder_community_labels(communities); let cap = opts.max_communities.unwrap_or_else(|| communities.len()); @@ -306,25 +321,65 @@ where let batch_size = opts.batch_size.max(1); let total = labeled_cids.len(); let n_batches = total.div_ceil(batch_size); - let mut written = 0usize; - let mut first_error: Option = None; - for batch_idx in 0..n_batches { + // Backends that serialise per process must not fan out: ollama serves one + // request at a time per loaded model (parallel batches cause VRAM pressure + // and hollow replies) and claude-cli shells out to a single session that + // parallel subprocesses corrupt. Force serial unless opted in (#1390). + let mut max_concurrency = opts.max_concurrency; + if backend == "ollama" && !env_parallel_opt_in("GRAPHIFY_OLLAMA_PARALLEL") { + max_concurrency = 1; + } + if backend == "claude-cli" && !env_parallel_opt_in("GRAPHIFY_CLAUDE_CLI_PARALLEL") { + max_concurrency = 1; + } + let workers = max_concurrency.min(n_batches).max(1); + + let run_batch = |batch_idx: usize| -> (usize, Result, LlmError>) { let start = batch_idx * batch_size; let end = (start + batch_size).min(total); - let batch_lines = &lines[start..end]; - let batch_cids = &labeled_cids[start..end]; + let parsed = label_batch_with_retry( + &labeled_cids[start..end], + &lines[start..end], + backend, + opts.model, + 0, + &call, + ); + (batch_idx, parsed) + }; + + // Fan out batches across `workers` threads; merge on this thread so `labels` + // is never mutated concurrently. `workers == 1` keeps the sequential path. + let mut results: Vec = if workers <= 1 { + (0..n_batches).map(&run_batch).collect() + } else { + rayon::ThreadPoolBuilder::new() + .num_threads(workers) + .build() + .map_or_else( + |_| (0..n_batches).map(&run_batch).collect(), + |pool| pool.install(|| (0..n_batches).into_par_iter().map(&run_batch).collect()), + ) + }; + // Merge in batch order so the propagated error and stderr are deterministic. + results.sort_by_key(|(batch_idx, _)| *batch_idx); - match label_batch_with_retry(batch_cids, batch_lines, backend, opts.model, 0, &call) { + let mut written = 0usize; + let mut first_error: Option = None; + for (batch_idx, parsed) in results { + match parsed { Ok(parsed) => { written += parsed.len(); labels.extend(parsed); } Err(exc) => { + let start = batch_idx * batch_size; + let end = (start + batch_size).min(total); eprintln!( "[graphify label] batch {}/{n_batches} ({} communities) failed: {exc}", batch_idx + 1, - batch_cids.len(), + end - start, ); if first_error.is_none() { first_error = Some(exc); @@ -349,6 +404,9 @@ where /// Returns `(labels, source)` where `source` is `"llm"` or `"placeholder"`. /// Never errors. #[must_use] +// Labeling entry point: graph data + backend auto-detect + tuning knobs; a +// partial options-struct split would obscure the auto-detect/degrade flow. +#[allow(clippy::too_many_arguments)] pub fn generate_community_labels( communities: &IndexMap>, node_labels: &IndexMap, @@ -356,6 +414,8 @@ pub fn generate_community_labels( backend: Option<&str>, model: Option<&str>, quiet: bool, + max_concurrency: usize, + batch_size: usize, ) -> (IndexMap, &'static str) { generate_community_labels_with( communities, @@ -364,6 +424,8 @@ pub fn generate_community_labels( backend, model, quiet, + max_concurrency, + batch_size, |prompt, b, max, m| call_llm_with_model(prompt, b, max as usize, m), ) } @@ -371,6 +433,8 @@ pub fn generate_community_labels( /// [`generate_community_labels`] with an injectable LLM call — `call(prompt, /// backend, max_tokens, model)`. Used by the public wrapper and by tests. #[must_use] +// As `generate_community_labels`, plus the injectable `call` for testing. +#[allow(clippy::too_many_arguments)] pub fn generate_community_labels_with( communities: &IndexMap>, node_labels: &IndexMap, @@ -378,10 +442,12 @@ pub fn generate_community_labels_with( backend: Option<&str>, model: Option<&str>, quiet: bool, + max_concurrency: usize, + batch_size: usize, call: F, ) -> (IndexMap, &'static str) where - F: Fn(&str, &str, u32, Option<&str>) -> Result, + F: Fn(&str, &str, u32, Option<&str>) -> Result + Sync, { let resolved = match backend { Some(b) if !b.is_empty() => Some(b.to_string()), @@ -398,6 +464,8 @@ where }; let opts = LabelOptions { model, + batch_size, + max_concurrency, ..LabelOptions::default() }; match label_communities_with(communities, node_labels, gods, &backend, opts, call) { diff --git a/crates/graphify-llm/tests/labeling.rs b/crates/graphify-llm/tests/labeling.rs index ed4a763..e2ceb55 100644 --- a/crates/graphify-llm/tests/labeling.rs +++ b/crates/graphify-llm/tests/labeling.rs @@ -4,7 +4,8 @@ //! `_call_llm`; the Rust port injects the call via the `*_with` variants. #![allow(clippy::expect_used)] -use std::cell::Cell; +use std::sync::Mutex; +use std::sync::atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}; use graphify_llm::{ LabelOptions, generate_community_labels, generate_community_labels_with, label_communities, @@ -62,7 +63,7 @@ fn cids_in_prompt(prompt: &str) -> Vec { fn label_communities_happy_path() { let (node_labels, communities) = graph(); let gods = IndexSet::new(); - let captured: Cell> = Cell::new(None); + let captured: Mutex> = Mutex::new(None); let labels = label_communities_with( &communities, @@ -71,7 +72,7 @@ fn label_communities_happy_path() { "gemini", LabelOptions::default(), |prompt, backend, _max, _model| { - captured.set(Some((prompt.to_string(), backend.to_string()))); + *captured.lock().expect("lock") = Some((prompt.to_string(), backend.to_string())); Ok(r#"{"0": "Order Management", "1": "Payment Flow"}"#.to_string()) }, ) @@ -79,7 +80,7 @@ fn label_communities_happy_path() { assert_eq!(labels[&0], "Order Management"); assert_eq!(labels[&1], "Payment Flow"); - let (prompt, backend) = captured.take().expect("call invoked"); + let (prompt, backend) = captured.lock().expect("lock").take().expect("call invoked"); assert!(prompt.contains("place_order")); assert!(prompt.contains("StripeClient")); assert_eq!(backend, "gemini"); @@ -90,7 +91,7 @@ fn label_communities_passes_model_override() { // The model override threads through to the injected call (#b304331). let (node_labels, communities) = graph(); let gods = IndexSet::new(); - let captured: Cell)>> = Cell::new(None); + let captured: Mutex)>> = Mutex::new(None); let opts = LabelOptions { model: Some("gemini-3.1-flash-lite"), @@ -103,7 +104,8 @@ fn label_communities_passes_model_override() { "gemini", opts, |_prompt, backend, _max, model| { - captured.set(Some((backend.to_string(), model.map(str::to_string)))); + *captured.lock().expect("lock") = + Some((backend.to_string(), model.map(str::to_string))); Ok(r#"{"0": "Order Management", "1": "Payment Flow"}"#.to_string()) }, ) @@ -111,7 +113,7 @@ fn label_communities_passes_model_override() { assert_eq!(labels[&0], "Order Management"); assert_eq!(labels[&1], "Payment Flow"); - let (backend, model) = captured.take().expect("call invoked"); + let (backend, model) = captured.lock().expect("lock").take().expect("call invoked"); assert_eq!(backend, "gemini"); assert_eq!(model.as_deref(), Some("gemini-3.1-flash-lite")); } @@ -217,6 +219,8 @@ fn generate_community_labels_degrades_on_error() { Some("gemini"), None, true, + 4, + 100, |_, _, _, _| Ok("not json".to_string()), ); assert_eq!(source, "placeholder"); @@ -238,7 +242,7 @@ fn generate_community_labels_no_backend() { let (node_labels, communities) = graph(); let gods = IndexSet::new(); let (labels, source) = - generate_community_labels(&communities, &node_labels, &gods, None, None, true); + generate_community_labels(&communities, &node_labels, &gods, None, None, true, 4, 100); assert_eq!(source, "placeholder"); assert_eq!(labels[&0], "Community 0"); assert_eq!(labels[&1], "Community 1"); @@ -256,6 +260,8 @@ fn generate_community_labels_degrades_loud() { Some("gemini"), None, false, + 4, + 100, |_, _, _, _| Ok("not json".to_string()), ); assert_eq!(source, "placeholder"); @@ -326,6 +332,8 @@ fn label_communities_real_path_via_custom_provider() { Some("labelprov"), None, true, + 4, + 100, ); assert_eq!(source, "llm"); assert_eq!(labels[&0], "Orders"); @@ -346,6 +354,8 @@ fn generate_community_labels_success() { Some("gemini"), None, true, + 4, + 100, |_, _, _, _| Ok(r#"{"0":"Orders","1":"Payments"}"#.to_string()), ); assert_eq!(source, "llm"); @@ -379,7 +389,7 @@ fn empty_communities_returns_placeholders() { let mut communities: IndexMap> = IndexMap::new(); communities.insert(0, vec![]); let gods = IndexSet::new(); - let called = Cell::new(false); + let called = AtomicBool::new(false); // community with no resolvable nodes -> no prompt line -> no backend call. let labels = label_communities_with( &communities, @@ -388,13 +398,13 @@ fn empty_communities_returns_placeholders() { "gemini", LabelOptions::default(), |_, _, _, _| { - called.set(true); + called.store(true, Ordering::Relaxed); Ok("{}".to_string()) }, ) .expect("labeling succeeds"); assert_eq!(labels[&0], "Community 0"); - assert!(!called.get()); + assert!(!called.load(Ordering::Relaxed)); } // --------------------------------------------------------------------------- @@ -407,10 +417,11 @@ fn empty_communities_returns_placeholders() { fn label_communities_batches_when_over_batch_size() { let (node_labels, communities) = wide_graph(250); let gods = IndexSet::new(); - let calls: std::cell::RefCell> = std::cell::RefCell::new(Vec::new()); + let calls: Mutex> = Mutex::new(Vec::new()); let opts = LabelOptions { batch_size: 100, + max_concurrency: 1, ..LabelOptions::default() }; let labels = label_communities_with( @@ -421,7 +432,7 @@ fn label_communities_batches_when_over_batch_size() { opts, |prompt, _backend, _max, _model| { let cids = cids_in_prompt(prompt); - calls.borrow_mut().push(cids.len()); + calls.lock().expect("lock").push(cids.len()); let body = cids .iter() .map(|c| format!("\"{c}\": \"Cluster {c}\"")) @@ -433,7 +444,7 @@ fn label_communities_batches_when_over_batch_size() { .expect("labeling succeeds"); // 250 communities / 100 per batch -> 3 batches (100, 100, 50). - assert_eq!(*calls.borrow(), vec![100, 100, 50]); + assert_eq!(*calls.lock().expect("lock"), vec![100, 100, 50]); // Every community got a real name, none left as a placeholder. assert_eq!(labels.len(), 250); assert!( @@ -446,10 +457,11 @@ fn label_communities_batches_when_over_batch_size() { fn label_communities_partial_batch_failure_keeps_successful_batches() { let (node_labels, communities) = wide_graph(150); let gods = IndexSet::new(); - let n_calls = Cell::new(0u32); + let n_calls = AtomicU32::new(0); let opts = LabelOptions { batch_size: 50, + max_concurrency: 1, ..LabelOptions::default() }; let labels = label_communities_with( @@ -459,9 +471,9 @@ fn label_communities_partial_batch_failure_keeps_successful_batches() { "gemini", opts, |prompt, _backend, _max, _model| { - n_calls.set(n_calls.get() + 1); + n_calls.fetch_add(1, Ordering::Relaxed); let cids = cids_in_prompt(prompt); - if n_calls.get() == 2 { + if n_calls.load(Ordering::Relaxed) == 2 { return Err(graphify_llm::LlmError::Http( "simulated transient backend failure".to_string(), )); @@ -522,7 +534,7 @@ fn label_communities_max_communities_caps_total() { // Backwards compat: explicit max_communities still caps the total labeled. let (node_labels, communities) = wide_graph(150); let gods = IndexSet::new(); - let captured: std::cell::RefCell> = std::cell::RefCell::new(Vec::new()); + let captured: Mutex> = Mutex::new(Vec::new()); let opts = LabelOptions { max_communities: Some(40), @@ -537,7 +549,7 @@ fn label_communities_max_communities_caps_total() { opts, |prompt, _backend, _max, _model| { let cids = cids_in_prompt(prompt); - captured.borrow_mut().extend(&cids); + captured.lock().expect("lock").extend(&cids); let body = cids .iter() .map(|c| format!("\"{c}\": \"X{c}\"")) @@ -549,7 +561,7 @@ fn label_communities_max_communities_caps_total() { .expect("labeling succeeds"); // Only 40 communities should have been sent to the backend. - assert_eq!(captured.borrow().len(), 40); + assert_eq!(captured.lock().expect("lock").len(), 40); } // --------------------------------------------------------------------------- @@ -564,7 +576,7 @@ fn label_batch_recovers_via_split_on_invalid_json() { // community ends up labeled — none silently dropped. let (node_labels, communities) = wide_graph(4); let gods = IndexSet::new(); - let n_calls = Cell::new(0u32); + let n_calls = AtomicU32::new(0); let labels = label_communities_with( &communities, &node_labels, @@ -572,8 +584,8 @@ fn label_batch_recovers_via_split_on_invalid_json() { "gemini", LabelOptions::default(), |prompt, _backend, _max, _model| { - n_calls.set(n_calls.get() + 1); - if n_calls.get() == 1 { + n_calls.fetch_add(1, Ordering::Relaxed); + if n_calls.load(Ordering::Relaxed) == 1 { // Broken JSON on the full batch triggers the split-and-retry. return Ok("{this is not valid json, missing quotes".to_string()); } @@ -592,8 +604,165 @@ fn label_batch_recovers_via_split_on_invalid_json() { assert_eq!(labels[&cid], format!("Label {cid}")); } assert_eq!( - n_calls.get(), + n_calls.load(Ordering::Relaxed), 3, "expected 1 initial call + 2 retry calls after split" ); } + +// --------------------------------------------------------------------------- +// #1390: parallel labeling via --max-concurrency / --batch-size. +// --------------------------------------------------------------------------- + +/// Label every community at the given concurrency with a deterministic mock. +#[must_use] +fn label_at_concurrency( + communities: &IndexMap>, + node_labels: &IndexMap, + max_concurrency: usize, +) -> IndexMap { + label_communities_with( + communities, + node_labels, + &IndexSet::new(), + "gemini", + LabelOptions { + batch_size: 1, + max_concurrency, + ..LabelOptions::default() + }, + |prompt, _b, _m, _model| { + let body = cids_in_prompt(prompt) + .iter() + .map(|c| format!("\"{c}\": \"Cluster {c}\"")) + .collect::>() + .join(", "); + Ok(format!("{{{body}}}")) + }, + ) + .expect("labeling succeeds") +} + +#[test] +fn label_communities_parallel_matches_sequential() { + let (node_labels, communities) = wide_graph(12); + let seq = label_at_concurrency(&communities, &node_labels, 1); + let par = label_at_concurrency(&communities, &node_labels, 4); + assert_eq!( + seq, par, + "parallel labeling must produce the same map as serial" + ); +} + +#[test] +fn label_communities_batch_size_controls_batch_count() { + let (node_labels, communities) = wide_graph(5); + let gods = IndexSet::new(); + let sizes: Mutex> = Mutex::new(Vec::new()); + // Pin serial so the recorded batch-size order is deterministic. + let opts = LabelOptions { + batch_size: 2, + max_concurrency: 1, + ..LabelOptions::default() + }; + label_communities_with( + &communities, + &node_labels, + &gods, + "gemini", + opts, + |prompt, _b, _m, _model| { + let cids = cids_in_prompt(prompt); + sizes.lock().expect("lock").push(cids.len()); + let body = cids + .iter() + .map(|c| format!("\"{c}\": \"C{c}\"")) + .collect::>() + .join(", "); + Ok(format!("{{{body}}}")) + }, + ) + .expect("labeling succeeds"); + // 5 communities / batch size 2 -> batches of [2, 2, 1]. + assert_eq!(*sizes.lock().expect("lock"), vec![2, 2, 1]); +} + +#[test] +fn label_communities_runs_batches_concurrently() { + let (node_labels, communities) = wide_graph(8); + let gods = IndexSet::new(); + let current = AtomicUsize::new(0); + let peak = AtomicUsize::new(0); + let opts = LabelOptions { + batch_size: 1, + max_concurrency: 4, + ..LabelOptions::default() + }; + label_communities_with( + &communities, + &node_labels, + &gods, + "gemini", + opts, + |prompt, _b, _m, _model| { + let now = current.fetch_add(1, Ordering::SeqCst) + 1; + peak.fetch_max(now, Ordering::SeqCst); + std::thread::sleep(std::time::Duration::from_millis(30)); + current.fetch_sub(1, Ordering::SeqCst); + let body = cids_in_prompt(prompt) + .iter() + .map(|c| format!("\"{c}\": \"C{c}\"")) + .collect::>() + .join(", "); + Ok(format!("{{{body}}}")) + }, + ) + .expect("labeling succeeds"); + assert!( + peak.load(Ordering::SeqCst) > 1, + "batches did not run concurrently with max_concurrency=4" + ); +} + +#[test] +#[serial] +fn label_communities_forces_serial_for_ollama() { + // Ollama serves one request at a time; without the opt-in env switch the + // guard must pin labeling to a single worker (#1390). + let mut g = EnvGuard::new(); + g.set("GRAPHIFY_OLLAMA_PARALLEL", ""); + let (node_labels, communities) = wide_graph(8); + let gods = IndexSet::new(); + let current = AtomicUsize::new(0); + let peak = AtomicUsize::new(0); + let opts = LabelOptions { + batch_size: 1, + max_concurrency: 4, + ..LabelOptions::default() + }; + label_communities_with( + &communities, + &node_labels, + &gods, + "ollama", + opts, + |prompt, _b, _m, _model| { + let now = current.fetch_add(1, Ordering::SeqCst) + 1; + peak.fetch_max(now, Ordering::SeqCst); + std::thread::sleep(std::time::Duration::from_millis(10)); + current.fetch_sub(1, Ordering::SeqCst); + let body = cids_in_prompt(prompt) + .iter() + .map(|c| format!("\"{c}\": \"C{c}\"")) + .collect::>() + .join(", "); + Ok(format!("{{{body}}}")) + }, + ) + .expect("labeling succeeds"); + assert_eq!( + peak.load(Ordering::SeqCst), + 1, + "ollama labeling must run serially" + ); +} diff --git a/crates/graphify-llm/tests/parity.rs b/crates/graphify-llm/tests/parity.rs index 7ee7bbc..9525766 100644 --- a/crates/graphify-llm/tests/parity.rs +++ b/crates/graphify-llm/tests/parity.rs @@ -1217,6 +1217,30 @@ fn test_extraction_system_states_edge_direction_rule() { assert!(sys.contains("the function/method BEING CALLED. Never reverse this.")); } +/// The extraction prompt requests hyperedges with a populated schema example +/// (not an empty array) and the guidance paragraph (#1418 follow-up). +#[test] +fn test_extraction_system_requests_hyperedges() { + let sys = graphify_llm::EXTRACTION_SYSTEM; + assert!( + !sys.contains("\"hyperedges\":[]"), + "schema must not show an empty hyperedges array" + ); + assert!( + sys.contains("\"nodes\":[\"node_id1\""), + "schema must show a populated hyperedge example" + ); + assert!(sys.contains("3 or more nodes")); + assert!(sys.to_lowercase().contains("hyperedge")); +} + +#[test] +fn test_extraction_system_hyperedge_guidance_text() { + assert!( + graphify_llm::EXTRACTION_SYSTEM.contains("3 or more nodes clearly participate together") + ); +} + #[test] fn test_extraction_system_deep_appends_suffix() { let sys = graphify_llm::extraction_system(true); diff --git a/crates/graphify-reflect/Cargo.toml b/crates/graphify-reflect/Cargo.toml new file mode 100644 index 0000000..d234e3a --- /dev/null +++ b/crates/graphify-reflect/Cargo.toml @@ -0,0 +1,23 @@ +[package] +description = "Deterministic work-memory reflection: aggregate graphify-out/memory/ outcomes into a lessons doc" +edition.workspace = true +license.workspace = true +name = "graphify-reflect" +publish.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +chrono = { workspace = true } +indexmap = { workspace = true } +regex = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } + +[dev-dependencies] +graphify-ingest = { workspace = true } +tempfile = { workspace = true } + +[lints] +workspace = true diff --git a/crates/graphify-reflect/src/aggregate.rs b/crates/graphify-reflect/src/aggregate.rs new file mode 100644 index 0000000..e86ce85 --- /dev/null +++ b/crates/graphify-reflect/src/aggregate.rs @@ -0,0 +1,368 @@ +//! Scoring and aggregation of parsed memory docs into a lessons structure. + +use std::cmp::Ordering; +use std::collections::HashSet; + +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; +use indexmap::IndexMap; + +use crate::graph::doc_community; +use crate::parse::MemoryDoc; + +/// Rounding for the signed score keeps sort order and the contested verdict +/// stable across platforms (the last ULP of `powf` can differ). +const SCORE_NDIGITS: i32 = 9; + +/// Outcome tallies for a bucket: the three signals plus `unmarked`. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct OutcomeCounts { + /// Count of `useful` docs. + pub useful: usize, + /// Count of `dead_end` docs. + pub dead_end: usize, + /// Count of `corrected` docs. + pub corrected: usize, + /// Count of docs with no (or an unrecognised) outcome. + pub unmarked: usize, +} + +/// A positive-only source node: `preferred` (corroborated) or `tentative`. +#[derive(Clone, Debug, PartialEq)] +pub struct SourceEntry { + /// The cited node id/label. + pub node: String, + /// Distinct `useful` results citing it. + pub n: usize, + /// Signed, time-decayed score (used for ordering only). + pub score: f64, +} + +/// A node with both positive and negative signals; recency decides the verdict. +#[derive(Clone, Debug, PartialEq)] +pub struct ContestedEntry { + /// The cited node id/label. + pub node: String, + /// Distinct `useful` results. + pub pos: usize, + /// Distinct `dead_end`/`corrected` results. + pub neg: usize, + /// Signed, time-decayed score. + pub score: f64, + /// `"useful"`, `"dead end"`, or `"even"` by the sign of `score`. + pub verdict: String, + /// Most recent event date seen for this node. + pub last: String, +} + +/// A `dead_end` question and the nodes it cited. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct DeadEnd { + /// The question that led nowhere. + pub question: String, + /// Cited source nodes. + pub nodes: Vec, + /// ISO date. + pub date: String, +} + +/// A `corrected` question and the right answer. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Correction { + /// The question that was corrected. + pub question: String, + /// What the right answer was. + pub correction: String, + /// ISO date. + pub date: String, +} + +/// One community's (or the overall) finalized lessons. +#[derive(Clone, Debug)] +pub struct Bucket { + /// Outcome tallies for this bucket. + pub counts: OutcomeCounts, + /// Corroborated positive-only sources. + pub preferred: Vec, + /// Not-yet-corroborated positive-only sources. + pub tentative: Vec, + /// Mixed-signal sources. + pub contested: Vec, + /// Dead-end questions. + pub dead_ends: Vec, + /// Corrections. + pub corrections: Vec, +} + +/// The full aggregate produced by [`aggregate_lessons`]. +#[derive(Clone, Debug)] +pub struct AggResult { + /// Total docs aggregated. + pub total: usize, + /// Overall outcome tallies. + pub counts: OutcomeCounts, + /// The corroboration threshold used (echoed for rendering). + pub min_corroboration: usize, + /// Overall preferred sources. + pub preferred: Vec, + /// Overall tentative sources. + pub tentative: Vec, + /// Overall contested sources. + pub contested: Vec, + /// Overall dead ends. + pub dead_ends: Vec, + /// Overall corrections. + pub corrections: Vec, + /// Per-community buckets; empty unless a graph was supplied. + pub by_community: IndexMap, +} + +/// Mutable accumulator threaded through aggregation. +#[derive(Default)] +struct AggBucket { + counts: OutcomeCounts, + node_score: IndexMap, + node_pos: IndexMap, + node_neg: IndexMap, + node_last: IndexMap, + dead_ends: Vec, + corrections: Vec, +} + +/// Parse an ISO date/datetime to an aware UTC datetime, or `None`. +fn parse_dt(date_str: &str) -> Option> { + if date_str.is_empty() { + return None; + } + if let Ok(dt) = DateTime::parse_from_rfc3339(date_str) { + return Some(dt.with_timezone(&Utc)); + } + if let Ok(ndt) = NaiveDateTime::parse_from_str(date_str, "%Y-%m-%dT%H:%M:%S") { + return Some(ndt.and_utc()); + } + if let Ok(nd) = NaiveDate::parse_from_str(date_str, "%Y-%m-%d") { + return Some(nd.and_hms_opt(0, 0, 0)?.and_utc()); + } + None +} + +/// Time-decay weight in (0, 1]: halves every `half_life_days`. Undated/future +/// signals keep full weight (1.0). +fn decay(date_str: &str, now: DateTime, half_life_days: f64) -> f64 { + let Some(dt) = parse_dt(date_str) else { + return 1.0; + }; + if half_life_days <= 0.0 { + return 1.0; + } + // Seconds-since-epoch differences stay far within f64's exact-integer range. + #[allow(clippy::cast_precision_loss)] + let age_days = ((now - dt).num_seconds() as f64 / 86_400.0).max(0.0); + 0.5_f64.powf(age_days / half_life_days) +} + +/// Round a score to [`SCORE_NDIGITS`] decimal places. +fn round_score(x: f64) -> f64 { + let factor = 10_f64.powi(SCORE_NDIGITS); + (x * factor).round() / factor +} + +fn record_node(b: &mut AggBucket, node: &str, sign: i32, weight: f64, date: &str) { + *b.node_score.entry(node.to_owned()).or_insert(0.0) += f64::from(sign) * weight; + if sign > 0 { + *b.node_pos.entry(node.to_owned()).or_insert(0) += 1; + } else if sign < 0 { + *b.node_neg.entry(node.to_owned()).or_insert(0) += 1; + } + let cur = b.node_last.get(node).map_or("", String::as_str); + if date > cur { + b.node_last.insert(node.to_owned(), date.to_owned()); + } +} + +fn cmp_score_then_node(sa: f64, na: &str, sb: f64, nb: &str) -> Ordering { + sb.partial_cmp(&sa) + .unwrap_or(Ordering::Equal) + .then_with(|| na.cmp(nb)) +} + +/// Split a bucket's scored nodes into preferred / tentative / contested. +fn finalize_sources( + b: &AggBucket, + k: usize, +) -> (Vec, Vec, Vec) { + let mut preferred: Vec = Vec::new(); + let mut tentative: Vec = Vec::new(); + let mut contested: Vec = Vec::new(); + for (node, raw) in &b.node_score { + let pos = b.node_pos.get(node).copied().unwrap_or(0); + let neg = b.node_neg.get(node).copied().unwrap_or(0); + let score = round_score(*raw); + if pos > 0 && neg > 0 { + let verdict = if score > 0.0 { + "useful" + } else if score < 0.0 { + "dead end" + } else { + "even" + }; + contested.push(ContestedEntry { + node: node.clone(), + pos, + neg, + score, + verdict: verdict.to_owned(), + last: b.node_last.get(node).cloned().unwrap_or_default(), + }); + } else if pos > 0 { + let entry = SourceEntry { + node: node.clone(), + n: pos, + score, + }; + if pos >= k { + preferred.push(entry); + } else { + tentative.push(entry); + } + } + // negative-only nodes are surfaced via the dead-ends questions, not here. + } + preferred.sort_by(|a, c| cmp_score_then_node(a.score, &a.node, c.score, &c.node)); + tentative.sort_by(|a, c| cmp_score_then_node(a.score, &a.node, c.score, &c.node)); + contested.sort_by(|a, c| cmp_score_then_node(a.score, &a.node, c.score, &c.node)); + (preferred, tentative, contested) +} + +/// Collapse repeated questions to one entry (last/most-recent text wins), +/// ordered by (date, question). +fn dedupe_by_question( + items: Vec, + question: impl Fn(&T) -> String, + date: impl Fn(&T) -> String, +) -> Vec { + let mut latest: IndexMap = IndexMap::new(); + for item in items { + latest.insert(question(&item), item); + } + let mut out: Vec = latest.into_values().collect(); + out.sort_by_key(|item| (date(item), question(item))); + out +} + +fn dedupe_dead_ends(items: Vec) -> Vec { + dedupe_by_question(items, |d| d.question.clone(), |d| d.date.clone()) +} + +fn dedupe_corrections(items: Vec) -> Vec { + dedupe_by_question(items, |c| c.question.clone(), |c| c.date.clone()) +} + +/// Apply one doc's signal to a bucket (counts, node scores, dead ends/corrections). +fn apply_doc(b: &mut AggBucket, doc: &MemoryDoc, nodes: &[String], sign: i32, weight: f64) { + let date = doc.date.as_str(); + match doc.outcome.as_deref() { + Some("useful") => b.counts.useful += 1, + Some("dead_end") => b.counts.dead_end += 1, + Some("corrected") => b.counts.corrected += 1, + _ => b.counts.unmarked += 1, + } + if sign != 0 { + for n in nodes { + record_node(b, n, sign, weight, date); + } + } + match doc.outcome.as_deref() { + Some("dead_end") => b.dead_ends.push(DeadEnd { + question: doc.question.clone(), + nodes: nodes.to_vec(), + date: doc.date.clone(), + }), + Some("corrected") => b.corrections.push(Correction { + question: doc.question.clone(), + correction: doc.correction.clone().unwrap_or_default(), + date: doc.date.clone(), + }), + _ => {} + } +} + +fn finalize_bucket(b: &AggBucket, k: usize) -> Bucket { + let (preferred, tentative, contested) = finalize_sources(b, k); + Bucket { + counts: b.counts.clone(), + preferred, + tentative, + contested, + dead_ends: dedupe_dead_ends(b.dead_ends.clone()), + corrections: dedupe_corrections(b.corrections.clone()), + } +} + +/// Aggregate parsed memory docs into a deterministic lessons structure. +/// +/// `now` anchors the time-decay (pass it explicitly for byte-stable output). +/// `known_nodes` (when given) gates out source nodes no longer in the graph. +/// `by_community` is empty unless `node_community` is supplied and non-empty. +#[must_use] +// `None` call sites can't infer a generic hasher; callers build the default-hasher set. +#[allow(clippy::implicit_hasher)] +pub fn aggregate_lessons( + docs: &[MemoryDoc], + node_community: Option<&IndexMap>, + now: DateTime, + half_life_days: f64, + min_corroboration: usize, + known_nodes: Option<&HashSet>, +) -> AggResult { + let mut overall = AggBucket::default(); + let mut by_community: IndexMap = IndexMap::new(); + + for doc in docs { + // One event per node per doc; drop nodes the graph no longer knows. + let mut seen: HashSet<&str> = HashSet::new(); + let nodes: Vec = doc + .source_nodes + .iter() + .filter(|n| known_nodes.is_none_or(|k| k.contains(n.as_str()))) + .filter(|n| seen.insert(n.as_str())) + .cloned() + .collect(); + let community = doc_community(&nodes, node_community); + + let sign = match doc.outcome.as_deref() { + Some("useful") => 1, + Some("dead_end" | "corrected") => -1, + _ => 0, + }; + let weight = if sign == 0 { + 0.0 + } else { + decay(&doc.date, now, half_life_days) + }; + + let bucket = by_community.entry(community).or_default(); + for target in [&mut overall, bucket] { + apply_doc(target, doc, &nodes, sign, weight); + } + } + + let mut community_out: IndexMap = IndexMap::new(); + if node_community.is_some_and(|m| !m.is_empty()) { + for (label, b) in &by_community { + community_out.insert(label.clone(), finalize_bucket(b, min_corroboration)); + } + } + + let (preferred, tentative, contested) = finalize_sources(&overall, min_corroboration); + AggResult { + total: docs.len(), + counts: overall.counts.clone(), + min_corroboration, + preferred, + tentative, + contested, + dead_ends: dedupe_dead_ends(overall.dead_ends), + corrections: dedupe_corrections(overall.corrections), + by_community: community_out, + } +} diff --git a/crates/graphify-reflect/src/graph.rs b/crates/graphify-reflect/src/graph.rs new file mode 100644 index 0000000..b1b4a56 --- /dev/null +++ b/crates/graphify-reflect/src/graph.rs @@ -0,0 +1,147 @@ +//! Optional graph artifacts: community grouping and the node-existence gate. +//! +//! Mirrors how `graphify export wiki` reads `graph.json` + +//! `.graphify_analysis.json` + `.graphify_labels.json`. Community membership in +//! the analysis sidecar is keyed by node id, but `save-result` cites nodes by +//! label, so both id and label are mapped to a community. Best-effort: any +//! missing/unparseable artifact disables grouping. + +use std::collections::HashSet; +use std::path::Path; + +use indexmap::IndexMap; +use serde_json::Value; + +use crate::UNCATEGORIZED; + +/// Read and parse a JSON file, or `None` on any I/O or parse failure. +fn read_json(path: &Path) -> Option { + let text = std::fs::read_to_string(path).ok()?; + serde_json::from_str(&text).ok() +} + +/// Build a lookup from node id AND node label → community label, or `None` if +/// the graph isn't available. Label collisions resolve to the smallest community +/// id (sorted-cid iteration + first-write-wins). +#[must_use] +pub fn load_node_community( + graph_path: &Path, + analysis_path: &Path, + labels_path: &Path, +) -> Option> { + if !graph_path.exists() || !analysis_path.exists() { + return None; + } + let analysis = read_json(analysis_path)?; + let communities = analysis.get("communities").and_then(Value::as_object)?; + if communities.is_empty() { + return None; + } + let labels = read_json(labels_path) + .as_ref() + .and_then(Value::as_object) + .cloned() + .unwrap_or_default(); + + // id -> label from the graph, so a label-form citation resolves too. + let mut id_to_label: IndexMap = IndexMap::new(); + if let Some(nodes) = read_json(graph_path) + .as_ref() + .and_then(|g| g.get("nodes")) + .and_then(Value::as_array) + { + for n in nodes { + if let (Some(id), Some(label)) = (n.get("id"), n.get("label")) + && !id.is_null() + && !label.is_null() + { + id_to_label.insert(json_to_string(id), json_to_string(label)); + } + } + } + + // Sorted cid iteration + first-write-wins makes any collision deterministic. + let mut cids: Vec<&String> = communities.keys().collect(); + cids.sort(); + let mut node_community: IndexMap = IndexMap::new(); + for cid in cids { + let label = labels + .get(cid) + .and_then(Value::as_str) + .map_or_else(|| format!("Community {cid}"), str::to_string); + let Some(members) = communities.get(cid).and_then(Value::as_array) else { + continue; + }; + for member in members { + let nid = json_to_string(member); + node_community + .entry(nid.clone()) + .or_insert_with(|| label.clone()); + if let Some(nlabel) = id_to_label.get(&nid) { + node_community + .entry(nlabel.clone()) + .or_insert_with(|| label.clone()); + } + } + } + Some(node_community) +} + +/// The set of node ids AND labels in the current graph, or `None` if +/// unavailable. Used to drop source nodes whose code is gone. +#[must_use] +pub fn load_known_nodes(graph_path: &Path) -> Option> { + let nodes = read_json(graph_path)? + .get("nodes") + .and_then(Value::as_array) + .cloned()?; + let mut known: HashSet = HashSet::new(); + for n in &nodes { + if let Some(id) = n.get("id").filter(|v| !v.is_null()) { + known.insert(json_to_string(id)); + } + if let Some(label) = n.get("label").filter(|v| !v.is_null()) { + known.insert(json_to_string(label)); + } + } + if known.is_empty() { None } else { Some(known) } +} + +/// The community a doc belongs to: the plurality community of its source nodes, +/// ties broken to the lexicographically-smallest label. Docs with no resolvable +/// community fall into the `Uncategorized` bucket. +#[must_use] +pub(crate) fn doc_community( + nodes: &[String], + node_community: Option<&IndexMap>, +) -> String { + let Some(nc) = node_community.filter(|m| !m.is_empty()) else { + return UNCATEGORIZED.to_string(); + }; + let mut counts: IndexMap<&str, usize> = IndexMap::new(); + for n in nodes { + if let Some(label) = nc.get(n) { + *counts.entry(label.as_str()).or_insert(0) += 1; + } + } + if counts.is_empty() { + return UNCATEGORIZED.to_string(); + } + // min over (-count, label): highest count wins, then smallest label. + counts + .iter() + .min_by(|a, b| (std::cmp::Reverse(*a.1), *a.0).cmp(&(std::cmp::Reverse(*b.1), *b.0))) + .map_or_else( + || UNCATEGORIZED.to_string(), + |(label, _)| (*label).to_string(), + ) +} + +/// Stringify a JSON scalar the way Python's `str(node_id)` would for the +/// id/label forms `save-result` and the graph use (strings stay verbatim). +fn json_to_string(value: &Value) -> String { + match value { + Value::String(s) => s.clone(), + other => other.to_string(), + } +} diff --git a/crates/graphify-reflect/src/lib.rs b/crates/graphify-reflect/src/lib.rs new file mode 100644 index 0000000..a465ae7 --- /dev/null +++ b/crates/graphify-reflect/src/lib.rs @@ -0,0 +1,144 @@ +//! Deterministic "work memory" reflection over `graphify-out/memory/`. +//! +//! `graphify reflect` reads the Q&A memory docs that `graphify save-result` +//! files back into the graph, aggregates their outcome signals (`useful` / +//! `dead_end` / `corrected`), and writes a single lessons artifact an agent can +//! load at the start of the next session: +//! +//! - **Preferred sources** — nodes corroborated by multiple `useful` answers. +//! - **Tentative** — nodes seen useful only once (not yet corroborated). +//! - **Contested** — nodes with both positive and negative signals; recency decides. +//! - **Known dead ends** — questions/sources marked `dead_end`. +//! - **Corrections** — answers the user corrected, and the right answer. +//! +//! Source nodes are scored, not counted: each citation contributes a signed, +//! time-decayed value (`useful` positive, `dead_end`/`corrected` negative, with +//! a half-life so a fresh dead end outweighs a months-old useful). A node is +//! only promoted to "preferred" once corroborated by enough distinct results. +//! +//! It is deterministic: no LLM, stable sort orders, byte-stable output for a +//! given input and a given `now`. Ports `graphify-py/graphify/reflect.py`. + +mod aggregate; +mod graph; +mod parse; +mod render; + +use std::path::{Path, PathBuf}; + +use chrono::{DateTime, Utc}; + +pub use aggregate::{ + AggResult, Bucket, ContestedEntry, Correction, DeadEnd, OutcomeCounts, SourceEntry, + aggregate_lessons, +}; +pub use graph::{load_known_nodes, load_node_community}; +pub use parse::{MemoryDoc, load_memory_docs, parse_memory_doc}; +pub use render::render_lessons_md; + +/// A signal's weight halves every 30 days by default. +pub const DEFAULT_HALF_LIFE_DAYS: f64 = 30.0; +/// Distinct `useful` results needed to promote a node to "preferred". +pub const DEFAULT_MIN_CORROBORATION: usize = 2; +/// Bucket label for docs with no resolvable community. +pub(crate) const UNCATEGORIZED: &str = "Uncategorized"; + +/// `true` if `out_path` exists and is at least as new as every input that feeds +/// it (the memory docs, and the graph when one is used). +/// +/// Lets `graphify reflect --if-stale` skip a redundant run. A missing output is +/// never fresh (it must be built). Mtime-based and best-effort. +#[must_use] +pub fn lessons_fresh(out_path: &Path, memory_dir: &Path, graph_path: Option<&Path>) -> bool { + let Ok(out_mtime) = std::fs::metadata(out_path).and_then(|m| m.modified()) else { + return false; // missing/unreadable -> must build + }; + let mut newest = std::time::SystemTime::UNIX_EPOCH; + if memory_dir.is_dir() + && let Ok(entries) = std::fs::read_dir(memory_dir) + { + for path in entries + .flatten() + .map(|e| e.path()) + .filter(|p| p.extension().is_some_and(|e| e == "md")) + { + if let Ok(mtime) = std::fs::metadata(&path).and_then(|m| m.modified()) { + newest = newest.max(mtime); + } + } + } + if let Some(gp) = graph_path + && let Ok(mtime) = std::fs::metadata(gp).and_then(|m| m.modified()) + { + newest = newest.max(mtime); + } + out_mtime >= newest +} + +/// Optional graph artifacts that enable community grouping + the node-existence gate. +#[derive(Clone, Copy, Debug, Default)] +pub struct GraphPaths<'a> { + /// `graph.json` path; community grouping is disabled when `None`. + pub graph: Option<&'a Path>, + /// `.graphify_analysis.json` override; defaults to the graph's sibling. + pub analysis: Option<&'a Path>, + /// `.graphify_labels.json` override; defaults to the graph's sibling. + pub labels: Option<&'a Path>, +} + +/// Scan `memory_dir`, write the lessons doc to `out_path`, return (path, agg). +/// +/// When `graphs.graph` is given, lessons are grouped by community and source +/// nodes no longer in the graph are dropped; otherwise the doc is a single flat +/// section. +/// +/// # Errors +/// +/// Returns [`std::io::Error`] if the output directory cannot be created or the +/// lessons file cannot be written. +pub fn reflect( + memory_dir: &Path, + out_path: &Path, + graphs: GraphPaths<'_>, + now: DateTime, + half_life_days: f64, + min_corroboration: usize, +) -> std::io::Result<(PathBuf, AggResult)> { + let docs = load_memory_docs(memory_dir); + + let mut node_community = None; + let mut known_nodes = None; + if let Some(graph) = graphs.graph { + let analysis: PathBuf = graphs.analysis.map_or_else( + || sibling(graph, ".graphify_analysis.json"), + Path::to_path_buf, + ); + let labels: PathBuf = graphs.labels.map_or_else( + || sibling(graph, ".graphify_labels.json"), + Path::to_path_buf, + ); + node_community = load_node_community(graph, &analysis, &labels); + known_nodes = load_known_nodes(graph); + } + + let agg = aggregate_lessons( + &docs, + node_community.as_ref(), + now, + half_life_days, + min_corroboration, + known_nodes.as_ref(), + ); + + if let Some(parent) = out_path.parent() { + std::fs::create_dir_all(parent)?; + } + std::fs::write(out_path, render_lessons_md(&agg))?; + Ok((out_path.to_path_buf(), agg)) +} + +/// A path sharing `base`'s parent directory but with a different filename. +fn sibling(base: &Path, name: &str) -> PathBuf { + base.parent() + .map_or_else(|| PathBuf::from(name), |p| p.join(name)) +} diff --git a/crates/graphify-reflect/src/parse.rs b/crates/graphify-reflect/src/parse.rs new file mode 100644 index 0000000..8551c00 --- /dev/null +++ b/crates/graphify-reflect/src/parse.rs @@ -0,0 +1,171 @@ +//! Frontmatter parsing for memory docs. +//! +//! `save_query_result` writes a tiny hand-built YAML subset (no `PyYAML` +//! dependency), so we parse the same subset by hand: scalar `key: "value"` +//! lines and a `source_nodes: ["a", "b"]` flow list. Anything unrecognised is +//! ignored, so foreign `.md` files in `memory/` are skipped cleanly. + +use std::path::{Path, PathBuf}; +use std::sync::LazyLock; + +use regex::Regex; + +#[allow(clippy::expect_used)] // literal patterns; cannot fail at runtime. +static SCALAR_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"^([A-Za-z_][\w-]*):\s*"(.*)"\s*$"#).expect("scalar regex")); +#[allow(clippy::expect_used)] // literal patterns; cannot fail at runtime. +static LIST_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^([A-Za-z_][\w-]*):\s*\[(.*)\]\s*$").expect("list regex")); +#[allow(clippy::expect_used)] // literal patterns; cannot fail at runtime. +static DQ_ITEM_RE: LazyLock = + LazyLock::new(|| Regex::new(r#""((?:[^"\\]|\\.)*)""#).expect("item regex")); + +/// Parsed frontmatter of one memory doc. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct MemoryDoc { + /// The `type` field (e.g. `query`, `explain`). + pub doc_type: Option, + /// ISO date string; empty when absent. + pub date: String, + /// The question text; empty when absent. + pub question: String, + /// Outcome signal (`useful` / `dead_end` / `corrected`), if marked. + pub outcome: Option, + /// Correction text for a `corrected` outcome. + pub correction: Option, + /// Cited source-node labels (always a list, possibly empty). + pub source_nodes: Vec, + /// Source filename, set by [`load_memory_docs`] (empty from a bare parse). + pub path: String, +} + +/// Reverse the double-quoted escaping that `ingest::yaml_str` applies. +#[must_use] +pub(crate) fn yaml_unescape(s: &str) -> String { + let chars: Vec = s.chars().collect(); + let mut out = String::with_capacity(s.len()); + let mut i = 0; + while i < chars.len() { + let ch = chars[i]; + if ch == '\\' && i + 1 < chars.len() { + let nxt = chars[i + 1]; + let simple = match nxt { + 'n' => Some('\n'), + 'r' => Some('\r'), + 't' => Some('\t'), + '0' => Some('\0'), + '"' => Some('"'), + '\\' => Some('\\'), + 'L' => Some('\u{2028}'), + 'P' => Some('\u{2029}'), + _ => None, + }; + if let Some(c) = simple { + out.push(c); + i += 2; + continue; + } + if nxt == 'x' + && i + 3 < chars.len() + && let Some(c) = hex_char(&chars[i + 2..=i + 3]) + { + out.push(c); + i += 4; + continue; + } + if nxt == 'u' + && i + 5 < chars.len() + && let Some(c) = hex_char(&chars[i + 2..=i + 5]) + { + out.push(c); + i += 6; + continue; + } + } + out.push(ch); + i += 1; + } + out +} + +/// Decode a run of hex digits to a `char`, or `None` if invalid. +fn hex_char(digits: &[char]) -> Option { + let hex: String = digits.iter().collect(); + u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) +} + +/// Parse the frontmatter of a memory doc, or `None` if it has none. +#[must_use] +pub fn parse_memory_doc(text: &str) -> Option { + if !text.starts_with("---") { + return None; + } + let lines: Vec<&str> = text.lines().collect(); + if lines.first().map(|l| l.trim()) != Some("---") { + return None; + } + let mut doc = MemoryDoc::default(); + for line in &lines[1..] { + if line.trim() == "---" { + break; + } + if let Some(caps) = LIST_RE.captures(line) + && &caps[1] == "source_nodes" + { + doc.source_nodes = DQ_ITEM_RE + .captures_iter(&caps[2]) + .map(|c| yaml_unescape(&c[1])) + .collect(); + continue; + } + if let Some(caps) = SCALAR_RE.captures(line) { + let val = yaml_unescape(&caps[2]); + match &caps[1] { + "type" => doc.doc_type = Some(val), + "date" => doc.date = val, + "question" => doc.question = val, + "outcome" => doc.outcome = Some(val), + "correction" => doc.correction = Some(val), + _ => {} + } + } + } + Some(doc) +} + +/// Parse every memory doc under `memory_dir`, sorted by date then filename. +/// +/// Docs without recognisable frontmatter (foreign `.md` files, the `LESSONS.md` +/// artifact) are skipped. +#[must_use] +pub fn load_memory_docs(memory_dir: &Path) -> Vec { + if !memory_dir.exists() { + return Vec::new(); + } + let Ok(entries) = std::fs::read_dir(memory_dir) else { + return Vec::new(); + }; + let mut paths: Vec = entries + .flatten() + .map(|e| e.path()) + .filter(|p| p.extension().is_some_and(|e| e == "md")) + .collect(); + paths.sort(); + + let mut docs: Vec = Vec::new(); + for path in paths { + let Ok(text) = std::fs::read_to_string(&path) else { + continue; + }; + if let Some(mut doc) = parse_memory_doc(&text) { + doc.path = path + .file_name() + .map(|n| n.to_string_lossy().into_owned()) + .unwrap_or_default(); + docs.push(doc); + } + } + // Stable order: (date, filename) so output is deterministic across runs. + docs.sort_by(|a, b| (&a.date, &a.path).cmp(&(&b.date, &b.path))); + docs +} diff --git a/crates/graphify-reflect/src/render.rs b/crates/graphify-reflect/src/render.rs new file mode 100644 index 0000000..ffd9839 --- /dev/null +++ b/crates/graphify-reflect/src/render.rs @@ -0,0 +1,160 @@ +//! Rendering the aggregate into the deterministic `LESSONS.md` markdown body. + +use crate::UNCATEGORIZED; +use crate::aggregate::{AggResult, Bucket, ContestedEntry, Correction, DeadEnd, SourceEntry}; + +/// Append one bucket's rendered sections to `out`. +fn render_bucket( + out: &mut Vec, + preferred: &[SourceEntry], + tentative: &[SourceEntry], + contested: &[ContestedEntry], + dead_ends: &[DeadEnd], + corrections: &[Correction], + k: usize, +) { + if !preferred.is_empty() { + out.push(format!( + "**Preferred sources** — corroborated by ≥{k} useful results; start here." + )); + out.push(String::new()); + for e in preferred { + out.push(format!("- `{}` ({}× useful)", e.node, e.n)); + } + out.push(String::new()); + } + if !tentative.is_empty() { + out.push(format!( + "**Tentative** — useful in fewer than {k} results; verify before relying." + )); + out.push(String::new()); + for e in tentative { + out.push(format!("- `{}` ({}× useful)", e.node, e.n)); + } + out.push(String::new()); + } + if !contested.is_empty() { + out.push("**Contested** — mixed signals; recency decides.".to_string()); + out.push(String::new()); + for e in contested { + let day: String = e.last.chars().take(10).collect(); + let verdict = if e.verdict == "even" { + "evenly split".to_string() + } else { + format!("recency leans **{}**", e.verdict) + }; + let suffix = if day.is_empty() { + String::new() + } else { + format!(" (latest {day})") + }; + out.push(format!( + "- `{}` — {}× useful, {}× dead end/corrected → {verdict}{suffix}", + e.node, e.pos, e.neg + )); + } + out.push(String::new()); + } + if !dead_ends.is_empty() { + out.push("**Known dead ends** — led nowhere; don't re-derive.".to_string()); + out.push(String::new()); + for d in dead_ends { + let nodes = d + .nodes + .iter() + .map(|n| format!("`{n}`")) + .collect::>() + .join(", "); + let suffix = if nodes.is_empty() { + String::new() + } else { + format!(" — {nodes}") + }; + out.push(format!("- \"{}\"{suffix}", d.question)); + } + out.push(String::new()); + } + if !corrections.is_empty() { + out.push("**Corrections** — do these differently.".to_string()); + out.push(String::new()); + for c in corrections { + out.push(format!("- \"{}\" → {}", c.question, c.correction)); + } + out.push(String::new()); + } + if preferred.is_empty() + && tentative.is_empty() + && contested.is_empty() + && dead_ends.is_empty() + && corrections.is_empty() + { + out.push("_No marked outcomes yet._".to_string()); + out.push(String::new()); + } +} + +/// Render the aggregate into the deterministic `LESSONS.md` markdown body. +#[must_use] +pub fn render_lessons_md(agg: &AggResult) -> String { + let c = &agg.counts; + let k = agg.min_corroboration; + let memory_word = if agg.total == 1 { "memory" } else { "memories" }; + let mut out: Vec = vec![ + "# Lessons".to_string(), + String::new(), + format!( + "_Auto-generated by `graphify reflect` from {} session {memory_word} in \ + graphify-out/memory/. Deterministic; no LLM. Use for orientation — verify \ + before relying, and revisit dead ends if the code has changed since._", + agg.total + ), + String::new(), + "## Summary".to_string(), + String::new(), + format!( + "- {} useful · {} dead ends · {} corrected · {} unmarked", + c.useful, c.dead_end, c.corrected, c.unmarked + ), + String::new(), + "## Lessons".to_string(), + String::new(), + ]; + render_bucket( + &mut out, + &agg.preferred, + &agg.tentative, + &agg.contested, + &agg.dead_ends, + &agg.corrections, + k, + ); + + if !agg.by_community.is_empty() { + out.push("## By topic".to_string()); + out.push(String::new()); + // Uncategorized sorts last; everything else alphabetically. + let mut labels: Vec<&String> = agg.by_community.keys().collect(); + labels.sort_by(|a, b| { + let ka = (usize::from(a.as_str() == UNCATEGORIZED), a.as_str()); + let kb = (usize::from(b.as_str() == UNCATEGORIZED), b.as_str()); + ka.cmp(&kb) + }); + for label in labels { + out.push(format!("### {label}")); + out.push(String::new()); + let b: &Bucket = &agg.by_community[label]; + render_bucket( + &mut out, + &b.preferred, + &b.tentative, + &b.contested, + &b.dead_ends, + &b.corrections, + k, + ); + } + } + + // Single trailing newline, no trailing whitespace lines. + format!("{}\n", out.join("\n").trim_end_matches('\n')) +} diff --git a/crates/graphify-reflect/tests/parity.rs b/crates/graphify-reflect/tests/parity.rs new file mode 100644 index 0000000..51daf84 --- /dev/null +++ b/crates/graphify-reflect/tests/parity.rs @@ -0,0 +1,651 @@ +//! Parity tests against `graphify-py/tests/test_reflect.py`. +#![allow(clippy::expect_used, clippy::unwrap_used)] + +use std::collections::HashSet; + +use chrono::{DateTime, Duration, TimeZone, Utc}; +use graphify_ingest::save_query_result; +use graphify_reflect::{ + AggResult, MemoryDoc, aggregate_lessons, lessons_fresh, load_memory_docs, parse_memory_doc, + reflect, render_lessons_md, +}; +use indexmap::IndexMap; + +/// Fixed clock so time-decay scoring is byte-stable (mirrors Python `_NOW`). +fn now() -> DateTime { + Utc.with_ymd_and_hms(2026, 6, 1, 0, 0, 0).unwrap() +} + +fn days_before(n: i64) -> String { + (now() - Duration::days(n)).to_rfc3339() +} + +/// Build a `MemoryDoc` (mirrors Python `_doc`). +fn doc( + outcome: Option<&str>, + nodes: &[&str], + question: &str, + correction: &str, + date: &str, +) -> MemoryDoc { + MemoryDoc { + doc_type: None, + date: date.to_string(), + question: question.to_string(), + outcome: outcome.map(str::to_string), + correction: if correction.is_empty() { + None + } else { + Some(correction.to_string()) + }, + source_nodes: nodes.iter().map(|s| (*s).to_string()).collect(), + path: String::new(), + } +} + +/// `aggregate_lessons` with the test defaults (no graph, fixed clock, k=2). +fn agg(docs: &[MemoryDoc]) -> AggResult { + aggregate_lessons(docs, None, now(), 30.0, 2, None) +} + +fn community(pairs: &[(&str, &str)]) -> IndexMap { + pairs + .iter() + .map(|(n, c)| ((*n).to_string(), (*c).to_string())) + .collect() +} + +// --- frontmatter parsing ------------------------------------------------------- + +#[test] +fn parse_round_trips_a_saved_doc() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + let out = save_query_result( + "what is \"attention\"?", + "softmax", + &mem, + "explain", + Some(&["AttentionLayer".to_string(), "SoftmaxFunc".to_string()]), + Some("useful"), + None, + ) + .unwrap(); + let text = std::fs::read_to_string(&out).unwrap(); + let parsed = parse_memory_doc(&text).expect("frontmatter parses"); + assert_eq!(parsed.doc_type.as_deref(), Some("explain")); + assert_eq!(parsed.question, "what is \"attention\"?"); + assert_eq!(parsed.outcome.as_deref(), Some("useful")); + assert_eq!(parsed.source_nodes, vec!["AttentionLayer", "SoftmaxFunc"]); +} + +#[test] +fn parse_returns_none_for_foreign_doc() { + assert!(parse_memory_doc("# just a note\n\nno frontmatter here\n").is_none()); + assert!(parse_memory_doc("").is_none()); +} + +#[test] +fn round_trip_survives_backslash_newline_and_quoted_node() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + let out = save_query_result( + r#"path is C:\Users and a "quote""#, + "a", + &mem, + "query", + Some(&[r#"Node"With\Quote"#.to_string()]), + Some("corrected"), + Some("line1\nline2"), + ) + .unwrap(); + let parsed = parse_memory_doc(&std::fs::read_to_string(&out).unwrap()).unwrap(); + assert_eq!(parsed.question, r#"path is C:\Users and a "quote""#); + assert_eq!(parsed.correction.as_deref(), Some("line1\nline2")); + assert_eq!(parsed.source_nodes, vec![r#"Node"With\Quote"#]); +} + +#[test] +fn parse_handles_crlf() { + let doc = "---\r\ntype: \"query\"\r\noutcome: \"useful\"\r\nsource_nodes: [\"A\"]\r\n---\r\n# body\r\n"; + let parsed = parse_memory_doc(doc).unwrap(); + assert_eq!(parsed.outcome.as_deref(), Some("useful")); + assert_eq!(parsed.source_nodes, vec!["A"]); +} + +#[test] +fn load_memory_docs_missing_dir_is_empty() { + let tmp = tempfile::tempdir().unwrap(); + assert!(load_memory_docs(&tmp.path().join("nope")).is_empty()); +} + +fn write_raw_doc(mem: &std::path::Path, filename: &str, date: &str, outcome: &str, question: &str) { + std::fs::create_dir_all(mem).unwrap(); + let body = format!( + "---\ntype: \"query\"\ndate: \"{date}\"\nquestion: \"{question}\"\ncontributor: \"graphify\"\noutcome: \"{outcome}\"\n---\n\n# Q: {question}\n" + ); + std::fs::write(mem.join(filename), body).unwrap(); +} + +#[test] +fn load_memory_docs_skips_foreign_and_sorts() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + std::fs::create_dir_all(&mem).unwrap(); + std::fs::write(mem.join("foreign.md"), "# not a memory doc\n").unwrap(); + write_raw_doc(&mem, "a.md", "2026-01-01", "useful", "first"); + write_raw_doc(&mem, "b.md", "2026-01-02", "dead_end", "second"); + let docs = load_memory_docs(&mem); + assert_eq!(docs.len(), 2); + let outcomes: HashSet<&str> = docs.iter().filter_map(|d| d.outcome.as_deref()).collect(); + assert_eq!(outcomes, HashSet::from(["useful", "dead_end"])); +} + +#[test] +fn load_memory_docs_orders_by_date_then_filename() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + write_raw_doc(&mem, "z.md", "2026-03-01", "dead_end", "march"); + write_raw_doc(&mem, "a.md", "2026-01-01", "dead_end", "january"); + write_raw_doc(&mem, "b.md", "2026-02-01", "dead_end", "february"); + let docs = load_memory_docs(&mem); + let questions: Vec<&str> = docs.iter().map(|d| d.question.as_str()).collect(); + assert_eq!(questions, vec!["january", "february", "march"]); +} + +// --- aggregation --------------------------------------------------------------- + +#[test] +fn aggregate_counts_each_outcome() { + let docs = [ + doc(Some("useful"), &["A"], "q", "", "2026-01-01"), + doc(Some("useful"), &["A", "B"], "q", "", "2026-01-01"), + doc(Some("dead_end"), &["C"], "q", "", "2026-01-01"), + doc(Some("corrected"), &[], "q", "use D", "2026-01-01"), + doc(None, &[], "q", "", "2026-01-01"), + ]; + let a = agg(&docs); + assert_eq!(a.total, 5); + assert_eq!(a.counts.useful, 2); + assert_eq!(a.counts.dead_end, 1); + assert_eq!(a.counts.corrected, 1); + assert_eq!(a.counts.unmarked, 1); +} + +fn node_names(entries: &[graphify_reflect::SourceEntry]) -> Vec<&str> { + entries.iter().map(|e| e.node.as_str()).collect() +} + +#[test] +fn sources_split_into_preferred_tentative_contested() { + let docs = [ + doc(Some("useful"), &["A", "B"], "q", "", "2026-01-01"), + doc(Some("useful"), &["A", "B"], "q", "", "2026-01-01"), + doc(Some("useful"), &["C"], "q", "", "2026-01-01"), + doc(Some("dead_end"), &["A"], "q", "", "2026-01-01"), + ]; + let a = agg(&docs); + assert_eq!(node_names(&a.preferred), vec!["B"]); + assert_eq!(node_names(&a.tentative), vec!["C"]); + assert_eq!( + a.contested + .iter() + .map(|e| e.node.as_str()) + .collect::>(), + vec!["A"] + ); +} + +#[test] +fn corroboration_threshold_promotes_only_repeated_nodes() { + let one = agg(&[doc(Some("useful"), &["A"], "q", "", "2026-01-01")]); + assert_eq!(node_names(&one.tentative), vec!["A"]); + assert!(one.preferred.is_empty()); + + let two = agg(&[ + doc(Some("useful"), &["A"], "q", "", "2026-01-01"), + doc(Some("useful"), &["A"], "q", "", "2026-01-01"), + ]); + assert_eq!(node_names(&two.preferred), vec!["A"]); + assert!(two.tentative.is_empty()); +} + +#[test] +fn recency_decides_contested_verdict() { + let a = agg(&[ + doc(Some("useful"), &["N"], "q", "", &days_before(120)), + doc(Some("dead_end"), &["N"], "q", "", &days_before(1)), + ]); + assert_eq!(a.contested.len(), 1); + assert_eq!(a.contested[0].node, "N"); + assert_eq!(a.contested[0].verdict, "dead end"); + + let flipped = agg(&[ + doc(Some("useful"), &["N"], "q", "", &days_before(1)), + doc(Some("dead_end"), &["N"], "q", "", &days_before(120)), + ]); + assert_eq!(flipped.contested[0].verdict, "useful"); +} + +#[test] +fn node_existence_gate_drops_stale_nodes() { + let docs = [ + doc(Some("useful"), &["Alive", "Deleted"], "q", "", "2026-01-01"), + doc(Some("useful"), &["Alive", "Deleted"], "q", "", "2026-01-01"), + ]; + let known: HashSet = HashSet::from(["Alive".to_string()]); + let a = aggregate_lessons(&docs, None, now(), 30.0, 2, Some(&known)); + let names: Vec<&str> = a + .preferred + .iter() + .chain(&a.tentative) + .map(|e| e.node.as_str()) + .collect(); + assert!(!names.contains(&"Deleted")); + assert!(names.contains(&"Alive")); +} + +#[test] +fn corroboration_counts_distinct_docs_not_citations() { + let a = agg(&[doc(Some("useful"), &["A", "A"], "q", "", "2026-01-01")]); + assert!(a.preferred.is_empty()); + assert_eq!(node_names(&a.tentative), vec!["A"]); + assert_eq!(a.tentative[0].n, 1); +} + +#[test] +fn min_corroboration_is_honored_not_hardcoded() { + let docs = [ + doc(Some("useful"), &["A"], "q", "", "2026-01-01"), + doc(Some("useful"), &["A"], "q", "", "2026-01-01"), + ]; + let k2 = aggregate_lessons(&docs, None, now(), 30.0, 2, None); + assert_eq!(node_names(&k2.preferred), vec!["A"]); + let k3 = aggregate_lessons(&docs, None, now(), 30.0, 3, None); + assert!(k3.preferred.is_empty()); + assert_eq!(node_names(&k3.tentative), vec!["A"]); +} + +#[test] +fn half_life_actually_feeds_decay() { + let docs = [ + doc(Some("useful"), &["N"], "q", "", &days_before(90)), + doc(Some("useful"), &["N"], "q", "", &days_before(90)), + doc(Some("dead_end"), &["N"], "q", "", &days_before(1)), + ]; + let long_hl = aggregate_lessons(&docs, None, now(), 100_000.0, 2, None); + let short_hl = aggregate_lessons(&docs, None, now(), 10.0, 2, None); + assert_eq!(long_hl.contested[0].verdict, "useful"); + assert_eq!(short_hl.contested[0].verdict, "dead end"); +} + +#[test] +fn evenly_split_and_nonpositive_half_life() { + let day = days_before(5); + let a = agg(&[ + doc(Some("useful"), &["N"], "q", "", &day), + doc(Some("dead_end"), &["N"], "q", "", &day), + ]); + assert_eq!(a.contested[0].verdict, "even"); + assert!(render_lessons_md(&a).contains("evenly split")); + + let docs = [ + doc(Some("useful"), &["N"], "q", "", &days_before(365)), + doc(Some("dead_end"), &["N"], "q", "", &days_before(1)), + ]; + let no_decay = aggregate_lessons(&docs, None, now(), 0.0, 2, None); + assert_eq!(no_decay.contested[0].verdict, "even"); +} + +#[test] +fn negative_only_node_absent_from_sources() { + let a = agg(&[doc(Some("dead_end"), &["Bad"], "why?", "", "2026-01-01")]); + let names: Vec<&str> = a + .preferred + .iter() + .chain(&a.tentative) + .map(|e| e.node.as_str()) + .collect(); + assert!(!names.contains(&"Bad")); + assert_eq!(a.dead_ends[0].nodes, vec!["Bad"]); +} + +#[test] +fn dead_ends_and_corrections_collected() { + let a = agg(&[ + doc( + Some("dead_end"), + &["RedisClient"], + "where is the cache?", + "", + "2026-01-01", + ), + doc( + Some("corrected"), + &[], + "what hashes pw?", + "bcrypt", + "2026-01-01", + ), + ]); + assert_eq!(a.dead_ends[0].question, "where is the cache?"); + assert_eq!(a.dead_ends[0].nodes, vec!["RedisClient"]); + assert_eq!(a.corrections[0].correction, "bcrypt"); +} + +#[test] +fn no_community_grouping_without_graph() { + let a = agg(&[doc(Some("useful"), &["A"], "q", "", "2026-01-01")]); + assert!(a.by_community.is_empty()); +} + +#[test] +fn doc_community_tie_breaks_to_smallest_label() { + let nc = community(&[("x", "Zeta"), ("y", "Alpha")]); + let a1 = aggregate_lessons( + &[doc(Some("useful"), &["x", "y"], "q", "", "2026-01-01")], + Some(&nc), + now(), + 30.0, + 2, + None, + ); + assert!(a1.by_community.contains_key("Alpha")); + assert!(!a1.by_community.contains_key("Zeta")); +} + +#[test] +fn community_grouping_uses_plurality_community() { + let nc = community(&[("A", "Auth"), ("B", "Auth"), ("C", "Cache")]); + let docs = [ + doc(Some("useful"), &["A", "B", "C"], "q", "", "2026-01-01"), + doc(Some("dead_end"), &["C"], "q", "", "2026-01-01"), + doc(Some("useful"), &["Z"], "q", "", "2026-01-01"), + ]; + let a = aggregate_lessons(&docs, Some(&nc), now(), 30.0, 2, None); + let keys: HashSet<&str> = a.by_community.keys().map(String::as_str).collect(); + assert_eq!(keys, HashSet::from(["Auth", "Cache", "Uncategorized"])); + assert_eq!(a.by_community["Auth"].counts.useful, 1); + assert_eq!(a.by_community["Cache"].counts.dead_end, 1); + assert_eq!(a.by_community["Uncategorized"].counts.useful, 1); +} + +#[test] +fn dead_ends_and_corrections_dedupe_by_question() { + let docs = [ + doc(Some("dead_end"), &[], "ws server?", "", "2026-01-01"), + doc(Some("dead_end"), &[], "ws server?", "", "2026-01-02"), + doc(Some("corrected"), &[], "hash?", "SHA-1", "2026-01-01"), + doc(Some("corrected"), &[], "hash?", "SHA-256", "2026-01-03"), + ]; + let a = agg(&docs); + assert_eq!( + a.dead_ends + .iter() + .map(|d| d.question.as_str()) + .collect::>(), + vec!["ws server?"] + ); + assert_eq!(a.corrections.len(), 1); + assert_eq!(a.corrections[0].correction, "SHA-256"); +} + +// --- rendering ----------------------------------------------------------------- + +#[test] +fn render_has_summary_and_sections() { + let docs = [ + doc(Some("useful"), &["AuthMiddleware"], "q", "", "2026-01-01"), + doc( + Some("dead_end"), + &["RedisClient"], + "where is the cache?", + "", + "2026-01-01", + ), + doc(Some("corrected"), &[], "pw?", "bcrypt", "2026-01-01"), + ]; + let md = render_lessons_md(&agg(&docs)); + assert!(md.contains("# Lessons")); + assert!(md.contains("1 useful · 1 dead ends · 1 corrected")); + assert!(md.contains("`AuthMiddleware`")); + assert!(md.contains("where is the cache?")); + assert!(md.contains("bcrypt")); + assert!(!md.contains("## By topic")); +} + +#[test] +fn render_includes_by_topic_when_graph_present() { + let nc = community(&[("A", "Auth")]); + let md = render_lessons_md(&aggregate_lessons( + &[doc(Some("useful"), &["A"], "q", "", "2026-01-01")], + Some(&nc), + now(), + 30.0, + 2, + None, + )); + assert!(md.contains("## By topic")); + assert!(md.contains("### Auth")); +} + +#[test] +fn topic_sections_alpha_with_uncategorized_last() { + let nc = community(&[("a", "Zeta"), ("b", "Alpha")]); + let docs = [ + doc(Some("useful"), &["a"], "q", "", "2026-01-01"), + doc(Some("useful"), &["b"], "q", "", "2026-01-01"), + doc(Some("useful"), &["unknown"], "q", "", "2026-01-01"), + ]; + let md = render_lessons_md(&aggregate_lessons(&docs, Some(&nc), now(), 30.0, 2, None)); + let headers: Vec<&str> = md.lines().filter_map(|l| l.strip_prefix("### ")).collect(); + assert_eq!(headers, vec!["Alpha", "Zeta", "Uncategorized"]); +} + +#[test] +fn render_byte_stable_across_independent_aggregations() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + write_raw_doc(&mem, "a.md", "2026-01-01", "useful", "first"); + write_raw_doc(&mem, "b.md", "2026-01-02", "dead_end", "dead?"); + let first = render_lessons_md(&aggregate_lessons( + &load_memory_docs(&mem), + None, + now(), + 30.0, + 2, + None, + )); + let second = render_lessons_md(&aggregate_lessons( + &load_memory_docs(&mem), + None, + now(), + 30.0, + 2, + None, + )); + assert_eq!(first, second); +} + +#[test] +fn contested_node_renders_once_under_contested() { + let docs = [ + doc(Some("useful"), &["N"], "q", "", "2026-01-01"), + doc(Some("dead_end"), &["N"], "bad?", "", "2026-01-01"), + ]; + let md = render_lessons_md(&agg(&docs)); + assert!(md.contains("**Contested**")); + let lines: Vec<&str> = md + .lines() + .filter(|l| l.starts_with("- `N` —") && l.contains("useful") && l.contains("dead end")) + .collect(); + assert_eq!(lines.len(), 1); +} + +#[test] +fn header_is_cautious() { + let md = render_lessons_md(&agg(&[doc(Some("useful"), &["A"], "q", "", "2026-01-01")])); + assert!(md.contains("verify before relying")); + assert!(!md.contains("reuse what worked")); +} + +#[test] +fn lessons_artifact_cannot_be_globbed_back_into_memory() { + let md = render_lessons_md(&agg(&[doc(Some("useful"), &["A"], "q", "", "2026-01-01")])); + assert!(parse_memory_doc(&md).is_none()); + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + std::fs::create_dir_all(&mem).unwrap(); + std::fs::write(mem.join("LESSONS.md"), &md).unwrap(); + save_query_result("real", "a", &mem, "query", None, Some("useful"), None).unwrap(); + let docs = load_memory_docs(&mem); + assert_eq!(docs.len(), 1); + assert_eq!(docs[0].question, "real"); +} + +#[test] +fn render_empty_memory_is_graceful() { + let md = render_lessons_md(&agg(&[])); + assert!(md.contains("from 0 session memories")); + assert!(md.contains("_No marked outcomes yet._")); +} + +// --- orchestrator -------------------------------------------------------------- + +#[test] +fn reflect_writes_lessons_file() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + save_query_result( + "q1", + "a1", + &mem, + "query", + Some(&["A".to_string()]), + Some("useful"), + None, + ) + .unwrap(); + let out = tmp.path().join("reflections").join("LESSONS.md"); + let (out_path, a) = reflect( + &mem, + &out, + graphify_reflect::GraphPaths::default(), + now(), + 30.0, + 2, + ) + .unwrap(); + assert!(out_path.exists()); + assert_eq!(a.total, 1); + assert!(std::fs::read_to_string(&out_path).unwrap().contains("`A`")); +} + +#[test] +fn second_session_benefits_from_the_first() { + let tmp = tempfile::tempdir().unwrap(); + let out = tmp.path().join("graphify-out"); + let mem = out.join("memory"); + save_query_result( + "how does auth work?", + "JWT in middleware", + &mem, + "query", + Some(&["AuthMiddleware".to_string()]), + Some("useful"), + None, + ) + .unwrap(); + save_query_result( + "where is the cache?", + "looked at RedisClient, not it", + &mem, + "query", + Some(&["RedisClient".to_string()]), + Some("dead_end"), + None, + ) + .unwrap(); + let lessons = out.join("reflections").join("LESSONS.md"); + reflect( + &mem, + &lessons, + graphify_reflect::GraphPaths::default(), + now(), + 30.0, + 2, + ) + .unwrap(); + let body = std::fs::read_to_string(&lessons).unwrap(); + assert!(body.contains("`AuthMiddleware`")); + assert!(body.contains("where is the cache?")); +} + +// --- lessons_fresh ------------------------------------------------------------- + +#[test] +fn lessons_fresh_missing_output_is_not_fresh() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + std::fs::create_dir_all(&mem).unwrap(); + std::fs::write(mem.join("q.md"), "x").unwrap(); + assert!(!lessons_fresh(&tmp.path().join("LESSONS.md"), &mem, None)); +} + +#[test] +fn lessons_fresh_true_when_output_newer_than_inputs() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + std::fs::create_dir_all(&mem).unwrap(); + let doc = mem.join("q.md"); + std::fs::write(&doc, "x").unwrap(); + let out = tmp.path().join("LESSONS.md"); + std::fs::write(&out, "y").unwrap(); + set_mtime(&doc, 1000); + set_mtime(&out, 2000); + assert!(lessons_fresh(&out, &mem, None)); +} + +#[test] +fn lessons_fresh_false_when_memory_newer() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + std::fs::create_dir_all(&mem).unwrap(); + let doc = mem.join("q.md"); + std::fs::write(&doc, "x").unwrap(); + let out = tmp.path().join("LESSONS.md"); + std::fs::write(&out, "y").unwrap(); + set_mtime(&out, 1000); + set_mtime(&doc, 2000); + assert!(!lessons_fresh(&out, &mem, None)); +} + +#[test] +fn lessons_fresh_false_when_graph_newer() { + let tmp = tempfile::tempdir().unwrap(); + let mem = tmp.path().join("memory"); + std::fs::create_dir_all(&mem).unwrap(); + let doc = mem.join("q.md"); + std::fs::write(&doc, "x").unwrap(); + let out = tmp.path().join("LESSONS.md"); + std::fs::write(&out, "y").unwrap(); + let graph = tmp.path().join("graph.json"); + std::fs::write(&graph, "{}").unwrap(); + set_mtime(&doc, 1000); + set_mtime(&out, 1500); + set_mtime(&graph, 2000); + assert!(!lessons_fresh(&out, &mem, Some(&graph))); +} + +/// Set a file's mtime to `secs` after the Unix epoch. +fn set_mtime(path: &std::path::Path, secs: u64) { + let mtime = std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(secs); + filetime_set(path, mtime); +} + +fn filetime_set(path: &std::path::Path, mtime: std::time::SystemTime) { + // Re-open and set times via a small platform-agnostic shim: write then use + // `File::set_modified` (stable since 1.75). + let f = std::fs::OpenOptions::new().write(true).open(path).unwrap(); + f.set_modified(mtime).unwrap(); +} diff --git a/crates/graphify-security/Cargo.toml b/crates/graphify-security/Cargo.toml index 670ed8e..7858814 100644 --- a/crates/graphify-security/Cargo.toml +++ b/crates/graphify-security/Cargo.toml @@ -18,6 +18,7 @@ url = { workspace = true } [dev-dependencies] mockito = { workspace = true } +serial_test = { workspace = true } tempfile = { workspace = true } [lints] diff --git a/crates/graphify-security/src/lib.rs b/crates/graphify-security/src/lib.rs index 338dd3e..910033c 100644 --- a/crates/graphify-security/src/lib.rs +++ b/crates/graphify-security/src/lib.rs @@ -28,6 +28,7 @@ mod ip; mod label; mod metadata; mod path_guard; +pub mod paths; #[doc(hidden)] pub mod test_support; mod url_guard; @@ -44,4 +45,5 @@ pub use metadata::{ sanitize_metadata_string, sanitize_metadata_value, }; pub use path_guard::validate_graph_path; +pub use paths::{DEFAULT_GRAPHIFY_OUT, default_graph_json, graphify_out, graphify_out_name}; pub use url_guard::validate_url; diff --git a/crates/graphify-security/src/path_guard.rs b/crates/graphify-security/src/path_guard.rs index 4d24212..c9cc44e 100644 --- a/crates/graphify-security/src/path_guard.rs +++ b/crates/graphify-security/src/path_guard.rs @@ -28,8 +28,9 @@ pub fn validate_graph_path>( let hint = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); let mut found: Option = None; let mut cur = Some(hint.as_path()); + let out_name = crate::paths::graphify_out_name(); while let Some(c) = cur { - if c.file_name().is_some_and(|n| n == "graphify-out") { + if c.file_name().is_some_and(|n| n == out_name.as_str()) { found = Some(c.to_path_buf()); break; } @@ -37,8 +38,8 @@ pub fn validate_graph_path>( } found.unwrap_or_else(|| { std::env::current_dir().map_or_else( - |_| PathBuf::from("graphify-out"), - |cwd| cwd.join("graphify-out"), + |_| crate::paths::graphify_out(), + |cwd| cwd.join(crate::paths::graphify_out()), ) }) }; diff --git a/crates/graphify-security/src/paths.rs b/crates/graphify-security/src/paths.rs new file mode 100644 index 0000000..906e27b --- /dev/null +++ b/crates/graphify-security/src/paths.rs @@ -0,0 +1,50 @@ +//! Output-directory resolution — the single source of truth for the +//! `GRAPHIFY_OUT` override across the workspace. +//! +//! The output directory is `graphify-out` by default and overridable with the +//! `GRAPHIFY_OUT` environment variable (worktrees or shared-output setups). It +//! accepts a relative name (`graphify-out-feature`) or an absolute path +//! (`/shared/graphify-out`). +//! +//! Ports `graphify-py/graphify/paths.py`. Unlike Python — which snapshots the +//! value once at import time — these read the environment on every call, so a +//! test (or a process whose environment changed) always observes the current +//! value. + +use std::path::PathBuf; + +/// Output directory name used when `GRAPHIFY_OUT` is unset. +pub const DEFAULT_GRAPHIFY_OUT: &str = "graphify-out"; + +/// The configured graphify output directory, honouring `GRAPHIFY_OUT`. +/// +/// Returns a relative name (`graphify-out`) or an absolute path verbatim; the +/// caller joins it against a project root (`root.join(graphify_out())` resolves +/// correctly for both, since joining an absolute path replaces the base). +#[must_use] +pub fn graphify_out() -> PathBuf { + PathBuf::from(std::env::var("GRAPHIFY_OUT").unwrap_or_else(|_| DEFAULT_GRAPHIFY_OUT.to_owned())) +} + +/// Bare directory name even when `GRAPHIFY_OUT` is an absolute path. +/// +/// Used by the path guards that walk parents looking for the output dir by +/// name, and by the detect scan-exclude so a custom output dir is never +/// re-ingested as source. Mirrors Python's +/// `os.path.basename(os.path.normpath(GRAPHIFY_OUT))`. +#[must_use] +pub fn graphify_out_name() -> String { + graphify_out().file_name().map_or_else( + || DEFAULT_GRAPHIFY_OUT.to_owned(), + |n| n.to_string_lossy().into_owned(), + ) +} + +/// Default `graph.json` path under the configured output dir. +/// +/// The package-wide fallback so a `GRAPHIFY_OUT` override is honoured wherever +/// a graph path is not passed explicitly. +#[must_use] +pub fn default_graph_json() -> PathBuf { + graphify_out().join("graph.json") +} diff --git a/crates/graphify-security/tests/parity.rs b/crates/graphify-security/tests/parity.rs index 7bc3d7d..4b3c3da 100644 --- a/crates/graphify-security/tests/parity.rs +++ b/crates/graphify-security/tests/parity.rs @@ -1,5 +1,7 @@ //! Parity tests against `graphify-py/tests/test_security.py`. #![allow(clippy::expect_used)] +// `std::env::set_var` is unsafe in edition 2024 — test-only, serialised below. +#![allow(unsafe_code)] use std::time::Duration; @@ -239,6 +241,72 @@ fn validate_graph_path_raises_if_file_missing() { assert!(matches!(err, SecurityError::GraphFileMissing(_))); } +/// RAII guard that sets an env var and restores it on drop. +struct EnvGuard { + key: &'static str, + prev: Option, +} + +impl EnvGuard { + fn set(key: &'static str, value: &str) -> Self { + let prev = std::env::var(key).ok(); + // SAFETY: test-only, serialised via `#[serial_test::serial]`. + unsafe { std::env::set_var(key, value) }; + Self { key, prev } + } + + /// Clear `key` for the test's duration, restoring the prior value on drop. + fn unset(key: &'static str) -> Self { + let prev = std::env::var(key).ok(); + // SAFETY: test-only, serialised via `#[serial_test::serial]`. + unsafe { std::env::remove_var(key) }; + Self { key, prev } + } +} + +impl Drop for EnvGuard { + fn drop(&mut self) { + match &self.prev { + // SAFETY: test-only cleanup. + Some(v) => unsafe { std::env::set_var(self.key, v) }, + None => unsafe { std::env::remove_var(self.key) }, + } + } +} + +#[test] +#[serial_test::serial(graphify_out_env)] +fn validate_graph_path_default_base_discovers_output_dir() { + // With base omitted, the output dir is discovered by walking the path's + // parents for the configured output-dir name (default "graphify-out"). + // Clear any ambient GRAPHIFY_OUT so discovery resolves the default name. + let _guard = EnvGuard::unset("GRAPHIFY_OUT"); + let tmp = tempfile::tempdir().expect("tempdir"); + let base = tmp.path().join("graphify-out"); + std::fs::create_dir(&base).expect("mkdir"); + let graph = base.join("graph.json"); + std::fs::write(&graph, "{}").expect("write"); + let resolved = + validate_graph_path(&graph, None).expect("default base should discover graphify-out"); + assert_eq!(resolved, graph.canonicalize().expect("canonicalize")); +} + +#[test] +#[serial_test::serial(graphify_out_env)] +fn validate_graph_path_default_base_honours_graphify_out_override() { + // base=None discovery must honour GRAPHIFY_OUT, not the hardcoded literal, + // so a renamed output dir validates against the right base (#1423). + let _guard = EnvGuard::set("GRAPHIFY_OUT", "custom-out"); + let tmp = tempfile::tempdir().expect("tempdir"); + let out = tmp.path().join("custom-out"); + std::fs::create_dir(&out).expect("mkdir"); + let graph = out.join("graph.json"); + std::fs::write(&graph, "{}").expect("write"); + let resolved = + validate_graph_path(&graph, None).expect("override base should discover custom-out"); + assert_eq!(resolved, graph.canonicalize().expect("canonicalize")); +} + // --------------------------------------------------------------------------- // sanitize_label // --------------------------------------------------------------------------- diff --git a/crates/graphify-serve/src/tools.rs b/crates/graphify-serve/src/tools.rs index 75c2a53..dad5779 100644 --- a/crates/graphify-serve/src/tools.rs +++ b/crates/graphify-serve/src/tools.rs @@ -191,6 +191,25 @@ pub fn tool_get_neighbors(graph: &Graph, arguments: &serde_json::Map) -> String { + let base = format!("Community {cid}"); + if let Some(name) = community_name { + let clean = sanitize_label(Some(name)); + if !clean.is_empty() && clean != base { + return format!("{base} — {clean}"); + } + } + base +} + /// Execute the `get_community` tool. #[must_use] pub fn tool_get_community( @@ -205,7 +224,12 @@ pub fn tool_get_community( Some(ns) if !ns.is_empty() => ns, _ => return format!("Community {cid} not found."), }; - let mut lines = vec![format!("Community {cid} ({} nodes):", nodes.len())]; + let community_name = graph + .node_data(&nodes[0]) + .and_then(|d| d.get("community_name")) + .and_then(Value::as_str); + let header = community_header(cid, community_name); + let mut lines = vec![format!("{header} ({} nodes):", nodes.len())]; for n in nodes { let empty = IndexMap::new(); let d = graph.node_data(n).unwrap_or(&empty); diff --git a/crates/graphify-serve/tests/tool_handlers.rs b/crates/graphify-serve/tests/tool_handlers.rs index e69660a..67ab4eb 100644 --- a/crates/graphify-serve/tests/tool_handlers.rs +++ b/crates/graphify-serve/tests/tool_handlers.rs @@ -1,5 +1,8 @@ //! Coverage tests for the MCP tool handler functions. +// Test setup uses `.expect("test invariant")`; AGENTS.md sanctions the file-top +// `expect_used` allow for test files, so a build/setup failure surfaces as a +// clear panic rather than threading `Result` through every handler test. #![allow(clippy::expect_used)] use std::collections::HashMap; @@ -7,8 +10,8 @@ use std::collections::HashMap; use graphify_build::{Graph, build_from_json}; use graphify_serve::graph::communities_from_graph; use graphify_serve::tools::{ - tool_get_community, tool_get_neighbors, tool_get_node, tool_god_nodes, tool_graph_stats, - tool_query_graph, tool_shortest_path, + community_header, tool_get_community, tool_get_neighbors, tool_get_node, tool_god_nodes, + tool_graph_stats, tool_query_graph, tool_shortest_path, }; use serde_json::{Map, Value, json}; @@ -171,6 +174,59 @@ fn tool_get_community_not_found() { assert!(out.contains("not found")); } +#[test] +fn tool_get_community_shows_community_name() { + // #1448: the header surfaces the community label, like get_node / query. + let g = build_from_json( + json!({ + "nodes": [ + {"id": "n1", "label": "alpha", "source_file": "alpha.py", + "community": 0, "community_name": "Auth Layer", "file_type": "code"}, + {"id": "n2", "label": "beta", "source_file": "beta.py", + "community": 0, "community_name": "Auth Layer", "file_type": "code"}, + ], + "edges": [] + }), + false, + None, + ) + .expect("test invariant"); + let communities = communities_from_graph(&g); + let args = arg_map(&[("community_id", json!(0))]); + let out = tool_get_community(&g, &communities, &args); + assert!(out.contains("Community 0 — Auth Layer"), "got: {out}"); +} + +// ── community_header (#1448) ───────────────────────────────────────────────── + +#[test] +fn community_header_shows_real_name() { + assert_eq!( + community_header(12, Some("Auth & Sessions")), + "Community 12 — Auth & Sessions" + ); +} + +#[test] +fn community_header_skips_placeholder_name() { + // No "Community 12 — Community 12" doubling. + assert_eq!(community_header(12, Some("Community 12")), "Community 12"); +} + +#[test] +fn community_header_falls_back_when_no_name() { + assert_eq!(community_header(7, None), "Community 7"); + assert_eq!(community_header(7, Some("")), "Community 7"); +} + +#[test] +fn community_header_sanitizes_name() { + let out = community_header(3, Some("Pay\u{0}ments\u{1b}[31m")); + assert!(out.starts_with("Community 3 — "), "got: {out}"); + assert!(!out.contains('\u{0}')); + assert!(!out.contains('\u{1b}')); +} + // ── tool_god_nodes ────────────────────────────────────────────────────────── #[test] diff --git a/crates/graphify-validate/src/validate.rs b/crates/graphify-validate/src/validate.rs index 8ee1341..711dc92 100644 --- a/crates/graphify-validate/src/validate.rs +++ b/crates/graphify-validate/src/validate.rs @@ -63,8 +63,21 @@ pub fn validate_extraction(data: &Value) -> Vec { "Node {i} (id={id_repr}) has invalid file_type '{ft}' - must be one of {allowed:?}" )); } - if let Some(Value::String(id)) = node_obj.get("id") { - node_ids.insert(id.clone()); + match node_obj.get("id") { + // A list/dict id is non-hashable in Python; report it rather + // than crash on set construction (#1447). Numbers/bools/null + // are hashable, so they are neither reported nor collected as + // string ids. + Some(id @ (Value::Array(_) | Value::Object(_))) => { + errors.push(format!( + "Node {i} has non-hashable id {} - id must be a string", + repr(id) + )); + } + Some(Value::String(id)) => { + node_ids.insert(id.clone()); + } + _ => {} } } } @@ -94,20 +107,25 @@ pub fn validate_extraction(data: &Value) -> Vec { "Edge {i} has invalid confidence '{conf}' - must be one of {allowed:?}" )); } - if !node_ids.is_empty() { - if let Some(src) = edge_obj.get("source").and_then(Value::as_str) - && !node_ids.contains(src) - { - errors.push(format!( - "Edge {i} source '{src}' does not match any node id" - )); - } - if let Some(tgt) = edge_obj.get("target").and_then(Value::as_str) - && !node_ids.contains(tgt) - { - errors.push(format!( - "Edge {i} target '{tgt}' does not match any node id" - )); + for endpoint in ["source", "target"] { + let Some(val) = edge_obj.get(endpoint) else { + continue; + }; + match val { + // A list/dict endpoint is non-hashable in Python; report + // it rather than crash the membership test (#1447). + Value::Array(_) | Value::Object(_) => { + errors.push(format!( + "Edge {i} {endpoint} {} is non-hashable - must be a string", + repr(val) + )); + } + Value::String(s) if !node_ids.is_empty() && !node_ids.contains(s) => { + errors.push(format!( + "Edge {i} {endpoint} '{s}' does not match any node id" + )); + } + _ => {} } } } diff --git a/crates/graphify-validate/tests/parity.rs b/crates/graphify-validate/tests/parity.rs index 41d44cf..a8e7798 100644 --- a/crates/graphify-validate/tests/parity.rs +++ b/crates/graphify-validate/tests/parity.rs @@ -165,3 +165,61 @@ fn edge_must_be_object() { })); assert!(errors.iter().any(|e| e.contains("must be an object"))); } + +#[test] +fn non_hashable_node_id_reported_not_raised() { + // A list-valued id must be reported as an error, not crash the validator. + let data = json!({ + "nodes": [ + {"id": "n1", "label": "A", "file_type": "code", "source_file": "a.py"}, + {"id": ["x", "y"], "label": "B", "file_type": "code", "source_file": "b.py"}, + ], + "edges": [], + }); + let errors = validate_extraction(&data); + assert!(errors.iter().any(|e| e.contains("non-hashable id"))); +} + +#[test] +fn non_hashable_edge_endpoint_reported_not_raised() { + // A list-valued endpoint must be reported, not crash the membership test. + let data = json!({ + "nodes": [ + {"id": "n1", "label": "A", "file_type": "code", "source_file": "a.py"}, + {"id": "n2", "label": "B", "file_type": "code", "source_file": "b.py"}, + ], + "edges": [ + {"source": "n1", "target": ["n2", "n3"], "relation": "calls", + "confidence": "INFERRED", "source_file": "a.py"}, + ], + }); + let errors = validate_extraction(&data); + assert!( + errors + .iter() + .any(|e| e.contains("target") && e.contains("non-hashable")) + ); +} + +#[test] +fn non_hashable_node_id_does_not_mask_valid_ids() { + // The valid node id must still be collected so a legitimately-dangling edge + // is still flagged even when a sibling node has a bad id. + let data = json!({ + "nodes": [ + {"id": "n1", "label": "A", "file_type": "code", "source_file": "a.py"}, + {"id": {"oops": 1}, "label": "B", "file_type": "code", "source_file": "b.py"}, + ], + "edges": [ + {"source": "n1", "target": "ghost", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "a.py"}, + ], + }); + let errors = validate_extraction(&data); + assert!(errors.iter().any(|e| e.contains("non-hashable id"))); + assert!( + errors + .iter() + .any(|e| e.contains("target") && e.contains("ghost")) + ); +} diff --git a/crates/graphify-watch/src/constants.rs b/crates/graphify-watch/src/constants.rs index fddbd09..f42f22a 100644 --- a/crates/graphify-watch/src/constants.rs +++ b/crates/graphify-watch/src/constants.rs @@ -1,9 +1,4 @@ -//! File-extension constants used by the watcher and the output-dir -//! environment-variable override. - -/// Default output sub-directory name, overridden by the `GRAPHIFY_OUT` -/// environment variable. -pub(crate) const DEFAULT_GRAPHIFY_OUT: &str = "graphify-out"; +//! File-extension constants used by the watcher. /// All extensions the watcher pays attention to (code + doc + paper + /// image). diff --git a/crates/graphify-watch/src/lib.rs b/crates/graphify-watch/src/lib.rs index 5cb2118..cea7ca8 100644 --- a/crates/graphify-watch/src/lib.rs +++ b/crates/graphify-watch/src/lib.rs @@ -21,7 +21,7 @@ mod watch_fn; pub use constants::WATCHED_EXTENSIONS; pub use error::WatchError; pub use lock::RebuildLock; -pub use notify::{check_update, graphify_out, notify_only}; +pub use notify::{check_update, notify_only}; pub use rebuild::{ LockPolicy, PENDING_DRAIN_MAX_PASSES, PENDING_FILENAME, RebuildOptions, check_shrink, drain_pending, git_head, merge_changed_paths, node_community_map, queue_pending, diff --git a/crates/graphify-watch/src/notify.rs b/crates/graphify-watch/src/notify.rs index a8fd7f6..15354fe 100644 --- a/crates/graphify-watch/src/notify.rs +++ b/crates/graphify-watch/src/notify.rs @@ -3,18 +3,8 @@ use std::path::Path; -use crate::constants::DEFAULT_GRAPHIFY_OUT; use crate::error::WatchError; -/// Return the effective output directory name from the environment. -/// -/// Reads `GRAPHIFY_OUT`, falling back to the compile-time default -/// (`"graphify-out"`). -#[must_use] -pub fn graphify_out() -> String { - std::env::var("GRAPHIFY_OUT").unwrap_or_else(|_| DEFAULT_GRAPHIFY_OUT.to_string()) -} - /// Write a `needs_update` flag file and print a notification. /// /// Called for non-code file changes (docs, papers, images) that require @@ -26,7 +16,7 @@ pub fn graphify_out() -> String { /// /// Returns [`WatchError::Io`] if the flag file cannot be created. pub fn notify_only(watch_path: &Path) -> Result<(), WatchError> { - let out = watch_path.join(graphify_out()); + let out = watch_path.join(graphify_security::graphify_out()); let flag = out.join("needs_update"); std::fs::create_dir_all(&out).map_err(WatchError::Io)?; std::fs::write(&flag, "1").map_err(WatchError::Io)?; @@ -49,7 +39,9 @@ pub fn notify_only(watch_path: &Path) -> Result<(), WatchError> { /// Ports `check_update` from Python. #[must_use] pub fn check_update(watch_path: &Path) -> bool { - let flag = watch_path.join(graphify_out()).join("needs_update"); + let flag = watch_path + .join(graphify_security::graphify_out()) + .join("needs_update"); if flag.exists() { println!( "[graphify check-update] Pending non-code changes in {}.", diff --git a/crates/graphify-watch/src/rebuild/mod.rs b/crates/graphify-watch/src/rebuild/mod.rs index 0d384b0..9575379 100644 --- a/crates/graphify-watch/src/rebuild/mod.rs +++ b/crates/graphify-watch/src/rebuild/mod.rs @@ -26,7 +26,6 @@ pub use shrink::check_shrink; use std::path::{Path, PathBuf}; use crate::error::WatchError; -use crate::graphify_out; use crate::lock::RebuildLock; use pipeline::rebuild_code_inner; @@ -72,7 +71,7 @@ pub fn rebuild_code( changed_paths: Option<&[PathBuf]>, opts: RebuildOptions, ) -> Result { - let out = watch_path.join(graphify_out()); + let out = watch_path.join(graphify_security::graphify_out()); match opts.lock { LockPolicy::None => { diff --git a/crates/graphify-watch/src/rebuild/pipeline.rs b/crates/graphify-watch/src/rebuild/pipeline.rs index 49a9349..962a9cf 100644 --- a/crates/graphify-watch/src/rebuild/pipeline.rs +++ b/crates/graphify-watch/src/rebuild/pipeline.rs @@ -11,14 +11,13 @@ use graphify_export::attach_hyperedges; use serde_json::Value; use crate::error::WatchError; -use crate::graphify_out; use crate::rebuild::git::git_head; use crate::rebuild::helpers::report_root_label; use crate::rebuild::pipeline_helpers::{ FinaliseArgs, build_phase, cluster_phase, compare_existing_graph, compare_existing_report, - compute_extract_targets, detect_phase, extract_phase, finalise_rebuild, load_or_default_labels, - merge_with_existing_graph, render_report_phase, resolve_project_root, run_analysis, - run_no_cluster_path, topology_unchanged, write_graph_tmp, + compute_extract_targets, compute_rebuilt_sources, detect_phase, extract_phase, + finalise_rebuild, load_or_default_labels, merge_with_existing_graph, render_report_phase, + resolve_project_root, run_analysis, run_no_cluster_path, topology_unchanged, write_graph_tmp, }; use crate::rebuild::relativize::relativize_source_files; @@ -30,6 +29,9 @@ use crate::rebuild::relativize::relativize_source_files; /// # Errors /// /// Propagates I/O and pipeline errors via `WatchError`. +// Linear detect → extract → merge → build → cluster → finalise pipeline; +// splitting the sequence across more helpers obscures the ordering it encodes. +#[allow(clippy::too_many_lines)] pub(crate) fn rebuild_code_inner( watch_path: &Path, changed_paths: Option<&[PathBuf]>, @@ -41,7 +43,7 @@ pub(crate) fn rebuild_code_inner( .unwrap_or_else(|_| watch_path.to_path_buf()); let project_root = resolve_project_root(watch_path, &watch_root); let report_root = report_root_label(watch_path); - let out = watch_path.join(graphify_out()); + let out = watch_path.join(graphify_security::graphify_out()); let (detected, code_files) = detect_phase(watch_path); if code_files.is_empty() { @@ -56,6 +58,7 @@ pub(crate) fn rebuild_code_inner( }; let extract_targets = targets.wanted; let deleted_paths = targets.deleted_paths; + let rebuilt_sources = compute_rebuilt_sources(&extract_targets, &deleted_paths, &project_root); let commit = git_head(&watch_root); let mut result = extract_phase(&extract_targets, &watch_root); @@ -97,6 +100,7 @@ pub(crate) fn rebuild_code_inner( &out, force, had_explicit_deletions, + Some(&rebuilt_sources), t_post, ); } @@ -137,6 +141,7 @@ pub(crate) fn rebuild_code_inner( no_change, force, had_explicit_deletions, + rebuilt_sources: Some(&rebuilt_sources), graph_with_hyper: &graph_with_hyper, communities: &communities, labels: &labels, diff --git a/crates/graphify-watch/src/rebuild/pipeline_helpers.rs b/crates/graphify-watch/src/rebuild/pipeline_helpers.rs index 35af7a2..37fae27 100644 --- a/crates/graphify-watch/src/rebuild/pipeline_helpers.rs +++ b/crates/graphify-watch/src/rebuild/pipeline_helpers.rs @@ -1,5 +1,6 @@ //! Helper functions split out of [`super::pipeline::rebuild_code_inner`]. +use std::collections::HashSet; use std::path::{Path, PathBuf}; use graphify_build::{Graph, build_from_json, dedupe_edges, dedupe_nodes, norm_source_file}; @@ -49,6 +50,32 @@ pub(crate) fn detect_phase(watch_path: &Path) -> (DetectResult, Vec) { (detected, code_files) } +/// The set of source files re-extracted this run, normalised to match the +/// stored graph's `source_file` values. A net shrink is legitimate when every +/// lost node belongs to one of these (or a deleted file) — see +/// [`check_shrink`](crate::rebuild::shrink::check_shrink) (#1116). +pub(crate) fn compute_rebuilt_sources( + extract_targets: &[PathBuf], + deleted_paths: &[String], + project_root: &Path, +) -> HashSet { + let root = project_root.to_string_lossy(); + let mut sources: HashSet = extract_targets + .iter() + .map(|p| { + let raw = p.to_string_lossy(); + let normalized = norm_source_file(&raw, Some(&root)); + if normalized.is_empty() { + raw.into_owned() + } else { + normalized + } + }) + .collect(); + sources.extend(deleted_paths.iter().cloned()); + sources +} + /// `true` when `path` would have been pulled into the rebuild's `code_files` /// set if it still existed on disk. Mirrors the inclusion rule used in /// [`crate::rebuild::helpers::detect_code_files`]: any `FileType::Code` plus @@ -452,6 +479,9 @@ pub(crate) fn merge_with_existing_graph( } /// Execute the `--no-cluster` shortcut: write `graph.json` only, no clustering or report. +// One-shot `--no-cluster` shortcut threading the rebuild context; an args +// struct would add indirection for a single call site. +#[allow(clippy::too_many_arguments)] pub(crate) fn run_no_cluster_path( result: &Value, existing_graph_path: &Path, @@ -459,6 +489,7 @@ pub(crate) fn run_no_cluster_path( out: &Path, force: bool, had_explicit_deletions: bool, + rebuilt_sources: Option<&HashSet>, t_post: std::time::Instant, ) -> Result { // Dedupe nodes by id and parallel edges by (source, target, relation): the @@ -501,6 +532,7 @@ pub(crate) fn run_no_cluster_path( &candidate_data, None, had_explicit_deletions, + rebuilt_sources, )?; std::fs::write(existing_graph_path, json_text(&candidate_data).as_bytes()) .map_err(WatchError::Io)?; @@ -731,6 +763,9 @@ pub(crate) struct CommitArgs<'a> { /// Skip the shrink guard when the caller has declared deletions — the /// smaller graph is expected and not a sign of silent corruption. pub had_explicit_deletions: bool, + /// Source files re-extracted this run; lets the shrink guard allow a + /// symbol removed from a rebuilt file (#1116). + pub rebuilt_sources: Option<&'a HashSet>, /// The graph JSON that was on disk before this rebuild began. pub existing_graph_data: &'a Value, /// The newly built graph JSON to be committed. @@ -764,6 +799,7 @@ pub(crate) fn commit_rebuild_outputs(args: &CommitArgs<'_>) -> Result<(), WatchE args.candidate_graph_data, Some(args.graph_tmp), args.had_explicit_deletions, + args.rebuilt_sources, )?; let _ = backup_if_protected(args.out); std::fs::rename(args.graph_tmp, args.existing_graph_path).map_err(WatchError::Io)?; @@ -844,6 +880,9 @@ pub(crate) struct FinaliseArgs<'a> { /// Skip the shrink guard when the caller has declared deletions — see /// [`check_shrink`](crate::rebuild::shrink::check_shrink) for context. pub had_explicit_deletions: bool, + /// Source files re-extracted this run; lets the shrink guard allow a + /// symbol removed from a rebuilt file (#1116). + pub rebuilt_sources: Option<&'a HashSet>, /// Final graph value including attached hyperedges. pub graph_with_hyper: &'a Graph, /// Community detection result mapping community ID → member node IDs. @@ -885,6 +924,7 @@ pub(crate) fn finalise_rebuild(args: &FinaliseArgs<'_>) -> Result<(), WatchError commit_rebuild_outputs(&CommitArgs { force: args.force, had_explicit_deletions: args.had_explicit_deletions, + rebuilt_sources: args.rebuilt_sources, existing_graph_data: args.existing_graph_data, candidate_graph_data: args.candidate_graph_data, graph_tmp: args.graph_tmp, diff --git a/crates/graphify-watch/src/rebuild/shrink.rs b/crates/graphify-watch/src/rebuild/shrink.rs index 62b3239..b233d23 100644 --- a/crates/graphify-watch/src/rebuild/shrink.rs +++ b/crates/graphify-watch/src/rebuild/shrink.rs @@ -5,8 +5,10 @@ //! Extracted from `rebuild.rs` so the shrink-detection logic is isolated and //! independently testable. +use std::collections::HashSet; use std::path::Path; +use graphify_build::norm_source_file; use serde_json::Value; use crate::error::WatchError; @@ -23,6 +25,10 @@ use crate::error::WatchError; /// - `had_explicit_deletions` is `true`, signalling that the caller already /// declared which files were removed (e.g. the post-commit hook saw a `D` /// in `git diff --name-only`) and the smaller graph is the expected outcome. +/// - `rebuilt_sources` is `Some` and every *lost* node (present before, gone +/// now) belonged to a source re-extracted this run or carries no +/// `source_file` — a symbol removed from a rebuilt file is a legitimate +/// shrink, not a failed chunk (#1116). /// /// If `tmp` is provided and the check fails, the temporary file is cleaned up /// before returning `Err`. @@ -34,41 +40,72 @@ use crate::error::WatchError; /// /// Returns [`WatchError::ShrinkRefused`] when the new graph has fewer nodes /// than the existing one and none of the bypass conditions apply. +// `None` call sites cannot infer a generic hasher parameter, and every caller +// builds the set with the default hasher. +#[allow(clippy::implicit_hasher)] pub fn check_shrink( force: bool, existing_data: &Value, new_data: &Value, tmp: Option<&Path>, had_explicit_deletions: bool, + rebuilt_sources: Option<&HashSet>, ) -> Result<(), WatchError> { if force || had_explicit_deletions { return Ok(()); } - let existing_nodes = existing_data - .get("nodes") - .and_then(Value::as_array) - .map_or(0, Vec::len); - if existing_nodes == 0 { + let existing_nodes = existing_data.get("nodes").and_then(Value::as_array); + let existing_count = existing_nodes.map_or(0, Vec::len); + if existing_count == 0 { return Ok(()); } - let new_nodes = new_data - .get("nodes") - .and_then(Value::as_array) - .map_or(0, Vec::len); - if new_nodes < existing_nodes { - if let Some(tmp_path) = tmp { - let _ = std::fs::remove_file(tmp_path); - } - eprintln!( - "[graphify] WARNING: new graph has {new_nodes} nodes but existing \ - graph.json has {existing_nodes}. Refusing to overwrite — you may be \ - missing chunk files from a previous session. \ - Pass --force to override." - ); - return Err(WatchError::ShrinkRefused { - existing: existing_nodes, - new: new_nodes, + let new_node_arr = new_data.get("nodes").and_then(Value::as_array); + let new_count = new_node_arr.map_or(0, Vec::len); + if new_count >= existing_count { + return Ok(()); + } + + // A net shrink is legitimate — not a failed chunk — when every *lost* node + // belonged to a source re-extracted this run (a symbol removed from a + // rebuilt file) or carries no source_file. Only an unexplained loss (a node + // from a file we did NOT touch) refuses the write. (#1116) + if let Some(rebuilt) = rebuilt_sources + && let Some(existing_arr) = existing_nodes + { + let new_ids: HashSet<&str> = new_node_arr + .into_iter() + .flatten() + .filter_map(|n| n.get("id").and_then(Value::as_str)) + .collect(); + let all_lost_accounted = existing_arr.iter().all(|n| { + // A node still present is not "lost". + if n.get("id") + .and_then(Value::as_str) + .is_some_and(|id| new_ids.contains(id)) + { + return true; + } + match n.get("source_file").and_then(Value::as_str) { + None | Some("") => true, + Some(sf) => rebuilt.contains(sf) || rebuilt.contains(&norm_source_file(sf, None)), + } }); + if all_lost_accounted { + return Ok(()); + } + } + + if let Some(tmp_path) = tmp { + let _ = std::fs::remove_file(tmp_path); } - Ok(()) + eprintln!( + "[graphify] WARNING: new graph has {new_count} nodes but existing \ + graph.json has {existing_count}. Refusing to overwrite — you may be \ + missing chunk files from a previous session. \ + Pass --force to override." + ); + Err(WatchError::ShrinkRefused { + existing: existing_count, + new: new_count, + }) } diff --git a/crates/graphify-watch/src/watch_fn.rs b/crates/graphify-watch/src/watch_fn.rs index ee455b2..b3802f3 100644 --- a/crates/graphify-watch/src/watch_fn.rs +++ b/crates/graphify-watch/src/watch_fn.rs @@ -8,7 +8,7 @@ use graphify_detect::{is_ignored, load_graphifyignore}; use crate::constants::WATCHED_EXTENSIONS; use crate::error::WatchError; -use crate::notify::{graphify_out, notify_only}; +use crate::notify::notify_only; use crate::rebuild; /// Re-run AST extraction + build + optional cluster + report for code @@ -72,7 +72,7 @@ pub fn watch(watch_path: &Path, debounce: f64) -> Result<(), WatchError> { use notify_debouncer_full::{DebounceEventResult, new_debouncer, notify::RecursiveMode}; let debounce_dur = Duration::from_secs_f64(debounce); - let out_dir_name = graphify_out(); + let out_dir_name = graphify_security::graphify_out_name(); // Load .graphifyignore patterns ONCE at startup (mirrors gh-928 fix). let watch_root = watch_path diff --git a/crates/graphify-watch/tests/helpers.rs b/crates/graphify-watch/tests/helpers.rs index 87f8973..099b49a 100644 --- a/crates/graphify-watch/tests/helpers.rs +++ b/crates/graphify-watch/tests/helpers.rs @@ -2,6 +2,7 @@ #![allow(clippy::expect_used)] +use std::collections::HashSet; use std::fs; use std::process::Command; @@ -161,35 +162,35 @@ fn relativize_noop_on_non_object_payload() { fn check_shrink_allows_growth() { let existing = json!({"nodes": [{"id": "a"}]}); let new = json!({"nodes": [{"id": "a"}, {"id": "b"}]}); - assert!(check_shrink(false, &existing, &new, None, false).is_ok()); + assert!(check_shrink(false, &existing, &new, None, false, None).is_ok()); } #[test] fn check_shrink_allows_same() { let existing = json!({"nodes": [{"id": "a"}]}); let new = json!({"nodes": [{"id": "b"}]}); - assert!(check_shrink(false, &existing, &new, None, false).is_ok()); + assert!(check_shrink(false, &existing, &new, None, false, None).is_ok()); } #[test] fn check_shrink_refuses_shrink() { let existing = json!({"nodes": [{"id": "a"}, {"id": "b"}]}); let new = json!({"nodes": [{"id": "a"}]}); - assert!(check_shrink(false, &existing, &new, None, false).is_err()); + assert!(check_shrink(false, &existing, &new, None, false, None).is_err()); } #[test] fn check_shrink_force_overrides() { let existing = json!({"nodes": [{"id": "a"}, {"id": "b"}]}); let new = json!({"nodes": [{"id": "a"}]}); - assert!(check_shrink(true, &existing, &new, None, false).is_ok()); + assert!(check_shrink(true, &existing, &new, None, false, None).is_ok()); } #[test] fn check_shrink_no_existing_passes() { let existing = json!({"nodes": []}); let new = json!({"nodes": [{"id": "a"}]}); - assert!(check_shrink(false, &existing, &new, None, false).is_ok()); + assert!(check_shrink(false, &existing, &new, None, false, None).is_ok()); } #[test] @@ -199,7 +200,7 @@ fn check_shrink_cleans_up_tmp_file_on_failure() { fs::write(&tmp_path, "{}").expect("write fixture"); let existing = json!({"nodes": [{"id": "a"}, {"id": "b"}]}); let new = json!({"nodes": [{"id": "a"}]}); - assert!(check_shrink(false, &existing, &new, Some(&tmp_path), false).is_err()); + assert!(check_shrink(false, &existing, &new, Some(&tmp_path), false, None).is_err()); assert!(!tmp_path.exists(), "tmp file should be cleaned up"); } @@ -208,7 +209,7 @@ fn check_shrink_allows_explicit_deletions() { let existing = json!({"nodes": (0..100).map(|i| json!({"id": format!("n{i}")})).collect::>()}); let new = json!({"nodes": (0..80).map(|i| json!({"id": format!("n{i}")})).collect::>()}); - assert!(check_shrink(false, &existing, &new, None, true).is_ok()); + assert!(check_shrink(false, &existing, &new, None, true, None).is_ok()); } #[test] @@ -219,7 +220,7 @@ fn check_shrink_keeps_tmp_when_deletions_declared() { let existing = json!({"nodes": (0..100).map(|i| json!({"id": format!("n{i}")})).collect::>()}); let new = json!({"nodes": (0..80).map(|i| json!({"id": format!("n{i}")})).collect::>()}); - assert!(check_shrink(false, &existing, &new, Some(&tmp_path), true).is_ok()); + assert!(check_shrink(false, &existing, &new, Some(&tmp_path), true, None).is_ok()); assert!( tmp_path.exists(), "tmp file must NOT be deleted when shrink is intentional — caller is about to swap it into place" @@ -273,3 +274,33 @@ fn node_community_map_returns_empty_for_missing_nodes() { let map = node_community_map(&graph); assert!(map.is_empty()); } + +#[test] +fn check_shrink_allows_shrink_within_rebuilt_sources() { + // #1116: a symbol removed from a re-extracted file is a legitimate shrink — + // every lost node belongs to a rebuilt source, so the write proceeds. + let existing = json!({"nodes": [ + {"id": "a", "source_file": "m.py"}, + {"id": "b", "source_file": "m.py"}, + {"id": "c", "source_file": "other.py"}, + ], "links": []}); + let new = json!({"nodes": [ + {"id": "a", "source_file": "m.py"}, + {"id": "c", "source_file": "other.py"}, + ], "links": []}); + let rebuilt: HashSet = ["m.py".to_string()].into_iter().collect(); + assert!(check_shrink(false, &existing, &new, None, false, Some(&rebuilt)).is_ok()); +} + +#[test] +fn check_shrink_blocks_shrink_outside_rebuilt_sources() { + // A node lost from a file we did NOT re-extract (the failed-chunk signal) is + // still refused even with rebuilt_sources set. + let existing = json!({"nodes": [ + {"id": "a", "source_file": "m.py"}, + {"id": "z", "source_file": "untouched.py"}, + ], "links": []}); + let new = json!({"nodes": [{"id": "a", "source_file": "m.py"}], "links": []}); + let rebuilt: HashSet = ["m.py".to_string()].into_iter().collect(); + assert!(check_shrink(false, &existing, &new, None, false, Some(&rebuilt)).is_err()); +} diff --git a/crates/graphify-watch/tests/rebuild_pipeline.rs b/crates/graphify-watch/tests/rebuild_pipeline.rs index 0632f85..85f1688 100644 --- a/crates/graphify-watch/tests/rebuild_pipeline.rs +++ b/crates/graphify-watch/tests/rebuild_pipeline.rs @@ -11,6 +11,14 @@ use std::path::{Path, PathBuf}; use graphify_watch::{LockPolicy, RebuildOptions, rebuild_code}; +// GRAPHIFY_OUT isolation: these tests drive `rebuild_code` against per-test +// tempdirs and read the default `graphify-out/` output dir. They deliberately +// do not isolate `GRAPHIFY_OUT` — `cargo nextest` runs each test in its own +// process, no test in this crate mutates `GRAPHIFY_OUT`, and `#[serial]` would +// not guard against an ambient override (shared equally by every test here +// that asserts on `graphify-out`). The `#[serial]` marks further down isolate +// `set_current_dir`, not the environment. + /// Parse `graph.json` at `path` and collect the given string field from every node. fn node_field_set(path: &Path, field: &str) -> std::collections::HashSet { let value: serde_json::Value = @@ -212,6 +220,52 @@ fn rebuild_code_evicts_nodes_from_deleted_files() { ); } +#[test] +fn rebuild_code_evicts_removed_symbol_from_surviving_file() { + // #1116: a symbol removed from a re-extracted (not deleted) file is a + // legitimate shrink — `graphify update` must refresh the graph WITHOUT + // --force, because every lost node belongs to a rebuilt source. + let tmp = tempfile::tempdir().expect("tempdir"); + let corpus = tmp.path(); + fs::write( + corpus.join("auth.py"), + "def login(): pass\ndef logout(): pass\ndef reset(): pass\n", + ) + .expect("write auth.py"); + + let opts = RebuildOptions { + force: false, + no_cluster: false, + lock: LockPolicy::None, + }; + assert!(rebuild_code(corpus, None, opts).expect("first rebuild")); + + let graph_path = corpus.join("graphify-out").join("graph.json"); + let before = node_field_set(&graph_path, "label"); + assert!( + before.contains("reset()"), + "reset should be present before the edit" + ); + + // Remove one function from the surviving file and re-run a full update. + fs::write( + corpus.join("auth.py"), + "def login(): pass\ndef logout(): pass\n", + ) + .expect("rewrite auth.py"); + assert!( + rebuild_code(corpus, None, opts).expect("second rebuild without --force"), + "shrink-guard must allow a symbol removed from a rebuilt source" + ); + + let after = node_field_set(&graph_path, "label"); + assert!( + !after.contains("reset()"), + "removed symbol must be pruned without --force" + ); + assert!(after.contains("login()"), "surviving symbol must be kept"); +} + #[test] fn rebuild_code_with_force_flag() { let tmp = tempfile::tempdir().expect("tempdir"); diff --git a/graphify-py b/graphify-py index dce54a0..6d3c959 160000 --- a/graphify-py +++ b/graphify-py @@ -1 +1 @@ -Subproject commit dce54a007e2ad3eda41307abd669d5a8acac3813 +Subproject commit 6d3c9594e364d12f7c5da6f4cd95a3592ab710e6 diff --git a/mise.toml b/mise.toml index cfd4d04..3d68070 100644 --- a/mise.toml +++ b/mise.toml @@ -5,7 +5,7 @@ minimum_release_age = "7d" npm.package_manager = "bun" [tools] -"npm:markdownlint-cli" = "0.48" +"npm:markdownlint-cli" = "0.49" actionlint = "1.7" bun = "1" diff --git a/src/cli/args.rs b/src/cli/args.rs index a75cc72..d2b1166 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -126,6 +126,12 @@ pub(crate) enum Command { /// Model to use for community naming (default: backend default). #[arg(long)] model: Option, + /// Max community-label batches sent concurrently (#1390). + #[arg(long = "max-concurrency", default_value_t = 4)] + max_concurrency: usize, + /// Communities per LLM labeling call (#1390). + #[arg(long = "batch-size", default_value_t = 100)] + batch_size: usize, }, /// (Re)name communities with the configured LLM backend, regenerate report. @@ -150,6 +156,12 @@ pub(crate) enum Command { /// Model to use for community naming (default: backend default). #[arg(long)] model: Option, + /// Max community-label batches sent concurrently (#1390). + #[arg(long = "max-concurrency", default_value_t = 4)] + max_concurrency: usize, + /// Communities per LLM labeling call (#1390). + #[arg(long = "batch-size", default_value_t = 100)] + batch_size: usize, }, /// Manage custom LLM providers (`graphify provider `). @@ -199,6 +211,40 @@ pub(crate) enum Command { nodes: Vec, #[arg(long = "memory-dir", default_value = "graphify-out/memory")] memory_dir: PathBuf, + /// Work-memory signal: useful | `dead_end` | corrected (#1441). + #[arg(long)] + outcome: Option, + /// What the right answer was (pairs with `--outcome corrected`). + #[arg(long)] + correction: Option, + }, + + /// Aggregate graphify-out/memory/ outcomes into a deterministic lessons doc. + Reflect { + /// Memory directory (default: `/memory`). + #[arg(long = "memory-dir")] + memory_dir: Option, + /// Output lessons file (default: `/reflections/LESSONS.md`). + #[arg(long)] + out: Option, + /// graph.json for community grouping (default: auto-detect under ``). + #[arg(long)] + graph: Option, + /// `.graphify_analysis.json` override (default: sibling of the graph). + #[arg(long)] + analysis: Option, + /// `.graphify_labels.json` override (default: sibling of the graph). + #[arg(long)] + labels: Option, + /// Signal weight halves every N days. + #[arg(long = "half-life-days", default_value_t = graphify_reflect::DEFAULT_HALF_LIFE_DAYS)] + half_life_days: f64, + /// Distinct useful results to promote a node to preferred. + #[arg(long = "min-corroboration", default_value_t = graphify_reflect::DEFAULT_MIN_CORROBORATION)] + min_corroboration: usize, + /// Skip when LESSONS.md is already newer than every input. + #[arg(long = "if-stale")] + if_stale: bool, }, /// Check `needs_update` flag and notify if semantic re-extraction is pending. @@ -537,6 +583,16 @@ pub(crate) enum Command { #[command(subcommand)] cmd: PlatformCmd, }, + /// Install or uninstall the generic cross-framework Agent-Skills integration. + Agents { + #[command(subcommand)] + cmd: PlatformCmd, + }, + /// Alias for `agents` (the Agent-Skills ecosystem calls them "skills"). + Skills { + #[command(subcommand)] + cmd: PlatformCmd, + }, } /// Subcommands for the `hook` command group. diff --git a/src/cli/cluster_only.rs b/src/cli/cluster_only.rs index 3bedc1e..8b845ae 100644 --- a/src/cli/cluster_only.rs +++ b/src/cli/cluster_only.rs @@ -16,6 +16,10 @@ pub(crate) struct LabelOptions<'a> { pub model: Option<&'a str>, /// `graphify label` always (re)names even when a labels file exists. pub force_relabel: bool, + /// Max community-label batches sent concurrently (#1390). + pub max_concurrency: usize, + /// Communities per LLM labeling call (#1390). + pub batch_size: usize, } /// Rerun community detection on an existing graph.json and regenerate the report. @@ -177,6 +181,8 @@ pub(crate) fn cmd_cluster_only( opts.backend, opts.model, false, // quiet + opts.max_concurrency, + opts.batch_size, ); labels }; diff --git a/src/cli/dispatch.rs b/src/cli/dispatch.rs index f713b17..3bee562 100644 --- a/src/cli/dispatch.rs +++ b/src/cli/dispatch.rs @@ -8,6 +8,9 @@ use crate::cli; use crate::cli::args::Command; /// Dispatch a parsed [`Command`] to its handler function. +// Exhaustive command-routing match; one arm per subcommand reads clearer flat +// than split across helpers. +#[allow(clippy::too_many_lines)] pub(crate) fn dispatch(cmd: Command) -> Result<()> { match cmd { Command::Validate { path } => cli::validate::cmd_validate(&path), @@ -32,6 +35,7 @@ pub(crate) fn dispatch(cmd: Command) -> Result<()> { Command::Path { from, to, graph } => cli::query::cmd_path(&from, &to, graph.as_deref()), Command::Explain { node, graph } => cli::query::cmd_explain(&node, graph.as_deref()), cmd @ Command::SaveResult { .. } => dispatch_save_result(cmd), + cmd @ Command::Reflect { .. } => dispatch_reflect(cmd), Command::CheckUpdate { path } => cli::watch::cmd_check_update(&path), cmd @ Command::Tree { .. } => dispatch_tree(cmd), cmd @ Command::Extract { .. } => dispatch_extract(cmd), @@ -109,6 +113,10 @@ pub(crate) fn dispatch(cmd: Command) -> Result<()> { Command::TraeCn { cmd: c } => cli::install::cmd_platform("trae-cn", &c), Command::Hermes { cmd: c } => cli::install::cmd_platform("hermes", &c), Command::Devin { cmd: c } => cli::install::cmd_platform("devin", &c), + // `agents` and its `skills` alias share one amp-twin handler (#1432). + Command::Agents { cmd: c } | Command::Skills { cmd: c } => { + cli::install::cmd_agents_platform(&c) + } } } @@ -146,6 +154,8 @@ fn dispatch_cluster_only(cmd: Command) -> Result<()> { no_label, backend, model, + max_concurrency, + batch_size, force, ) = match cmd { Command::ClusterOnly { @@ -158,6 +168,8 @@ fn dispatch_cluster_only(cmd: Command) -> Result<()> { no_label, backend, model, + max_concurrency, + batch_size, } => ( path, no_viz, @@ -168,6 +180,8 @@ fn dispatch_cluster_only(cmd: Command) -> Result<()> { no_label, backend, model, + max_concurrency, + batch_size, false, ), Command::Label { @@ -179,6 +193,8 @@ fn dispatch_cluster_only(cmd: Command) -> Result<()> { min_community_size, backend, model, + max_concurrency, + batch_size, } => ( path, no_viz, @@ -189,6 +205,8 @@ fn dispatch_cluster_only(cmd: Command) -> Result<()> { false, backend, model, + max_concurrency, + batch_size, true, ), _ => unreachable!("dispatch_cluster_only invoked with wrong variant"), @@ -205,6 +223,8 @@ fn dispatch_cluster_only(cmd: Command) -> Result<()> { backend: backend.as_deref(), model: model.as_deref(), force_relabel: force, + max_concurrency, + batch_size, }, ) } @@ -230,11 +250,47 @@ fn dispatch_save_result(cmd: Command) -> Result<()> { query_type, nodes, memory_dir, + outcome, + correction, } = cmd else { unreachable!("dispatch_save_result invoked with wrong variant") }; - cli::save_result::cmd_save_result(&question, &answer, &query_type, &nodes, &memory_dir) + cli::save_result::cmd_save_result( + &question, + &answer, + &query_type, + &nodes, + &memory_dir, + outcome.as_deref(), + correction.as_deref(), + ) +} + +fn dispatch_reflect(cmd: Command) -> Result<()> { + let Command::Reflect { + memory_dir, + out, + graph, + analysis, + labels, + half_life_days, + min_corroboration, + if_stale, + } = cmd + else { + unreachable!("dispatch_reflect invoked with wrong variant") + }; + cli::reflect::cmd_reflect(cli::reflect::ReflectArgs { + memory_dir, + out, + graph, + analysis, + labels, + half_life_days, + min_corroboration, + if_stale, + }) } fn dispatch_tree(cmd: Command) -> Result<()> { diff --git a/src/cli/install.rs b/src/cli/install.rs index 22e5cd4..6bd3309 100644 --- a/src/cli/install.rs +++ b/src/cli/install.rs @@ -9,18 +9,42 @@ use std::io::IsTerminal; use anyhow::Result; use graphify_hooks::platform::{ - agents_install, agents_uninstall, amp_install, amp_uninstall, antigravity_install, - antigravity_uninstall, claude_install, claude_uninstall, codebuddy_install, - codebuddy_uninstall, copilot_install, copilot_uninstall, cursor_install, cursor_uninstall, - devin_install, devin_project_install, devin_project_uninstall, devin_uninstall, gemini_install, - gemini_uninstall, install_kilo_skill_and_command, install_platform_skill, - install_platform_skill_project, kilo_install, kilo_uninstall, kiro_install, kiro_uninstall, - pi_install, pi_uninstall, uninstall_all, uninstall_platform_skill_project, vscode_install, - vscode_uninstall, + agents_install, agents_platform_install, agents_platform_uninstall, agents_uninstall, + amp_install, amp_uninstall, antigravity_install, antigravity_uninstall, claude_install, + claude_uninstall, codebuddy_install, codebuddy_uninstall, copilot_install, copilot_uninstall, + cursor_install, cursor_uninstall, devin_install, devin_project_install, + devin_project_uninstall, devin_uninstall, gemini_install, gemini_uninstall, + install_kilo_skill_and_command, install_platform_skill, install_platform_skill_project, + kilo_install, kilo_uninstall, kiro_install, kiro_uninstall, pi_install, pi_uninstall, + uninstall_all, uninstall_platform_skill_project, vscode_install, vscode_uninstall, }; use crate::cli::args::PlatformCmd; +/// Resolve a CLI platform alias to its canonical platform name. `skills` is the +/// friendly alias for the generic `agents` platform (#1432). Mirrors Python +/// `_canonical_platform`. +#[must_use] +pub(crate) fn canonical_platform(platform: &str) -> &str { + if platform == "skills" { + "agents" + } else { + platform + } +} + +/// `graphify agents install|uninstall` (and the `skills` alias): the amp-twin +/// of the generic Agent-Skills target — skill at `~/.agents/skills` plus the +/// always-on `AGENTS.md` section (#1432). +pub(crate) fn cmd_agents_platform(cmd: &PlatformCmd) -> Result<()> { + let cwd = std::env::current_dir()?; + let msg = match cmd { + PlatformCmd::Install { .. } => agents_platform_install(&cwd)?, + PlatformCmd::Uninstall { .. } => agents_platform_uninstall(&cwd)?, + }; + println!("{msg}"); + Ok(()) +} /// Print the amber-brain banner shown at the top of `graphify install`. /// /// TTY-only (suppressed in CI logs and pipes) and best-effort — it never fails @@ -60,6 +84,8 @@ fn print_banner() { /// under the current working directory instead of the user home /// directory. Mirrors the Python `--project` flag (#931). pub(crate) fn cmd_install(platform: &str, project: bool) -> Result<()> { + // Resolve `skills` -> `agents` before routing (#1432). + let platform = canonical_platform(platform); print_banner(); // Antigravity's project install lays down the full always-on layer // (skill + rules + workflow), not just the skill — matching graphify-py's diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 3ccdb19..b01ab59 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -22,6 +22,7 @@ pub(crate) mod merge_chunks; pub(crate) mod provider; pub(crate) mod prs; pub(crate) mod query; +pub(crate) mod reflect; pub(crate) mod save_result; pub(crate) mod serve; pub(crate) mod tree; @@ -57,16 +58,17 @@ pub(crate) fn run() -> Result<()> { /// Return the graphify output directory, honouring the `GRAPHIFY_OUT` env var. /// -/// Python equivalent: `os.environ.get("GRAPHIFY_OUT", "graphify-out")` at -/// `__main__.py:19`. Accepts a relative name ("graphify-out-feature") or an -/// absolute path ("/shared/graphify-out"). +/// Thin re-export of [`graphify_security::graphify_out`] — the single source of +/// truth for the output-dir override (Python `graphify.paths`). Accepts a +/// relative name (`graphify-out-feature`) or an absolute path +/// (`/shared/graphify-out`). pub(crate) fn graphify_out_dir() -> PathBuf { - PathBuf::from(std::env::var("GRAPHIFY_OUT").unwrap_or_else(|_| "graphify-out".to_owned())) + graphify_security::graphify_out() } -/// Return the default graph.json path, honouring `GRAPHIFY_OUT`. +/// Return the default `graph.json` path, honouring `GRAPHIFY_OUT`. pub(crate) fn default_graph_path() -> PathBuf { - graphify_out_dir().join("graph.json") + graphify_security::default_graph_json() } /// Load and parse `graph.json` into a [`graphify_build::Graph`]. diff --git a/src/cli/reflect.rs b/src/cli/reflect.rs new file mode 100644 index 0000000..21d8b1e --- /dev/null +++ b/src/cli/reflect.rs @@ -0,0 +1,82 @@ +//! `reflect` command — aggregate `graphify-out/memory/` outcomes into a +//! deterministic lessons doc (`graphify-out/reflections/LESSONS.md`). +//! +//! Mirrors Python's `reflect` command at `__main__.py`. Output directory is +//! honoured via `GRAPHIFY_OUT` (see [`crate::cli::graphify_out_dir`]). + +use std::path::PathBuf; + +use anyhow::Result; + +use crate::cli::graphify_out_dir; + +/// Parsed `graphify reflect` arguments. +pub(crate) struct ReflectArgs { + /// Memory directory; defaults to `/memory`. + pub memory_dir: Option, + /// Output lessons path; defaults to `/reflections/LESSONS.md`. + pub out: Option, + /// `graph.json`; auto-detected under `` when absent. + pub graph: Option, + /// `.graphify_analysis.json` override. + pub analysis: Option, + /// `.graphify_labels.json` override. + pub labels: Option, + /// Time-decay half-life in days. + pub half_life_days: f64, + /// Distinct useful results to promote a node to preferred. + pub min_corroboration: usize, + /// Skip the rebuild when `LESSONS.md` is already current. + pub if_stale: bool, +} + +/// Run `graphify reflect`, writing the lessons doc and printing a summary. +/// +/// # Errors +/// +/// Returns an error if the lessons file cannot be written. +pub(crate) fn cmd_reflect(args: ReflectArgs) -> Result<()> { + let out_dir = graphify_out_dir(); + let memory_dir = args.memory_dir.unwrap_or_else(|| out_dir.join("memory")); + let out_path = args + .out + .unwrap_or_else(|| out_dir.join("reflections").join("LESSONS.md")); + + // Auto-detect graph.json under the output dir when --graph is not given, so + // lessons are grouped by community without the user wiring it up. + let graph = args.graph.or_else(|| { + let default_graph = out_dir.join("graph.json"); + default_graph.exists().then_some(default_graph) + }); + + if args.if_stale && graphify_reflect::lessons_fresh(&out_path, &memory_dir, graph.as_deref()) { + println!( + "Lessons already up to date -> {} (skipped; omit --if-stale to force)", + out_path.display() + ); + return Ok(()); + } + + let graphs = graphify_reflect::GraphPaths { + graph: graph.as_deref(), + analysis: args.analysis.as_deref(), + labels: args.labels.as_deref(), + }; + let (path, agg) = graphify_reflect::reflect( + &memory_dir, + &out_path, + graphs, + chrono::Utc::now(), + args.half_life_days, + args.min_corroboration, + )?; + println!( + "Reflected {} memories ({} useful, {} dead ends, {} corrected) -> {}", + agg.total, + agg.counts.useful, + agg.counts.dead_end, + agg.counts.corrected, + path.display() + ); + Ok(()) +} diff --git a/src/cli/save_result.rs b/src/cli/save_result.rs index a95350a..7738841 100644 --- a/src/cli/save_result.rs +++ b/src/cli/save_result.rs @@ -12,10 +12,19 @@ pub(crate) fn cmd_save_result( query_type: &str, nodes: &[String], memory_dir: &std::path::Path, + outcome: Option<&str>, + correction: Option<&str>, ) -> Result<()> { let source_nodes = if nodes.is_empty() { None } else { Some(nodes) }; - let path = - graphify_ingest::save_query_result(question, answer, memory_dir, query_type, source_nodes)?; + let path = graphify_ingest::save_query_result( + question, + answer, + memory_dir, + query_type, + source_nodes, + outcome, + correction, + )?; println!("Saved to {}", path.display()); Ok(()) } diff --git a/tests/cli_commands.rs b/tests/cli_commands.rs index d0b8c3e..2d0131f 100644 --- a/tests/cli_commands.rs +++ b/tests/cli_commands.rs @@ -121,6 +121,39 @@ fn extract_runs_without_backend_writes_graph() { ); } +#[test] +fn extract_writes_to_graphify_out_env() { + // #1423: `graphify extract` honours GRAPHIFY_OUT for where it WRITES, not + // only where readers look. Code-only corpus, so no LLM backend is needed. + let dir = tempfile::tempdir().unwrap(); + fs::write( + dir.path().join("m.py"), + "def a():\n return b()\n\n\ndef b():\n return 1\n", + ) + .unwrap(); + cli_no_backend() + .current_dir(dir.path()) + .env("GRAPHIFY_OUT", "custom-out") + .arg("extract") + .arg(".") + .arg("--no-cluster") + .assert() + .success(); + + assert!( + dir.path().join("custom-out").join("graph.json").exists(), + "graph.json not written to the GRAPHIFY_OUT override" + ); + assert!( + dir.path().join("custom-out").join("manifest.json").exists(), + "manifest.json not written to the GRAPHIFY_OUT override" + ); + assert!( + !dir.path().join("graphify-out").exists(), + "extract ignored GRAPHIFY_OUT and wrote graphify-out/" + ); +} + #[test] fn extract_mode_deep_prints_banner_and_succeeds() { let dir = tempfile::tempdir().unwrap(); @@ -882,6 +915,32 @@ fn label_accepts_model_flag() { ); } +#[test] +fn label_accepts_concurrency_flags() { + // #1390: `label --max-concurrency --batch-size` parse and thread through to + // the labeling path. With no backend the run degrades to placeholders, + // proving the flags are accepted end-to-end without error. + let dir = tempfile::tempdir().unwrap(); + write_python_project(dir.path()); + cli_no_backend() + .arg("extract") + .arg(dir.path()) + .arg("--no-cluster") + .assert() + .success(); + + cli_no_backend() + .arg("label") + .arg(dir.path()) + .arg("--max-concurrency") + .arg("8") + .arg("--batch-size") + .arg("50") + .arg("--no-viz") + .assert() + .success(); +} + /// #1347/#1350: a no-op incremental `extract --no-cluster` re-run must leave /// graph.json byte-identical. The first run persists `manifest.json` (parity with /// graphify-py `__main__.py:4492`), so the second run takes the incremental path; @@ -934,3 +993,84 @@ fn extract_no_cluster_incremental_noop_preserves_existing_graph() { ); assert_eq!(after, before, "no-op incremental run changed graph.json"); } + +// ── reflect (#1441) ────────────────────────────────────────────────────────── + +#[test] +fn reflect_end_to_end_writes_lessons() { + let dir = tempfile::tempdir().unwrap(); + cli() + .current_dir(dir.path()) + .env_remove("GRAPHIFY_OUT") + .args([ + "save-result", + "--question", + "how does auth work?", + "--answer", + "JWT", + "--nodes", + "AuthMiddleware", + "--outcome", + "useful", + ]) + .assert() + .success(); + cli() + .current_dir(dir.path()) + .env_remove("GRAPHIFY_OUT") + .arg("reflect") + .assert() + .success() + .stdout(contains("Reflected 1 memories")); + let lessons = dir + .path() + .join("graphify-out") + .join("reflections") + .join("LESSONS.md"); + assert!(lessons.exists()); + assert!( + fs::read_to_string(&lessons) + .unwrap() + .contains("`AuthMiddleware`") + ); +} + +#[test] +fn reflect_cold_start_writes_empty_lessons() { + let dir = tempfile::tempdir().unwrap(); + cli() + .current_dir(dir.path()) + .env_remove("GRAPHIFY_OUT") + .arg("reflect") + .assert() + .success() + .stdout(contains("Reflected 0 memories")); + let lessons = dir + .path() + .join("graphify-out") + .join("reflections") + .join("LESSONS.md"); + assert!( + fs::read_to_string(&lessons) + .unwrap() + .contains("from 0 session memories") + ); +} + +#[test] +fn save_result_rejects_bad_outcome() { + let dir = tempfile::tempdir().unwrap(); + cli() + .current_dir(dir.path()) + .args([ + "save-result", + "--question", + "q", + "--answer", + "a", + "--outcome", + "great", + ]) + .assert() + .failure(); +} diff --git a/tests/cli_install.rs b/tests/cli_install.rs index d32a654..5da0b0f 100644 --- a/tests/cli_install.rs +++ b/tests/cli_install.rs @@ -233,3 +233,94 @@ fn codebuddy_listed_in_help() { .success() .stdout(predicates::str::contains("codebuddy")); } + +// ── #1432: generic `agents` platform + `skills` alias ──────────────────────── + +#[test] +fn install_platform_agents_writes_skill_only() { + // `graphify install --platform agents` writes ~/.agents/skills, no AGENTS.md. + let project = tempfile::tempdir().unwrap(); + let home = tempfile::tempdir().unwrap(); + cli() + .current_dir(project.path()) + .env("HOME", home.path()) + .env_remove("CLAUDE_CONFIG_DIR") + .arg("install") + .arg("--platform") + .arg("agents") + .assert() + .success(); + assert!( + home.path() + .join(".agents/skills/graphify/SKILL.md") + .exists() + ); + assert!(!project.path().join("AGENTS.md").exists()); +} + +#[test] +fn install_platform_skills_alias_resolves_to_agents() { + // `--platform skills` is the friendly alias for `agents` (#1432). + let project = tempfile::tempdir().unwrap(); + let home = tempfile::tempdir().unwrap(); + cli() + .current_dir(project.path()) + .env("HOME", home.path()) + .env_remove("CLAUDE_CONFIG_DIR") + .arg("install") + .arg("--platform") + .arg("skills") + .assert() + .success(); + assert!( + home.path() + .join(".agents/skills/graphify/SKILL.md") + .exists() + ); +} + +#[test] +fn agents_install_writes_skill_and_agents_md() { + // `graphify agents install` is the amp-twin: skill + AGENTS.md. + let project = tempfile::tempdir().unwrap(); + let home = tempfile::tempdir().unwrap(); + cli() + .current_dir(project.path()) + .env("HOME", home.path()) + .env_remove("CLAUDE_CONFIG_DIR") + .arg("agents") + .arg("install") + .assert() + .success(); + assert!( + home.path() + .join(".agents/skills/graphify/SKILL.md") + .exists() + ); + assert!(project.path().join("AGENTS.md").exists()); +} + +#[test] +fn skills_install_aliases_agents_subcommand() { + let project = tempfile::tempdir().unwrap(); + let home = tempfile::tempdir().unwrap(); + cli() + .current_dir(project.path()) + .env("HOME", home.path()) + .env_remove("CLAUDE_CONFIG_DIR") + .arg("skills") + .arg("install") + .assert() + .success(); + assert!( + home.path() + .join(".agents/skills/graphify/SKILL.md") + .exists() + ); + assert!(project.path().join("AGENTS.md").exists()); +} + +#[test] +fn agents_uninstall_runs() { + uninstall_runs("agents"); +}