Skip to content

Commit e199ef3

Browse files
hyperpolymathclaude
andcommitted
feat(assail): word-boundary detector for UnboundedAllocation (Task #25)
Core refactor: substring `contains(...)` -> word-boundary regex for the keyword-based part of the UnboundedAllocation heuristic. Problems solved: 1. FP closed — the detector tripped on its own variable names (`has_unbounded_allocations`, `unbounded_vec_patterns`) and on legitimate `tokio::sync::mpsc::unbounded_channel` usage. Word- boundary regex (`\bunbounded\b`) does not match when the word sits inside a longer identifier (trailing `_` is a word char). 2. FN closed — `code_only.contains("limit")` disarmed the read_to_* check for any file containing `value_delimiter`, `delimiter`, or any other unrelated token with `limit` as a substring. Observed on src/main.rs (5 unbounded fs::read_to_string calls that the old check silently disarmed). Switched to `(?i)\blimit` — matches `limit`, `Limit`, `LIMIT`, `READ_LIMIT`, `limit_bytes`, and other word-starting-with-limit forms, does NOT match interior-substring `delimiter`/`sublimit`/etc. Helpers added (all OnceLock-initialised regex, compiled once per scan): has_unbounded_keyword — \b(unbounded|no_bound|no_limit| boundless|unlimited|unconstrained)\b has_infinite_word — \binfinite\b has_recursion_word — \brecursion\b has_limit_word — (?i)\blimit Also: added `read_report_bounded()` helper in main.rs and routed the 5 previously-FN-disarmed fs::read_to_string sites through it (64 MiB cap on report JSON reads). Self-scan numbers: Before refactor: UnboundedAllocation 1 critical (analyzer.rs self-reference FP) + FN'd main.rs After refactor: UnboundedAllocation 0. Total critical: 2 -> 1 (remaining 1 is tests/fixtures/example.py pickle.load — an intentional vulnerability fixture). Total findings: 11 -> 10. Regression tests added (4 new, all passing, 190/190 lib tests pass): analyze_rust_unbounded_as_identifier_substring_does_not_fire analyze_rust_unbounded_as_bare_identifier_still_fires analyze_rust_unlimited_does_not_disarm_limit_check analyze_rust_uppercase_limit_const_disarms_read_check This is Task #25 step 1 of the plan — detector refinement on assail before scaling out with Chapel. The substring -> regex move is the first structural narrowing; AST-level pattern detection (actual `Vec::with_capacity(n)` with n-from-input, loop-push without break) remains the target for zero-FN on a curated reference corpus. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 804a25d commit e199ef3

2 files changed

Lines changed: 210 additions & 21 deletions

File tree

src/assail/analyzer.rs

Lines changed: 186 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,70 @@ fn read_bounded(path: &Path, limit: u64) -> Option<String> {
4141
Some(buf)
4242
}
4343

44+
// ═══════════════════════════════════════════════════════════════════════
45+
// UnboundedAllocation detector — word-boundary keyword matchers
46+
// ═══════════════════════════════════════════════════════════════════════
47+
//
48+
// Historical context: the detector originally did substring matches like
49+
// `code_only.contains("unbounded")`. This fires on any identifier
50+
// containing those letters — including the detector's own variable names
51+
// (`has_unbounded_allocations`, `unbounded_vec_patterns`) and normal
52+
// tokio usage (`tokio::sync::mpsc::unbounded_channel`). On a self-scan,
53+
// analyzer.rs was the last residual Critical after the `.take(LIMIT)`
54+
// sweep closed the real unbounded reads.
55+
//
56+
// Fix: word-boundary regex matches. `\bunbounded\b` does NOT match
57+
// `unbounded_channel` or `unbounded_vec_patterns` because `_` is a word
58+
// character in regex, so the trailing `_` blocks the closing boundary.
59+
// It DOES match bare `unbounded` (e.g., `fn unbounded()`, `type T = Unbounded;`),
60+
// which is what we actually want the alarm for.
61+
//
62+
// The regexes are compiled once via OnceLock and reused across every
63+
// file in a scan — per-call overhead is a single pointer read.
64+
65+
/// Match the unbounded-allocation alarm keywords as whole words only.
66+
/// Keywords: unbounded, no_bound, no_limit, boundless, unlimited, unconstrained.
67+
static UNBOUNDED_KEYWORDS_RE: OnceLock<Regex> = OnceLock::new();
68+
69+
/// Match `infinite` as a whole word (not part of `is_infinite`, `infinite_loop_v2`, etc.).
70+
static INFINITE_WORD_RE: OnceLock<Regex> = OnceLock::new();
71+
72+
/// Match `recursion` as a whole word.
73+
static RECURSION_WORD_RE: OnceLock<Regex> = OnceLock::new();
74+
75+
/// Match `limit` as a word prefix, case-insensitively, to disarm
76+
/// read_to_* checks when the file has explicit bounds. Matches `limit`,
77+
/// `Limit`, `LIMIT`, `READ_LIMIT`, `limit_bytes`, `limits`; does NOT match
78+
/// `unlimited` or `sublimit` (those start inside a word).
79+
static LIMIT_WORD_RE: OnceLock<Regex> = OnceLock::new();
80+
81+
fn has_unbounded_keyword(code: &str) -> bool {
82+
UNBOUNDED_KEYWORDS_RE
83+
.get_or_init(|| {
84+
Regex::new(r"\b(unbounded|no_bound|no_limit|boundless|unlimited|unconstrained)\b")
85+
.expect("static regex is valid")
86+
})
87+
.is_match(code)
88+
}
89+
90+
fn has_infinite_word(code: &str) -> bool {
91+
INFINITE_WORD_RE
92+
.get_or_init(|| Regex::new(r"\binfinite\b").expect("static regex is valid"))
93+
.is_match(code)
94+
}
95+
96+
fn has_recursion_word(code: &str) -> bool {
97+
RECURSION_WORD_RE
98+
.get_or_init(|| Regex::new(r"\brecursion\b").expect("static regex is valid"))
99+
.is_match(code)
100+
}
101+
102+
fn has_limit_word(code: &str) -> bool {
103+
LIMIT_WORD_RE
104+
.get_or_init(|| Regex::new(r"(?i)\blimit").expect("static regex is valid"))
105+
.is_match(code)
106+
}
107+
44108
// Thread-local accumulators for migration analysis.
45109
// These collect deprecated/modern API counts across all files during a single
46110
// analyze() run, then get consumed by build_migration_metrics().
@@ -889,23 +953,23 @@ impl Analyzer {
889953
// of the explicit-keyword / tiny-capacity / unlimited-read
890954
// signals below, which remain specific enough to be useful.
891955
//
892-
// `read_to_*` is disarmed by either the lexical `limit` token
956+
// `read_to_*` is disarmed by either a `limit`-prefixed word
957+
// (case-insensitive, e.g. `LIMIT`, `READ_LIMIT`, `limit_bytes`)
893958
// OR `.take(` in the same file — both are valid bounded-read
894959
// patterns; `.take(N).read_to_end(&mut buf)` is the canonical
895-
// idiom.
896-
let read_is_bounded =
897-
code_only.contains("limit") || code_only.contains(".take(");
898-
let has_unbounded_allocations = code_only.contains("unbounded")
899-
|| code_only.contains("no_bound")
900-
|| code_only.contains("no_limit")
901-
|| code_only.contains("boundless")
902-
|| code_only.contains("unlimited")
903-
|| code_only.contains("unconstrained")
960+
// idiom. Word-boundary match avoids disarming on `unlimited`.
961+
let read_is_bounded = has_limit_word(&code_only) || code_only.contains(".take(");
962+
// Keyword alarms use word-boundary regex so the detector's own
963+
// variable names (`has_unbounded_allocations`, `unbounded_vec_*`)
964+
// and legitimate tokio types (`unbounded_channel`) don't trip
965+
// the substring heuristic. Bare keyword usage as an identifier
966+
// still fires, which is the intended signal.
967+
let has_unbounded_allocations = has_unbounded_keyword(&code_only)
904968
// `infinite` matches Rust std `f64::is_infinite()`, which is
905969
// benign. Require the word in a non-method-call context.
906-
|| (code_only.contains("infinite") && !code_only.contains("is_infinite"))
970+
|| (has_infinite_word(&code_only) && !code_only.contains("is_infinite"))
907971
// Unterminated recursion lacking any depth guard.
908-
|| (code_only.contains("recursion") && !code_only.contains("depth"))
972+
|| (has_recursion_word(&code_only) && !code_only.contains("depth"))
909973
// Suspiciously small initial capacity for a growing vector.
910974
|| code_only.contains("with_capacity(0)")
911975
|| code_only.contains("with_capacity(1)")
@@ -6401,4 +6465,114 @@ pub fn load(path: &str) -> std::io::Result<String> {
64016465
"unbounded read_to_string in production must still fire"
64026466
);
64036467
}
6468+
6469+
// ────────────────────────────────────────────────────────────────
6470+
// Word-boundary detector regression tests (Task #25 — zero-FN gate)
6471+
//
6472+
// Lock in that the substring -> word-boundary refactor does not
6473+
// reintroduce self-reference FPs or drop real signal.
6474+
// ────────────────────────────────────────────────────────────────
6475+
6476+
#[test]
6477+
fn analyze_rust_unbounded_as_identifier_substring_does_not_fire() {
6478+
// Self-reference FP regression: the detector's own variable
6479+
// names (and tokio's `unbounded_channel`) previously tripped
6480+
// the substring check. Word-boundary regex must not match
6481+
// these because the trailing `_` is a word char.
6482+
let src = "\
6483+
use tokio::sync::mpsc;
6484+
6485+
pub fn make_channel() -> (mpsc::UnboundedSender<u8>, mpsc::UnboundedReceiver<u8>) {
6486+
mpsc::unbounded_channel()
6487+
}
6488+
6489+
pub fn analyze(body: &str) -> bool {
6490+
let has_unbounded_allocations = body.contains(\"x\");
6491+
let unbounded_vec_patterns = body.len();
6492+
let unbounded_string_patterns = unbounded_vec_patterns * 2;
6493+
has_unbounded_allocations || unbounded_string_patterns > 0
6494+
}
6495+
";
6496+
let findings = analyze_rust_file("src/lib.rs", src);
6497+
let ub_findings: Vec<_> = findings
6498+
.iter()
6499+
.filter(|wp| wp.category == WeakPointCategory::UnboundedAllocation)
6500+
.collect();
6501+
assert!(
6502+
ub_findings.is_empty(),
6503+
"identifiers containing `unbounded` as substring must not fire: {:?}",
6504+
ub_findings
6505+
);
6506+
}
6507+
6508+
#[test]
6509+
fn analyze_rust_unbounded_as_bare_identifier_still_fires() {
6510+
// Sanity: when `unbounded` is actually a bare word/identifier
6511+
// (not inside a longer name), we still flag it.
6512+
let src = "\
6513+
pub fn unbounded() -> Vec<u8> {
6514+
Vec::new()
6515+
}
6516+
";
6517+
let findings = analyze_rust_file("src/lib.rs", src);
6518+
let ub_findings: Vec<_> = findings
6519+
.iter()
6520+
.filter(|wp| wp.category == WeakPointCategory::UnboundedAllocation)
6521+
.collect();
6522+
assert!(
6523+
!ub_findings.is_empty(),
6524+
"bare `unbounded` identifier should still fire the alarm"
6525+
);
6526+
}
6527+
6528+
#[test]
6529+
fn analyze_rust_unlimited_does_not_disarm_limit_check() {
6530+
// `unlimited` must NOT disarm read_to_string bound check — the
6531+
// word-prefix regex for `limit` should only match at word
6532+
// boundaries. Previously the substring `.contains("limit")`
6533+
// disarmed via the tail of `unlimited`.
6534+
let src = "\
6535+
pub fn slurp_unlimited(path: &str) -> std::io::Result<String> {
6536+
std::fs::read_to_string(path)
6537+
}
6538+
";
6539+
let findings = analyze_rust_file("src/read.rs", src);
6540+
let ub_findings: Vec<_> = findings
6541+
.iter()
6542+
.filter(|wp| wp.category == WeakPointCategory::UnboundedAllocation)
6543+
.collect();
6544+
assert!(
6545+
!ub_findings.is_empty(),
6546+
"`unlimited` in an identifier must not disarm the read check — \
6547+
unbounded read_to_string is still unbounded: {:?}",
6548+
ub_findings
6549+
);
6550+
}
6551+
6552+
#[test]
6553+
fn analyze_rust_uppercase_limit_const_disarms_read_check() {
6554+
// A `const READ_LIMIT: u64` should disarm the read_to_string
6555+
// check via the case-insensitive \blimit regex.
6556+
let src = "\
6557+
use std::io::Read;
6558+
6559+
const READ_LIMIT: u64 = 64 * 1024;
6560+
6561+
pub fn load(path: &str) -> std::io::Result<String> {
6562+
let mut buf = String::new();
6563+
std::fs::File::open(path)?.take(READ_LIMIT).read_to_string(&mut buf)?;
6564+
Ok(buf)
6565+
}
6566+
";
6567+
let findings = analyze_rust_file("src/read.rs", src);
6568+
let ub_findings: Vec<_> = findings
6569+
.iter()
6570+
.filter(|wp| wp.category == WeakPointCategory::UnboundedAllocation)
6571+
.collect();
6572+
assert!(
6573+
ub_findings.is_empty(),
6574+
"`.take(READ_LIMIT)` plus `READ_LIMIT` constant must disarm: {:?}",
6575+
ub_findings
6576+
);
6577+
}
64046578
}

src/main.rs

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,27 @@ use clap::{CommandFactory, Parser, Subcommand};
4949
use clap_complete::{generate, Shell};
5050
use clap_complete_nushell::Nushell;
5151
use std::collections::HashMap;
52-
use std::fs;
53-
use std::io::{self, Write};
52+
use std::fs::{self, File};
53+
use std::io::{self, Read, Write};
5454
use std::path::{Path, PathBuf};
5555
use std::time::Duration;
56+
57+
/// Upper bound on report JSON reads in the CLI. Reports are aggregated
58+
/// scan outputs; 64 MiB is two orders of magnitude beyond any realistic
59+
/// size and bounds tampered or malformed input before parsing.
60+
const REPORT_FILE_READ_LIMIT: u64 = 64 * 1024 * 1024;
61+
62+
/// Read a file into a String, capped at `limit` bytes. Silent-truncates
63+
/// if the file is larger; returns I/O errors as-is via `?`.
64+
fn read_report_bounded(path: &Path) -> Result<String> {
65+
let mut buf = String::new();
66+
File::open(path)
67+
.with_context(|| format!("opening {}", path.display()))?
68+
.take(REPORT_FILE_READ_LIMIT)
69+
.read_to_string(&mut buf)
70+
.with_context(|| format!("reading {}", path.display()))?;
71+
Ok(buf)
72+
}
5673
use types::*;
5774

5875
macro_rules! qprintln {
@@ -1609,7 +1626,7 @@ fn run_main() -> Result<()> {
16091626
report_path.display()
16101627
);
16111628

1612-
let content = fs::read_to_string(&report_path)?;
1629+
let content = read_report_bounded(&report_path)?;
16131630
let crash: CrashReport = serde_json::from_str(&content)?;
16141631

16151632
let signatures = signatures::detect_signatures(&crash);
@@ -1633,7 +1650,7 @@ fn run_main() -> Result<()> {
16331650
}
16341651

16351652
Commands::Report { report } => {
1636-
let content = fs::read_to_string(&report)?;
1653+
let content = read_report_bounded(&report)?;
16371654
let assault_report: AssaultReport = serde_json::from_str(&content)?;
16381655
if !cli.quiet {
16391656
report::print_report(
@@ -1646,13 +1663,13 @@ fn run_main() -> Result<()> {
16461663
}
16471664

16481665
Commands::Tui { report } => {
1649-
let content = fs::read_to_string(&report)?;
1666+
let content = read_report_bounded(&report)?;
16501667
let assault_report: AssaultReport = serde_json::from_str(&content)?;
16511668
ReportTui::run(&assault_report)?;
16521669
}
16531670

16541671
Commands::Gui { report } => {
1655-
let content = fs::read_to_string(&report)?;
1672+
let content = read_report_bounded(&report)?;
16561673
let assault_report: AssaultReport = serde_json::from_str(&content)?;
16571674
report::ReportGui::run(assault_report)?;
16581675
}
@@ -1865,9 +1882,7 @@ fn run_main() -> Result<()> {
18651882
create_issues,
18661883
github_owner,
18671884
} => {
1868-
let content = fs::read_to_string(&report_path).with_context(|| {
1869-
format!("reading assemblyline report {}", report_path.display())
1870-
})?;
1885+
let content = read_report_bounded(&report_path)?;
18711886
let asmline_report: assemblyline::AssemblylineReport =
18721887
serde_json::from_str(&content)
18731888
.with_context(|| "parsing assemblyline report JSON")?;

0 commit comments

Comments
 (0)