From 3507a27f89c444c5c75270786a207314ce59e184 Mon Sep 17 00:00:00 2001 From: Bion Howard Date: Tue, 9 Jun 2026 18:59:24 -0400 Subject: [PATCH] feat: Java, Kotlin, and Swift component extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Java and Swift are tree-sitter formatters (treesitter/java.rs, treesitter/swift.rs) emitting source-text slices validated by the legacy mini-patterns: Java's annotation/class/method/two-word-prototype rules with the ' {\n' body gate, Swift's column-0 type headers and modifier-free func/init signatures. Kotlin is a procedural line scanner (extract/kotlin.rs): the community grammar (tree-sitter-kotlin-ng) cannot recover from the deliberately invalid constructs in the acceptance fixture — one ERROR node swallows the rest of the file — while the legacy pattern is line-anchored and keeps going. The scanner reproduces the legacy semantics: type headers captured to a brace/blank-line/column-0 stop, last-'fun '-first matching with the ' -> Word)' tail that preserves function-type parameters, enum entry lines fused onto the following fun, and the strip-comments-then- rstrip ordering whose interplay decides whether a closing paren survives a trailing line comment. All three fixture component sets match the legacy goldens byte-for-byte; trees_v1 goldens regenerated with the three parsers unstubbed. Deep- nesting robustness tests extended to the new languages. All visitors use explicit heap stacks (no AST-depth recursion). Co-Authored-By: Claude Fable 5 --- Cargo.lock | 22 ++ Cargo.toml | 2 + crates/tree_plus_core/Cargo.toml | 2 + crates/tree_plus_core/examples/dump_ast.rs | 3 + crates/tree_plus_core/src/extract/kotlin.rs | 260 +++++++++++++++++ crates/tree_plus_core/src/extract/mod.rs | 14 +- .../src/extract/treesitter/java.rs | 265 ++++++++++++++++++ .../src/extract/treesitter/mod.rs | 2 + .../src/extract/treesitter/swift.rs | 154 ++++++++++ crates/tree_plus_core/tests/golden_parity.rs | 3 + crates/tree_plus_core/tests/robustness.rs | 4 +- docs/language-roadmap.md | 16 +- tests/golden/generate_legacy_goldens.py | 4 +- tests/golden/legacy/trees/repo_concise.txt | 58 ++-- .../golden/legacy/trees_v1/more_languages.txt | 121 ++++++++ .../legacy/trees_v1/more_languages_group1.txt | 91 ++++++ .../legacy/trees_v1/more_languages_group3.txt | 30 ++ tests/golden/legacy/trees_v1/multi_seed.txt | 91 ++++++ 18 files changed, 1101 insertions(+), 41 deletions(-) create mode 100644 crates/tree_plus_core/src/extract/kotlin.rs create mode 100644 crates/tree_plus_core/src/extract/treesitter/java.rs create mode 100644 crates/tree_plus_core/src/extract/treesitter/swift.rs diff --git a/Cargo.lock b/Cargo.lock index 700cc68..6183d14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -867,6 +867,16 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-java" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa6cbcdc8c679b214e616fd3300da67da0e492e066df01bcf5a5921a71e90d6" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-javascript" version = "0.25.0" @@ -903,6 +913,16 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-swift" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe36052155b9dd69ca82b3b8f1b4ccfb2d867125ac1a4db1dd7331829242668c" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-typescript" version = "0.23.2" @@ -939,9 +959,11 @@ dependencies = [ "tree-sitter-c", "tree-sitter-cpp", "tree-sitter-go", + "tree-sitter-java", "tree-sitter-javascript", "tree-sitter-python", "tree-sitter-rust", + "tree-sitter-swift", "tree-sitter-typescript", "unicode-width", ] diff --git a/Cargo.toml b/Cargo.toml index 528074f..e675f49 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,5 +26,7 @@ tree-sitter-typescript = "0.23" tree-sitter-c = "0.24" tree-sitter-cpp = "0.23" tree-sitter-go = "0.25" +tree-sitter-java = "0.23" +tree-sitter-swift = "0.7" rusqlite = { version = "0.32", features = ["bundled"] } criterion = "0.5" diff --git a/crates/tree_plus_core/Cargo.toml b/crates/tree_plus_core/Cargo.toml index 58bf87b..b045211 100644 --- a/crates/tree_plus_core/Cargo.toml +++ b/crates/tree_plus_core/Cargo.toml @@ -28,6 +28,8 @@ tree-sitter-c.workspace = true tree-sitter-cpp.workspace = true rusqlite = { workspace = true, optional = true } tree-sitter-go.workspace = true +tree-sitter-java.workspace = true +tree-sitter-swift.workspace = true [dev-dependencies] criterion.workspace = true diff --git a/crates/tree_plus_core/examples/dump_ast.rs b/crates/tree_plus_core/examples/dump_ast.rs index 436cea7..090c14b 100644 --- a/crates/tree_plus_core/examples/dump_ast.rs +++ b/crates/tree_plus_core/examples/dump_ast.rs @@ -23,6 +23,9 @@ fn main() { ".py" => tree_sitter_python::LANGUAGE.into(), ".rs" => tree_sitter_rust::LANGUAGE.into(), ".js" | ".ts" => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(), + ".go" => tree_sitter_go::LANGUAGE.into(), + ".java" => tree_sitter_java::LANGUAGE.into(), + ".swift" => tree_sitter_swift::LANGUAGE.into(), other => panic!("no grammar for {other}"), }; let mut parser = tree_sitter::Parser::new(); diff --git a/crates/tree_plus_core/src/extract/kotlin.rs b/crates/tree_plus_core/src/extract/kotlin.rs new file mode 100644 index 0000000..d045c57 --- /dev/null +++ b/crates/tree_plus_core/src/extract/kotlin.rs @@ -0,0 +1,260 @@ +//! Kotlin component extraction, matching legacy `parse_kt`. +//! +//! This is a procedural port of the legacy combined regex rather than a +//! tree-sitter formatter: the community Kotlin grammar cannot recover from +//! the (deliberately invalid) constructs in the acceptance fixture — one +//! ERROR node swallows the rest of the file — while the legacy pattern is +//! line-anchored and keeps going. The scanner is a single forward pass. +//! +//! Legacy semantics, reproduced deliberately: +//! - type headers: `^ *(modifier )*(class|interface|object)` (no modifier +//! may be `fun`), captured until a `{`, a blank line, or a column-0 word +//! line, then right-trimmed — multi-line primary constructors survive; +//! - functions: `^.* ?fun ( )?name(args)` where `args` stops at +//! the first `)`, plus an optional `: CapitalizedType` return and an +//! optional ` -> Word)` tail (which is how `(T) -> Unit)` parameters +//! keep their closing paren), gated on a following whitespace; +//! - an indented ALL-CAPS `NAME {` line directly above a `fun` line is +//! prepended to the component (enum entries with bodies); +//! - per component: strip comments, right-trim, replace `", \n"` with +//! `",\n"`. + +use std::sync::LazyLock; + +use regex::Regex; + +use super::treesitter::rust::strip_c_comments; + +static TYPE_START_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^ *((?:\w+ )*)(?:class|interface|object)[^.\n]").unwrap()); + +static ENUM_ENTRY_RE: LazyLock = LazyLock::new(|| Regex::new(r"^ +[A-Z]+ \{").unwrap()); + +static RETURN_TYPE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^\s?: [A-Z]\w*\??").unwrap()); + +static ARROW_TAIL_RE: LazyLock = LazyLock::new(|| Regex::new(r"^\s->\s\w+\)").unwrap()); + +/// Extract Kotlin components: classes/interfaces/objects and functions. +pub fn extract(content: &str) -> Vec { + let lines = line_offsets(content); + let mut components = Vec::new(); + let mut i = 0; + while i < lines.len() { + if let Some((component, next)) = match_type(content, &lines, i) { + components.push(clean(&component)); + i = next; + continue; + } + if let Some((component, next)) = match_fun(content, &lines, i) { + components.push(clean(&component)); + i = next; + continue; + } + i += 1; + } + components +} + +/// (byte offset, line content without trailing newline) for each line. +fn line_offsets(content: &str) -> Vec<(usize, &str)> { + let mut out = Vec::new(); + let mut offset = 0; + for line in content.split('\n') { + out.push((offset, line)); + offset += line.len() + 1; + } + out +} + +fn clean(component: &str) -> String { + strip_c_comments(component) + .trim_end() + .replace(", \n", ",\n") +} + +/// Type headers; returns (component, next line index). +fn match_type(content: &str, lines: &[(usize, &str)], i: usize) -> Option<(String, usize)> { + let (offset, line) = lines[i]; + let caps = TYPE_START_RE.captures(line)?; + // no modifier word may be `fun` (legacy `(? Option<(String, usize)> { + // an enum-entry line directly above the fun line joins the component + let (prefix, f) = if ENUM_ENTRY_RE.is_match(lines[i].1) + && lines.get(i + 1).is_some_and(|l| l.1.contains("fun")) + { + (Some(lines[i].1), i + 1) + } else { + (None, i) + }; + let (line_offset, line) = *lines.get(f)?; + // legacy `^.* ?fun ` is greedy: try the last `fun ` first + let mut search_end = line.len(); + while let Some(rel) = line[..search_end].rfind("fun ") { + if let Some(end) = parse_fun_tail(content, line_offset, line, rel + 4) { + let mut component = String::new(); + if let Some(p) = prefix { + component.push_str(p); + component.push('\n'); + } + component.push_str(&content[line_offset..end]); + // resume scanning on the line after the match end + let end_line = lines.partition_point(|&(o, _)| o <= end) - 1; + return Some((component, end_line + 1)); + } + search_end = rel; + } + None +} + +/// Validate `name(args)` + optional return/arrow tail starting at `name_rel` +/// (relative to `line`); returns the absolute end byte of the component. +fn parse_fun_tail(content: &str, line_offset: usize, line: &str, name_rel: usize) -> Option { + let bytes = line.as_bytes(); + let mut pos = name_rel; + // optional ` ` (balanced, same line) + if bytes.get(pos) == Some(&b'<') { + let close = balanced_angle(line, pos)?; + if bytes.get(close + 1) != Some(&b' ') { + return None; + } + pos = close + 2; + } + // name: words, dots, optional `?`, embedded `` chunks + let name_start = pos; + loop { + match bytes.get(pos) { + Some(c) if c.is_ascii_alphanumeric() || *c == b'_' || *c == b'.' => { + pos += 1; + if bytes.get(pos) == Some(&b'?') { + pos += 1; + } + } + Some(&b'<') => pos = balanced_angle(line, pos)? + 1, + _ => break, + } + } + if pos == name_start || bytes.get(pos) != Some(&b'(') { + return None; + } + // params: everything to the first `)` (may span lines) + let abs_open = line_offset + pos; + let close_rel = content[abs_open..].find(')')?; + let mut end = abs_open + close_rel + 1; + // optional `: CapitalizedType`, then optional ` -> Word)` + if let Some(m) = RETURN_TYPE_RE.find(&content[end..]) { + end += m.end(); + } + if let Some(m) = ARROW_TAIL_RE.find(&content[end..]) { + end += m.end(); + } + // legacy `(?=\s)` gate + content[end..] + .chars() + .next() + .filter(|c| c.is_whitespace()) + .map(|_| end) +} + +/// End index of the `>` balancing the `<` at `open` (same line only). +fn balanced_angle(line: &str, open: usize) -> Option { + let mut depth = 0; + for (k, c) in line.char_indices().skip(open) { + match c { + '<' => depth += 1, + '>' => { + depth -= 1; + if depth == 0 { + return Some(k); + } + } + _ => {} + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn data_class_and_fun() { + let src = + "data class Person(val name: String)\n\nfun greet(person: Person) = println(person)\n"; + assert_eq!( + extract(src), + vec![ + "data class Person(val name: String)", + "fun greet(person: Person)" + ] + ); + } + + #[test] + fn function_type_param_keeps_closing_paren() { + let src = "fun processItems(items: List, processor: (T) -> Unit) {\n items.forEach { processor(it) }\n}\n"; + assert_eq!( + extract(src), + vec!["fun processItems(items: List, processor: (T) -> Unit)"] + ); + } + + #[test] + fn multiline_class_with_comments() { + let src = "class People(\n firstNames: Array, /* heard you like */\n ages: Array(42), // edge cases galore\n) {\n fun edgeCases(): Boolean {\n return True\n }\n}\n"; + assert_eq!( + extract(src), + vec![ + "class People(\n firstNames: Array,\n ages: Array(42),\n)", + " fun edgeCases(): Boolean", + ] + ); + } + + #[test] + fn enum_entry_prefix() { + let src = " enum class E : Op {\n PLUS {\n override fun apply(t: Int): Int = t\n };\n}\n"; + assert_eq!( + extract(src), + vec![ + " enum class E : Op", + " PLUS {\n override fun apply(t: Int): Int", + ] + ); + } + + #[test] + fn fun_modifier_blocks_type_match() { + // `(? ExtractResult { e if C_EXTENSIONS.contains(&e) => treesitter::c_cpp::extract(&content, e)?, ".rs" => treesitter::rust::extract(&content, syntax)?, ".go" => treesitter::go::extract(&content)?, + ".java" => treesitter::java::extract(&content)?, + ".kt" => kotlin::extract(&content), + ".swift" => treesitter::swift::extract(&content)?, ".jsonl" => data::extract_jsonl(&content)?, ".env" => simple::dot_env(&content), ".txt" => { diff --git a/crates/tree_plus_core/src/extract/treesitter/java.rs b/crates/tree_plus_core/src/extract/treesitter/java.rs new file mode 100644 index 0000000..bc6fbd7 --- /dev/null +++ b/crates/tree_plus_core/src/extract/treesitter/java.rs @@ -0,0 +1,265 @@ +//! Java component extraction (tree-sitter), matching legacy `parse_java`. +//! +//! The legacy extractor was one combined regex over comment-stripped text: +//! - classes: `(public )?(abstract )?class NAME( extends \w+)?( implements +//! [\w, ]+)?` followed by `\s*{`, preceded by a newline; +//! - methods/constructors: optional visibility, optional abstract/static, +//! optional one-word return type, `name(args)` followed by exactly ` {\n`; +//! - interfaces: emitted as `interface NAME` (modifiers dropped); +//! - annotations on their own lines: ` *@[\w"/()]+` (truncates at the first +//! character outside that set); +//! - bodyless methods: ` *(abstract)? \w+ \w+(args);` (exactly two words, so +//! `public String f();` never matched). +//! +//! Annotations inside parameter lists were consumed by the method match and +//! never emitted; we reproduce that by not descending into parameters. + +use std::sync::LazyLock; + +use regex::Regex; +use tree_sitter::Node; + +use super::rust::strip_c_comments; +use super::{parse, ExtractResult}; + +static CLASS_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"^( *(?:public )?(?:abstract )?class\s+\w+(?: extends \w+)?(?: implements [\w, ]+)?)\s*$", + ) + .unwrap() +}); + +static INTERFACE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^ *(?:public )?interface\s+(\w+)").unwrap()); + +static METHOD_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?s)^ *(?:public|protected|private)? ?(?:abstract|static)? ?(?:\w+ )?\w+\([^{]*\)$", + ) + .unwrap() +}); + +static ABSTRACT_METHOD_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?s)^ *(?:abstract)? \w+ \w+\([^)]*\)$").unwrap()); + +static ANNOTATION_RE: LazyLock = LazyLock::new(|| Regex::new(r#"^ *@[\w"/()]+"#).unwrap()); + +/// Extract Java components: classes, interfaces, methods, annotations. +pub fn extract(content: &str) -> ExtractResult { + let tree = parse(content, &tree_sitter_java::LANGUAGE.into())?; + let mut extractor = JavaExtractor { + content, + components: Vec::new(), + }; + extractor.run(tree.root_node()); + Ok(extractor.components) +} + +struct JavaExtractor<'a> { + content: &'a str, + components: Vec, +} + +fn line_start(content: &str, byte: usize) -> usize { + content[..byte].rfind('\n').map(|i| i + 1).unwrap_or(0) +} + +impl<'a> JavaExtractor<'a> { + /// Depth-first via an explicit stack: AST depth is input-controlled, and + /// extraction runs on small worker-thread stacks. + fn run(&mut self, root: Node<'a>) { + let mut stack = vec![root]; + while let Some(node) = stack.pop() { + match node.kind() { + "class_declaration" | "enum_declaration" | "record_declaration" => { + self.emit_annotations(node); + if node.kind() == "class_declaration" { + self.emit_class(node); + } + if let Some(body) = node.child_by_field_name("body") { + stack.push(body); + } + } + "interface_declaration" | "annotation_type_declaration" => { + self.emit_annotations(node); + if node.kind() == "interface_declaration" { + self.emit_interface(node); + } + if let Some(body) = node.child_by_field_name("body") { + stack.push(body); + } + } + "method_declaration" | "constructor_declaration" => { + self.emit_annotations(node); + match node.child_by_field_name("body") { + Some(body) => { + self.emit_method(node, body); + // local/anonymous classes inside bodies still match + stack.push(body); + } + None => self.emit_bodyless_method(node), + } + } + "field_declaration" => { + self.emit_annotations(node); + } + _ => { + let mut cursor = node.walk(); + let children: Vec> = node.named_children(&mut cursor).collect(); + stack.extend(children.into_iter().rev()); + } + } + } + } + + /// Annotations attached as modifiers, one component per annotation line. + fn emit_annotations(&mut self, node: Node<'a>) { + let mut cursor = node.walk(); + let Some(mods) = node.children(&mut cursor).find(|c| c.kind() == "modifiers") else { + return; + }; + let mut mcursor = mods.walk(); + for ann in mods.named_children(&mut mcursor) { + if !matches!(ann.kind(), "annotation" | "marker_annotation") { + continue; + } + let start = line_start(self.content, ann.start_byte()); + let slice = &self.content[start..ann.end_byte()]; + if let Some(m) = ANNOTATION_RE.find(slice) { + self.components.push(m.as_str().to_string()); + } + } + } + + /// Byte offset where the signature text begins: the line after the last + /// annotation modifier (legacy matches started past the annotations). + fn signature_line_start(&self, node: Node<'a>) -> usize { + let mut pos = node.start_byte(); + let mut cursor = node.walk(); + if let Some(mods) = node.children(&mut cursor).find(|c| c.kind() == "modifiers") { + let mut mcursor = mods.walk(); + for ann in mods.named_children(&mut mcursor) { + if matches!(ann.kind(), "annotation" | "marker_annotation") { + pos = pos.max(ann.end_byte()); + } + } + } + let rest = &self.content[pos..]; + let skipped = rest.len() - rest.trim_start().len(); + line_start(self.content, pos + skipped) + } + + fn emit_class(&mut self, node: Node<'a>) { + let Some(body) = node.child_by_field_name("body") else { + return; + }; + let start = self.signature_line_start(node); + if start == 0 { + return; // legacy required a preceding newline + } + let candidate = strip_c_comments(&self.content[start..body.start_byte()]); + // stripped comment lines between annotations and the signature + // leave leading newlines behind (legacy stripped globally) + let candidate = candidate.trim_start_matches('\n'); + if let Some(caps) = CLASS_RE.captures(candidate) { + self.components + .push(caps.get(1).unwrap().as_str().trim_end().to_string()); + } + } + + fn emit_interface(&mut self, node: Node<'a>) { + let Some(body) = node.child_by_field_name("body") else { + return; + }; + let start = self.signature_line_start(node); + if start == 0 { + return; + } + let candidate = strip_c_comments(&self.content[start..body.start_byte()]); + let candidate = candidate.trim_start_matches('\n'); + if let Some(caps) = INTERFACE_RE.captures(candidate) { + self.components.push(format!("interface {}", &caps[1])); + } + } + + fn emit_method(&mut self, node: Node<'a>, body: Node<'a>) { + let start = self.signature_line_start(node); + if start == 0 { + return; + } + // legacy gate: the signature's `)` is followed by exactly ` {\n` + let before = &self.content[start..body.start_byte()]; + if !before.ends_with(' ') || before.ends_with(" ") || before.ends_with("\n ") { + return; + } + if !self.content[body.start_byte() + 1..].starts_with('\n') { + return; + } + let candidate = strip_c_comments(before); + let candidate = candidate.trim_start_matches('\n').trim_end(); + if METHOD_RE.is_match(candidate) { + self.components.push(candidate.to_string()); + } + } + + /// Bodyless (abstract/interface) methods: exactly two words + `(args);`. + fn emit_bodyless_method(&mut self, node: Node<'a>) { + let start = self.signature_line_start(node); + if start == 0 { + return; + } + let raw = self.content[start..node.end_byte()].trim_end(); + let Some(raw) = raw.strip_suffix(';') else { + return; + }; + let candidate = strip_c_comments(raw); + if ABSTRACT_METHOD_RE.is_match(&candidate) { + self.components.push(candidate); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn class_method_annotation() { + let src = "\nabstract class A {\n abstract void f();\n}\n\n@Log\nclass B extends A {\n @Override\n void f() {\n g();\n }\n}\n"; + let got = extract(src).unwrap(); + assert_eq!( + got, + vec![ + "abstract class A", + " abstract void f()", + "@Log", + "class B extends A", + " @Override", + " void f()", + ] + ); + } + + #[test] + fn interface_drops_modifiers() { + let src = "\npublic interface Comm {\n String communicate();\n}\n"; + let got = extract(src).unwrap(); + assert_eq!(got, vec!["interface Comm", " String communicate()"]); + } + + #[test] + fn three_word_prototype_skipped() { + // legacy two-word rule: `public String f();` never matched + let src = "\ninterface I {\n public String f();\n}\n"; + let got = extract(src).unwrap(); + assert_eq!(got, vec!["interface I"]); + } + + #[test] + fn single_line_body_skipped() { + // legacy required ` {` followed by a newline + let src = "\nclass A {\n void f() { g(); }\n}\n"; + let got = extract(src).unwrap(); + assert_eq!(got, vec!["class A"]); + } +} diff --git a/crates/tree_plus_core/src/extract/treesitter/mod.rs b/crates/tree_plus_core/src/extract/treesitter/mod.rs index feaefe7..13038e2 100644 --- a/crates/tree_plus_core/src/extract/treesitter/mod.rs +++ b/crates/tree_plus_core/src/extract/treesitter/mod.rs @@ -6,8 +6,10 @@ pub mod c_cpp; pub mod go; +pub mod java; pub mod python; pub mod rust; +pub mod swift; pub mod typescript; use std::cell::RefCell; diff --git a/crates/tree_plus_core/src/extract/treesitter/swift.rs b/crates/tree_plus_core/src/extract/treesitter/swift.rs new file mode 100644 index 0000000..5b893fa --- /dev/null +++ b/crates/tree_plus_core/src/extract/treesitter/swift.rs @@ -0,0 +1,154 @@ +//! Swift component extraction (tree-sitter), matching legacy `parse_swift`. +//! +//! Legacy semantics, reproduced deliberately: +//! - type headers: `^(class|struct|protocol|enum).*` up to a ` {` on the +//! same line — column 0 only, keyword first (so `public class` and +//! indented/nested types were never matched); +//! - functions and initializers: `^ *(func|init)\s*\w*\s*\(args\)` with an +//! optional `-> \w+` return, gated on a following ` {`, `;`, or newline; +//! no modifiers allowed (`override func` was never matched); +//! - matching ran on comment-stripped text, so comments vanish from +//! multi-line signatures but raw spacing/newlines are kept. + +use std::sync::LazyLock; + +use regex::Regex; +use tree_sitter::Node; + +use super::rust::strip_c_comments; +use super::{parse, ExtractResult}; + +static TYPE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^(?:class|struct|protocol|enum)").unwrap()); + +static FUNC_START_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^( *)(func|init)\s*\w*\s*\(").unwrap()); + +static RETURN_RE: LazyLock = LazyLock::new(|| Regex::new(r"^\s*->\s*\w+").unwrap()); + +/// Extract Swift components: type headers, funcs, inits. +pub fn extract(content: &str) -> ExtractResult { + let tree = parse(content, &tree_sitter_swift::LANGUAGE.into())?; + let mut extractor = SwiftExtractor { + content, + components: Vec::new(), + }; + extractor.run(tree.root_node()); + Ok(extractor.components) +} + +struct SwiftExtractor<'a> { + content: &'a str, + components: Vec, +} + +fn line_start(content: &str, byte: usize) -> usize { + content[..byte].rfind('\n').map(|i| i + 1).unwrap_or(0) +} + +impl<'a> SwiftExtractor<'a> { + /// Depth-first via an explicit stack: AST depth is input-controlled, and + /// extraction runs on small worker-thread stacks. + fn run(&mut self, root: Node<'a>) { + let mut stack = vec![root]; + while let Some(node) = stack.pop() { + match node.kind() { + "class_declaration" | "protocol_declaration" => { + // covers class/struct/enum (one node kind) and protocol + self.emit_type_header(node); + if let Some(body) = node.child_by_field_name("body") { + stack.push(body); + } + } + "function_declaration" | "protocol_function_declaration" | "init_declaration" => { + self.emit_function(node); + if let Some(body) = node.child_by_field_name("body") { + stack.push(body); + } + } + _ => { + let mut cursor = node.walk(); + let children: Vec> = node.named_children(&mut cursor).collect(); + stack.extend(children.into_iter().rev()); + } + } + } + } + + /// `class Person`, `struct Dog: Animal`, ... — column-0 keyword line, + /// captured up to the last ` {` on that line. + fn emit_type_header(&mut self, node: Node<'a>) { + let start = node.start_byte(); + if start != line_start(self.content, start) { + return; // legacy pattern had no indent allowance + } + let line = self.content[start..].split('\n').next().unwrap_or(""); + let line = strip_c_comments(line); + if !TYPE_RE.is_match(&line) { + return; + } + let Some(brace) = line.rfind(" {") else { + return; // `{` must follow on the same line after a space + }; + self.components.push(line[..brace].to_string()); + } + + /// ` *(func|init) name(args) -> T` gated on ` {`, `;`, or newline. + fn emit_function(&mut self, node: Node<'a>) { + let start = line_start(self.content, node.start_byte()); + let region = &self.content[start..]; + let Some(caps) = FUNC_START_RE.captures(region) else { + return; // modifiers (e.g. `override func`) never matched + }; + let open = caps.get(0).unwrap().end() - 1; // the `(` + let Some(close_rel) = region[open..].find(')') else { + return; + }; + let mut end = open + close_rel + 1; + // optional single-word return type + if let Some(m) = RETURN_RE.find(®ion[end..]) { + end += m.end(); + } + // legacy lookahead: ` {`, `;`, or newline must follow + let after = ®ion[end..]; + if !(after.starts_with(" {") || after.starts_with(';') || after.starts_with('\n')) { + // tolerate extra whitespace before the brace like `\s*` did + let trimmed = after.trim_start_matches(' '); + if !(trimmed.starts_with('{') && after.starts_with(' ')) { + return; + } + } + let component = strip_c_comments(®ion[..end]); + self.components.push(component.trim_end().to_string()); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn types_and_functions() { + let src = "class Person {\n init(name: String) {\n self.name = name\n }\n func greet() {\n print(\"hi\")\n }\n}\n\nfunc top() -> Int {\n return 1\n}\n"; + let got = extract(src).unwrap(); + assert_eq!( + got, + vec![ + "class Person", + " init(name: String)", + " func greet()", + "func top() -> Int", + ] + ); + } + + #[test] + fn indented_types_and_modified_funcs_skipped() { + let src = + "public class A {\n override func f() {\n g()\n }\n}\nstruct B: P {\n}\n"; + let got = extract(src).unwrap(); + // `public class` not keyword-first; `override func` has a modifier; + // struct B's body brace qualifies + assert_eq!(got, vec!["struct B: P"]); + } +} diff --git a/crates/tree_plus_core/tests/golden_parity.rs b/crates/tree_plus_core/tests/golden_parity.rs index 8ec8f33..4aceaa3 100644 --- a/crates/tree_plus_core/tests/golden_parity.rs +++ b/crates/tree_plus_core/tests/golden_parity.rs @@ -28,6 +28,9 @@ fn in_v1_scope(path: &str) -> bool { const V1_EXTS: &[&str] = &[ ".py", ".pyi", + ".java", + ".kt", + ".swift", ".rs", ".go", ".js", diff --git a/crates/tree_plus_core/tests/robustness.rs b/crates/tree_plus_core/tests/robustness.rs index 64e42dd..4ab531f 100644 --- a/crates/tree_plus_core/tests/robustness.rs +++ b/crates/tree_plus_core/tests/robustness.rs @@ -27,8 +27,8 @@ impl Rng { } const EXTENSIONS: &[&str] = &[ - "py", "rs", "ts", "tsx", "js", "c", "cpp", "h", "md", "json", "jsonl", "yml", "toml", "csv", - "txt", "env", "rst", + "py", "rs", "ts", "tsx", "js", "c", "cpp", "h", "java", "kt", "swift", "md", "json", "jsonl", + "yml", "toml", "csv", "txt", "env", "rst", ]; fn write_temp(name: &str, bytes: &[u8]) -> PathBuf { diff --git a/docs/language-roadmap.md b/docs/language-roadmap.md index a571141..d53fb23 100644 --- a/docs/language-roadmap.md +++ b/docs/language-roadmap.md @@ -1,6 +1,7 @@ # Language Roadmap (Rust Port) Version-1 implements: Rust, Python, JavaScript, TypeScript, C, C++, Go, +Java, Kotlin, Swift, Markdown (+ RST), JSON (package.json / schema / RPC / OpenRPC), JSONL, YAML, TOML (Cargo/pyproject), CSV, Makefile/Justfile, .env, requirements.txt, SQLite, and TODO/BUG/NOTE markers everywhere except `.md`/`.txt` (legacy @@ -15,9 +16,6 @@ availability → suggested path → missing tests. | Language | Extensions | Legacy extractor | TS grammar? | Suggested path | Missing tests | |---|---|---|---|---|---| -| Java | .java | parse_java | yes | tree-sitter formatter | port golden `JavaTest.java` | -| Kotlin | .kt | parse_kt | yes (community) | tree-sitter formatter | port golden `KotlinTest.kt` | -| Swift | .swift | parse_swift | yes (community) | tree-sitter formatter | port golden `swift_test.swift` | | C# | .cs | parse_cs | yes | tree-sitter formatter | port golden `csharp_test.cs` | | PHP | .php | parse_php | yes | tree-sitter formatter | port golden `php_test.php` | | Ruby | .rb | parse_rb | yes | tree-sitter formatter | port golden `ruby_test.rb` | @@ -59,6 +57,16 @@ highlighting, tiktoken tokenizers — see docs/rust-port-differences.md. ## Suggested order of attack -1. Java, Kotlin, C#, Ruby, Bash (mature grammars, heavily used). +1. C#, Ruby, Bash (mature grammars, heavily used). 2. SQL/GraphQL/Protobuf/requirements-style line formats (cheap regex ports). 3. The long tail, prioritized by user demand. + +## Implementation notes + +- Java and Swift use tree-sitter formatters (`treesitter/java.rs`, + `treesitter/swift.rs`). +- Kotlin is a procedural line scanner (`extract/kotlin.rs`): the community + grammar (tree-sitter-kotlin-ng) cannot recover from the deliberately + invalid constructs in the acceptance fixture — one ERROR node swallows + the rest of the file — while the legacy pattern is line-anchored and + keeps going. diff --git a/tests/golden/generate_legacy_goldens.py b/tests/golden/generate_legacy_goldens.py index 6976136..3d9bc3b 100644 --- a/tests/golden/generate_legacy_goldens.py +++ b/tests/golden/generate_legacy_goldens.py @@ -50,9 +50,9 @@ def sanitize(p: Path) -> str: DEFERRED_PARSERS = [ - "parse_php", "parse_kt", "parse_swift", "parse_bash", + "parse_php", "parse_bash", "parse_ps1", "parse_zig", "parse_rb", "parse_sql", "parse_graphql", - "parse_cs", "parse_jl", "parse_scala", "parse_java", "parse_perl", + "parse_cs", "parse_jl", "parse_scala", "parse_perl", "parse_hs", "parse_fsharp", "parse_lisp", "parse_erl", "parse_capnp", "parse_grpc", "parse_tex", "parse_lean", "parse_fortran", "parse_tf", "parse_isabelle", "parse_lua", "parse_tcl", "parse_objective_c", diff --git a/tests/golden/legacy/trees/repo_concise.txt b/tests/golden/legacy/trees/repo_concise.txt index e2786e5..987d1f2 100644 --- a/tests/golden/legacy/trees/repo_concise.txt +++ b/tests/golden/legacy/trees/repo_concise.txt @@ -1,4 +1,4 @@ -📁 tree_plus (44 folders, 443 files) +📁 tree_plus (44 folders, 445 files) ├── 📄 .env.test (4 tokens, 0 lines) ├── 📁 .github (2 folders, 4 files) │ ├── 📄 dependabot.yml (128 tokens, 11 lines) @@ -8,53 +8,55 @@ │ └── 📄 unix.yml (790 tokens, 106 lines) ├── 📄 .gitignore (226 tokens, 60 lines) ├── 📄 .mcp_server.pid (2 tokens, 1 line) -├── 📄 Cargo.toml (212 tokens, 30 lines) -├── 📄 claude-fable-5-rust-rewrite-goal.md (3,394 tokens, 434 lines) +├── 📄 Cargo.toml (225 tokens, 32 lines) ├── 📁 coverage (1 folder, 1 file) │ └── 📄 lcov.info (17,359 tokens, 2,180 lines) -├── 📁 crates (11 folders, 28 files) +├── 📁 crates (11 folders, 31 files) │ ├── 📁 tree_plus_cli (3 folders, 3 files) │ │ ├── 📄 Cargo.toml (120 tokens, 21 lines) │ │ ├── 📁 src (1 folder, 1 file) │ │ │ └── 📄 main.rs (1,339 tokens, 173 lines) │ │ └── 📁 tests (1 folder, 1 file) │ │ └── 📄 cli.rs (701 tokens, 92 lines) -│ └── 📁 tree_plus_core (7 folders, 25 files) +│ └── 📁 tree_plus_core (7 folders, 28 files) │ ├── 📁 benches (1 folder, 1 file) │ │ └── 📄 tree_plus_bench.rs (608 tokens, 78 lines) -│ ├── 📄 Cargo.toml (236 tokens, 37 lines) +│ ├── 📄 Cargo.toml (254 tokens, 39 lines) │ ├── 📁 examples (1 folder, 2 files) -│ │ ├── 📄 dump_ast.rs (516 tokens, 55 lines) +│ │ ├── 📄 dump_ast.rs (570 tokens, 59 lines) │ │ └── 📄 extract.rs (129 tokens, 16 lines) -│ ├── 📁 src (3 folders, 19 files) +│ ├── 📁 src (3 folders, 22 files) │ │ ├── 📄 config.rs (304 tokens, 39 lines) │ │ ├── 📄 count.rs (1,346 tokens, 203 lines) -│ │ ├── 📁 extract (2 folders, 11 files) +│ │ ├── 📁 extract (2 folders, 14 files) │ │ │ ├── 📄 data.rs (5,115 tokens, 582 lines) +│ │ │ ├── 📄 kotlin.rs (2,315 tokens, 257 lines) │ │ │ ├── 📄 markdown.rs (1,531 tokens, 180 lines) │ │ │ ├── 📄 markers.rs (438 tokens, 60 lines) -│ │ │ ├── 📄 mod.rs (2,532 tokens, 278 lines) +│ │ │ ├── 📄 mod.rs (2,569 tokens, 282 lines) │ │ │ ├── 📄 simple.rs (1,629 tokens, 216 lines) -│ │ │ └── 📁 treesitter (1 folder, 6 files) -│ │ │ ├── 📄 c_cpp.rs (5,979 tokens, 591 lines) -│ │ │ ├── 📄 go.rs (1,364 tokens, 152 lines) -│ │ │ ├── 📄 mod.rs (491 tokens, 67 lines) -│ │ │ ├── 📄 python.rs (3,487 tokens, 346 lines) -│ │ │ ├── 📄 rust.rs (2,785 tokens, 312 lines) -│ │ │ └── 📄 typescript.rs (3,897 tokens, 420 lines) +│ │ │ └── 📁 treesitter (1 folder, 8 files) +│ │ │ ├── 📄 c_cpp.rs (6,440 tokens, 635 lines) +│ │ │ ├── 📄 go.rs (1,439 tokens, 158 lines) +│ │ │ ├── 📄 java.rs (2,512 tokens, 269 lines) +│ │ │ ├── 📄 mod.rs (499 tokens, 69 lines) +│ │ │ ├── 📄 python.rs (3,635 tokens, 353 lines) +│ │ │ ├── 📄 rust.rs (2,828 tokens, 315 lines) +│ │ │ ├── 📄 swift.rs (1,474 tokens, 155 lines) +│ │ │ └── 📄 typescript.rs (4,004 tokens, 431 lines) │ │ ├── 📄 ignore.rs (2,144 tokens, 307 lines) │ │ ├── 📄 lib.rs (222 tokens, 29 lines) │ │ ├── 📄 model.rs (928 tokens, 125 lines) │ │ ├── 📄 render.rs (2,741 tokens, 347 lines) │ │ ├── 📄 sort.rs (1,693 tokens, 214 lines) -│ │ └── 📄 walk.rs (2,245 tokens, 260 lines) +│ │ └── 📄 walk.rs (2,426 tokens, 278 lines) │ └── 📁 tests (1 folder, 2 files) -│ ├── 📄 golden_parity.rs (1,812 tokens, 244 lines) -│ └── 📄 robustness.rs (743 tokens, 86 lines) +│ ├── 📄 golden_parity.rs (1,825 tokens, 247 lines) +│ └── 📄 robustness.rs (1,220 tokens, 144 lines) ├── 📁 docs (1 folder, 4 files) │ ├── 📄 architecture.md (1,392 tokens, 113 lines) -│ ├── 📄 language-roadmap.md (1,262 tokens, 64 lines) -│ ├── 📄 performance.md (690 tokens, 64 lines) +│ ├── 📄 language-roadmap.md (1,238 tokens, 64 lines) +│ ├── 📄 performance.md (1,121 tokens, 98 lines) │ └── 📄 rust-port-differences.md (1,022 tokens, 78 lines) ├── 📄 Justfile (141 tokens, 23 lines) ├── 📄 LICENSE (2,744 tokens, 81 lines) @@ -78,7 +80,7 @@ │ │ └── 📄 logging.py (11 tokens, 0 lines) │ ├── 📁 golden (6 folders, 247 files) │ │ ├── 📄 diff_components.py (417 tokens, 59 lines) -│ │ ├── 📄 generate_legacy_goldens.py (1,496 tokens, 156 lines) +│ │ ├── 📄 generate_legacy_goldens.py (1,486 tokens, 156 lines) │ │ └── 📁 legacy (5 folders, 245 files) │ │ ├── 📁 components (1 folder, 109 files) │ │ │ ├── 📄 tests__dot_dot__my_test_file.py.json (21 tokens, 5 lines) @@ -530,20 +532,20 @@ │ │ │ ├── 📄 more_languages_group_todo.txt (1,491 tokens, 111 lines) │ │ │ ├── 📄 multi_seed.txt (3,789 tokens, 427 lines) │ │ │ ├── 📄 path_to_test.txt (445 tokens, 51 lines) -│ │ │ └── 📄 repo_concise.txt (9,776 tokens, 699 lines) +│ │ │ └── 📄 repo_concise.txt (9,829 tokens, 703 lines) │ │ └── 📁 trees_v1 (1 folder, 13 files) │ │ ├── 📄 dot_dot.txt (99 tokens, 10 lines) -│ │ ├── 📄 more_languages.txt (10,912 tokens, 1,135 lines) -│ │ ├── 📄 more_languages_group1.txt (1,130 tokens, 146 lines) +│ │ ├── 📄 more_languages.txt (12,064 tokens, 1,256 lines) +│ │ ├── 📄 more_languages_group1.txt (1,948 tokens, 237 lines) │ │ ├── 📄 more_languages_group2.txt (531 tokens, 67 lines) -│ │ ├── 📄 more_languages_group3.txt (1,240 tokens, 149 lines) +│ │ ├── 📄 more_languages_group3.txt (1,453 tokens, 179 lines) │ │ ├── 📄 more_languages_group4.txt (1,020 tokens, 123 lines) │ │ ├── 📄 more_languages_group5.txt (2,099 tokens, 235 lines) │ │ ├── 📄 more_languages_group6.txt (2,821 tokens, 300 lines) │ │ ├── 📄 more_languages_group7.txt (705 tokens, 83 lines) │ │ ├── 📄 more_languages_group_lisp.txt (52 tokens, 5 lines) │ │ ├── 📄 more_languages_group_todo.txt (140 tokens, 13 lines) -│ │ ├── 📄 multi_seed.txt (1,783 tokens, 199 lines) +│ │ ├── 📄 multi_seed.txt (2,692 tokens, 290 lines) │ │ └── 📄 path_to_test.txt (445 tokens, 51 lines) │ ├── 📁 more_languages (10 folders, 99 files) │ │ ├── 📁 group1 (1 folder, 11 files) diff --git a/tests/golden/legacy/trees_v1/more_languages.txt b/tests/golden/legacy/trees_v1/more_languages.txt index 897f9fb..5914b28 100644 --- a/tests/golden/legacy/trees_v1/more_languages.txt +++ b/tests/golden/legacy/trees_v1/more_languages.txt @@ -3,8 +3,99 @@ │ ├── 📄 addamt.cobol (441 tokens, 40 lines) │ ├── 📄 CUSTOMER-INVOICE.CBL (412 tokens, 60 lines) │ ├── 📄 JavaTest.java (578 tokens, 86 lines) +│ │ ├── abstract class LivingBeing +│ │ ├── abstract void breathe() +│ │ ├── interface Communicator +│ │ ├── String communicate() +│ │ ├── @Log +│ │ ├── @Getter +│ │ ├── @Setter +│ │ ├── class Person extends LivingBeing implements Communicator +│ │ ├── Person(String name, int age) +│ │ ├── @Override +│ │ ├── void breathe() +│ │ ├── @Override +│ │ ├── public String communicate() +│ │ ├── void greet() +│ │ ├── String personalizedGreeting(String greeting, Optional +│ │ │ includeAge) +│ │ ├── @Singleton +│ │ ├── @RestController +│ │ ├── @SpringBootApplication +│ │ ├── public class Example +│ │ ├── @Inject +│ │ ├── public Example(Person person) +│ │ ├── @RequestMapping("/greet") +│ │ ├── String home(@RequestParam(value = "name", defaultValue = +│ │ │ "World") String name, +│ │ │ @RequestParam(value = "age", defaultValue = "30") +│ │ │ int age) +│ │ └── public static void main(String[] args) │ ├── 📄 JuliaTest.jl (381 tokens, 63 lines) │ ├── 📄 KotlinTest.kt (974 tokens, 171 lines) +│ │ ├── data class Person(val name: String) +│ │ ├── fun greet(person: Person) +│ │ ├── fun processItems(items: List, processor: (T) -> Unit) +│ │ ├── interface Source +│ │ ├── fun nextT(): T +│ │ ├── fun MutableList.swap(index1: Int, index2: Int) +│ │ ├── fun Any?.toString(): String +│ │ ├── tailrec fun findFixPoint(x: Double = 1.0): Double +│ │ ├── class GenericRepository +│ │ ├── fun getItem(id: Int): T? +│ │ ├── sealed interface Error +│ │ ├── sealed class IOError(): Error +│ │ ├── object Runner +│ │ ├── inline fun , T> run() : T +│ │ ├── infix fun Int.shl(x: Int): Int +│ │ ├── class MyStringCollection +│ │ ├── infix fun add(s: String) +│ │ ├── fun build() +│ │ ├── open class Base(p: Int) +│ │ ├── class Derived(p: Int) : Base(p) +│ │ ├── open class Shape +│ │ ├── open fun draw() +│ │ ├── fun fill() +│ │ ├── open fun edge(case: Int) +│ │ ├── interface Thingy +│ │ ├── fun edge() +│ │ ├── class Circle() : Shape(), Thingy +│ │ ├── override fun draw() +│ │ ├── final override fun edge(case: Int) +│ │ ├── interface Base +│ │ ├── fun print() +│ │ ├── class BaseImpl(val x: Int) : Base +│ │ ├── override fun print() +│ │ ├── internal class Derived(b: Base) : Base by b +│ │ ├── class Person constructor(firstName: String) +│ │ ├── class People( +│ │ │ firstNames: Array, +│ │ │ ages: Array(42), +│ │ │ ) +│ │ ├── fun edgeCases(): Boolean +│ │ ├── class Alien public @Inject constructor( +│ │ │ val firstName: String, +│ │ │ val lastName: String, +│ │ │ var age: Int, +│ │ │ val pets: MutableList = mutableListOf(), +│ │ │ ) +│ │ ├── fun objectOriented(): String +│ │ ├── enum class IntArithmetics : BinaryOperator, IntBinaryOperator +│ │ ├── PLUS { +│ │ │ override fun apply(t: Int, u: Int): Int +│ │ ├── TIMES { +│ │ │ override fun apply(t: Int, u: Int): Int +│ │ ├── override fun applyAsInt(t: Int, u: Int) +│ │ ├── fun reformat( +│ │ │ str: String, +│ │ │ normalizeCase: Boolean = true, +│ │ │ upperCaseFirstLetter: Boolean = true, +│ │ │ divideByCamelHumps: Boolean = false, +│ │ │ wordSeparator: Char = ' ', +│ │ │ ) +│ │ ├── operator fun Point.unaryMinus() +│ │ ├── abstract class Polygon +│ │ └── abstract fun draw() │ ├── 📄 lesson.cbl (635 tokens, 78 lines) │ ├── 📄 LuaTest.lua (83 tokens, 16 lines) │ ├── 📄 ObjectiveCTest.m (62 tokens, 16 lines) @@ -302,6 +393,36 @@ │ ├── 📄 hallucination.tex (1,633 tokens, 126 lines) │ ├── 📄 ruby_test.rb (138 tokens, 37 lines) │ ├── 📄 swift_test.swift (469 tokens, 110 lines) +│ │ ├── class Person +│ │ ├── init(name: String) +│ │ ├── func greet() +│ │ ├── func yEdgeCase( +│ │ │ fname: String, +│ │ │ lname: String, +│ │ │ age: Int, +│ │ │ address: String, +│ │ │ phoneNumber: String +│ │ │ ) +│ │ ├── func globalGreet() +│ │ ├── struct Point +│ │ ├── protocol Animal +│ │ ├── func speak() +│ │ ├── struct Dog: Animal +│ │ ├── class Cat: Animal +│ │ ├── init(name: String) +│ │ ├── func speak() +│ │ ├── enum CarType +│ │ ├── func getPreferredCarType() -> CarType +│ │ ├── enum CarType: UInt8 +│ │ ├── enum class CarType: UInt8 +│ │ ├── func myFunction(fname: String, age: Int) +│ │ └── func myFunctionWithMultipleParameters( +│ │ fname: String, +│ │ lname: String, +│ │ age: Int, +│ │ address: String, +│ │ phoneNumber: String +│ │ ) │ ├── 📄 test.lean (289 tokens, 42 lines) │ ├── 📄 test.capnp (117 tokens, 30 lines) │ ├── 📄 test.graphql (66 tokens, 21 lines) diff --git a/tests/golden/legacy/trees_v1/more_languages_group1.txt b/tests/golden/legacy/trees_v1/more_languages_group1.txt index 1d32827..1249bfb 100644 --- a/tests/golden/legacy/trees_v1/more_languages_group1.txt +++ b/tests/golden/legacy/trees_v1/more_languages_group1.txt @@ -2,8 +2,99 @@ ├── 📄 addamt.cobol (441 tokens, 40 lines) ├── 📄 CUSTOMER-INVOICE.CBL (412 tokens, 60 lines) ├── 📄 JavaTest.java (578 tokens, 86 lines) +│ ├── abstract class LivingBeing +│ ├── abstract void breathe() +│ ├── interface Communicator +│ ├── String communicate() +│ ├── @Log +│ ├── @Getter +│ ├── @Setter +│ ├── class Person extends LivingBeing implements Communicator +│ ├── Person(String name, int age) +│ ├── @Override +│ ├── void breathe() +│ ├── @Override +│ ├── public String communicate() +│ ├── void greet() +│ ├── String personalizedGreeting(String greeting, Optional +│ │ includeAge) +│ ├── @Singleton +│ ├── @RestController +│ ├── @SpringBootApplication +│ ├── public class Example +│ ├── @Inject +│ ├── public Example(Person person) +│ ├── @RequestMapping("/greet") +│ ├── String home(@RequestParam(value = "name", defaultValue = "World") +│ │ String name, +│ │ @RequestParam(value = "age", defaultValue = "30") int +│ │ age) +│ └── public static void main(String[] args) ├── 📄 JuliaTest.jl (381 tokens, 63 lines) ├── 📄 KotlinTest.kt (974 tokens, 171 lines) +│ ├── data class Person(val name: String) +│ ├── fun greet(person: Person) +│ ├── fun processItems(items: List, processor: (T) -> Unit) +│ ├── interface Source +│ ├── fun nextT(): T +│ ├── fun MutableList.swap(index1: Int, index2: Int) +│ ├── fun Any?.toString(): String +│ ├── tailrec fun findFixPoint(x: Double = 1.0): Double +│ ├── class GenericRepository +│ ├── fun getItem(id: Int): T? +│ ├── sealed interface Error +│ ├── sealed class IOError(): Error +│ ├── object Runner +│ ├── inline fun , T> run() : T +│ ├── infix fun Int.shl(x: Int): Int +│ ├── class MyStringCollection +│ ├── infix fun add(s: String) +│ ├── fun build() +│ ├── open class Base(p: Int) +│ ├── class Derived(p: Int) : Base(p) +│ ├── open class Shape +│ ├── open fun draw() +│ ├── fun fill() +│ ├── open fun edge(case: Int) +│ ├── interface Thingy +│ ├── fun edge() +│ ├── class Circle() : Shape(), Thingy +│ ├── override fun draw() +│ ├── final override fun edge(case: Int) +│ ├── interface Base +│ ├── fun print() +│ ├── class BaseImpl(val x: Int) : Base +│ ├── override fun print() +│ ├── internal class Derived(b: Base) : Base by b +│ ├── class Person constructor(firstName: String) +│ ├── class People( +│ │ firstNames: Array, +│ │ ages: Array(42), +│ │ ) +│ ├── fun edgeCases(): Boolean +│ ├── class Alien public @Inject constructor( +│ │ val firstName: String, +│ │ val lastName: String, +│ │ var age: Int, +│ │ val pets: MutableList = mutableListOf(), +│ │ ) +│ ├── fun objectOriented(): String +│ ├── enum class IntArithmetics : BinaryOperator, IntBinaryOperator +│ ├── PLUS { +│ │ override fun apply(t: Int, u: Int): Int +│ ├── TIMES { +│ │ override fun apply(t: Int, u: Int): Int +│ ├── override fun applyAsInt(t: Int, u: Int) +│ ├── fun reformat( +│ │ str: String, +│ │ normalizeCase: Boolean = true, +│ │ upperCaseFirstLetter: Boolean = true, +│ │ divideByCamelHumps: Boolean = false, +│ │ wordSeparator: Char = ' ', +│ │ ) +│ ├── operator fun Point.unaryMinus() +│ ├── abstract class Polygon +│ └── abstract fun draw() ├── 📄 lesson.cbl (635 tokens, 78 lines) ├── 📄 LuaTest.lua (83 tokens, 16 lines) ├── 📄 ObjectiveCTest.m (62 tokens, 16 lines) diff --git a/tests/golden/legacy/trees_v1/more_languages_group3.txt b/tests/golden/legacy/trees_v1/more_languages_group3.txt index 1b4769e..9e0123e 100644 --- a/tests/golden/legacy/trees_v1/more_languages_group3.txt +++ b/tests/golden/legacy/trees_v1/more_languages_group3.txt @@ -85,6 +85,36 @@ ├── 📄 hallucination.tex (1,633 tokens, 126 lines) ├── 📄 ruby_test.rb (138 tokens, 37 lines) ├── 📄 swift_test.swift (469 tokens, 110 lines) +│ ├── class Person +│ ├── init(name: String) +│ ├── func greet() +│ ├── func yEdgeCase( +│ │ fname: String, +│ │ lname: String, +│ │ age: Int, +│ │ address: String, +│ │ phoneNumber: String +│ │ ) +│ ├── func globalGreet() +│ ├── struct Point +│ ├── protocol Animal +│ ├── func speak() +│ ├── struct Dog: Animal +│ ├── class Cat: Animal +│ ├── init(name: String) +│ ├── func speak() +│ ├── enum CarType +│ ├── func getPreferredCarType() -> CarType +│ ├── enum CarType: UInt8 +│ ├── enum class CarType: UInt8 +│ ├── func myFunction(fname: String, age: Int) +│ └── func myFunctionWithMultipleParameters( +│ fname: String, +│ lname: String, +│ age: Int, +│ address: String, +│ phoneNumber: String +│ ) ├── 📄 test.lean (289 tokens, 42 lines) ├── 📄 test.capnp (117 tokens, 30 lines) ├── 📄 test.graphql (66 tokens, 21 lines) diff --git a/tests/golden/legacy/trees_v1/multi_seed.txt b/tests/golden/legacy/trees_v1/multi_seed.txt index c23221a..d8cfa50 100644 --- a/tests/golden/legacy/trees_v1/multi_seed.txt +++ b/tests/golden/legacy/trees_v1/multi_seed.txt @@ -3,8 +3,99 @@ │ ├── 📄 addamt.cobol (441 tokens, 40 lines) │ ├── 📄 CUSTOMER-INVOICE.CBL (412 tokens, 60 lines) │ ├── 📄 JavaTest.java (578 tokens, 86 lines) +│ │ ├── abstract class LivingBeing +│ │ ├── abstract void breathe() +│ │ ├── interface Communicator +│ │ ├── String communicate() +│ │ ├── @Log +│ │ ├── @Getter +│ │ ├── @Setter +│ │ ├── class Person extends LivingBeing implements Communicator +│ │ ├── Person(String name, int age) +│ │ ├── @Override +│ │ ├── void breathe() +│ │ ├── @Override +│ │ ├── public String communicate() +│ │ ├── void greet() +│ │ ├── String personalizedGreeting(String greeting, Optional +│ │ │ includeAge) +│ │ ├── @Singleton +│ │ ├── @RestController +│ │ ├── @SpringBootApplication +│ │ ├── public class Example +│ │ ├── @Inject +│ │ ├── public Example(Person person) +│ │ ├── @RequestMapping("/greet") +│ │ ├── String home(@RequestParam(value = "name", defaultValue = +│ │ │ "World") String name, +│ │ │ @RequestParam(value = "age", defaultValue = "30") +│ │ │ int age) +│ │ └── public static void main(String[] args) │ ├── 📄 JuliaTest.jl (381 tokens, 63 lines) │ ├── 📄 KotlinTest.kt (974 tokens, 171 lines) +│ │ ├── data class Person(val name: String) +│ │ ├── fun greet(person: Person) +│ │ ├── fun processItems(items: List, processor: (T) -> Unit) +│ │ ├── interface Source +│ │ ├── fun nextT(): T +│ │ ├── fun MutableList.swap(index1: Int, index2: Int) +│ │ ├── fun Any?.toString(): String +│ │ ├── tailrec fun findFixPoint(x: Double = 1.0): Double +│ │ ├── class GenericRepository +│ │ ├── fun getItem(id: Int): T? +│ │ ├── sealed interface Error +│ │ ├── sealed class IOError(): Error +│ │ ├── object Runner +│ │ ├── inline fun , T> run() : T +│ │ ├── infix fun Int.shl(x: Int): Int +│ │ ├── class MyStringCollection +│ │ ├── infix fun add(s: String) +│ │ ├── fun build() +│ │ ├── open class Base(p: Int) +│ │ ├── class Derived(p: Int) : Base(p) +│ │ ├── open class Shape +│ │ ├── open fun draw() +│ │ ├── fun fill() +│ │ ├── open fun edge(case: Int) +│ │ ├── interface Thingy +│ │ ├── fun edge() +│ │ ├── class Circle() : Shape(), Thingy +│ │ ├── override fun draw() +│ │ ├── final override fun edge(case: Int) +│ │ ├── interface Base +│ │ ├── fun print() +│ │ ├── class BaseImpl(val x: Int) : Base +│ │ ├── override fun print() +│ │ ├── internal class Derived(b: Base) : Base by b +│ │ ├── class Person constructor(firstName: String) +│ │ ├── class People( +│ │ │ firstNames: Array, +│ │ │ ages: Array(42), +│ │ │ ) +│ │ ├── fun edgeCases(): Boolean +│ │ ├── class Alien public @Inject constructor( +│ │ │ val firstName: String, +│ │ │ val lastName: String, +│ │ │ var age: Int, +│ │ │ val pets: MutableList = mutableListOf(), +│ │ │ ) +│ │ ├── fun objectOriented(): String +│ │ ├── enum class IntArithmetics : BinaryOperator, IntBinaryOperator +│ │ ├── PLUS { +│ │ │ override fun apply(t: Int, u: Int): Int +│ │ ├── TIMES { +│ │ │ override fun apply(t: Int, u: Int): Int +│ │ ├── override fun applyAsInt(t: Int, u: Int) +│ │ ├── fun reformat( +│ │ │ str: String, +│ │ │ normalizeCase: Boolean = true, +│ │ │ upperCaseFirstLetter: Boolean = true, +│ │ │ divideByCamelHumps: Boolean = false, +│ │ │ wordSeparator: Char = ' ', +│ │ │ ) +│ │ ├── operator fun Point.unaryMinus() +│ │ ├── abstract class Polygon +│ │ └── abstract fun draw() │ ├── 📄 lesson.cbl (635 tokens, 78 lines) │ ├── 📄 LuaTest.lua (83 tokens, 16 lines) │ ├── 📄 ObjectiveCTest.m (62 tokens, 16 lines)