diff --git a/crates/tree_plus_core/src/extract/mod.rs b/crates/tree_plus_core/src/extract/mod.rs index 5e28bd4..4485d89 100644 --- a/crates/tree_plus_core/src/extract/mod.rs +++ b/crates/tree_plus_core/src/extract/mod.rs @@ -109,11 +109,11 @@ const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdx", ".mdc"]; /// Rust port version 1. Files still get TODO/BUG/NOTE markers; component /// extraction is tracked in docs/language-roadmap.md. const DEFERRED_EXTENSIONS: &[&str] = &[ - ".php", ".kt", ".swift", ".sh", ".ps1", ".zig", ".rb", ".sql", ".graphql", ".cs", ".jl", - ".scala", ".java", ".pl", ".hs", ".fs", ".lisp", ".clj", ".scm", ".el", ".rkt", ".erl", ".hrl", - ".capnp", ".proto", ".tex", ".lean", ".f", ".for", ".f77", ".f90", ".f95", ".f03", ".f08", - ".tf", ".thy", ".lua", ".tcl", ".m", ".r", ".nb", ".wl", ".matlab", ".ml", ".cbl", ".cobol", - ".apl", ".metal", ".wgsl", ".html", + ".php", ".kt", ".swift", ".sh", ".ps1", ".zig", ".rb", ".cs", ".jl", ".scala", ".java", ".pl", + ".hs", ".fs", ".lisp", ".clj", ".scm", ".el", ".rkt", ".erl", ".hrl", ".capnp", ".tex", + ".lean", ".f", ".for", ".f77", ".f90", ".f95", ".f03", ".f08", ".tf", ".thy", ".lua", ".tcl", + ".m", ".r", ".nb", ".wl", ".matlab", ".ml", ".cbl", ".cobol", ".apl", ".metal", ".wgsl", + ".html", ]; /// Whether this extension is deferred (legacy support, no Rust port yet). @@ -207,6 +207,9 @@ fn try_extract_components(path: &Path, syntax: bool) -> ExtractResult { e if C_EXTENSIONS.contains(&e) => treesitter::c_cpp::extract(&content, e)?, ".rs" => treesitter::rust::extract(&content, syntax)?, ".go" => treesitter::go::extract(&content)?, + ".sql" => simple::sql(&content), + ".graphql" => simple::graphql(&content), + ".proto" => simple::protobuf(&content), ".jsonl" => data::extract_jsonl(&content)?, ".env" => simple::dot_env(&content), ".txt" => { diff --git a/crates/tree_plus_core/src/extract/simple.rs b/crates/tree_plus_core/src/extract/simple.rs index 38a00e1..5a47479 100644 --- a/crates/tree_plus_core/src/extract/simple.rs +++ b/crates/tree_plus_core/src/extract/simple.rs @@ -167,6 +167,59 @@ pub fn environment_ts(content: &str) -> Vec { } } +/// Legacy `parse_sql`: `CREATE TABLE name (body);` statements, with the +/// body lines re-emitted under a 3-space indent. +pub fn sql(content: &str) -> Vec { + static CREATE_TABLE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?s)CREATE TABLE (\w+) \((.*?)\);").unwrap()); + let mut components = Vec::new(); + for caps in CREATE_TABLE_RE.captures_iter(content) { + components.push(format!("CREATE TABLE {}", &caps[1])); + for line in caps[2].trim().split('\n') { + components.push(format!(" {}", line.trim())); + } + } + components +} + +/// Legacy `parse_graphql`: type/enum headers without the brace, all other +/// non-comment lines indented; bare `}` lines skipped. +pub fn graphql(content: &str) -> Vec { + let mut components = Vec::new(); + for line in content.lines() { + let line = line.trim(); + if line == "}" { + continue; + } + if line.starts_with("type") || line.starts_with("enum") { + components.push(line.trim_end_matches([' ', '{']).to_string()); + } else if !line.is_empty() && !line.starts_with('#') { + components.push(format!(" {line}")); + } + } + components +} + +/// Legacy `parse_grpc` (protobuf): syntax/service/message/rpc lines plus +/// `=`-bearing field lines (field numbers kept, `;`-tail dropped). +pub fn protobuf(content: &str) -> Vec { + let mut components = Vec::new(); + for line in content.split('\n') { + let line = line.trim(); + if line.starts_with("syntax") { + components.push(line.trim_end_matches(';').to_string()); + } else if line.starts_with("service") || line.starts_with("message") { + components.push(line.trim_end_matches(['{', ' ']).to_string()); + } else if line.starts_with("rpc") { + components.push(format!(" {}", line.trim_end_matches([' ', '{', '}']))); + } else if !line.is_empty() && !line.starts_with("//") && line.contains('=') { + let field_info = line.split(';').next().unwrap_or(line); + components.push(format!(" {field_info}")); + } + } + components +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/tree_plus_core/tests/golden_parity.rs b/crates/tree_plus_core/tests/golden_parity.rs index 8ec8f33..256a217 100644 --- a/crates/tree_plus_core/tests/golden_parity.rs +++ b/crates/tree_plus_core/tests/golden_parity.rs @@ -28,6 +28,9 @@ fn in_v1_scope(path: &str) -> bool { const V1_EXTS: &[&str] = &[ ".py", ".pyi", + ".sql", + ".graphql", + ".proto", ".rs", ".go", ".js", diff --git a/crates/tree_plus_core/tests/robustness.rs b/crates/tree_plus_core/tests/robustness.rs index 64e42dd..da1d33b 100644 --- a/crates/tree_plus_core/tests/robustness.rs +++ b/crates/tree_plus_core/tests/robustness.rs @@ -28,7 +28,7 @@ impl Rng { const EXTENSIONS: &[&str] = &[ "py", "rs", "ts", "tsx", "js", "c", "cpp", "h", "md", "json", "jsonl", "yml", "toml", "csv", - "txt", "env", "rst", + "txt", "env", "rst", "sql", "graphql", "proto", ]; fn write_temp(name: &str, bytes: &[u8]) -> PathBuf { diff --git a/docs/language-roadmap.md b/docs/language-roadmap.md index a571141..d692c1f 100644 --- a/docs/language-roadmap.md +++ b/docs/language-roadmap.md @@ -1,7 +1,8 @@ # Language Roadmap (Rust Port) Version-1 implements: Rust, Python, JavaScript, TypeScript, C, C++, Go, -Markdown (+ RST), JSON (package.json / schema / RPC / OpenRPC), JSONL, YAML, +Markdown (+ RST), SQL, GraphQL, Protobuf, +JSON (package.json / schema / RPC / OpenRPC), JSONL, YAML, TOML (Cargo/pyproject), CSV, Makefile/Justfile, .env, requirements.txt, SQLite, and TODO/BUG/NOTE markers everywhere except `.md`/`.txt` (legacy rule). @@ -40,9 +41,6 @@ availability → suggested path → missing tests. | COBOL | .cbl .cobol | parse_cbl | no maintained | regex port (division/para lines) | port group1 goldens | | Fortran | .f .f90 ... | parse_fortran | yes (community) | regex port | port golden `fortran_test.f90` | | APL | .apl | parse_apl | no | regex port | port golden `apl_test.apl` | -| SQL | .sql | parse_sql | yes (community) | regex port (CREATE TABLE) | port golden `sql_test.sql` | -| GraphQL | .graphql | parse_graphql | yes (community) | line port (trivial) | port golden `graphql_test.graphql` | -| Protobuf | .proto | parse_grpc | yes (community) | line/regex port | port golden `proto_test.proto` | | Cap'n Proto | .capnp | parse_capnp | no | regex port | port golden `capnp_test.capnp` | | LaTeX | .tex | parse_tex | yes (community) | regex port (sections) | port golden `tex_test.tex` | | Lean | .lean | parse_lean | community | regex port | port golden `lean_test.lean` | @@ -60,5 +58,4 @@ highlighting, tiktoken tokenizers — see docs/rust-port-differences.md. ## Suggested order of attack 1. Java, Kotlin, C#, Ruby, Bash (mature grammars, heavily used). -2. SQL/GraphQL/Protobuf/requirements-style line formats (cheap regex ports). 3. The long tail, prioritized by user demand. diff --git a/tests/golden/generate_legacy_goldens.py b/tests/golden/generate_legacy_goldens.py index 6976136..ab6782b 100644 --- a/tests/golden/generate_legacy_goldens.py +++ b/tests/golden/generate_legacy_goldens.py @@ -51,10 +51,10 @@ def sanitize(p: Path) -> str: DEFERRED_PARSERS = [ "parse_php", "parse_kt", "parse_swift", "parse_bash", - "parse_ps1", "parse_zig", "parse_rb", "parse_sql", "parse_graphql", + "parse_ps1", "parse_zig", "parse_rb", "parse_cs", "parse_jl", "parse_scala", "parse_java", "parse_perl", "parse_hs", "parse_fsharp", "parse_lisp", "parse_erl", "parse_capnp", - "parse_grpc", "parse_tex", "parse_lean", "parse_fortran", "parse_tf", + "parse_tex", "parse_lean", "parse_fortran", "parse_tf", "parse_isabelle", "parse_lua", "parse_tcl", "parse_objective_c", "parse_matlab", "parse_r", "parse_mathematica", "parse_ocaml", "parse_cbl", "parse_apl", "parse_metal", "parse_wgsl", diff --git a/tests/golden/legacy/trees/repo_concise.txt b/tests/golden/legacy/trees/repo_concise.txt index e2786e5..e4238dc 100644 --- a/tests/golden/legacy/trees/repo_concise.txt +++ b/tests/golden/legacy/trees/repo_concise.txt @@ -1,4 +1,4 @@ -📁 tree_plus (44 folders, 443 files) +📁 tree_plus (44 folders, 442 files) ├── 📄 .env.test (4 tokens, 0 lines) ├── 📁 .github (2 folders, 4 files) │ ├── 📄 dependabot.yml (128 tokens, 11 lines) @@ -9,7 +9,6 @@ ├── 📄 .gitignore (226 tokens, 60 lines) ├── 📄 .mcp_server.pid (2 tokens, 1 line) ├── 📄 Cargo.toml (212 tokens, 30 lines) -├── 📄 claude-fable-5-rust-rewrite-goal.md (3,394 tokens, 434 lines) ├── 📁 coverage (1 folder, 1 file) │ └── 📄 lcov.info (17,359 tokens, 2,180 lines) ├── 📁 crates (11 folders, 28 files) @@ -33,28 +32,28 @@ │ │ │ ├── 📄 data.rs (5,115 tokens, 582 lines) │ │ │ ├── 📄 markdown.rs (1,531 tokens, 180 lines) │ │ │ ├── 📄 markers.rs (438 tokens, 60 lines) -│ │ │ ├── 📄 mod.rs (2,532 tokens, 278 lines) -│ │ │ ├── 📄 simple.rs (1,629 tokens, 216 lines) +│ │ │ ├── 📄 mod.rs (2,559 tokens, 281 lines) +│ │ │ ├── 📄 simple.rs (2,170 tokens, 269 lines) │ │ │ └── 📁 treesitter (1 folder, 6 files) -│ │ │ ├── 📄 c_cpp.rs (5,979 tokens, 591 lines) -│ │ │ ├── 📄 go.rs (1,364 tokens, 152 lines) +│ │ │ ├── 📄 c_cpp.rs (6,440 tokens, 635 lines) +│ │ │ ├── 📄 go.rs (1,439 tokens, 158 lines) │ │ │ ├── 📄 mod.rs (491 tokens, 67 lines) -│ │ │ ├── 📄 python.rs (3,487 tokens, 346 lines) -│ │ │ ├── 📄 rust.rs (2,785 tokens, 312 lines) -│ │ │ └── 📄 typescript.rs (3,897 tokens, 420 lines) +│ │ │ ├── 📄 python.rs (3,635 tokens, 353 lines) +│ │ │ ├── 📄 rust.rs (2,828 tokens, 315 lines) +│ │ │ └── 📄 typescript.rs (4,004 tokens, 431 lines) │ │ ├── 📄 ignore.rs (2,144 tokens, 307 lines) │ │ ├── 📄 lib.rs (222 tokens, 29 lines) │ │ ├── 📄 model.rs (928 tokens, 125 lines) │ │ ├── 📄 render.rs (2,741 tokens, 347 lines) │ │ ├── 📄 sort.rs (1,693 tokens, 214 lines) -│ │ └── 📄 walk.rs (2,245 tokens, 260 lines) +│ │ └── 📄 walk.rs (2,426 tokens, 278 lines) │ └── 📁 tests (1 folder, 2 files) -│ ├── 📄 golden_parity.rs (1,812 tokens, 244 lines) -│ └── 📄 robustness.rs (743 tokens, 86 lines) +│ ├── 📄 golden_parity.rs (1,826 tokens, 247 lines) +│ └── 📄 robustness.rs (1,213 tokens, 143 lines) ├── 📁 docs (1 folder, 4 files) │ ├── 📄 architecture.md (1,392 tokens, 113 lines) -│ ├── 📄 language-roadmap.md (1,262 tokens, 64 lines) -│ ├── 📄 performance.md (690 tokens, 64 lines) +│ ├── 📄 language-roadmap.md (1,238 tokens, 64 lines) +│ ├── 📄 performance.md (1,121 tokens, 98 lines) │ └── 📄 rust-port-differences.md (1,022 tokens, 78 lines) ├── 📄 Justfile (141 tokens, 23 lines) ├── 📄 LICENSE (2,744 tokens, 81 lines) @@ -78,7 +77,7 @@ │ │ └── 📄 logging.py (11 tokens, 0 lines) │ ├── 📁 golden (6 folders, 247 files) │ │ ├── 📄 diff_components.py (417 tokens, 59 lines) -│ │ ├── 📄 generate_legacy_goldens.py (1,496 tokens, 156 lines) +│ │ ├── 📄 generate_legacy_goldens.py (1,485 tokens, 156 lines) │ │ └── 📁 legacy (5 folders, 245 files) │ │ ├── 📁 components (1 folder, 109 files) │ │ │ ├── 📄 tests__dot_dot__my_test_file.py.json (21 tokens, 5 lines) @@ -530,15 +529,15 @@ │ │ │ ├── 📄 more_languages_group_todo.txt (1,491 tokens, 111 lines) │ │ │ ├── 📄 multi_seed.txt (3,789 tokens, 427 lines) │ │ │ ├── 📄 path_to_test.txt (445 tokens, 51 lines) -│ │ │ └── 📄 repo_concise.txt (9,776 tokens, 699 lines) +│ │ │ └── 📄 repo_concise.txt (9,829 tokens, 703 lines) │ │ └── 📁 trees_v1 (1 folder, 13 files) │ │ ├── 📄 dot_dot.txt (99 tokens, 10 lines) -│ │ ├── 📄 more_languages.txt (10,912 tokens, 1,135 lines) +│ │ ├── 📄 more_languages.txt (11,402 tokens, 1,189 lines) │ │ ├── 📄 more_languages_group1.txt (1,130 tokens, 146 lines) │ │ ├── 📄 more_languages_group2.txt (531 tokens, 67 lines) -│ │ ├── 📄 more_languages_group3.txt (1,240 tokens, 149 lines) +│ │ ├── 📄 more_languages_group3.txt (1,488 tokens, 181 lines) │ │ ├── 📄 more_languages_group4.txt (1,020 tokens, 123 lines) -│ │ ├── 📄 more_languages_group5.txt (2,099 tokens, 235 lines) +│ │ ├── 📄 more_languages_group5.txt (2,288 tokens, 257 lines) │ │ ├── 📄 more_languages_group6.txt (2,821 tokens, 300 lines) │ │ ├── 📄 more_languages_group7.txt (705 tokens, 83 lines) │ │ ├── 📄 more_languages_group_lisp.txt (52 tokens, 5 lines) diff --git a/tests/golden/legacy/trees_v1/more_languages.txt b/tests/golden/legacy/trees_v1/more_languages.txt index 897f9fb..dcfda06 100644 --- a/tests/golden/legacy/trees_v1/more_languages.txt +++ b/tests/golden/legacy/trees_v1/more_languages.txt @@ -305,7 +305,39 @@ │ ├── 📄 test.lean (289 tokens, 42 lines) │ ├── 📄 test.capnp (117 tokens, 30 lines) │ ├── 📄 test.graphql (66 tokens, 21 lines) +│ │ ├── type Query +│ │ ├── getBooks: [Book] +│ │ ├── getAuthors: [Author] +│ │ ├── type Mutation +│ │ ├── addBook(title: String, author: String): Book +│ │ ├── removeBook(id: ID): Book +│ │ ├── type Book +│ │ ├── id: ID +│ │ ├── title: String +│ │ ├── author: Author +│ │ ├── type Author +│ │ ├── id: ID +│ │ ├── name: String +│ │ └── books: [Book] │ ├── 📄 test.proto (142 tokens, 34 lines) +│ │ ├── syntax = "proto3" +│ │ ├── service EmployeeService +│ │ ├── rpc GetEmployee(EmployeeId) returns (EmployeeInfo) +│ │ ├── rpc AddEmployee(EmployeeData) returns (EmployeeInfo) +│ │ ├── rpc UpdateEmployee(EmployeeUpdate) returns (EmployeeInfo) +│ │ ├── message EmployeeId +│ │ ├── int32 id = 1 +│ │ ├── message EmployeeInfo +│ │ ├── int32 id = 1 +│ │ ├── string name = 2 +│ │ ├── string role = 3 +│ │ ├── message EmployeeData +│ │ ├── string name = 1 +│ │ ├── string role = 2 +│ │ ├── message EmployeeUpdate +│ │ ├── int32 id = 1 +│ │ ├── string name = 2 +│ │ └── string role = 3 │ ├── 📄 test.sqlite (0 tokens, 0 lines) │ │ ├── students table: │ │ ├── id integer primary key @@ -613,6 +645,28 @@ │ │ ├── fn draw(&self) │ │ └── fn main() │ ├── 📄 sql_test.sql (270 tokens, 51 lines) +│ │ ├── CREATE TABLE promoters +│ │ ├── user_id serial PRIMARY KEY, +│ │ ├── type varchar(20) NOT NULL, +│ │ ├── username varchar(20) NOT NULL, +│ │ ├── password varchar(20) NOT NULL, +│ │ ├── email varchar(30) NOT NULL, +│ │ ├── phone varchar(20) NOT NULL, +│ │ ├── promocode varchar(20), +│ │ ├── info json, +│ │ ├── going text[], +│ │ ├── invites text[], +│ │ ├── balance integer NOT NULL, +│ │ ├── rewards text[], +│ │ ├── created timestamp +│ │ ├── CREATE TABLE events +│ │ ├── event_id serial PRIMARY KEY, +│ │ ├── name varchar(64) NOT NULL, +│ │ ├── date varchar(64) NOT NULL, +│ │ ├── location varchar(64) NOT NULL, +│ │ ├── performer varchar(64) NOT NULL, +│ │ ├── rewards json, +│ │ └── created timestamp │ ├── 📄 standard-app-routing.module.ts (100 tokens, 16 lines) │ │ └── const routes: Routes = [ │ │ { path: '', component: HomeComponent }, diff --git a/tests/golden/legacy/trees_v1/more_languages_group3.txt b/tests/golden/legacy/trees_v1/more_languages_group3.txt index 1b4769e..f679f0f 100644 --- a/tests/golden/legacy/trees_v1/more_languages_group3.txt +++ b/tests/golden/legacy/trees_v1/more_languages_group3.txt @@ -88,7 +88,39 @@ ├── 📄 test.lean (289 tokens, 42 lines) ├── 📄 test.capnp (117 tokens, 30 lines) ├── 📄 test.graphql (66 tokens, 21 lines) +│ ├── type Query +│ ├── getBooks: [Book] +│ ├── getAuthors: [Author] +│ ├── type Mutation +│ ├── addBook(title: String, author: String): Book +│ ├── removeBook(id: ID): Book +│ ├── type Book +│ ├── id: ID +│ ├── title: String +│ ├── author: Author +│ ├── type Author +│ ├── id: ID +│ ├── name: String +│ └── books: [Book] ├── 📄 test.proto (142 tokens, 34 lines) +│ ├── syntax = "proto3" +│ ├── service EmployeeService +│ ├── rpc GetEmployee(EmployeeId) returns (EmployeeInfo) +│ ├── rpc AddEmployee(EmployeeData) returns (EmployeeInfo) +│ ├── rpc UpdateEmployee(EmployeeUpdate) returns (EmployeeInfo) +│ ├── message EmployeeId +│ ├── int32 id = 1 +│ ├── message EmployeeInfo +│ ├── int32 id = 1 +│ ├── string name = 2 +│ ├── string role = 3 +│ ├── message EmployeeData +│ ├── string name = 1 +│ ├── string role = 2 +│ ├── message EmployeeUpdate +│ ├── int32 id = 1 +│ ├── string name = 2 +│ └── string role = 3 ├── 📄 test.sqlite (0 tokens, 0 lines) │ ├── students table: │ ├── id integer primary key diff --git a/tests/golden/legacy/trees_v1/more_languages_group5.txt b/tests/golden/legacy/trees_v1/more_languages_group5.txt index 47b0a19..9acc6f2 100644 --- a/tests/golden/legacy/trees_v1/more_languages_group5.txt +++ b/tests/golden/legacy/trees_v1/more_languages_group5.txt @@ -121,6 +121,28 @@ │ ├── fn draw(&self) │ └── fn main() ├── 📄 sql_test.sql (270 tokens, 51 lines) +│ ├── CREATE TABLE promoters +│ ├── user_id serial PRIMARY KEY, +│ ├── type varchar(20) NOT NULL, +│ ├── username varchar(20) NOT NULL, +│ ├── password varchar(20) NOT NULL, +│ ├── email varchar(30) NOT NULL, +│ ├── phone varchar(20) NOT NULL, +│ ├── promocode varchar(20), +│ ├── info json, +│ ├── going text[], +│ ├── invites text[], +│ ├── balance integer NOT NULL, +│ ├── rewards text[], +│ ├── created timestamp +│ ├── CREATE TABLE events +│ ├── event_id serial PRIMARY KEY, +│ ├── name varchar(64) NOT NULL, +│ ├── date varchar(64) NOT NULL, +│ ├── location varchar(64) NOT NULL, +│ ├── performer varchar(64) NOT NULL, +│ ├── rewards json, +│ └── created timestamp ├── 📄 standard-app-routing.module.ts (100 tokens, 16 lines) │ └── const routes: Routes = [ │ { path: '', component: HomeComponent },