Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions crates/tree_plus_core/src/extract/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,11 @@ const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdx", ".mdc"];
/// Rust port version 1. Files still get TODO/BUG/NOTE markers; component
/// extraction is tracked in docs/language-roadmap.md.
const DEFERRED_EXTENSIONS: &[&str] = &[
".php", ".kt", ".swift", ".sh", ".ps1", ".zig", ".rb", ".sql", ".graphql", ".cs", ".jl",
".scala", ".java", ".pl", ".hs", ".fs", ".lisp", ".clj", ".scm", ".el", ".rkt", ".erl", ".hrl",
".capnp", ".proto", ".tex", ".lean", ".f", ".for", ".f77", ".f90", ".f95", ".f03", ".f08",
".tf", ".thy", ".lua", ".tcl", ".m", ".r", ".nb", ".wl", ".matlab", ".ml", ".cbl", ".cobol",
".apl", ".metal", ".wgsl", ".html",
".php", ".kt", ".swift", ".sh", ".ps1", ".zig", ".rb", ".cs", ".jl", ".scala", ".java", ".pl",
".hs", ".fs", ".lisp", ".clj", ".scm", ".el", ".rkt", ".erl", ".hrl", ".capnp", ".tex",
".lean", ".f", ".for", ".f77", ".f90", ".f95", ".f03", ".f08", ".tf", ".thy", ".lua", ".tcl",
".m", ".r", ".nb", ".wl", ".matlab", ".ml", ".cbl", ".cobol", ".apl", ".metal", ".wgsl",
".html",
];

/// Whether this extension is deferred (legacy support, no Rust port yet).
Expand Down Expand Up @@ -207,6 +207,9 @@ fn try_extract_components(path: &Path, syntax: bool) -> ExtractResult {
e if C_EXTENSIONS.contains(&e) => treesitter::c_cpp::extract(&content, e)?,
".rs" => treesitter::rust::extract(&content, syntax)?,
".go" => treesitter::go::extract(&content)?,
".sql" => simple::sql(&content),
".graphql" => simple::graphql(&content),
".proto" => simple::protobuf(&content),
".jsonl" => data::extract_jsonl(&content)?,
".env" => simple::dot_env(&content),
".txt" => {
Expand Down
53 changes: 53 additions & 0 deletions crates/tree_plus_core/src/extract/simple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,59 @@ pub fn environment_ts(content: &str) -> Vec<String> {
}
}

/// Legacy `parse_sql`: `CREATE TABLE name (body);` statements, with the
/// body lines re-emitted under a 3-space indent.
pub fn sql(content: &str) -> Vec<String> {
static CREATE_TABLE_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)CREATE TABLE (\w+) \((.*?)\);").unwrap());
let mut components = Vec::new();
for caps in CREATE_TABLE_RE.captures_iter(content) {
components.push(format!("CREATE TABLE {}", &caps[1]));
for line in caps[2].trim().split('\n') {
components.push(format!(" {}", line.trim()));
}
}
components
}

/// Legacy `parse_graphql`: type/enum headers without the brace, all other
/// non-comment lines indented; bare `}` lines skipped.
pub fn graphql(content: &str) -> Vec<String> {
let mut components = Vec::new();
for line in content.lines() {
let line = line.trim();
if line == "}" {
continue;
}
if line.starts_with("type") || line.starts_with("enum") {
components.push(line.trim_end_matches([' ', '{']).to_string());
} else if !line.is_empty() && !line.starts_with('#') {
components.push(format!(" {line}"));
}
}
components
}

/// Legacy `parse_grpc` (protobuf): syntax/service/message/rpc lines plus
/// `=`-bearing field lines (field numbers kept, `;`-tail dropped).
pub fn protobuf(content: &str) -> Vec<String> {
let mut components = Vec::new();
for line in content.split('\n') {
let line = line.trim();
if line.starts_with("syntax") {
components.push(line.trim_end_matches(';').to_string());
} else if line.starts_with("service") || line.starts_with("message") {
components.push(line.trim_end_matches(['{', ' ']).to_string());
} else if line.starts_with("rpc") {
components.push(format!(" {}", line.trim_end_matches([' ', '{', '}'])));
} else if !line.is_empty() && !line.starts_with("//") && line.contains('=') {
let field_info = line.split(';').next().unwrap_or(line);
components.push(format!(" {field_info}"));
}
}
components
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
3 changes: 3 additions & 0 deletions crates/tree_plus_core/tests/golden_parity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ fn in_v1_scope(path: &str) -> bool {
const V1_EXTS: &[&str] = &[
".py",
".pyi",
".sql",
".graphql",
".proto",
".rs",
".go",
".js",
Expand Down
2 changes: 1 addition & 1 deletion crates/tree_plus_core/tests/robustness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ impl Rng {

const EXTENSIONS: &[&str] = &[
"py", "rs", "ts", "tsx", "js", "c", "cpp", "h", "md", "json", "jsonl", "yml", "toml", "csv",
"txt", "env", "rst",
"txt", "env", "rst", "sql", "graphql", "proto",
];

fn write_temp(name: &str, bytes: &[u8]) -> PathBuf {
Expand Down
7 changes: 2 additions & 5 deletions docs/language-roadmap.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Language Roadmap (Rust Port)

Version-1 implements: Rust, Python, JavaScript, TypeScript, C, C++, Go,
Markdown (+ RST), JSON (package.json / schema / RPC / OpenRPC), JSONL, YAML,
Markdown (+ RST), SQL, GraphQL, Protobuf,
JSON (package.json / schema / RPC / OpenRPC), JSONL, YAML,
TOML (Cargo/pyproject), CSV, Makefile/Justfile, .env, requirements.txt,
SQLite, and TODO/BUG/NOTE markers everywhere except `.md`/`.txt` (legacy
rule).
Expand Down Expand Up @@ -40,9 +41,6 @@ availability → suggested path → missing tests.
| COBOL | .cbl .cobol | parse_cbl | no maintained | regex port (division/para lines) | port group1 goldens |
| Fortran | .f .f90 ... | parse_fortran | yes (community) | regex port | port golden `fortran_test.f90` |
| APL | .apl | parse_apl | no | regex port | port golden `apl_test.apl` |
| SQL | .sql | parse_sql | yes (community) | regex port (CREATE TABLE) | port golden `sql_test.sql` |
| GraphQL | .graphql | parse_graphql | yes (community) | line port (trivial) | port golden `graphql_test.graphql` |
| Protobuf | .proto | parse_grpc | yes (community) | line/regex port | port golden `proto_test.proto` |
| Cap'n Proto | .capnp | parse_capnp | no | regex port | port golden `capnp_test.capnp` |
| LaTeX | .tex | parse_tex | yes (community) | regex port (sections) | port golden `tex_test.tex` |
| Lean | .lean | parse_lean | community | regex port | port golden `lean_test.lean` |
Expand All @@ -60,5 +58,4 @@ highlighting, tiktoken tokenizers — see docs/rust-port-differences.md.
## Suggested order of attack

1. Java, Kotlin, C#, Ruby, Bash (mature grammars, heavily used).
2. SQL/GraphQL/Protobuf/requirements-style line formats (cheap regex ports).
3. The long tail, prioritized by user demand.
4 changes: 2 additions & 2 deletions tests/golden/generate_legacy_goldens.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ def sanitize(p: Path) -> str:

DEFERRED_PARSERS = [
"parse_php", "parse_kt", "parse_swift", "parse_bash",
"parse_ps1", "parse_zig", "parse_rb", "parse_sql", "parse_graphql",
"parse_ps1", "parse_zig", "parse_rb",
"parse_cs", "parse_jl", "parse_scala", "parse_java", "parse_perl",
"parse_hs", "parse_fsharp", "parse_lisp", "parse_erl", "parse_capnp",
"parse_grpc", "parse_tex", "parse_lean", "parse_fortran", "parse_tf",
"parse_tex", "parse_lean", "parse_fortran", "parse_tf",
"parse_isabelle", "parse_lua", "parse_tcl", "parse_objective_c",
"parse_matlab", "parse_r", "parse_mathematica", "parse_ocaml",
"parse_cbl", "parse_apl", "parse_metal", "parse_wgsl",
Expand Down
37 changes: 18 additions & 19 deletions tests/golden/legacy/trees/repo_concise.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
📁 tree_plus (44 folders, 443 files)
📁 tree_plus (44 folders, 442 files)
├── 📄 .env.test (4 tokens, 0 lines)
├── 📁 .github (2 folders, 4 files)
│ ├── 📄 dependabot.yml (128 tokens, 11 lines)
Expand All @@ -9,7 +9,6 @@
├── 📄 .gitignore (226 tokens, 60 lines)
├── 📄 .mcp_server.pid (2 tokens, 1 line)
├── 📄 Cargo.toml (212 tokens, 30 lines)
├── 📄 claude-fable-5-rust-rewrite-goal.md (3,394 tokens, 434 lines)
├── 📁 coverage (1 folder, 1 file)
│ └── 📄 lcov.info (17,359 tokens, 2,180 lines)
├── 📁 crates (11 folders, 28 files)
Expand All @@ -33,28 +32,28 @@
│ │ │ ├── 📄 data.rs (5,115 tokens, 582 lines)
│ │ │ ├── 📄 markdown.rs (1,531 tokens, 180 lines)
│ │ │ ├── 📄 markers.rs (438 tokens, 60 lines)
│ │ │ ├── 📄 mod.rs (2,532 tokens, 278 lines)
│ │ │ ├── 📄 simple.rs (1,629 tokens, 216 lines)
│ │ │ ├── 📄 mod.rs (2,559 tokens, 281 lines)
│ │ │ ├── 📄 simple.rs (2,170 tokens, 269 lines)
│ │ │ └── 📁 treesitter (1 folder, 6 files)
│ │ │ ├── 📄 c_cpp.rs (5,979 tokens, 591 lines)
│ │ │ ├── 📄 go.rs (1,364 tokens, 152 lines)
│ │ │ ├── 📄 c_cpp.rs (6,440 tokens, 635 lines)
│ │ │ ├── 📄 go.rs (1,439 tokens, 158 lines)
│ │ │ ├── 📄 mod.rs (491 tokens, 67 lines)
│ │ │ ├── 📄 python.rs (3,487 tokens, 346 lines)
│ │ │ ├── 📄 rust.rs (2,785 tokens, 312 lines)
│ │ │ └── 📄 typescript.rs (3,897 tokens, 420 lines)
│ │ │ ├── 📄 python.rs (3,635 tokens, 353 lines)
│ │ │ ├── 📄 rust.rs (2,828 tokens, 315 lines)
│ │ │ └── 📄 typescript.rs (4,004 tokens, 431 lines)
│ │ ├── 📄 ignore.rs (2,144 tokens, 307 lines)
│ │ ├── 📄 lib.rs (222 tokens, 29 lines)
│ │ ├── 📄 model.rs (928 tokens, 125 lines)
│ │ ├── 📄 render.rs (2,741 tokens, 347 lines)
│ │ ├── 📄 sort.rs (1,693 tokens, 214 lines)
│ │ └── 📄 walk.rs (2,245 tokens, 260 lines)
│ │ └── 📄 walk.rs (2,426 tokens, 278 lines)
│ └── 📁 tests (1 folder, 2 files)
│ ├── 📄 golden_parity.rs (1,812 tokens, 244 lines)
│ └── 📄 robustness.rs (743 tokens, 86 lines)
│ ├── 📄 golden_parity.rs (1,826 tokens, 247 lines)
│ └── 📄 robustness.rs (1,213 tokens, 143 lines)
├── 📁 docs (1 folder, 4 files)
│ ├── 📄 architecture.md (1,392 tokens, 113 lines)
│ ├── 📄 language-roadmap.md (1,262 tokens, 64 lines)
│ ├── 📄 performance.md (690 tokens, 64 lines)
│ ├── 📄 language-roadmap.md (1,238 tokens, 64 lines)
│ ├── 📄 performance.md (1,121 tokens, 98 lines)
│ └── 📄 rust-port-differences.md (1,022 tokens, 78 lines)
├── 📄 Justfile (141 tokens, 23 lines)
├── 📄 LICENSE (2,744 tokens, 81 lines)
Expand All @@ -78,7 +77,7 @@
│ │ └── 📄 logging.py (11 tokens, 0 lines)
│ ├── 📁 golden (6 folders, 247 files)
│ │ ├── 📄 diff_components.py (417 tokens, 59 lines)
│ │ ├── 📄 generate_legacy_goldens.py (1,496 tokens, 156 lines)
│ │ ├── 📄 generate_legacy_goldens.py (1,485 tokens, 156 lines)
│ │ └── 📁 legacy (5 folders, 245 files)
│ │ ├── 📁 components (1 folder, 109 files)
│ │ │ ├── 📄 tests__dot_dot__my_test_file.py.json (21 tokens, 5 lines)
Expand Down Expand Up @@ -530,15 +529,15 @@
│ │ │ ├── 📄 more_languages_group_todo.txt (1,491 tokens, 111 lines)
│ │ │ ├── 📄 multi_seed.txt (3,789 tokens, 427 lines)
│ │ │ ├── 📄 path_to_test.txt (445 tokens, 51 lines)
│ │ │ └── 📄 repo_concise.txt (9,776 tokens, 699 lines)
│ │ │ └── 📄 repo_concise.txt (9,829 tokens, 703 lines)
│ │ └── 📁 trees_v1 (1 folder, 13 files)
│ │ ├── 📄 dot_dot.txt (99 tokens, 10 lines)
│ │ ├── 📄 more_languages.txt (10,912 tokens, 1,135 lines)
│ │ ├── 📄 more_languages.txt (11,402 tokens, 1,189 lines)
│ │ ├── 📄 more_languages_group1.txt (1,130 tokens, 146 lines)
│ │ ├── 📄 more_languages_group2.txt (531 tokens, 67 lines)
│ │ ├── 📄 more_languages_group3.txt (1,240 tokens, 149 lines)
│ │ ├── 📄 more_languages_group3.txt (1,488 tokens, 181 lines)
│ │ ├── 📄 more_languages_group4.txt (1,020 tokens, 123 lines)
│ │ ├── 📄 more_languages_group5.txt (2,099 tokens, 235 lines)
│ │ ├── 📄 more_languages_group5.txt (2,288 tokens, 257 lines)
│ │ ├── 📄 more_languages_group6.txt (2,821 tokens, 300 lines)
│ │ ├── 📄 more_languages_group7.txt (705 tokens, 83 lines)
│ │ ├── 📄 more_languages_group_lisp.txt (52 tokens, 5 lines)
Expand Down
54 changes: 54 additions & 0 deletions tests/golden/legacy/trees_v1/more_languages.txt
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,39 @@
│ ├── 📄 test.lean (289 tokens, 42 lines)
│ ├── 📄 test.capnp (117 tokens, 30 lines)
│ ├── 📄 test.graphql (66 tokens, 21 lines)
│ │ ├── type Query
│ │ ├── getBooks: [Book]
│ │ ├── getAuthors: [Author]
│ │ ├── type Mutation
│ │ ├── addBook(title: String, author: String): Book
│ │ ├── removeBook(id: ID): Book
│ │ ├── type Book
│ │ ├── id: ID
│ │ ├── title: String
│ │ ├── author: Author
│ │ ├── type Author
│ │ ├── id: ID
│ │ ├── name: String
│ │ └── books: [Book]
│ ├── 📄 test.proto (142 tokens, 34 lines)
│ │ ├── syntax = "proto3"
│ │ ├── service EmployeeService
│ │ ├── rpc GetEmployee(EmployeeId) returns (EmployeeInfo)
│ │ ├── rpc AddEmployee(EmployeeData) returns (EmployeeInfo)
│ │ ├── rpc UpdateEmployee(EmployeeUpdate) returns (EmployeeInfo)
│ │ ├── message EmployeeId
│ │ ├── int32 id = 1
│ │ ├── message EmployeeInfo
│ │ ├── int32 id = 1
│ │ ├── string name = 2
│ │ ├── string role = 3
│ │ ├── message EmployeeData
│ │ ├── string name = 1
│ │ ├── string role = 2
│ │ ├── message EmployeeUpdate
│ │ ├── int32 id = 1
│ │ ├── string name = 2
│ │ └── string role = 3
│ ├── 📄 test.sqlite (0 tokens, 0 lines)
│ │ ├── students table:
│ │ ├── id integer primary key
Expand Down Expand Up @@ -613,6 +645,28 @@
│ │ ├── fn draw(&self)
│ │ └── fn main()
│ ├── 📄 sql_test.sql (270 tokens, 51 lines)
│ │ ├── CREATE TABLE promoters
│ │ ├── user_id serial PRIMARY KEY,
│ │ ├── type varchar(20) NOT NULL,
│ │ ├── username varchar(20) NOT NULL,
│ │ ├── password varchar(20) NOT NULL,
│ │ ├── email varchar(30) NOT NULL,
│ │ ├── phone varchar(20) NOT NULL,
│ │ ├── promocode varchar(20),
│ │ ├── info json,
│ │ ├── going text[],
│ │ ├── invites text[],
│ │ ├── balance integer NOT NULL,
│ │ ├── rewards text[],
│ │ ├── created timestamp
│ │ ├── CREATE TABLE events
│ │ ├── event_id serial PRIMARY KEY,
│ │ ├── name varchar(64) NOT NULL,
│ │ ├── date varchar(64) NOT NULL,
│ │ ├── location varchar(64) NOT NULL,
│ │ ├── performer varchar(64) NOT NULL,
│ │ ├── rewards json,
│ │ └── created timestamp
│ ├── 📄 standard-app-routing.module.ts (100 tokens, 16 lines)
│ │ └── const routes: Routes = [
│ │ { path: '', component: HomeComponent },
Expand Down
32 changes: 32 additions & 0 deletions tests/golden/legacy/trees_v1/more_languages_group3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,39 @@
├── 📄 test.lean (289 tokens, 42 lines)
├── 📄 test.capnp (117 tokens, 30 lines)
├── 📄 test.graphql (66 tokens, 21 lines)
│ ├── type Query
│ ├── getBooks: [Book]
│ ├── getAuthors: [Author]
│ ├── type Mutation
│ ├── addBook(title: String, author: String): Book
│ ├── removeBook(id: ID): Book
│ ├── type Book
│ ├── id: ID
│ ├── title: String
│ ├── author: Author
│ ├── type Author
│ ├── id: ID
│ ├── name: String
│ └── books: [Book]
├── 📄 test.proto (142 tokens, 34 lines)
│ ├── syntax = "proto3"
│ ├── service EmployeeService
│ ├── rpc GetEmployee(EmployeeId) returns (EmployeeInfo)
│ ├── rpc AddEmployee(EmployeeData) returns (EmployeeInfo)
│ ├── rpc UpdateEmployee(EmployeeUpdate) returns (EmployeeInfo)
│ ├── message EmployeeId
│ ├── int32 id = 1
│ ├── message EmployeeInfo
│ ├── int32 id = 1
│ ├── string name = 2
│ ├── string role = 3
│ ├── message EmployeeData
│ ├── string name = 1
│ ├── string role = 2
│ ├── message EmployeeUpdate
│ ├── int32 id = 1
│ ├── string name = 2
│ └── string role = 3
├── 📄 test.sqlite (0 tokens, 0 lines)
│ ├── students table:
│ ├── id integer primary key
Expand Down
22 changes: 22 additions & 0 deletions tests/golden/legacy/trees_v1/more_languages_group5.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,28 @@
│ ├── fn draw(&self)
│ └── fn main()
├── 📄 sql_test.sql (270 tokens, 51 lines)
│ ├── CREATE TABLE promoters
│ ├── user_id serial PRIMARY KEY,
│ ├── type varchar(20) NOT NULL,
│ ├── username varchar(20) NOT NULL,
│ ├── password varchar(20) NOT NULL,
│ ├── email varchar(30) NOT NULL,
│ ├── phone varchar(20) NOT NULL,
│ ├── promocode varchar(20),
│ ├── info json,
│ ├── going text[],
│ ├── invites text[],
│ ├── balance integer NOT NULL,
│ ├── rewards text[],
│ ├── created timestamp
│ ├── CREATE TABLE events
│ ├── event_id serial PRIMARY KEY,
│ ├── name varchar(64) NOT NULL,
│ ├── date varchar(64) NOT NULL,
│ ├── location varchar(64) NOT NULL,
│ ├── performer varchar(64) NOT NULL,
│ ├── rewards json,
│ └── created timestamp
├── 📄 standard-app-routing.module.ts (100 tokens, 16 lines)
│ └── const routes: Routes = [
│ { path: '', component: HomeComponent },
Expand Down
Loading