diff --git a/crates/graphify-extract/src/extractors/dm/dmf.rs b/crates/graphify-extract/src/extractors/dm/dmf.rs new file mode 100644 index 0000000..e1e40b2 --- /dev/null +++ b/crates/graphify-extract/src/extractors/dm/dmf.rs @@ -0,0 +1,137 @@ +//! `.dmf` interface-form extractor (windows + controls). + +use crate::ids::{file_stem, make_id, make_id1}; +use crate::types::{Edge, FileResult, Node as GNode}; +use regex::Regex; +use std::collections::HashSet; +use std::path::Path; +use std::sync::LazyLock; + +#[allow(clippy::expect_used)] // literal pattern; compiles on first use +static DMF_WINDOW_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"^\s*window\s+"([^"]+)"\s*$"#).expect("static dmf_window regex")); + +#[allow(clippy::expect_used)] // literal pattern; compiles on first use +static DMF_ELEM_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"^\s*elem\s+"([^"]+)"\s*$"#).expect("static dmf_elem regex")); + +#[allow(clippy::expect_used)] // literal pattern; compiles on first use +static DMF_TYPE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^\s*type\s*=\s*(\S+)\s*$").expect("static dmf_type regex")); + +/// Extract windows and controls from a `.dmf` interface file. +#[must_use] +#[allow(clippy::too_many_lines)] // linear line scanner; verbose node/edge literals, not real complexity +pub fn extract_dmf(path: &Path) -> FileResult { + let data = match std::fs::read(path) { + Ok(b) => b, + Err(e) => return FileResult::error(e.to_string()), + }; + let text = String::from_utf8_lossy(&data).into_owned(); + + let str_path = path.to_string_lossy().into_owned(); + let stem = file_stem(path); + let file_nid = make_id1(&str_path); + let file_label = path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()); + let mut nodes = vec![GNode { + id: file_nid.clone(), + label: file_label, + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some("L1".to_string()), + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + let mut seen: HashSet = HashSet::from([file_nid.clone()]); + + let mut current_window_nid: Option = None; + let mut current_elem_nid: Option = None; + let mut current_elem_name: Option = None; + + let mut line_idx: u32 = 0; + for line in text.lines() { + line_idx += 1; + if let Some(cap) = DMF_WINDOW_RE.captures(line) { + let name = &cap[1]; + let nid = make_id(&[&stem, "window", name]); + if seen.insert(nid.clone()) { + nodes.push(GNode { + id: nid.clone(), + label: format!("window \"{name}\""), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line_idx}")), + metadata: None, + }); + edges.push(Edge { + source: file_nid.clone(), + target: nid.clone(), + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line_idx}")), + weight: 1.0, + context: None, + confidence_score: None, + external: false, + }); + } + current_window_nid = Some(nid); + current_elem_nid = None; + current_elem_name = None; + continue; + } + if let Some(cap) = DMF_ELEM_RE.captures(line) + && let Some(win) = current_window_nid.clone() + { + let name = cap[1].to_string(); + let nid = make_id(&[&stem, "elem", &win, &name]); + if seen.insert(nid.clone()) { + nodes.push(GNode { + id: nid.clone(), + label: format!("elem \"{name}\""), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line_idx}")), + metadata: None, + }); + edges.push(Edge { + source: win, + target: nid.clone(), + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line_idx}")), + weight: 1.0, + context: None, + confidence_score: None, + external: false, + }); + } + current_elem_nid = Some(nid); + current_elem_name = Some(name); + continue; + } + if let Some(cap) = DMF_TYPE_RE.captures(line) + && let (Some(elem_nid), Some(elem_name)) = + (current_elem_nid.as_deref(), current_elem_name.as_deref()) + { + let ctype = &cap[1]; + for n in &mut nodes { + if n.id == elem_nid && !n.label.contains(" [") { + n.label = format!("elem \"{elem_name}\" [{ctype}]"); + break; + } + } + } + } + + FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + } +} diff --git a/crates/graphify-extract/src/extractors/dm/dmi.rs b/crates/graphify-extract/src/extractors/dm/dmi.rs new file mode 100644 index 0000000..d4817ff --- /dev/null +++ b/crates/graphify-extract/src/extractors/dm/dmi.rs @@ -0,0 +1,146 @@ +//! `.dmi` icon-sheet extractor (PNG with BYOND metadata). + +use crate::ids::{file_stem, make_id, make_id1}; +use crate::types::{Edge, FileResult, Node as GNode}; +use std::collections::HashSet; +use std::io::Read; +use std::path::Path; + +/// Decompress up to 1 MiB of a zTXt zlib stream (best effort). +/// +/// graphify-py lets a corrupt zlib stream raise; we degrade gracefully and keep +/// whatever decompressed cleanly. The 1 MiB cap mirrors graphify-py's +/// `max_length` guard against decompression bombs. +fn decompress_capped(compressed: &[u8]) -> String { + let mut out = Vec::new(); + let mut decoder = flate2::read::ZlibDecoder::new(compressed).take(1024 * 1024); + let _ = decoder.read_to_end(&mut out); + String::from_utf8_lossy(&out).into_owned() +} + +/// Pull the BYOND metadata text out of a `.dmi` PNG, or `""` on failure. +/// +/// Scans PNG chunks for a `tEXt`/`zTXt` chunk keyed `Description`; zTXt payloads +/// are zlib-decompressed (capped). Mirrors graphify-py `_read_dmi_description`. +fn read_dmi_description(data: &[u8]) -> String { + const PNG_SIG: &[u8] = b"\x89PNG\r\n\x1a\n"; + if !data.starts_with(PNG_SIG) { + return String::new(); + } + let mut i = 8usize; + while i + 8 <= data.len() { + let length = usize::try_from(u32::from_be_bytes([ + data[i], + data[i + 1], + data[i + 2], + data[i + 3], + ])) + .unwrap_or(0); + let chunk_type = &data[i + 4..i + 8]; + let payload_start = i + 8; + let payload_end = payload_start.saturating_add(length).min(data.len()); + let payload = &data[payload_start..payload_end]; + if chunk_type == b"tEXt" || chunk_type == b"zTXt" { + let Some(nul) = payload.iter().position(|&b| b == 0) else { + return String::new(); + }; + if &payload[..nul] == b"Description" { + if chunk_type == b"zTXt" { + // zTXt: keyword \0 compression_method(1 byte) compressed_data + return decompress_capped(payload.get(nul + 2..).unwrap_or(&[])); + } + // tEXt: keyword \0 text + return String::from_utf8_lossy(payload.get(nul + 1..).unwrap_or(&[])).into_owned(); + } + } + i = i.saturating_add(8).saturating_add(length).saturating_add(4); + } + String::new() +} + +/// Extract icon state names from a `.dmi` (BYOND PNG icon sheet). +#[must_use] +pub fn extract_dmi(path: &Path) -> FileResult { + let data = match std::fs::read(path) { + Ok(b) => b, + Err(e) => return FileResult::error(e.to_string()), + }; + let str_path = path.to_string_lossy().into_owned(); + let stem = file_stem(path); + let file_nid = make_id1(&str_path); + let file_label = path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()); + + let mut nodes = vec![GNode { + id: file_nid.clone(), + label: file_label, + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some("L1".to_string()), + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + let mut seen: HashSet = HashSet::from([file_nid.clone()]); + + let description = read_dmi_description(&data); + if description.is_empty() { + return FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + }; + } + + let mut line_no: u32 = 0; + for raw_line in description.lines() { + line_no += 1; + let stripped = raw_line.trim(); + if !stripped.starts_with("state =") { + continue; + } + let value = stripped.split_once('=').map_or("", |(_, v)| v).trim(); + let state_name = if value.starts_with('"') && value.ends_with('"') && value.len() >= 2 { + &value[1..value.len() - 1] + } else { + value + }; + if state_name.is_empty() { + continue; + } + let nid = make_id(&[&stem, "state", state_name]); + if !seen.insert(nid.clone()) { + continue; + } + nodes.push(GNode { + id: nid.clone(), + label: format!("\"{state_name}\""), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line_no}")), + metadata: None, + }); + edges.push(Edge { + source: file_nid.clone(), + target: nid, + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line_no}")), + weight: 1.0, + context: None, + confidence_score: None, + external: false, + }); + } + + FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + } +} + +// ── .dmm (BYOND map files) ───────────────────────────────────────────────────── diff --git a/crates/graphify-extract/src/extractors/dm/dmm.rs b/crates/graphify-extract/src/extractors/dm/dmm.rs new file mode 100644 index 0000000..0b20c97 --- /dev/null +++ b/crates/graphify-extract/src/extractors/dm/dmm.rs @@ -0,0 +1,184 @@ +//! `.dmm` map-file extractor (tile dictionary type references). + +use crate::ids::make_id1; +use crate::types::{Edge, FileResult, Node as GNode}; +use regex::Regex; +use std::collections::HashSet; +use std::path::Path; +use std::sync::LazyLock; + +/// Matches the start of a `.dmm` grid section: `(x,y,z) = ...`. +#[allow(clippy::expect_used)] // literal pattern; compiles on first use +static DMM_GRID_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?m)^\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)\s*=").expect("static dmm_grid regex") +}); + +/// Split a tile-dictionary body on top-level commas, respecting `(){}[]` +/// nesting and string literals. Mirrors graphify-py `_split_dmm_tile`. +fn split_dmm_tile(body: &str) -> Vec { + let mut out: Vec = Vec::new(); + let mut buf = String::new(); + let mut depth: i32 = 0; + let mut in_string = false; + let mut escape = false; + for ch in body.chars() { + if escape { + buf.push(ch); + escape = false; + continue; + } + if in_string { + buf.push(ch); + if ch == '\\' { + escape = true; + } else if ch == '"' { + in_string = false; + } + continue; + } + match ch { + '"' => { + in_string = true; + buf.push(ch); + } + '(' | '{' | '[' => { + depth += 1; + buf.push(ch); + } + ')' | '}' | ']' => { + depth -= 1; + buf.push(ch); + } + ',' if depth == 0 => { + out.push(buf.trim().to_string()); + buf.clear(); + } + _ => buf.push(ch), + } + } + let tail = buf.trim(); + if !tail.is_empty() { + out.push(tail.to_string()); + } + out +} + +/// Strip a `{var=val; ...}` override suffix off a tile entry, leaving the type path. +fn dmm_type_path(entry: &str) -> String { + entry + .find('{') + .map_or(entry, |b| &entry[..b]) + .trim() + .to_string() +} + +/// Extract type-path references from a `.dmm` map file's tile dictionary. +#[must_use] +pub fn extract_dmm(path: &Path) -> FileResult { + match std::fs::metadata(path) { + Ok(m) if m.len() > 50 * 1024 * 1024 => { + return FileResult::error("file too large (>50 MB)"); + } + Ok(_) => {} + Err(e) => return FileResult::error(e.to_string()), + } + let data = match std::fs::read(path) { + Ok(b) => b, + Err(e) => return FileResult::error(e.to_string()), + }; + let text = String::from_utf8_lossy(&data).into_owned(); + + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + let file_label = path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()); + let nodes = vec![GNode { + id: file_nid.clone(), + label: file_label, + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some("L1".to_string()), + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + + // Only the dictionary section (before the grid) names type paths. + let dict_text = match DMM_GRID_RE.find(&text) { + Some(m) => &text[..m.start()], + None => &text[..], + }; + + let mut seen_targets: HashSet = HashSet::new(); + let mut buf = String::new(); + let mut open_line: u32 = 0; + let mut depth: i32 = 0; + let mut in_string = false; + let mut escape = false; + let mut line_idx: u32 = 0; + for line in dict_text.lines() { + line_idx += 1; + for ch in line.chars() { + if escape { + escape = false; + } else if in_string { + if ch == '\\' { + escape = true; + } else if ch == '"' { + in_string = false; + } + } else if ch == '"' { + in_string = true; + } else if ch == '(' { + if depth == 0 { + open_line = line_idx; + } + depth += 1; + } else if ch == ')' { + depth -= 1; + } + buf.push(ch); + } + buf.push('\n'); + if depth == 0 && !buf.is_empty() { + let chunk = std::mem::take(&mut buf); + let (Some(lp), Some(rp)) = (chunk.find('('), chunk.rfind(')')) else { + continue; + }; + if rp <= lp { + continue; + } + for entry in split_dmm_tile(&chunk[lp + 1..rp]) { + let tpath = dmm_type_path(&entry); + if !tpath.starts_with('/') { + continue; + } + let tgt = make_id1(&tpath); + if !seen_targets.insert(tgt.clone()) { + continue; + } + edges.push(Edge { + source: file_nid.clone(), + target: tgt, + relation: "uses".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{open_line}")), + weight: 1.0, + context: Some("map".to_string()), + confidence_score: None, + external: false, + }); + } + } + } + + FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + } +} + +// ── .dmf (BYOND interface forms) ──────────────────────────────────────────────── diff --git a/crates/graphify-extract/src/extractors/dm/mod.rs b/crates/graphify-extract/src/extractors/dm/mod.rs new file mode 100644 index 0000000..3a55788 --- /dev/null +++ b/crates/graphify-extract/src/extractors/dm/mod.rs @@ -0,0 +1,19 @@ +//! BYOND `DreamMaker` extractors. +//! +//! Ports the DM section of `graphify-py/graphify/extract.py`: +//! - [`extract_dm`] — `.dm`/`.dme` source via tree-sitter (types, procs, +//! includes, calls). DM identity is path-based (`/datum/object/proc/New()`), +//! so this uses a bespoke walk rather than the generic class-body walker. +//! - [`extract_dmi`] — `.dmi` icon sheets (PNG with a BYOND metadata text chunk). +//! - [`extract_dmm`] — `.dmm` map files (tile dictionary type references). +//! - [`extract_dmf`] — `.dmf` interface forms (windows + controls). + +mod dmf; +mod dmi; +mod dmm; +mod source; + +pub use dmf::extract_dmf; +pub use dmi::extract_dmi; +pub use dmm::extract_dmm; +pub use source::extract_dm; diff --git a/crates/graphify-extract/src/extractors/dm.rs b/crates/graphify-extract/src/extractors/dm/source.rs similarity index 52% rename from crates/graphify-extract/src/extractors/dm.rs rename to crates/graphify-extract/src/extractors/dm/source.rs index 6f18905..8b788a6 100644 --- a/crates/graphify-extract/src/extractors/dm.rs +++ b/crates/graphify-extract/src/extractors/dm/source.rs @@ -1,24 +1,11 @@ -//! BYOND `DreamMaker` extractors. -//! -//! Ports the DM section of `graphify-py/graphify/extract.py`: -//! - [`extract_dm`] — `.dm`/`.dme` source via tree-sitter (types, procs, -//! includes, calls). DM identity is path-based (`/datum/object/proc/New()`), -//! so this uses a bespoke walk rather than the generic class-body walker. -//! - [`extract_dmi`] — `.dmi` icon sheets (PNG with a BYOND metadata text chunk). -//! - [`extract_dmm`] — `.dmm` map files (tile dictionary type references). -//! - [`extract_dmf`] — `.dmf` interface forms (windows + controls). +//! `.dm` / `.dme` `DreamMaker` source extractor (tree-sitter). +use crate::ids::{file_stem, make_id, make_id1}; +use crate::types::{Edge, FileResult, Node as GNode, RawCall}; use std::collections::{HashMap, HashSet}; -use std::io::Read; use std::path::Path; -use std::sync::LazyLock; - -use regex::Regex; use tree_sitter::Node; -use crate::ids::{file_stem, make_id, make_id1}; -use crate::types::{Edge, FileResult, Node as GNode, RawCall}; - /// Byte range covered by `node`, decoded as UTF-8 (`""` on bad UTF-8). fn read_text<'b>(node: Node<'_>, source: &'b [u8]) -> &'b str { std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") @@ -458,445 +445,3 @@ pub fn extract_dm(path: &Path) -> FileResult { } // ── .dmi (BYOND icon sheets) ─────────────────────────────────────────────────── - -/// Decompress up to 1 MiB of a zTXt zlib stream (best effort). -/// -/// graphify-py lets a corrupt zlib stream raise; we degrade gracefully and keep -/// whatever decompressed cleanly. The 1 MiB cap mirrors graphify-py's -/// `max_length` guard against decompression bombs. -fn decompress_capped(compressed: &[u8]) -> String { - let mut out = Vec::new(); - let mut decoder = flate2::read::ZlibDecoder::new(compressed).take(1024 * 1024); - let _ = decoder.read_to_end(&mut out); - String::from_utf8_lossy(&out).into_owned() -} - -/// Pull the BYOND metadata text out of a `.dmi` PNG, or `""` on failure. -/// -/// Scans PNG chunks for a `tEXt`/`zTXt` chunk keyed `Description`; zTXt payloads -/// are zlib-decompressed (capped). Mirrors graphify-py `_read_dmi_description`. -fn read_dmi_description(data: &[u8]) -> String { - const PNG_SIG: &[u8] = b"\x89PNG\r\n\x1a\n"; - if !data.starts_with(PNG_SIG) { - return String::new(); - } - let mut i = 8usize; - while i + 8 <= data.len() { - let length = usize::try_from(u32::from_be_bytes([ - data[i], - data[i + 1], - data[i + 2], - data[i + 3], - ])) - .unwrap_or(0); - let chunk_type = &data[i + 4..i + 8]; - let payload_start = i + 8; - let payload_end = payload_start.saturating_add(length).min(data.len()); - let payload = &data[payload_start..payload_end]; - if chunk_type == b"tEXt" || chunk_type == b"zTXt" { - let Some(nul) = payload.iter().position(|&b| b == 0) else { - return String::new(); - }; - if &payload[..nul] == b"Description" { - if chunk_type == b"zTXt" { - // zTXt: keyword \0 compression_method(1 byte) compressed_data - return decompress_capped(payload.get(nul + 2..).unwrap_or(&[])); - } - // tEXt: keyword \0 text - return String::from_utf8_lossy(payload.get(nul + 1..).unwrap_or(&[])).into_owned(); - } - } - i = i.saturating_add(8).saturating_add(length).saturating_add(4); - } - String::new() -} - -/// Extract icon state names from a `.dmi` (BYOND PNG icon sheet). -#[must_use] -pub fn extract_dmi(path: &Path) -> FileResult { - let data = match std::fs::read(path) { - Ok(b) => b, - Err(e) => return FileResult::error(e.to_string()), - }; - let str_path = path.to_string_lossy().into_owned(); - let stem = file_stem(path); - let file_nid = make_id1(&str_path); - let file_label = path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()); - - let mut nodes = vec![GNode { - id: file_nid.clone(), - label: file_label, - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some("L1".to_string()), - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - let mut seen: HashSet = HashSet::from([file_nid.clone()]); - - let description = read_dmi_description(&data); - if description.is_empty() { - return FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - }; - } - - let mut line_no: u32 = 0; - for raw_line in description.lines() { - line_no += 1; - let stripped = raw_line.trim(); - if !stripped.starts_with("state =") { - continue; - } - let value = stripped.split_once('=').map_or("", |(_, v)| v).trim(); - let state_name = if value.starts_with('"') && value.ends_with('"') && value.len() >= 2 { - &value[1..value.len() - 1] - } else { - value - }; - if state_name.is_empty() { - continue; - } - let nid = make_id(&[&stem, "state", state_name]); - if !seen.insert(nid.clone()) { - continue; - } - nodes.push(GNode { - id: nid.clone(), - label: format!("\"{state_name}\""), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line_no}")), - metadata: None, - }); - edges.push(Edge { - source: file_nid.clone(), - target: nid, - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line_no}")), - weight: 1.0, - context: None, - confidence_score: None, - external: false, - }); - } - - FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - } -} - -// ── .dmm (BYOND map files) ───────────────────────────────────────────────────── - -/// Matches the start of a `.dmm` grid section: `(x,y,z) = ...`. -#[allow(clippy::expect_used)] // literal pattern; compiles on first use -static DMM_GRID_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"(?m)^\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)\s*=").expect("static dmm_grid regex") -}); - -/// Split a tile-dictionary body on top-level commas, respecting `(){}[]` -/// nesting and string literals. Mirrors graphify-py `_split_dmm_tile`. -fn split_dmm_tile(body: &str) -> Vec { - let mut out: Vec = Vec::new(); - let mut buf = String::new(); - let mut depth: i32 = 0; - let mut in_string = false; - let mut escape = false; - for ch in body.chars() { - if escape { - buf.push(ch); - escape = false; - continue; - } - if in_string { - buf.push(ch); - if ch == '\\' { - escape = true; - } else if ch == '"' { - in_string = false; - } - continue; - } - match ch { - '"' => { - in_string = true; - buf.push(ch); - } - '(' | '{' | '[' => { - depth += 1; - buf.push(ch); - } - ')' | '}' | ']' => { - depth -= 1; - buf.push(ch); - } - ',' if depth == 0 => { - out.push(buf.trim().to_string()); - buf.clear(); - } - _ => buf.push(ch), - } - } - let tail = buf.trim(); - if !tail.is_empty() { - out.push(tail.to_string()); - } - out -} - -/// Strip a `{var=val; ...}` override suffix off a tile entry, leaving the type path. -fn dmm_type_path(entry: &str) -> String { - entry - .find('{') - .map_or(entry, |b| &entry[..b]) - .trim() - .to_string() -} - -/// Extract type-path references from a `.dmm` map file's tile dictionary. -#[must_use] -pub fn extract_dmm(path: &Path) -> FileResult { - match std::fs::metadata(path) { - Ok(m) if m.len() > 50 * 1024 * 1024 => { - return FileResult::error("file too large (>50 MB)"); - } - Ok(_) => {} - Err(e) => return FileResult::error(e.to_string()), - } - let data = match std::fs::read(path) { - Ok(b) => b, - Err(e) => return FileResult::error(e.to_string()), - }; - let text = String::from_utf8_lossy(&data).into_owned(); - - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - let file_label = path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()); - let nodes = vec![GNode { - id: file_nid.clone(), - label: file_label, - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some("L1".to_string()), - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - - // Only the dictionary section (before the grid) names type paths. - let dict_text = match DMM_GRID_RE.find(&text) { - Some(m) => &text[..m.start()], - None => &text[..], - }; - - let mut seen_targets: HashSet = HashSet::new(); - let mut buf = String::new(); - let mut open_line: u32 = 0; - let mut depth: i32 = 0; - let mut in_string = false; - let mut escape = false; - let mut line_idx: u32 = 0; - for line in dict_text.lines() { - line_idx += 1; - for ch in line.chars() { - if escape { - escape = false; - } else if in_string { - if ch == '\\' { - escape = true; - } else if ch == '"' { - in_string = false; - } - } else if ch == '"' { - in_string = true; - } else if ch == '(' { - if depth == 0 { - open_line = line_idx; - } - depth += 1; - } else if ch == ')' { - depth -= 1; - } - buf.push(ch); - } - buf.push('\n'); - if depth == 0 && !buf.is_empty() { - let chunk = std::mem::take(&mut buf); - let (Some(lp), Some(rp)) = (chunk.find('('), chunk.rfind(')')) else { - continue; - }; - if rp <= lp { - continue; - } - for entry in split_dmm_tile(&chunk[lp + 1..rp]) { - let tpath = dmm_type_path(&entry); - if !tpath.starts_with('/') { - continue; - } - let tgt = make_id1(&tpath); - if !seen_targets.insert(tgt.clone()) { - continue; - } - edges.push(Edge { - source: file_nid.clone(), - target: tgt, - relation: "uses".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{open_line}")), - weight: 1.0, - context: Some("map".to_string()), - confidence_score: None, - external: false, - }); - } - } - } - - FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - } -} - -// ── .dmf (BYOND interface forms) ──────────────────────────────────────────────── - -#[allow(clippy::expect_used)] // literal pattern; compiles on first use -static DMF_WINDOW_RE: LazyLock = - LazyLock::new(|| Regex::new(r#"^\s*window\s+"([^"]+)"\s*$"#).expect("static dmf_window regex")); -#[allow(clippy::expect_used)] // literal pattern; compiles on first use -static DMF_ELEM_RE: LazyLock = - LazyLock::new(|| Regex::new(r#"^\s*elem\s+"([^"]+)"\s*$"#).expect("static dmf_elem regex")); -#[allow(clippy::expect_used)] // literal pattern; compiles on first use -static DMF_TYPE_RE: LazyLock = - LazyLock::new(|| Regex::new(r"^\s*type\s*=\s*(\S+)\s*$").expect("static dmf_type regex")); - -/// Extract windows and controls from a `.dmf` interface file. -#[must_use] -#[allow(clippy::too_many_lines)] // linear line scanner; verbose node/edge literals, not real complexity -pub fn extract_dmf(path: &Path) -> FileResult { - let data = match std::fs::read(path) { - Ok(b) => b, - Err(e) => return FileResult::error(e.to_string()), - }; - let text = String::from_utf8_lossy(&data).into_owned(); - - let str_path = path.to_string_lossy().into_owned(); - let stem = file_stem(path); - let file_nid = make_id1(&str_path); - let file_label = path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()); - let mut nodes = vec![GNode { - id: file_nid.clone(), - label: file_label, - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some("L1".to_string()), - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - let mut seen: HashSet = HashSet::from([file_nid.clone()]); - - let mut current_window_nid: Option = None; - let mut current_elem_nid: Option = None; - let mut current_elem_name: Option = None; - - let mut line_idx: u32 = 0; - for line in text.lines() { - line_idx += 1; - if let Some(cap) = DMF_WINDOW_RE.captures(line) { - let name = &cap[1]; - let nid = make_id(&[&stem, "window", name]); - if seen.insert(nid.clone()) { - nodes.push(GNode { - id: nid.clone(), - label: format!("window \"{name}\""), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line_idx}")), - metadata: None, - }); - edges.push(Edge { - source: file_nid.clone(), - target: nid.clone(), - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line_idx}")), - weight: 1.0, - context: None, - confidence_score: None, - external: false, - }); - } - current_window_nid = Some(nid); - current_elem_nid = None; - current_elem_name = None; - continue; - } - if let Some(cap) = DMF_ELEM_RE.captures(line) - && let Some(win) = current_window_nid.clone() - { - let name = cap[1].to_string(); - let nid = make_id(&[&stem, "elem", &win, &name]); - if seen.insert(nid.clone()) { - nodes.push(GNode { - id: nid.clone(), - label: format!("elem \"{name}\""), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line_idx}")), - metadata: None, - }); - edges.push(Edge { - source: win, - target: nid.clone(), - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line_idx}")), - weight: 1.0, - context: None, - confidence_score: None, - external: false, - }); - } - current_elem_nid = Some(nid); - current_elem_name = Some(name); - continue; - } - if let Some(cap) = DMF_TYPE_RE.captures(line) - && let (Some(elem_nid), Some(elem_name)) = - (current_elem_nid.as_deref(), current_elem_name.as_deref()) - { - let ctype = &cap[1]; - for n in &mut nodes { - if n.id == elem_nid && !n.label.contains(" [") { - n.label = format!("elem \"{elem_name}\" [{ctype}]"); - break; - } - } - } - } - - FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - } -} diff --git a/crates/graphify-extract/src/extractors/dotnet.rs b/crates/graphify-extract/src/extractors/dotnet.rs deleted file mode 100644 index 53b08b9..0000000 --- a/crates/graphify-extract/src/extractors/dotnet.rs +++ /dev/null @@ -1,1082 +0,0 @@ -//! .NET project-file extractors: `.sln`, `.csproj` / `.fsproj` / `.vbproj`, `.razor` / `.cshtml`. -//! -//! Ports `graphify-py/graphify/extract.py::extract_sln`, -//! `extract_csproj`, and `extract_razor`. The Python originals are three -//! discrete top-level helpers; in Rust they're co-located here because they -//! share the same target ecosystem and small helpers. - -use std::collections::HashSet; -use std::path::Path; - -use quick_xml::events::{BytesStart, Event}; -use quick_xml::reader::Reader; -use regex::Regex; -use std::sync::LazyLock; - -use crate::ids::{file_stem, make_id, make_id1}; -use crate::types::{Edge, FileResult, Node}; - -/// `MSBuild` project files (`.csproj` / `.fsproj` / `.vbproj`) larger than this -/// are skipped with an error. Real-world projects are well under 2 MiB; the -/// cap protects the extractor against accidentally being pointed at a -/// committed binary or a multi-megabyte generated artefact. Matches the -/// literal 2 MiB constant in `graphify-py` `extract.py::extract_csproj`, -/// so the cap is intentionally not configurable — raising or lowering it -/// across the Python/Rust pair belongs in a separate parity-bumping change. -const CSPROJ_MAX_BYTES: u64 = 2_097_152; - -/// Text events between an opening element tag and its matching close get -/// routed through this enum when the start tag was a `` or -/// `` element. -enum TextCapture { - None, - TargetFramework, - TargetFrameworks, -} - -#[allow(clippy::expect_used)] // literal pattern; build cannot fail -static SLN_PROJECT_RE: LazyLock = LazyLock::new(|| { - Regex::new(r#"Project\("[^"]*"\)\s*=\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*"([^"]*)""#) - .expect("static sln project regex") -}); -#[allow(clippy::expect_used)] -static SLN_DEP_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"\{([0-9a-fA-F-]+)\}\s*=\s*\{([0-9a-fA-F-]+)\}") - .expect("static sln dependency regex") -}); -#[allow(clippy::expect_used)] -static SLN_PROJECT_LINE_RE: LazyLock = LazyLock::new(|| { - Regex::new(r#"Project\("[^"]*"\)\s*=\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"\{([^}]+)\}""#) - .expect("static sln project-line regex") -}); - -#[allow(clippy::expect_used)] -static RAZOR_USING_RE: LazyLock = - LazyLock::new(|| Regex::new(r"^@using\s+([\w.]+)").expect("static razor @using regex")); -#[allow(clippy::expect_used)] -static RAZOR_INJECT_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"^@inject\s+([\w.<>\[\]]+)\s+(\w+)").expect("static razor @inject regex") -}); -#[allow(clippy::expect_used)] -static RAZOR_INHERITS_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"^@inherits\s+([\w.<>\[\]]+)").expect("static razor @inherits regex") -}); -#[allow(clippy::expect_used)] -static RAZOR_MODEL_RE: LazyLock = - LazyLock::new(|| Regex::new(r"^@model\s+([\w.<>\[\]]+)").expect("static razor @model regex")); -#[allow(clippy::expect_used)] -static RAZOR_PAGE_RE: LazyLock = - LazyLock::new(|| Regex::new(r#"^@page\s+"([^"]+)""#).expect("static razor @page regex")); -#[allow(clippy::expect_used)] -static RAZOR_COMPONENT_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"<([A-Z][A-Za-z0-9]+)[\s/>]").expect("static razor component regex") -}); -#[allow(clippy::expect_used)] -static RAZOR_CODE_BLOCK_RE: LazyLock = - LazyLock::new(|| Regex::new(r"(?m)@code\s*\{").expect("static razor @code regex")); -#[allow(clippy::expect_used)] -static RAZOR_METHOD_RE: LazyLock = LazyLock::new(|| { - Regex::new( - r"(?:public|private|protected|internal|static|async|override|virtual|abstract)\s+[\w<>\[\],\s]+\s+(\w+)\s*\(", - ) - .expect("static razor method regex") -}); - -const RAZOR_HTML_TAGS: &[&str] = &[ - "DOCTYPE", "Html", "Head", "Body", "Div", "Span", "Table", "Form", "Input", "Button", "Select", - "Option", "Label", "Textarea", "Script", "Style", "Link", "Meta", "Title", "Header", "Footer", - "Nav", "Main", "Section", "Article", "Aside", -]; - -// ── .sln ──────────────────────────────────────────────────────────────────── - -/// Extract project nodes and inter-project dependency edges from a `.sln` file. -/// -/// Each `Project(...) = ...` block becomes a node attached to the solution -/// file via `contains`; `ProjectSection(ProjectDependencies)` entries become -/// `imports` edges between projects identified by GUID. Mirrors -/// `graphify-py` `extract_sln`. -#[must_use] -#[allow(clippy::too_many_lines)] // two linear passes over .sln plus node/edge bookkeeping -pub fn extract_sln(path: &Path) -> FileResult { - let Ok(src) = std::fs::read_to_string(path) else { - return FileResult::error(format!("cannot read {}", path.display())); - }; - - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - - let mut nodes: Vec = vec![Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: None, - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::new(); - seen_ids.insert(file_nid.clone()); - - let mut guid_to_nid: std::collections::HashMap = - std::collections::HashMap::new(); - - for cap in SLN_PROJECT_RE.captures_iter(&src) { - let proj_name = cap.get(1).map_or("", |m| m.as_str()).to_string(); - let proj_path = cap.get(2).map_or("", |m| m.as_str()).replace('\\', "/"); - let proj_guid = cap - .get(3) - .map_or("", |m| m.as_str()) - .trim_matches(|c| c == '{' || c == '}') - .to_string(); - - let abs_proj = path - .parent() - .map(|p| p.join(&proj_path)) - .and_then(|p| p.canonicalize().ok()) - .map_or_else(|| proj_path.clone(), |p| p.to_string_lossy().into_owned()); - let proj_nid = make_id1(&abs_proj); - if !proj_nid.is_empty() && seen_ids.insert(proj_nid.clone()) { - nodes.push(Node { - id: proj_nid.clone(), - label: proj_name, - file_type: "code".to_string(), - source_file: abs_proj.clone(), - source_location: None, - metadata: None, - }); - edges.push(Edge { - external: false, - source: file_nid.clone(), - target: proj_nid.clone(), - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - if !proj_guid.is_empty() { - guid_to_nid.insert(proj_guid.to_lowercase(), proj_nid); - } - } - - // Second pass: project-dependency sections. Each block is nested inside - // a Project(...)/EndProject pair so we track the currently open project's - // GUID and emit `imports` edges to each declared dependency. - let mut in_dep_section = false; - let mut current_proj_guid: Option = None; - for line in src.lines() { - if let Some(cap) = SLN_PROJECT_LINE_RE.captures(line) { - current_proj_guid = cap.get(1).map(|m| m.as_str().to_lowercase()); - continue; - } - if line.trim() == "EndProject" { - current_proj_guid = None; - continue; - } - if line.contains("ProjectSection(ProjectDependencies)") { - in_dep_section = true; - continue; - } - if in_dep_section && line.contains("EndProjectSection") { - in_dep_section = false; - continue; - } - if in_dep_section - && let Some(ref from_guid) = current_proj_guid - && let Some(dep_cap) = SLN_DEP_RE.captures(line) - { - let to_guid = dep_cap.get(1).map_or("", |m| m.as_str()).to_lowercase(); - let from_nid = guid_to_nid.get(from_guid); - let to_nid = guid_to_nid.get(&to_guid); - if let (Some(from), Some(to)) = (from_nid, to_nid) - && from != to - { - edges.push(Edge { - external: false, - source: from.clone(), - target: to.clone(), - relation: "imports".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - } - } - - FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - } -} - -// ── .slnx ───────────────────────────────────────────────────────────────── - -/// Shared mutable state threaded through the `.slnx` streaming parse. -struct SlnxCtx<'a> { - path: &'a Path, - str_path: &'a str, - file_nid: &'a str, - nodes: &'a mut Vec, - edges: &'a mut Vec, - seen_ids: &'a mut HashSet, - project_nids: &'a mut HashSet, - /// Candidate `(from_nid, to_nid)` build dependencies, filtered against - /// `project_nids` only after the whole document is parsed (a dependency may - /// reference a project declared later in the file). - dep_candidates: &'a mut Vec<(String, String)>, - /// `from_nid`s of currently open `` elements, so a nested - /// `` attaches to its nearest enclosing project. An empty - /// string marks an open `` without a `Path` (keeps push/pop balanced). - proj_stack: &'a mut Vec, -} - -impl SlnxCtx<'_> { - /// Resolve a project path relative to the solution file, mirroring the - /// `.sln` resolver: canonicalise when the target exists, otherwise fall - /// back to the slash-normalised relative path so ids stay deterministic. - fn resolve(&self, proj_path: &str) -> String { - let norm = proj_path.replace('\\', "/"); - self.path - .parent() - .map(|p| p.join(&norm)) - .and_then(|p| p.canonicalize().ok()) - .map_or(norm, |p| p.to_string_lossy().into_owned()) - } - - /// Handle one `` / `` element. `has_children` is - /// `true` for a `Start` tag (a matching `End` will pop the stack) and - /// `false` for a self-closing `Empty` tag. - fn on_element(&mut self, e: &BytesStart<'_>, has_children: bool) { - match local_name(e).as_str() { - "Project" => { - let path_attr = attr_ci(e, "Path").filter(|s| !s.is_empty()); - let proj_nid = match &path_attr { - Some(proj_path) => { - let abs = self.resolve(proj_path); - let nid = make_id1(&abs); - if !nid.is_empty() { - if self.seen_ids.insert(nid.clone()) { - let label = Path::new(proj_path).file_stem().map_or_else( - || proj_path.clone(), - |s| s.to_string_lossy().into_owned(), - ); - self.nodes.push(Node { - id: nid.clone(), - label, - file_type: "code".to_string(), - source_file: abs.clone(), - source_location: None, - metadata: None, - }); - self.edges.push(Edge { - external: false, - source: self.file_nid.to_string(), - target: nid.clone(), - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: self.str_path.to_string(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - self.project_nids.insert(nid.clone()); - } - nid - } - None => String::new(), - }; - if has_children { - self.proj_stack.push(proj_nid); - } - } - "BuildDependency" => { - if let Some(dep_path) = attr_ci(e, "Project").filter(|s| !s.is_empty()) { - let to_nid = make_id1(&self.resolve(&dep_path)); - if let Some(from) = self.proj_stack.last() - && !from.is_empty() - && !to_nid.is_empty() - && *from != to_nid - { - self.dep_candidates.push((from.clone(), to_nid)); - } - } - } - _ => {} - } - } -} - -/// Extract project nodes and inter-project build-order dependencies from a -/// `.slnx` file — the XML-based replacement for `.sln`. -/// -/// `` elements (anywhere in the tree, including inside -/// ``) become nodes attached to the solution via `contains`; -/// `` children become `imports` edges between -/// known projects. Unlike `.sln` there are no GUIDs — projects are identified -/// by their resolved path. Mirrors `graphify-py` `extract_slnx`. -#[must_use] -pub fn extract_slnx(path: &Path) -> FileResult { - let Ok(bytes) = std::fs::read(path) else { - return FileResult::error(format!("cannot read {}", path.display())); - }; - if bytes.len() as u64 > CSPROJ_MAX_BYTES { - return FileResult::error("project file too large"); - } - if !crate::extractors::project_xml_is_safe(&bytes) { - return FileResult::error("refusing XML with DOCTYPE/ENTITY declaration"); - } - - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - - let mut nodes: Vec = vec![Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: None, - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::new(); - seen_ids.insert(file_nid.clone()); - - let mut project_nids: HashSet = HashSet::new(); - let mut dep_candidates: Vec<(String, String)> = Vec::new(); - let mut proj_stack: Vec = Vec::new(); - - let mut reader = Reader::from_reader(&*bytes); - reader.config_mut().trim_text(true); - let mut buf = Vec::new(); - { - let mut ctx = SlnxCtx { - path, - str_path: &str_path, - file_nid: &file_nid, - nodes: &mut nodes, - edges: &mut edges, - seen_ids: &mut seen_ids, - project_nids: &mut project_nids, - dep_candidates: &mut dep_candidates, - proj_stack: &mut proj_stack, - }; - loop { - match reader.read_event_into(&mut buf) { - Err(e) => return FileResult::error(format!("XML parse error: {e}")), - Ok(Event::Eof) => break, - Ok(Event::Start(ref e)) => ctx.on_element(e, true), - Ok(Event::Empty(ref e)) => ctx.on_element(e, false), - Ok(Event::End(ref e)) => { - // `BytesEnd` is a distinct type from `BytesStart`, so strip - // the namespace prefix inline rather than via `local_name`. - let name = e.name(); - let raw = name.as_ref(); - let local = raw - .iter() - .rposition(|&b| b == b':') - .map_or(raw, |i| &raw[i + 1..]); - if local == b"Project" { - ctx.proj_stack.pop(); - } - } - _ => {} - } - buf.clear(); - } - } - - // Build-order dependencies between known projects. - for (from, to) in dep_candidates { - if project_nids.contains(&to) { - edges.push(Edge { - external: false, - source: from, - target: to, - relation: "imports".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - } - - FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - } -} - -// ── .csproj / .fsproj / .vbproj ───────────────────────────────────────────── - -/// Strip an XML element's namespace prefix so callers can match on the local -/// tag name. Matches Python's `tag.split('}')[1]` pattern. -fn local_name(start: &BytesStart<'_>) -> String { - let name = start.name(); - let raw = name.as_ref(); - let local = raw - .iter() - .rposition(|&b| b == b':') - .map_or(raw, |i| &raw[i + 1..]); - String::from_utf8_lossy(local).into_owned() -} - -/// Find `attr` on a `BytesStart`, falling back to its lowercased variant — -/// mirrors Python's case-insensitive `Include`/`include` lookup. Returns -/// `None` when neither attribute is present. -fn attr_ci(start: &BytesStart<'_>, attr: &str) -> Option { - start - .try_get_attribute(attr) - .ok() - .flatten() - .or_else(|| { - start - .try_get_attribute(attr.to_lowercase().as_str()) - .ok() - .flatten() - }) - // `normalized_value` decodes XML entities (`&` → `&`, `/` - // → `/`, etc.) and collapses whitespace per the XML attribute-value - // normalization rules. Python's ElementTree returns already-decoded - // attribute text, so we match that here — a - // `PackageReference Include="A&B"` becomes the literal `A&B` - // node label instead of `A&B`. - .and_then(|a| { - // Treat the document as XML 1.0 when the declaration was - // omitted (csproj files almost never carry an `` prolog). - a.normalized_value(quick_xml::XmlVersion::Implicit1_0) - .ok() - .map(std::borrow::Cow::into_owned) - }) -} - -/// Extract packages, project references, target frameworks, and SDK from an -/// `MSBuild` project file (`.csproj` / `.fsproj` / `.vbproj`). Mirrors -/// `graphify-py` `extract_csproj`. -#[must_use] -#[allow(clippy::too_many_lines)] // linear element dispatch, hard to split without losing locality -pub fn extract_csproj(path: &Path) -> FileResult { - let Ok(bytes) = std::fs::read(path) else { - return FileResult::error(format!("cannot read {}", path.display())); - }; - if bytes.len() as u64 > CSPROJ_MAX_BYTES { - return FileResult::error("project file too large"); - } - if !crate::extractors::project_xml_is_safe(&bytes) { - return FileResult::error("refusing XML with DOCTYPE/ENTITY declaration"); - } - - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - - let mut nodes: Vec = vec![Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: None, - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::new(); - seen_ids.insert(file_nid.clone()); - - let mut reader = Reader::from_reader(&*bytes); - reader.config_mut().trim_text(true); - - // Root-level SDK attribute (read on the first encountered start tag). - let mut root_sdk: Option = None; - let mut root_seen = false; - - // Track text content of `` / `` since - // quick-xml delivers text in a separate event. - let mut capture = TextCapture::None; - - let mut buf = Vec::new(); - loop { - match reader.read_event_into(&mut buf) { - Err(e) => { - return FileResult::error(format!("XML parse error: {e}")); - } - Ok(Event::Eof) => break, - Ok(Event::Start(ref e) | Event::Empty(ref e)) => { - if !root_seen { - root_seen = true; - root_sdk = attr_ci(e, "Sdk"); - } - let name = local_name(e); - match name.as_str() { - "TargetFramework" => { - capture = TextCapture::TargetFramework; - } - "TargetFrameworks" => { - capture = TextCapture::TargetFrameworks; - } - "PackageReference" => { - let Some(pkg_name) = attr_ci(e, "Include") else { - continue; - }; - let version = attr_ci(e, "Version").unwrap_or_default(); - let pkg_nid = make_id(&["nuget", &pkg_name]); - if pkg_nid.is_empty() { - continue; - } - let label = if version.is_empty() { - pkg_name.clone() - } else { - format!("{pkg_name} ({version})") - }; - if seen_ids.insert(pkg_nid.clone()) { - nodes.push(Node { - id: pkg_nid.clone(), - label, - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: None, - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: file_nid.clone(), - target: pkg_nid, - relation: "imports".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - "ProjectReference" => { - let Some(ref_path) = attr_ci(e, "Include") else { - continue; - }; - let ref_norm = ref_path.replace('\\', "/"); - let abs_ref = path - .parent() - .map(|p| p.join(&ref_norm)) - .and_then(|p| p.canonicalize().ok()) - .map_or_else(|| ref_norm.clone(), |p| p.to_string_lossy().into_owned()); - let proj_nid = make_id1(&abs_ref); - if proj_nid.is_empty() { - continue; - } - let proj_label = Path::new(&ref_norm) - .file_name() - .map_or_else(|| ref_norm.clone(), |n| n.to_string_lossy().into_owned()); - if seen_ids.insert(proj_nid.clone()) { - nodes.push(Node { - id: proj_nid.clone(), - label: proj_label, - file_type: "code".to_string(), - source_file: abs_ref, - source_location: None, - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: file_nid.clone(), - target: proj_nid, - relation: "imports".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - _ => {} - } - } - Ok(Event::Text(t)) => { - let text = match t.decode() { - Ok(s) => s.into_owned(), - Err(_) => continue, - }; - match capture { - TextCapture::TargetFramework => { - let fw = text.trim().to_string(); - if !fw.is_empty() { - add_framework_node( - &fw, - &str_path, - &file_nid, - &mut nodes, - &mut edges, - &mut seen_ids, - ); - } - } - TextCapture::TargetFrameworks => { - for fw_raw in text.trim().split(';') { - let fw = fw_raw.trim(); - if !fw.is_empty() { - add_framework_node( - fw, - &str_path, - &file_nid, - &mut nodes, - &mut edges, - &mut seen_ids, - ); - } - } - } - TextCapture::None => {} - } - capture = TextCapture::None; - } - Ok(Event::End(_)) => { - capture = TextCapture::None; - } - _ => {} - } - buf.clear(); - } - - if let Some(sdk) = root_sdk - && !sdk.is_empty() - { - let sdk_nid = make_id(&["sdk", &sdk]); - if !sdk_nid.is_empty() && seen_ids.insert(sdk_nid.clone()) { - nodes.push(Node { - id: sdk_nid.clone(), - label: sdk, - file_type: "concept".to_string(), - source_file: str_path.clone(), - source_location: None, - metadata: None, - }); - edges.push(Edge { - external: false, - source: file_nid.clone(), - target: sdk_nid, - relation: "references".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - } - - FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - } -} - -fn add_framework_node( - fw: &str, - str_path: &str, - file_nid: &str, - nodes: &mut Vec, - edges: &mut Vec, - seen_ids: &mut HashSet, -) { - let fw_nid = make_id(&["framework", fw]); - if fw_nid.is_empty() || !seen_ids.insert(fw_nid.clone()) { - return; - } - nodes.push(Node { - id: fw_nid.clone(), - label: fw.to_string(), - file_type: "concept".to_string(), - source_file: str_path.to_string(), - source_location: None, - metadata: None, - }); - edges.push(Edge { - external: false, - source: file_nid.to_string(), - target: fw_nid, - relation: "references".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); -} - -// ── .razor / .cshtml ──────────────────────────────────────────────────────── - -/// Extract directives, component refs, and `@code` methods from a `.razor` / -/// `.cshtml` file. Mirrors `graphify-py` `extract_razor`. -#[must_use] -#[allow(clippy::too_many_lines)] // linear directive dispatch + component scan + @code body parse -pub fn extract_razor(path: &Path) -> FileResult { - let Ok(src) = std::fs::read_to_string(path) else { - return FileResult::error(format!("cannot read {}", path.display())); - }; - - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - - let mut nodes: Vec = vec![Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: None, - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::new(); - seen_ids.insert(file_nid.clone()); - - let add_ref = |target_name: &str, - relation: &str, - line: usize, - nodes: &mut Vec, - edges: &mut Vec, - seen_ids: &mut HashSet| { - let tgt_nid = make_id1(target_name); - if tgt_nid.is_empty() { - return; - } - if seen_ids.insert(tgt_nid.clone()) { - nodes.push(Node { - id: tgt_nid.clone(), - label: target_name.to_string(), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: file_nid.clone(), - target: tgt_nid, - relation: relation.to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - }; - - for (idx, line) in src.lines().enumerate() { - let i = idx + 1; - if let Some(cap) = RAZOR_USING_RE.captures(line) { - if let Some(m) = cap.get(1) { - add_ref( - m.as_str(), - "imports", - i, - &mut nodes, - &mut edges, - &mut seen_ids, - ); - } - continue; - } - if let Some(cap) = RAZOR_INJECT_RE.captures(line) { - if let Some(m) = cap.get(1) { - add_ref( - m.as_str(), - "imports", - i, - &mut nodes, - &mut edges, - &mut seen_ids, - ); - } - continue; - } - if let Some(cap) = RAZOR_INHERITS_RE.captures(line) { - if let Some(m) = cap.get(1) { - add_ref( - m.as_str(), - "inherits", - i, - &mut nodes, - &mut edges, - &mut seen_ids, - ); - } - continue; - } - if let Some(cap) = RAZOR_MODEL_RE.captures(line) { - if let Some(m) = cap.get(1) { - add_ref( - m.as_str(), - "references", - i, - &mut nodes, - &mut edges, - &mut seen_ids, - ); - } - continue; - } - if let Some(cap) = RAZOR_PAGE_RE.captures(line) - && let Some(m) = cap.get(1) - { - let route = m.as_str(); - let route_nid = make_id(&["route", route]); - if !route_nid.is_empty() && seen_ids.insert(route_nid.clone()) { - nodes.push(Node { - id: route_nid.clone(), - label: format!("route:{route}"), - file_type: "concept".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{i}")), - metadata: None, - }); - edges.push(Edge { - external: false, - source: file_nid.clone(), - target: route_nid, - relation: "references".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - } - } - - // Component references: capitalised tag names that aren't known HTML elements. - for m in RAZOR_COMPONENT_RE.captures_iter(&src) { - let Some(name_m) = m.get(1) else { continue }; - let comp_name = name_m.as_str(); - if RAZOR_HTML_TAGS.contains(&comp_name) { - continue; - } - let abs_pos = name_m.start(); - let line_num = src[..abs_pos].chars().filter(|&c| c == '\n').count() + 1; - add_ref( - comp_name, - "calls", - line_num, - &mut nodes, - &mut edges, - &mut seen_ids, - ); - } - - // @code { ... } method extraction. Find each `@code {` opening, walk - // braces tracking C# lexical context (line comments, block comments, - // regular strings, verbatim strings, char literals) so braces inside - // those don't confuse the depth counter. - // - // Divergence from `graphify-py` `extract_razor` (intentional): the - // Python brace counter is purely structural, which means a method - // body containing `"}{"` would truncate `block_end` early and - // silently drop every method below that point. Run-aware scanning - // costs O(n) extra work but produces the right block boundary. - let stem = file_stem(path); - let src_bytes = src.as_bytes(); - for cap in RAZOR_CODE_BLOCK_RE.find_iter(&src) { - let block_start = cap.end(); - let block_end = find_csharp_block_end(src_bytes, block_start); - if block_end <= block_start { - continue; - } - let block_body = &src[block_start..block_end]; - for mm in RAZOR_METHOD_RE.captures_iter(block_body) { - let Some(name_m) = mm.get(1) else { continue }; - let method_name = name_m.as_str(); - let abs_pos = block_start + name_m.start(); - let method_line = src[..abs_pos].chars().filter(|&c| c == '\n').count() + 1; - let method_nid = make_id(&[&stem, method_name]); - if method_nid.is_empty() { - continue; - } - if seen_ids.insert(method_nid.clone()) { - nodes.push(Node { - id: method_nid.clone(), - label: method_name.to_string(), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{method_line}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: file_nid.clone(), - target: method_nid, - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: None, - weight: 1.0, - context: None, - confidence_score: None, - }); - } - } - - FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - } -} - -/// Find the byte index of the closing `}` that matches the opening `{` of -/// an `@code {` block. -/// -/// Walks `src` from `start` tracking C# lexical state: line comments, -/// block comments, regular strings (with `\` escape), verbatim strings -/// (with `""` escape), interpolated strings, and char literals. Braces -/// inside any of those don't count toward the depth. -/// -/// Returns the byte offset of the matching `}` (the byte one past the -/// last byte of the block body). When the closing `}` is missing, -/// returns `src.len()` so the caller fails open and still scans -/// whatever body it has. -#[allow(clippy::too_many_lines)] // linear state-machine dispatch; splitting per-state would just spread the transition table -fn find_csharp_block_end(src: &[u8], start: usize) -> usize { - #[derive(Clone, Copy, PartialEq, Eq)] - enum State { - Code, - LineComment, - BlockComment, - String, - VerbatimString, - Char, - } - let mut state = State::Code; - let mut depth: i32 = 1; - let mut pos = start; - while pos < src.len() { - let b = src[pos]; - match state { - State::Code => { - let next = src.get(pos + 1).copied().unwrap_or(0); - if b == b'/' && next == b'/' { - state = State::LineComment; - pos += 2; - continue; - } - if b == b'/' && next == b'*' { - state = State::BlockComment; - pos += 2; - continue; - } - if (b == b'@' || b == b'$') && next == b'"' { - // `@"..."` is verbatim (no `\` escape, `""` is the - // embedded-quote). `$"..."` is interpolated — the - // embedded text between holes honours the regular - // `\"` escape, so route it through the regular - // string state. - state = if b == b'@' { - State::VerbatimString - } else { - State::String - }; - pos += 2; - continue; - } - if b == b'"' { - state = State::String; - pos += 1; - continue; - } - if b == b'\'' { - state = State::Char; - pos += 1; - continue; - } - if b == b'{' { - depth += 1; - } else if b == b'}' { - depth -= 1; - if depth == 0 { - return pos; - } - } - pos += 1; - } - State::LineComment => { - if b == b'\n' { - state = State::Code; - } - pos += 1; - } - State::BlockComment => { - if b == b'*' && src.get(pos + 1).copied() == Some(b'/') { - state = State::Code; - pos += 2; - } else { - pos += 1; - } - } - State::String => { - if b == b'\\' && pos + 1 < src.len() { - // Skip the escaped char (covers `\"`, `\\`, `\n`, ...). - pos += 2; - } else if b == b'"' { - state = State::Code; - pos += 1; - } else { - pos += 1; - } - } - State::VerbatimString => { - if b == b'"' && src.get(pos + 1).copied() == Some(b'"') { - pos += 2; - } else if b == b'"' { - state = State::Code; - pos += 1; - } else { - pos += 1; - } - } - State::Char => { - if b == b'\\' && pos + 1 < src.len() { - pos += 2; - } else if b == b'\'' { - state = State::Code; - pos += 1; - } else { - pos += 1; - } - } - } - } - src.len() -} diff --git a/crates/graphify-extract/src/extractors/dotnet/csproj.rs b/crates/graphify-extract/src/extractors/dotnet/csproj.rs new file mode 100644 index 0000000..e6c0753 --- /dev/null +++ b/crates/graphify-extract/src/extractors/dotnet/csproj.rs @@ -0,0 +1,288 @@ +//! `.csproj` / `.fsproj` / `.vbproj` `MSBuild` project-file extractor. + +use super::{CSPROJ_MAX_BYTES, attr_ci, local_name}; +use crate::ids::{make_id, make_id1}; +use crate::types::{Edge, FileResult, Node}; +use quick_xml::events::Event; +use quick_xml::reader::Reader; +use std::collections::HashSet; +use std::path::Path; + +/// Text events between an opening element tag and its matching close get +/// routed through this enum when the start tag was a `` or +/// `` element. +enum TextCapture { + None, + TargetFramework, + TargetFrameworks, +} + +/// Extract packages, project references, target frameworks, and SDK from an +/// `MSBuild` project file (`.csproj` / `.fsproj` / `.vbproj`). Mirrors +/// `graphify-py` `extract_csproj`. +#[must_use] +#[allow(clippy::too_many_lines)] // linear element dispatch, hard to split without losing locality +pub fn extract_csproj(path: &Path) -> FileResult { + let Ok(bytes) = std::fs::read(path) else { + return FileResult::error(format!("cannot read {}", path.display())); + }; + if bytes.len() as u64 > CSPROJ_MAX_BYTES { + return FileResult::error("project file too large"); + } + if !crate::extractors::project_xml_is_safe(&bytes) { + return FileResult::error("refusing XML with DOCTYPE/ENTITY declaration"); + } + + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + + let mut nodes: Vec = vec![Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: None, + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::new(); + seen_ids.insert(file_nid.clone()); + + let mut reader = Reader::from_reader(&*bytes); + reader.config_mut().trim_text(true); + + // Root-level SDK attribute (read on the first encountered start tag). + let mut root_sdk: Option = None; + let mut root_seen = false; + + // Track text content of `` / `` since + // quick-xml delivers text in a separate event. + let mut capture = TextCapture::None; + + let mut buf = Vec::new(); + loop { + match reader.read_event_into(&mut buf) { + Err(e) => { + return FileResult::error(format!("XML parse error: {e}")); + } + Ok(Event::Eof) => break, + Ok(start @ (Event::Start(_) | Event::Empty(_))) => { + // A self-closing `` (an `Empty` event) carries no + // text, so only a real open tag arms the capture; otherwise the flag + // would dangle and misattribute the next element's text. graphify-py + // reads `tf.text` (None for self-closing tags), so this matches it. + let is_empty = matches!(start, Event::Empty(_)); + let (Event::Start(e) | Event::Empty(e)) = &start else { + continue; + }; + if !root_seen { + root_seen = true; + root_sdk = attr_ci(e, "Sdk"); + } + let name = local_name(e); + match name.as_str() { + "TargetFramework" if !is_empty => { + capture = TextCapture::TargetFramework; + } + "TargetFrameworks" if !is_empty => { + capture = TextCapture::TargetFrameworks; + } + "PackageReference" => { + let Some(pkg_name) = attr_ci(e, "Include") else { + continue; + }; + let version = attr_ci(e, "Version").unwrap_or_default(); + let pkg_nid = make_id(&["nuget", &pkg_name]); + if pkg_nid.is_empty() { + continue; + } + let label = if version.is_empty() { + pkg_name.clone() + } else { + format!("{pkg_name} ({version})") + }; + if seen_ids.insert(pkg_nid.clone()) { + nodes.push(Node { + id: pkg_nid.clone(), + label, + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: None, + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: file_nid.clone(), + target: pkg_nid, + relation: "imports".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + "ProjectReference" => { + let Some(ref_path) = attr_ci(e, "Include") else { + continue; + }; + let ref_norm = ref_path.replace('\\', "/"); + let abs_ref = path + .parent() + .map(|p| p.join(&ref_norm)) + .and_then(|p| p.canonicalize().ok()) + .map_or_else(|| ref_norm.clone(), |p| p.to_string_lossy().into_owned()); + let proj_nid = make_id1(&abs_ref); + if proj_nid.is_empty() { + continue; + } + let proj_label = Path::new(&ref_norm) + .file_name() + .map_or_else(|| ref_norm.clone(), |n| n.to_string_lossy().into_owned()); + if seen_ids.insert(proj_nid.clone()) { + nodes.push(Node { + id: proj_nid.clone(), + label: proj_label, + file_type: "code".to_string(), + source_file: abs_ref, + source_location: None, + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: file_nid.clone(), + target: proj_nid, + relation: "imports".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + _ => {} + } + } + Ok(Event::Text(t)) => { + let text = match t.decode() { + Ok(s) => s.into_owned(), + Err(_) => continue, + }; + match capture { + TextCapture::TargetFramework => { + let fw = text.trim().to_string(); + if !fw.is_empty() { + add_framework_node( + &fw, + &str_path, + &file_nid, + &mut nodes, + &mut edges, + &mut seen_ids, + ); + } + } + TextCapture::TargetFrameworks => { + for fw_raw in text.trim().split(';') { + let fw = fw_raw.trim(); + if !fw.is_empty() { + add_framework_node( + fw, + &str_path, + &file_nid, + &mut nodes, + &mut edges, + &mut seen_ids, + ); + } + } + } + TextCapture::None => {} + } + capture = TextCapture::None; + } + Ok(Event::End(_)) => { + capture = TextCapture::None; + } + _ => {} + } + buf.clear(); + } + + if let Some(sdk) = root_sdk + && !sdk.is_empty() + { + let sdk_nid = make_id(&["sdk", &sdk]); + if !sdk_nid.is_empty() && seen_ids.insert(sdk_nid.clone()) { + nodes.push(Node { + id: sdk_nid.clone(), + label: sdk, + file_type: "concept".to_string(), + source_file: str_path.clone(), + source_location: None, + metadata: None, + }); + edges.push(Edge { + external: false, + source: file_nid.clone(), + target: sdk_nid, + relation: "references".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + } + + FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + } +} + +fn add_framework_node( + fw: &str, + str_path: &str, + file_nid: &str, + nodes: &mut Vec, + edges: &mut Vec, + seen_ids: &mut HashSet, +) { + let fw_nid = make_id(&["framework", fw]); + if fw_nid.is_empty() || !seen_ids.insert(fw_nid.clone()) { + return; + } + nodes.push(Node { + id: fw_nid.clone(), + label: fw.to_string(), + file_type: "concept".to_string(), + source_file: str_path.to_string(), + source_location: None, + metadata: None, + }); + edges.push(Edge { + external: false, + source: file_nid.to_string(), + target: fw_nid, + relation: "references".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); +} + +// ── .razor / .cshtml ──────────────────────────────────────────────────────── diff --git a/crates/graphify-extract/src/extractors/dotnet/mod.rs b/crates/graphify-extract/src/extractors/dotnet/mod.rs new file mode 100644 index 0000000..8a0270b --- /dev/null +++ b/crates/graphify-extract/src/extractors/dotnet/mod.rs @@ -0,0 +1,68 @@ +//! .NET project-file extractors: `.sln`, `.csproj` / `.fsproj` / `.vbproj`, `.razor` / `.cshtml`. +//! +//! Ports `graphify-py/graphify/extract.py::extract_sln`, +//! `extract_csproj`, and `extract_razor`. The Python originals are three +//! discrete top-level helpers; in Rust they're co-located here because they +//! share the same target ecosystem and small helpers. + +mod csproj; +mod razor; +mod sln; +mod slnx; + +pub use csproj::extract_csproj; +pub use razor::extract_razor; +pub use sln::extract_sln; +pub use slnx::extract_slnx; + +use quick_xml::events::BytesStart; + +/// `MSBuild` project files (`.csproj` / `.fsproj` / `.vbproj`) larger than this +/// are skipped with an error. Real-world projects are well under 2 MiB; the +/// cap protects the extractor against accidentally being pointed at a +/// committed binary or a multi-megabyte generated artefact. Matches the +/// literal 2 MiB constant in `graphify-py` `extract.py::extract_csproj`, +/// so the cap is intentionally not configurable — raising or lowering it +/// across the Python/Rust pair belongs in a separate parity-bumping change. +const CSPROJ_MAX_BYTES: u64 = 2_097_152; + +/// Strip an XML element's namespace prefix so callers can match on the local +/// tag name. Matches Python's `tag.split('}')[1]` pattern. +fn local_name(start: &BytesStart<'_>) -> String { + let name = start.name(); + let raw = name.as_ref(); + let local = raw + .iter() + .rposition(|&b| b == b':') + .map_or(raw, |i| &raw[i + 1..]); + String::from_utf8_lossy(local).into_owned() +} + +/// Find `attr` on a `BytesStart`, falling back to its lowercased variant — +/// mirrors Python's case-insensitive `Include`/`include` lookup. Returns +/// `None` when neither attribute is present. +fn attr_ci(start: &BytesStart<'_>, attr: &str) -> Option { + start + .try_get_attribute(attr) + .ok() + .flatten() + .or_else(|| { + start + .try_get_attribute(attr.to_lowercase().as_str()) + .ok() + .flatten() + }) + // `normalized_value` decodes XML entities (`&` → `&`, `/` + // → `/`, etc.) and collapses whitespace per the XML attribute-value + // normalization rules. Python's ElementTree returns already-decoded + // attribute text, so we match that here — a + // `PackageReference Include="A&B"` becomes the literal `A&B` + // node label instead of `A&B`. + .and_then(|a| { + // Treat the document as XML 1.0 when the declaration was + // omitted (csproj files almost never carry an `` prolog). + a.normalized_value(quick_xml::XmlVersion::Implicit1_0) + .ok() + .map(std::borrow::Cow::into_owned) + }) +} diff --git a/crates/graphify-extract/src/extractors/dotnet/razor.rs b/crates/graphify-extract/src/extractors/dotnet/razor.rs new file mode 100644 index 0000000..502ebee --- /dev/null +++ b/crates/graphify-extract/src/extractors/dotnet/razor.rs @@ -0,0 +1,404 @@ +//! `.razor` / `.cshtml` component extractor. + +use crate::ids::{file_stem, make_id, make_id1}; +use crate::types::{Edge, FileResult, Node}; +use regex::Regex; +use std::collections::HashSet; +use std::path::Path; +use std::sync::LazyLock; + +#[allow(clippy::expect_used)] +static RAZOR_USING_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^@using\s+([\w.]+)").expect("static razor @using regex")); + +#[allow(clippy::expect_used)] +static RAZOR_INJECT_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"^@inject\s+([\w.<>\[\]]+)\s+(\w+)").expect("static razor @inject regex") +}); + +#[allow(clippy::expect_used)] +static RAZOR_INHERITS_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"^@inherits\s+([\w.<>\[\]]+)").expect("static razor @inherits regex") +}); + +#[allow(clippy::expect_used)] +static RAZOR_MODEL_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^@model\s+([\w.<>\[\]]+)").expect("static razor @model regex")); + +#[allow(clippy::expect_used)] +static RAZOR_PAGE_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"^@page\s+"([^"]+)""#).expect("static razor @page regex")); + +#[allow(clippy::expect_used)] +static RAZOR_COMPONENT_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"<([A-Z][A-Za-z0-9]+)[\s/>]").expect("static razor component regex") +}); + +#[allow(clippy::expect_used)] +static RAZOR_CODE_BLOCK_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?m)@code\s*\{").expect("static razor @code regex")); + +#[allow(clippy::expect_used)] +static RAZOR_METHOD_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?:public|private|protected|internal|static|async|override|virtual|abstract)\s+[\w<>\[\],\s]+\s+(\w+)\s*\(", + ) + .expect("static razor method regex") +}); + +const RAZOR_HTML_TAGS: &[&str] = &[ + "DOCTYPE", "Html", "Head", "Body", "Div", "Span", "Table", "Form", "Input", "Button", "Select", + "Option", "Label", "Textarea", "Script", "Style", "Link", "Meta", "Title", "Header", "Footer", + "Nav", "Main", "Section", "Article", "Aside", +]; + +// ── .sln ──────────────────────────────────────────────────────────────────── + +/// Extract directives, component refs, and `@code` methods from a `.razor` / +/// `.cshtml` file. Mirrors `graphify-py` `extract_razor`. +#[must_use] +#[allow(clippy::too_many_lines)] // linear directive dispatch + component scan + @code body parse +pub fn extract_razor(path: &Path) -> FileResult { + let Ok(src) = std::fs::read_to_string(path) else { + return FileResult::error(format!("cannot read {}", path.display())); + }; + + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + + let mut nodes: Vec = vec![Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: None, + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::new(); + seen_ids.insert(file_nid.clone()); + + let add_ref = |target_name: &str, + relation: &str, + line: usize, + nodes: &mut Vec, + edges: &mut Vec, + seen_ids: &mut HashSet| { + let tgt_nid = make_id1(target_name); + if tgt_nid.is_empty() { + return; + } + if seen_ids.insert(tgt_nid.clone()) { + nodes.push(Node { + id: tgt_nid.clone(), + label: target_name.to_string(), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: file_nid.clone(), + target: tgt_nid, + relation: relation.to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + }; + + for (idx, line) in src.lines().enumerate() { + let i = idx + 1; + if let Some(cap) = RAZOR_USING_RE.captures(line) { + if let Some(m) = cap.get(1) { + add_ref( + m.as_str(), + "imports", + i, + &mut nodes, + &mut edges, + &mut seen_ids, + ); + } + continue; + } + if let Some(cap) = RAZOR_INJECT_RE.captures(line) { + if let Some(m) = cap.get(1) { + add_ref( + m.as_str(), + "imports", + i, + &mut nodes, + &mut edges, + &mut seen_ids, + ); + } + continue; + } + if let Some(cap) = RAZOR_INHERITS_RE.captures(line) { + if let Some(m) = cap.get(1) { + add_ref( + m.as_str(), + "inherits", + i, + &mut nodes, + &mut edges, + &mut seen_ids, + ); + } + continue; + } + if let Some(cap) = RAZOR_MODEL_RE.captures(line) { + if let Some(m) = cap.get(1) { + add_ref( + m.as_str(), + "references", + i, + &mut nodes, + &mut edges, + &mut seen_ids, + ); + } + continue; + } + if let Some(cap) = RAZOR_PAGE_RE.captures(line) + && let Some(m) = cap.get(1) + { + let route = m.as_str(); + let route_nid = make_id(&["route", route]); + if !route_nid.is_empty() && seen_ids.insert(route_nid.clone()) { + nodes.push(Node { + id: route_nid.clone(), + label: format!("route:{route}"), + file_type: "concept".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{i}")), + metadata: None, + }); + edges.push(Edge { + external: false, + source: file_nid.clone(), + target: route_nid, + relation: "references".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + } + } + + // Component references: capitalised tag names that aren't known HTML elements. + for m in RAZOR_COMPONENT_RE.captures_iter(&src) { + let Some(name_m) = m.get(1) else { continue }; + let comp_name = name_m.as_str(); + if RAZOR_HTML_TAGS.contains(&comp_name) { + continue; + } + let abs_pos = name_m.start(); + let line_num = src[..abs_pos].chars().filter(|&c| c == '\n').count() + 1; + add_ref( + comp_name, + "calls", + line_num, + &mut nodes, + &mut edges, + &mut seen_ids, + ); + } + + // @code { ... } method extraction. Find each `@code {` opening, walk + // braces tracking C# lexical context (line comments, block comments, + // regular strings, verbatim strings, char literals) so braces inside + // those don't confuse the depth counter. + // + // Divergence from `graphify-py` `extract_razor` (intentional): the + // Python brace counter is purely structural, which means a method + // body containing `"}{"` would truncate `block_end` early and + // silently drop every method below that point. Run-aware scanning + // costs O(n) extra work but produces the right block boundary. + let stem = file_stem(path); + let src_bytes = src.as_bytes(); + for cap in RAZOR_CODE_BLOCK_RE.find_iter(&src) { + let block_start = cap.end(); + let block_end = find_csharp_block_end(src_bytes, block_start); + if block_end <= block_start { + continue; + } + let block_body = &src[block_start..block_end]; + for mm in RAZOR_METHOD_RE.captures_iter(block_body) { + let Some(name_m) = mm.get(1) else { continue }; + let method_name = name_m.as_str(); + let abs_pos = block_start + name_m.start(); + let method_line = src[..abs_pos].chars().filter(|&c| c == '\n').count() + 1; + let method_nid = make_id(&[&stem, method_name]); + if method_nid.is_empty() { + continue; + } + if seen_ids.insert(method_nid.clone()) { + nodes.push(Node { + id: method_nid.clone(), + label: method_name.to_string(), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{method_line}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: file_nid.clone(), + target: method_nid, + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + } + + FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + } +} + +/// Find the byte index of the closing `}` that matches the opening `{` of +/// an `@code {` block. +/// +/// Walks `src` from `start` tracking C# lexical state: line comments, +/// block comments, regular strings (with `\` escape), verbatim strings +/// (with `""` escape), interpolated strings, and char literals. Braces +/// inside any of those don't count toward the depth. +/// +/// Returns the byte offset of the matching `}` (the byte one past the +/// last byte of the block body). When the closing `}` is missing, +/// returns `src.len()` so the caller fails open and still scans +/// whatever body it has. +#[allow(clippy::too_many_lines)] // linear state-machine dispatch; splitting per-state would just spread the transition table +fn find_csharp_block_end(src: &[u8], start: usize) -> usize { + #[derive(Clone, Copy, PartialEq, Eq)] + enum State { + Code, + LineComment, + BlockComment, + String, + VerbatimString, + Char, + } + let mut state = State::Code; + let mut depth: i32 = 1; + let mut pos = start; + while pos < src.len() { + let b = src[pos]; + match state { + State::Code => { + let next = src.get(pos + 1).copied().unwrap_or(0); + if b == b'/' && next == b'/' { + state = State::LineComment; + pos += 2; + continue; + } + if b == b'/' && next == b'*' { + state = State::BlockComment; + pos += 2; + continue; + } + if (b == b'@' || b == b'$') && next == b'"' { + // `@"..."` is verbatim (no `\` escape, `""` is the + // embedded-quote). `$"..."` is interpolated — the + // embedded text between holes honours the regular + // `\"` escape, so route it through the regular + // string state. + state = if b == b'@' { + State::VerbatimString + } else { + State::String + }; + pos += 2; + continue; + } + if b == b'"' { + state = State::String; + pos += 1; + continue; + } + if b == b'\'' { + state = State::Char; + pos += 1; + continue; + } + if b == b'{' { + depth += 1; + } else if b == b'}' { + depth -= 1; + if depth == 0 { + return pos; + } + } + pos += 1; + } + State::LineComment => { + if b == b'\n' { + state = State::Code; + } + pos += 1; + } + State::BlockComment => { + if b == b'*' && src.get(pos + 1).copied() == Some(b'/') { + state = State::Code; + pos += 2; + } else { + pos += 1; + } + } + State::String => { + if b == b'\\' && pos + 1 < src.len() { + // Skip the escaped char (covers `\"`, `\\`, `\n`, ...). + pos += 2; + } else if b == b'"' { + state = State::Code; + pos += 1; + } else { + pos += 1; + } + } + State::VerbatimString => { + if b == b'"' && src.get(pos + 1).copied() == Some(b'"') { + pos += 2; + } else if b == b'"' { + state = State::Code; + pos += 1; + } else { + pos += 1; + } + } + State::Char => { + if b == b'\\' && pos + 1 < src.len() { + pos += 2; + } else if b == b'\'' { + state = State::Code; + pos += 1; + } else { + pos += 1; + } + } + } + } + src.len() +} diff --git a/crates/graphify-extract/src/extractors/dotnet/sln.rs b/crates/graphify-extract/src/extractors/dotnet/sln.rs new file mode 100644 index 0000000..d971963 --- /dev/null +++ b/crates/graphify-extract/src/extractors/dotnet/sln.rs @@ -0,0 +1,159 @@ +//! `.sln` solution-file extractor. + +use crate::ids::make_id1; +use crate::types::{Edge, FileResult, Node}; +use regex::Regex; +use std::collections::HashSet; +use std::path::Path; +use std::sync::LazyLock; + +#[allow(clippy::expect_used)] // literal pattern; build cannot fail +static SLN_PROJECT_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"Project\("[^"]*"\)\s*=\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*"([^"]*)""#) + .expect("static sln project regex") +}); + +#[allow(clippy::expect_used)] +static SLN_DEP_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"\{([0-9a-fA-F-]+)\}\s*=\s*\{([0-9a-fA-F-]+)\}") + .expect("static sln dependency regex") +}); + +#[allow(clippy::expect_used)] +static SLN_PROJECT_LINE_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"Project\("[^"]*"\)\s*=\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"\{([^}]+)\}""#) + .expect("static sln project-line regex") +}); + +/// Extract project nodes and inter-project dependency edges from a `.sln` file. +/// +/// Each `Project(...) = ...` block becomes a node attached to the solution +/// file via `contains`; `ProjectSection(ProjectDependencies)` entries become +/// `imports` edges between projects identified by GUID. Mirrors +/// `graphify-py` `extract_sln`. +#[must_use] +#[allow(clippy::too_many_lines)] // two linear passes over .sln plus node/edge bookkeeping +pub fn extract_sln(path: &Path) -> FileResult { + let Ok(src) = std::fs::read_to_string(path) else { + return FileResult::error(format!("cannot read {}", path.display())); + }; + + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + + let mut nodes: Vec = vec![Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: None, + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::new(); + seen_ids.insert(file_nid.clone()); + + let mut guid_to_nid: std::collections::HashMap = + std::collections::HashMap::new(); + + for cap in SLN_PROJECT_RE.captures_iter(&src) { + let proj_name = cap.get(1).map_or("", |m| m.as_str()).to_string(); + let proj_path = cap.get(2).map_or("", |m| m.as_str()).replace('\\', "/"); + let proj_guid = cap + .get(3) + .map_or("", |m| m.as_str()) + .trim_matches(|c| c == '{' || c == '}') + .to_string(); + + let abs_proj = path + .parent() + .map(|p| p.join(&proj_path)) + .and_then(|p| p.canonicalize().ok()) + .map_or_else(|| proj_path.clone(), |p| p.to_string_lossy().into_owned()); + let proj_nid = make_id1(&abs_proj); + if !proj_nid.is_empty() && seen_ids.insert(proj_nid.clone()) { + nodes.push(Node { + id: proj_nid.clone(), + label: proj_name, + file_type: "code".to_string(), + source_file: abs_proj.clone(), + source_location: None, + metadata: None, + }); + edges.push(Edge { + external: false, + source: file_nid.clone(), + target: proj_nid.clone(), + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + if !proj_guid.is_empty() { + guid_to_nid.insert(proj_guid.to_lowercase(), proj_nid); + } + } + + // Second pass: project-dependency sections. Each block is nested inside + // a Project(...)/EndProject pair so we track the currently open project's + // GUID and emit `imports` edges to each declared dependency. + let mut in_dep_section = false; + let mut current_proj_guid: Option = None; + for line in src.lines() { + if let Some(cap) = SLN_PROJECT_LINE_RE.captures(line) { + current_proj_guid = cap.get(1).map(|m| m.as_str().to_lowercase()); + continue; + } + if line.trim() == "EndProject" { + current_proj_guid = None; + continue; + } + if line.contains("ProjectSection(ProjectDependencies)") { + in_dep_section = true; + continue; + } + if in_dep_section && line.contains("EndProjectSection") { + in_dep_section = false; + continue; + } + if in_dep_section + && let Some(ref from_guid) = current_proj_guid + && let Some(dep_cap) = SLN_DEP_RE.captures(line) + { + let to_guid = dep_cap.get(1).map_or("", |m| m.as_str()).to_lowercase(); + let from_nid = guid_to_nid.get(from_guid); + let to_nid = guid_to_nid.get(&to_guid); + if let (Some(from), Some(to)) = (from_nid, to_nid) + && from != to + { + edges.push(Edge { + external: false, + source: from.clone(), + target: to.clone(), + relation: "imports".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + } + } + + FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + } +} + +// ── .slnx ───────────────────────────────────────────────────────────────── diff --git a/crates/graphify-extract/src/extractors/dotnet/slnx.rs b/crates/graphify-extract/src/extractors/dotnet/slnx.rs new file mode 100644 index 0000000..1f2ec11 --- /dev/null +++ b/crates/graphify-extract/src/extractors/dotnet/slnx.rs @@ -0,0 +1,215 @@ +//! `.slnx` (XML solution) extractor. + +use super::{CSPROJ_MAX_BYTES, attr_ci, local_name}; +use crate::ids::make_id1; +use crate::types::{Edge, FileResult, Node}; +use quick_xml::events::{BytesStart, Event}; +use quick_xml::reader::Reader; +use std::collections::HashSet; +use std::path::Path; + +/// Shared mutable state threaded through the `.slnx` streaming parse. +struct SlnxCtx<'a> { + path: &'a Path, + str_path: &'a str, + file_nid: &'a str, + nodes: &'a mut Vec, + edges: &'a mut Vec, + seen_ids: &'a mut HashSet, + project_nids: &'a mut HashSet, + /// Candidate `(from_nid, to_nid)` build dependencies, filtered against + /// `project_nids` only after the whole document is parsed (a dependency may + /// reference a project declared later in the file). + dep_candidates: &'a mut Vec<(String, String)>, + /// `from_nid`s of currently open `` elements, so a nested + /// `` attaches to its nearest enclosing project. An empty + /// string marks an open `` without a `Path` (keeps push/pop balanced). + proj_stack: &'a mut Vec, +} + +impl SlnxCtx<'_> { + /// Resolve a project path relative to the solution file, mirroring the + /// `.sln` resolver: canonicalise when the target exists, otherwise fall + /// back to the slash-normalised relative path so ids stay deterministic. + fn resolve(&self, proj_path: &str) -> String { + let norm = proj_path.replace('\\', "/"); + self.path + .parent() + .map(|p| p.join(&norm)) + .and_then(|p| p.canonicalize().ok()) + .map_or(norm, |p| p.to_string_lossy().into_owned()) + } + + /// Handle one `` / `` element. `has_children` is + /// `true` for a `Start` tag (a matching `End` will pop the stack) and + /// `false` for a self-closing `Empty` tag. + fn on_element(&mut self, e: &BytesStart<'_>, has_children: bool) { + match local_name(e).as_str() { + "Project" => { + let path_attr = attr_ci(e, "Path").filter(|s| !s.is_empty()); + let proj_nid = match &path_attr { + Some(proj_path) => { + let abs = self.resolve(proj_path); + let nid = make_id1(&abs); + if !nid.is_empty() { + if self.seen_ids.insert(nid.clone()) { + let label = Path::new(proj_path).file_stem().map_or_else( + || proj_path.clone(), + |s| s.to_string_lossy().into_owned(), + ); + self.nodes.push(Node { + id: nid.clone(), + label, + file_type: "code".to_string(), + source_file: abs.clone(), + source_location: None, + metadata: None, + }); + self.edges.push(Edge { + external: false, + source: self.file_nid.to_string(), + target: nid.clone(), + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: self.str_path.to_string(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + self.project_nids.insert(nid.clone()); + } + nid + } + None => String::new(), + }; + if has_children { + self.proj_stack.push(proj_nid); + } + } + "BuildDependency" => { + if let Some(dep_path) = attr_ci(e, "Project").filter(|s| !s.is_empty()) { + let to_nid = make_id1(&self.resolve(&dep_path)); + if let Some(from) = self.proj_stack.last() + && !from.is_empty() + && !to_nid.is_empty() + && *from != to_nid + { + self.dep_candidates.push((from.clone(), to_nid)); + } + } + } + _ => {} + } + } +} + +/// Extract project nodes and inter-project build-order dependencies from a +/// `.slnx` file — the XML-based replacement for `.sln`. +/// +/// `` elements (anywhere in the tree, including inside +/// ``) become nodes attached to the solution via `contains`; +/// `` children become `imports` edges between +/// known projects. Unlike `.sln` there are no GUIDs — projects are identified +/// by their resolved path. Mirrors `graphify-py` `extract_slnx`. +#[must_use] +pub fn extract_slnx(path: &Path) -> FileResult { + let Ok(bytes) = std::fs::read(path) else { + return FileResult::error(format!("cannot read {}", path.display())); + }; + if bytes.len() as u64 > CSPROJ_MAX_BYTES { + return FileResult::error("project file too large"); + } + if !crate::extractors::project_xml_is_safe(&bytes) { + return FileResult::error("refusing XML with DOCTYPE/ENTITY declaration"); + } + + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + + let mut nodes: Vec = vec![Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: None, + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::new(); + seen_ids.insert(file_nid.clone()); + + let mut project_nids: HashSet = HashSet::new(); + let mut dep_candidates: Vec<(String, String)> = Vec::new(); + let mut proj_stack: Vec = Vec::new(); + + let mut reader = Reader::from_reader(&*bytes); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + { + let mut ctx = SlnxCtx { + path, + str_path: &str_path, + file_nid: &file_nid, + nodes: &mut nodes, + edges: &mut edges, + seen_ids: &mut seen_ids, + project_nids: &mut project_nids, + dep_candidates: &mut dep_candidates, + proj_stack: &mut proj_stack, + }; + loop { + match reader.read_event_into(&mut buf) { + Err(e) => return FileResult::error(format!("XML parse error: {e}")), + Ok(Event::Eof) => break, + Ok(Event::Start(ref e)) => ctx.on_element(e, true), + Ok(Event::Empty(ref e)) => ctx.on_element(e, false), + Ok(Event::End(ref e)) => { + // `BytesEnd` is a distinct type from `BytesStart`, so strip + // the namespace prefix inline rather than via `local_name`. + let name = e.name(); + let raw = name.as_ref(); + let local = raw + .iter() + .rposition(|&b| b == b':') + .map_or(raw, |i| &raw[i + 1..]); + if local == b"Project" { + ctx.proj_stack.pop(); + } + } + _ => {} + } + buf.clear(); + } + } + + // Build-order dependencies between known projects. + for (from, to) in dep_candidates { + if project_nids.contains(&to) { + edges.push(Edge { + external: false, + source: from, + target: to, + relation: "imports".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: None, + weight: 1.0, + context: None, + confidence_score: None, + }); + } + } + + FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + } +} + +// ── .csproj / .fsproj / .vbproj ───────────────────────────────────────────── diff --git a/crates/graphify-extract/src/extractors/go.rs b/crates/graphify-extract/src/extractors/go.rs deleted file mode 100644 index 4c40ed2..0000000 --- a/crates/graphify-extract/src/extractors/go.rs +++ /dev/null @@ -1,1042 +0,0 @@ -//! Go extractor — custom walk over tree-sitter-go AST. - -use std::collections::{HashMap, HashSet}; -use std::path::Path; - -use crate::ids::{file_stem, make_id, make_id1}; -use crate::types::{Edge, FileResult, Node, RawCall}; - -/// Return the source bytes covered by `node` as a UTF-8 `&str`, or `""` on bad UTF-8. -fn read_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { - std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") -} - -/// Go's predeclared type identifiers — never emitted as semantic type references. -const GO_PREDECLARED_TYPES: &[&str] = &[ - "bool", - "byte", - "complex64", - "complex128", - "error", - "float32", - "float64", - "int", - "int8", - "int16", - "int32", - "int64", - "rune", - "string", - "uint", - "uint8", - "uint16", - "uint32", - "uint64", - "uintptr", - "any", - "comparable", -]; - -/// Walk a Go type expression, appending `(name, is_generic_arg)` tuples for each -/// user-defined type referenced. Predeclared types are skipped. Mirrors Python -/// `_go_collect_type_refs`. -fn go_collect_type_refs( - node: tree_sitter::Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, bool)>, -) { - match node.kind() { - "type_identifier" => { - let text = read_text(node, source); - if !text.is_empty() && !GO_PREDECLARED_TYPES.contains(&text) { - out.push((text.to_string(), generic)); - } - } - "qualified_type" => { - let full = read_text(node, source); - let text = full.rsplit('.').next().unwrap_or(full); - if !text.is_empty() && !GO_PREDECLARED_TYPES.contains(&text) { - out.push((text.to_string(), generic)); - } - } - "generic_type" => { - if let Some(type_field) = node.child_by_field_name("type") { - go_collect_type_refs(type_field, source, generic, out); - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "type_arguments" { - let mut acur = cur.node().walk(); - if acur.goto_first_child() { - loop { - if acur.node().is_named() { - go_collect_type_refs(acur.node(), source, true, out); - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - "pointer_type" | "slice_type" | "array_type" | "map_type" | "channel_type" - | "parenthesized_type" => { - recurse_named_children(node, source, generic, out); - } - _ if node.is_named() => recurse_named_children(node, source, generic, out), - _ => {} - } -} - -/// Recurse `go_collect_type_refs` over every named child of `node`. -fn recurse_named_children( - node: tree_sitter::Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, bool)>, -) { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - go_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } -} - -/// Mutable graph state for the Go semantic-reference passes. Constructed by -/// reborrowing the structural-walk locals at each call site so these passes -/// never need to thread the full [`GoWalkCtx`]. -struct GoRefCtx<'a> { - source: &'a [u8], - pkg_scope: &'a str, - str_path: &'a str, - nodes: &'a mut Vec, - edges: &'a mut Vec, - seen_ids: &'a mut HashSet, -} - -impl GoRefCtx<'_> { - /// Return the NID for a named type, creating a bare placeholder node when no - /// package-qualified node already exists. Mirrors Go's `ensure_named_node`. - fn ensure_named_node(&mut self, name: &str, line: usize) -> String { - let nid1 = make_id(&[self.pkg_scope, name]); - if self.seen_ids.contains(&nid1) { - return nid1; - } - let nid2 = make_id1(name); - if self.seen_ids.insert(nid2.clone()) { - self.nodes.push(Node { - id: nid2.clone(), - label: name.to_string(), - file_type: "code".to_string(), - source_file: self.str_path.to_string(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - nid2 - } - - /// Push a `references` edge from `src` to `tgt` with the given context. - fn push_ref(&mut self, src: &str, tgt: &str, context: &str, line: usize) { - self.edges.push(Edge { - external: false, - source: src.to_string(), - target: tgt.to_string(), - relation: "references".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: self.str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: Some(context.to_string()), - confidence_score: None, - }); - } - - /// Push a plain `embeds` edge from `src` to `tgt` (Go struct/interface embedding). - fn push_embeds(&mut self, src: &str, tgt: &str, line: usize) { - self.edges.push(Edge { - external: false, - source: src.to_string(), - target: tgt.to_string(), - relation: "embeds".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: self.str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - } -} - -/// Emit `references` edges for a function/method's parameter and result types. -/// -/// Mirrors Python `emit_go_method_refs`: direct param types use the -/// `parameter_type` context, result types use `return_type`, and any generic -/// arguments use `generic_arg`. -fn emit_go_method_refs( - rc: &mut GoRefCtx<'_>, - func_node: tree_sitter::Node<'_>, - func_nid: &str, - line: usize, -) { - if let Some(params) = func_node.child_by_field_name("parameters") { - let mut cur = params.walk(); - if cur.goto_first_child() { - loop { - let p = cur.node(); - if p.kind() == "parameter_declaration" - && let Some(type_node) = p.child_by_field_name("type") - { - let mut refs: Vec<(String, bool)> = Vec::new(); - go_collect_type_refs(type_node, rc.source, false, &mut refs); - for (ref_name, is_generic) in refs { - let ctx = if is_generic { - "generic_arg" - } else { - "parameter_type" - }; - let tgt = rc.ensure_named_node(&ref_name, line); - if tgt != func_nid { - rc.push_ref(func_nid, &tgt, ctx, line); - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - let Some(result) = func_node.child_by_field_name("result") else { - return; - }; - if result.kind() == "parameter_list" { - let mut cur = result.walk(); - if cur.goto_first_child() { - loop { - let p = cur.node(); - if p.kind() == "parameter_declaration" { - let type_node = p.child_by_field_name("type").or_else(|| { - let mut c = p.walk(); - if c.goto_first_child() { - loop { - if c.node().is_named() { - return Some(c.node()); - } - if !c.goto_next_sibling() { - break; - } - } - } - None - }); - if let Some(tn) = type_node { - emit_go_result_refs(rc, tn, func_nid, line); - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - } else { - emit_go_result_refs(rc, result, func_nid, line); - } -} - -/// Emit `return_type` / `generic_arg` references from a single result type node. -fn emit_go_result_refs( - rc: &mut GoRefCtx<'_>, - type_node: tree_sitter::Node<'_>, - func_nid: &str, - line: usize, -) { - let mut refs: Vec<(String, bool)> = Vec::new(); - go_collect_type_refs(type_node, rc.source, false, &mut refs); - for (ref_name, is_generic) in refs { - let ctx = if is_generic { - "generic_arg" - } else { - "return_type" - }; - let tgt = rc.ensure_named_node(&ref_name, line); - if tgt != func_nid { - rc.push_ref(func_nid, &tgt, ctx, line); - } - } -} - -/// Emit `embeds` / `references[field]` edges for a `type_spec`'s struct fields, -/// and `embeds` / `references[generic_arg]` edges for interface embedding. -/// -/// A struct field with no name and a direct (non-generic) type is an embedded -/// field → `embeds`; named fields and generic args become `references`. Mirrors -/// the struct/interface body handling added to Python `extract_go`. -fn emit_go_type_body_refs(rc: &mut GoRefCtx<'_>, type_spec: tree_sitter::Node<'_>, type_nid: &str) { - let mut type_body: Option> = None; - let mut cur = type_spec.walk(); - if cur.goto_first_child() { - loop { - if matches!(cur.node().kind(), "struct_type" | "interface_type") { - type_body = Some(cur.node()); - break; - } - if !cur.goto_next_sibling() { - break; - } - } - } - let Some(type_body) = type_body else { - return; - }; - - if type_body.kind() == "struct_type" { - let mut fdl_cur = type_body.walk(); - if !fdl_cur.goto_first_child() { - return; - } - loop { - if fdl_cur.node().kind() == "field_declaration_list" { - let mut fcur = fdl_cur.node().walk(); - if fcur.goto_first_child() { - loop { - if fcur.node().kind() == "field_declaration" { - emit_go_struct_field_refs(rc, fcur.node(), type_nid); - } - if !fcur.goto_next_sibling() { - break; - } - } - } - } - if !fdl_cur.goto_next_sibling() { - break; - } - } - } else { - // interface_type — embedded interfaces appear as `type_elem`. - let mut ecur = type_body.walk(); - if !ecur.goto_first_child() { - return; - } - loop { - if ecur.node().kind() == "type_elem" { - let line = ecur.node().start_position().row + 1; - let mut refs: Vec<(String, bool)> = Vec::new(); - let mut scur = ecur.node().walk(); - if scur.goto_first_child() { - loop { - if scur.node().is_named() { - go_collect_type_refs(scur.node(), rc.source, false, &mut refs); - } - if !scur.goto_next_sibling() { - break; - } - } - } - for (ref_name, is_generic) in refs { - let tgt = rc.ensure_named_node(&ref_name, line); - if tgt == type_nid { - continue; - } - if is_generic { - rc.push_ref(type_nid, &tgt, "generic_arg", line); - } else { - rc.push_embeds(type_nid, &tgt, line); - } - } - } - if !ecur.goto_next_sibling() { - break; - } - } - } -} - -/// Emit edges for a single Go struct `field_declaration`. -fn emit_go_struct_field_refs(rc: &mut GoRefCtx<'_>, field: tree_sitter::Node<'_>, type_nid: &str) { - let line = field.start_position().row + 1; - let mut has_name = false; - let mut fallback_type: Option> = None; - let mut fcur = field.walk(); - if fcur.goto_first_child() { - loop { - let fc = fcur.node(); - if fc.kind() == "field_identifier" { - has_name = true; - } else if fallback_type.is_none() && fc.is_named() { - fallback_type = Some(fc); - } - if !fcur.goto_next_sibling() { - break; - } - } - } - let Some(type_node) = field.child_by_field_name("type").or(fallback_type) else { - return; - }; - let mut refs: Vec<(String, bool)> = Vec::new(); - go_collect_type_refs(type_node, rc.source, false, &mut refs); - for (ref_name, is_generic) in refs { - let tgt = rc.ensure_named_node(&ref_name, line); - if tgt == type_nid { - continue; - } - if !has_name && !is_generic { - rc.push_embeds(type_nid, &tgt, line); - } else { - let ctx = if is_generic { "generic_arg" } else { "field" }; - rc.push_ref(type_nid, &tgt, ctx, line); - } - } -} - -/// Extract functions, methods, type declarations, and imports from a `.go` file. -#[must_use] -pub fn extract_go(path: &Path) -> FileResult { - let Some((source, tree)) = parse_go_source(path) else { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: Some("parse failed".to_string()), - }; - }; - - let stem = file_stem(path); - let pkg_scope = derive_pkg_scope(path, &stem); - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - - let mut nodes: Vec = vec![Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some("L1".to_string()), - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::from([file_nid.clone()]); - let mut function_bodies: Vec<(String, usize, usize)> = Vec::new(); - let mut go_imported_pkgs: HashSet = HashSet::new(); - - { - let mut walk_ctx = GoWalkCtx { - str_path: &str_path, - stem: &stem, - pkg_scope: &pkg_scope, - file_nid: &file_nid, - nodes: &mut nodes, - edges: &mut edges, - seen_ids: &mut seen_ids, - function_bodies: &mut function_bodies, - go_imported_pkgs: &mut go_imported_pkgs, - }; - walk_go(&mut walk_ctx, tree.root_node(), &source); - } - - let label_to_nid = build_go_label_map(&nodes); - let raw_calls = resolve_go_function_calls(GoResolveArgs { - tree: &tree, - source: &source, - str_path: &str_path, - function_bodies: &function_bodies, - label_to_nid: &label_to_nid, - go_imported_pkgs: &go_imported_pkgs, - edges: &mut edges, - }); - crate::forward_refs::reconcile_forward_refs(&mut nodes, &mut edges); - // Validate dangling edges against the reconciled graph: reconcile may have - // folded placeholder nodes away, so rebuild the valid-id set from the - // surviving nodes rather than trusting the now-stale `seen_ids`. - let valid_ids: HashSet = nodes.iter().map(|n| n.id.clone()).collect(); - let clean_edges = filter_dangling_edges(edges, &valid_ids); - - FileResult { - nodes, - edges: clean_edges, - raw_calls, - error: None, - } -} - -/// Read the file and parse with tree-sitter-go. `None` on any I/O or parse error. -fn parse_go_source(path: &Path) -> Option<(Vec, tree_sitter::Tree)> { - let source = std::fs::read(path).ok()?; - let mut parser = tree_sitter::Parser::new(); - parser.set_language(&tree_sitter_go::LANGUAGE.into()).ok()?; - let tree = parser.parse(&source, None)?; - Some((source, tree)) -} - -/// Use the directory name as package scope so methods on the same type share a -/// canonical type node across files in the same package. -fn derive_pkg_scope(path: &Path, fallback_stem: &str) -> String { - path.parent() - .and_then(|p| p.file_name()) - .and_then(|n| n.to_str()) - .filter(|s| !s.is_empty()) - .unwrap_or(fallback_stem) - .to_string() -} - -/// Build a `normalised_label → nid` map for intra-file call resolution. -fn build_go_label_map(nodes: &[Node]) -> HashMap { - let mut label_to_nid: HashMap = HashMap::new(); - for n in nodes { - let normalised = n.label.trim_end_matches("()").trim_start_matches('.'); - label_to_nid.insert(normalised.to_lowercase(), n.id.clone()); - } - label_to_nid -} - -/// Bundle of shared inputs for [`resolve_go_function_calls`]. -struct GoResolveArgs<'a> { - tree: &'a tree_sitter::Tree, - source: &'a [u8], - str_path: &'a str, - function_bodies: &'a [(String, usize, usize)], - label_to_nid: &'a HashMap, - go_imported_pkgs: &'a HashSet, - edges: &'a mut Vec, -} - -/// Walk each function body to emit call edges and `RawCall` entries. -fn resolve_go_function_calls(args: GoResolveArgs<'_>) -> Vec { - let GoResolveArgs { - tree, - source, - str_path, - function_bodies, - label_to_nid, - go_imported_pkgs, - edges, - } = args; - let mut seen_call_pairs: HashSet<(String, String)> = HashSet::new(); - let mut raw_calls: Vec = Vec::new(); - { - let mut call_ctx = GoCallCtx { - str_path, - label_to_nid, - go_imported_pkgs, - edges, - seen_call_pairs: &mut seen_call_pairs, - raw_calls: &mut raw_calls, - }; - for (caller_nid, body_start, body_end) in function_bodies { - walk_calls_go( - &mut call_ctx, - tree.root_node(), - source, - caller_nid, - *body_start, - *body_end, - ); - } - } - raw_calls -} - -/// Drop edges whose endpoints aren't in `valid_ids` (except for `imports` edges). -fn filter_dangling_edges(edges: Vec, valid_ids: &HashSet) -> Vec { - edges - .into_iter() - .filter(|e| { - valid_ids.contains(&e.source) - && (valid_ids.contains(&e.target) - || matches!(e.relation.as_str(), "imports" | "imports_from")) - }) - .collect() -} - -/// Recursively walk a Go AST emitting nodes and edges for functions, methods, and type declarations. -/// -/// Handles `function_declaration`, `method_declaration`, `type_declaration`, and `import_declaration` -/// nodes. Descends into all child nodes. Mirrors Python `_walk_go`. -/// Shared state threaded through every [`walk_go`] recursion. -struct GoWalkCtx<'a> { - str_path: &'a str, - stem: &'a str, - pkg_scope: &'a str, - file_nid: &'a str, - nodes: &'a mut Vec, - edges: &'a mut Vec, - seen_ids: &'a mut HashSet, - function_bodies: &'a mut Vec<(String, usize, usize)>, - go_imported_pkgs: &'a mut HashSet, -} - -#[allow(clippy::too_many_lines)] // linear dispatch over Go's AST node kinds -fn walk_go(ctx: &mut GoWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8]) { - let str_path = ctx.str_path; - let stem = ctx.stem; - let pkg_scope = ctx.pkg_scope; - let file_nid = ctx.file_nid; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - let function_bodies = &mut *ctx.function_bodies; - let go_imported_pkgs = &mut *ctx.go_imported_pkgs; - let t = node.kind(); - - match t { - "function_declaration" => { - if let Some(name_node) = node.child_by_field_name("name") { - let func_name = read_text(name_node, source); - let line = node.start_position().row + 1; - let func_nid = make_id(&[stem, func_name]); - if seen_ids.insert(func_nid.clone()) { - nodes.push(Node { - id: func_nid.clone(), - label: format!("{func_name}()"), - file_type: "code".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: file_nid.to_string(), - target: func_nid.clone(), - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - let mut rc = GoRefCtx { - source, - pkg_scope, - str_path, - nodes: &mut *nodes, - edges: &mut *edges, - seen_ids: &mut *seen_ids, - }; - emit_go_method_refs(&mut rc, node, &func_nid, line); - if let Some(body) = node.child_by_field_name("body") { - function_bodies.push((func_nid, body.start_byte(), body.end_byte())); - } - } - } - "method_declaration" => { - let receiver = node.child_by_field_name("receiver"); - let mut receiver_type: Option = None; - if let Some(recv) = receiver { - let mut cur = recv.walk(); - if cur.goto_first_child() { - loop { - let param = cur.node(); - if param.kind() == "parameter_declaration" { - if let Some(type_node) = param.child_by_field_name("type") { - let raw = read_text(type_node, source) - .trim_start_matches('*') - .trim() - .to_string(); - receiver_type = Some(raw); - } - break; - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - if let Some(name_node) = node.child_by_field_name("name") { - let method_name = read_text(name_node, source); - let line = node.start_position().row + 1; - let method_nid = if let Some(ref rt) = receiver_type { - let parent_nid = make_id(&[pkg_scope, rt]); - if seen_ids.insert(parent_nid.clone()) { - nodes.push(Node { - id: parent_nid.clone(), - label: rt.clone(), - file_type: "code".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - let mnid = make_id(&[&parent_nid, method_name]); - if seen_ids.insert(mnid.clone()) { - nodes.push(Node { - id: mnid.clone(), - label: format!(".{method_name}()"), - file_type: "code".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: parent_nid, - target: mnid.clone(), - relation: "method".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - mnid - } else { - let mnid = make_id(&[stem, method_name]); - if seen_ids.insert(mnid.clone()) { - nodes.push(Node { - id: mnid.clone(), - label: format!("{method_name}()"), - file_type: "code".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: file_nid.to_string(), - target: mnid.clone(), - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - mnid - }; - let mut rc = GoRefCtx { - source, - pkg_scope, - str_path, - nodes: &mut *nodes, - edges: &mut *edges, - seen_ids: &mut *seen_ids, - }; - emit_go_method_refs(&mut rc, node, &method_nid, line); - if let Some(body) = node.child_by_field_name("body") { - function_bodies.push((method_nid, body.start_byte(), body.end_byte())); - } - } - } - "type_declaration" => { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if child.kind() == "type_spec" - && let Some(name_node) = child.child_by_field_name("name") - { - let type_name = read_text(name_node, source); - let line = child.start_position().row + 1; - let type_nid = make_id(&[pkg_scope, type_name]); - if seen_ids.insert(type_nid.clone()) { - nodes.push(Node { - id: type_nid.clone(), - label: type_name.to_string(), - file_type: "code".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: file_nid.to_string(), - target: type_nid.clone(), - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - // Struct field embeds/references and interface embedding. - let mut rc = GoRefCtx { - source, - pkg_scope, - str_path, - nodes: &mut *nodes, - edges: &mut *edges, - seen_ids: &mut *seen_ids, - }; - emit_go_type_body_refs(&mut rc, child, &type_nid); - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - "import_declaration" => { - walk_go_imports( - node, - source, - str_path, - file_nid, - edges, - seen_ids, - go_imported_pkgs, - ); - } - _ => { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_go(ctx, cur.node(), source); - if !cur.goto_next_sibling() { - break; - } - } - } - } - } -} - -/// Walk an `import_declaration` subtree, delegating each `import_spec` to `emit_go_import_spec`. -fn walk_go_imports( - node: tree_sitter::Node<'_>, - source: &[u8], - str_path: &str, - file_nid: &str, - edges: &mut Vec, - seen_ids: &mut HashSet, - go_imported_pkgs: &mut HashSet, -) { - let mut cur = node.walk(); - if !cur.goto_first_child() { - return; - } - loop { - let child = cur.node(); - match child.kind() { - "import_spec_list" => { - let mut c2 = child.walk(); - if c2.goto_first_child() { - loop { - let spec = c2.node(); - if spec.kind() == "import_spec" { - emit_go_import_spec( - spec, - source, - str_path, - file_nid, - edges, - seen_ids, - go_imported_pkgs, - ); - } - if !c2.goto_next_sibling() { - break; - } - } - } - } - "import_spec" => { - emit_go_import_spec( - child, - source, - str_path, - file_nid, - edges, - seen_ids, - go_imported_pkgs, - ); - } - _ => {} - } - if !cur.goto_next_sibling() { - break; - } - } -} - -/// Emit a single `imports_from` edge for one Go `import_spec` node. -/// -/// The target NID is derived from the import path string (e.g. `"fmt"` → `go::pkg::fmt`). -/// The package name is also recorded in `go_imported_pkgs` for use during call resolution. -fn emit_go_import_spec( - spec: tree_sitter::Node<'_>, - source: &[u8], - str_path: &str, - file_nid: &str, - edges: &mut Vec, - _seen_ids: &mut HashSet, - go_imported_pkgs: &mut HashSet, -) { - if let Some(path_node) = spec.child_by_field_name("path") { - let raw = read_text(path_node, source).trim_matches('"'); - let tgt_nid = make_id(&["go", "pkg", raw]); - let line = spec.start_position().row + 1; - edges.push(Edge { - external: false, - source: file_nid.to_string(), - target: tgt_nid, - relation: "imports_from".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: Some("import".to_string()), - confidence_score: None, - }); - // Track local name (alias or last path segment) - let alias = spec.child_by_field_name("name"); - let local_name = if let Some(a) = alias { - read_text(a, source).to_string() - } else { - raw.split('/').next_back().unwrap_or("").to_string() - }; - if !local_name.is_empty() && local_name != "_" && local_name != "." { - go_imported_pkgs.insert(local_name); - } - } -} - -/// Collect `calls` edges within a Go function or method body. -/// -/// Recurses through the body AST, emitting `calls` edges for `call_expression` nodes whose -/// callee matches a known function NID in this file. Selector expressions (package.Func) are -/// resolved against `go_imported_pkgs`. Mirrors Python `_walk_calls_go`. -/// Shared state threaded through every [`walk_calls_go`] recursion. -struct GoCallCtx<'a> { - str_path: &'a str, - label_to_nid: &'a HashMap, - go_imported_pkgs: &'a HashSet, - edges: &'a mut Vec, - seen_call_pairs: &'a mut HashSet<(String, String)>, - raw_calls: &'a mut Vec, -} - -#[allow(clippy::too_many_lines)] // linear dispatch over Go's call-site AST shapes -fn walk_calls_go( - ctx: &mut GoCallCtx<'_>, - node: tree_sitter::Node<'_>, - source: &[u8], - caller_nid: &str, - body_start: usize, - body_end: usize, -) { - let str_path = ctx.str_path; - let label_to_nid = ctx.label_to_nid; - let go_imported_pkgs = ctx.go_imported_pkgs; - let edges = &mut *ctx.edges; - let seen_call_pairs = &mut *ctx.seen_call_pairs; - let raw_calls = &mut *ctx.raw_calls; - // Only visit nodes within the body range - if node.start_byte() >= body_end || node.end_byte() <= body_start { - return; - } - - match node.kind() { - "function_declaration" | "method_declaration" => { - // Don't recurse into nested functions - } - "call_expression" => { - if let Some(func_node) = node.child_by_field_name("function") { - let mut callee_name: Option = None; - let mut is_member_call = false; - match func_node.kind() { - "identifier" => { - callee_name = Some(read_text(func_node, source).to_string()); - } - "selector_expression" => { - let field = func_node.child_by_field_name("field"); - let operand = func_node.child_by_field_name("operand"); - let receiver_name = operand - .map(|n| read_text(n, source).to_string()) - .unwrap_or_default(); - // Package-qualified call: fmt.Println → not a member call - is_member_call = !go_imported_pkgs.contains(&receiver_name); - if let Some(f) = field { - callee_name = Some(read_text(f, source).to_string()); - } - } - _ => {} - } - // Built-in suppression applies only to unqualified identifier - // calls; a selector call (`obj.len()`, `pkg.len()`) names a method - // or package function, not the language built-in, so it must not - // be filtered. - let is_unqualified = func_node.kind() == "identifier"; - if let Some(cn) = callee_name { - let tgt_nid = label_to_nid.get(&cn.to_lowercase()).cloned(); - if let Some(tgt) = tgt_nid { - if tgt != caller_nid { - let pair = (caller_nid.to_string(), tgt.clone()); - if seen_call_pairs.insert(pair) { - let line = node.start_position().row + 1; - edges.push(Edge { - external: false, - source: caller_nid.to_string(), - target: tgt, - relation: "calls".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: Some("call".to_string()), - confidence_score: None, - }); - } - } - } else if !(is_unqualified && crate::builtins::is_language_builtin_global(&cn)) - { - raw_calls.push(RawCall { - caller_nid: caller_nid.to_string(), - callee: cn, - is_member_call, - source_file: str_path.to_string(), - source_location: format!("L{}", node.start_position().row + 1), - receiver: None, - }); - } - } - } - // Recurse into arguments - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_calls_go(ctx, cur.node(), source, caller_nid, body_start, body_end); - if !cur.goto_next_sibling() { - break; - } - } - } - } - _ => { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_calls_go(ctx, cur.node(), source, caller_nid, body_start, body_end); - if !cur.goto_next_sibling() { - break; - } - } - } - } - } -} diff --git a/crates/graphify-extract/src/extractors/go/calls.rs b/crates/graphify-extract/src/extractors/go/calls.rs new file mode 100644 index 0000000..eb2abb7 --- /dev/null +++ b/crates/graphify-extract/src/extractors/go/calls.rs @@ -0,0 +1,130 @@ +//! Go call-graph pass. + +use super::read_text; +use crate::types::{Edge, RawCall}; +use std::collections::{HashMap, HashSet}; + +/// Collect `calls` edges within a Go function or method body. +/// +/// Recurses through the body AST, emitting `calls` edges for `call_expression` nodes whose +/// callee matches a known function NID in this file. Selector expressions (package.Func) are +/// resolved against `go_imported_pkgs`. Mirrors Python `_walk_calls_go`. +/// Shared state threaded through every [`walk_calls_go`] recursion. +pub(super) struct GoCallCtx<'a> { + pub(super) str_path: &'a str, + pub(super) label_to_nid: &'a HashMap, + pub(super) go_imported_pkgs: &'a HashSet, + pub(super) edges: &'a mut Vec, + pub(super) seen_call_pairs: &'a mut HashSet<(String, String)>, + pub(super) raw_calls: &'a mut Vec, +} + +#[allow(clippy::too_many_lines)] // linear dispatch over Go's call-site AST shapes +pub(super) fn walk_calls_go( + ctx: &mut GoCallCtx<'_>, + node: tree_sitter::Node<'_>, + source: &[u8], + caller_nid: &str, + body_start: usize, + body_end: usize, +) { + let str_path = ctx.str_path; + let label_to_nid = ctx.label_to_nid; + let go_imported_pkgs = ctx.go_imported_pkgs; + let edges = &mut *ctx.edges; + let seen_call_pairs = &mut *ctx.seen_call_pairs; + let raw_calls = &mut *ctx.raw_calls; + // Only visit nodes within the body range + if node.start_byte() >= body_end || node.end_byte() <= body_start { + return; + } + + match node.kind() { + "function_declaration" | "method_declaration" => { + // Don't recurse into nested functions + } + "call_expression" => { + if let Some(func_node) = node.child_by_field_name("function") { + let mut callee_name: Option = None; + let mut is_member_call = false; + match func_node.kind() { + "identifier" => { + callee_name = Some(read_text(func_node, source).to_string()); + } + "selector_expression" => { + let field = func_node.child_by_field_name("field"); + let operand = func_node.child_by_field_name("operand"); + let receiver_name = operand + .map(|n| read_text(n, source).to_string()) + .unwrap_or_default(); + // Package-qualified call: fmt.Println → not a member call + is_member_call = !go_imported_pkgs.contains(&receiver_name); + if let Some(f) = field { + callee_name = Some(read_text(f, source).to_string()); + } + } + _ => {} + } + // Built-in suppression applies only to unqualified identifier + // calls; a selector call (`obj.len()`, `pkg.len()`) names a method + // or package function, not the language built-in, so it must not + // be filtered. + let is_unqualified = func_node.kind() == "identifier"; + if let Some(cn) = callee_name { + let tgt_nid = label_to_nid.get(&cn.to_lowercase()).cloned(); + if let Some(tgt) = tgt_nid { + if tgt != caller_nid { + let pair = (caller_nid.to_string(), tgt.clone()); + if seen_call_pairs.insert(pair) { + let line = node.start_position().row + 1; + edges.push(Edge { + external: false, + source: caller_nid.to_string(), + target: tgt, + relation: "calls".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: Some("call".to_string()), + confidence_score: None, + }); + } + } + } else if !(is_unqualified && crate::builtins::is_language_builtin_global(&cn)) + { + raw_calls.push(RawCall { + caller_nid: caller_nid.to_string(), + callee: cn, + is_member_call, + source_file: str_path.to_string(), + source_location: format!("L{}", node.start_position().row + 1), + receiver: None, + }); + } + } + } + // Recurse into arguments + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_calls_go(ctx, cur.node(), source, caller_nid, body_start, body_end); + if !cur.goto_next_sibling() { + break; + } + } + } + } + _ => { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_calls_go(ctx, cur.node(), source, caller_nid, body_start, body_end); + if !cur.goto_next_sibling() { + break; + } + } + } + } + } +} diff --git a/crates/graphify-extract/src/extractors/go/mod.rs b/crates/graphify-extract/src/extractors/go/mod.rs new file mode 100644 index 0000000..95616a5 --- /dev/null +++ b/crates/graphify-extract/src/extractors/go/mod.rs @@ -0,0 +1,178 @@ +//! Go extractor — custom walk over tree-sitter-go AST. + +mod calls; +mod refs; +mod walk; + +use crate::ids::{file_stem, make_id1}; +use crate::types::{Edge, FileResult, Node, RawCall}; +use calls::{GoCallCtx, walk_calls_go}; +use std::collections::{HashMap, HashSet}; +use std::path::Path; +use walk::{GoWalkCtx, walk_go}; + +/// Return the source bytes covered by `node` as a UTF-8 `&str`, or `""` on bad UTF-8. +fn read_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { + std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") +} + +/// Extract functions, methods, type declarations, and imports from a `.go` file. +#[must_use] +pub fn extract_go(path: &Path) -> FileResult { + let Some((source, tree)) = parse_go_source(path) else { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: Some("parse failed".to_string()), + }; + }; + + let stem = file_stem(path); + let pkg_scope = derive_pkg_scope(path, &stem); + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + + let mut nodes: Vec = vec![Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some("L1".to_string()), + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::from([file_nid.clone()]); + let mut function_bodies: Vec<(String, usize, usize)> = Vec::new(); + let mut go_imported_pkgs: HashSet = HashSet::new(); + + { + let mut walk_ctx = GoWalkCtx { + str_path: &str_path, + stem: &stem, + pkg_scope: &pkg_scope, + file_nid: &file_nid, + nodes: &mut nodes, + edges: &mut edges, + seen_ids: &mut seen_ids, + function_bodies: &mut function_bodies, + go_imported_pkgs: &mut go_imported_pkgs, + }; + walk_go(&mut walk_ctx, tree.root_node(), &source); + } + + let label_to_nid = build_go_label_map(&nodes); + let raw_calls = resolve_go_function_calls(GoResolveArgs { + tree: &tree, + source: &source, + str_path: &str_path, + function_bodies: &function_bodies, + label_to_nid: &label_to_nid, + go_imported_pkgs: &go_imported_pkgs, + edges: &mut edges, + }); + crate::forward_refs::reconcile_forward_refs(&mut nodes, &mut edges); + // Validate dangling edges against the reconciled graph: reconcile may have + // folded placeholder nodes away, so rebuild the valid-id set from the + // surviving nodes rather than trusting the now-stale `seen_ids`. + let valid_ids: HashSet = nodes.iter().map(|n| n.id.clone()).collect(); + let clean_edges = filter_dangling_edges(edges, &valid_ids); + + FileResult { + nodes, + edges: clean_edges, + raw_calls, + error: None, + } +} + +/// Read the file and parse with tree-sitter-go. `None` on any I/O or parse error. +fn parse_go_source(path: &Path) -> Option<(Vec, tree_sitter::Tree)> { + let source = std::fs::read(path).ok()?; + let mut parser = tree_sitter::Parser::new(); + parser.set_language(&tree_sitter_go::LANGUAGE.into()).ok()?; + let tree = parser.parse(&source, None)?; + Some((source, tree)) +} + +/// Use the directory name as package scope so methods on the same type share a +/// canonical type node across files in the same package. +fn derive_pkg_scope(path: &Path, fallback_stem: &str) -> String { + path.parent() + .and_then(|p| p.file_name()) + .and_then(|n| n.to_str()) + .filter(|s| !s.is_empty()) + .unwrap_or(fallback_stem) + .to_string() +} + +/// Build a `normalised_label → nid` map for intra-file call resolution. +fn build_go_label_map(nodes: &[Node]) -> HashMap { + let mut label_to_nid: HashMap = HashMap::new(); + for n in nodes { + let normalised = n.label.trim_end_matches("()").trim_start_matches('.'); + label_to_nid.insert(normalised.to_lowercase(), n.id.clone()); + } + label_to_nid +} + +/// Bundle of shared inputs for [`resolve_go_function_calls`]. +struct GoResolveArgs<'a> { + tree: &'a tree_sitter::Tree, + source: &'a [u8], + str_path: &'a str, + function_bodies: &'a [(String, usize, usize)], + label_to_nid: &'a HashMap, + go_imported_pkgs: &'a HashSet, + edges: &'a mut Vec, +} + +/// Walk each function body to emit call edges and `RawCall` entries. +fn resolve_go_function_calls(args: GoResolveArgs<'_>) -> Vec { + let GoResolveArgs { + tree, + source, + str_path, + function_bodies, + label_to_nid, + go_imported_pkgs, + edges, + } = args; + let mut seen_call_pairs: HashSet<(String, String)> = HashSet::new(); + let mut raw_calls: Vec = Vec::new(); + { + let mut call_ctx = GoCallCtx { + str_path, + label_to_nid, + go_imported_pkgs, + edges, + seen_call_pairs: &mut seen_call_pairs, + raw_calls: &mut raw_calls, + }; + for (caller_nid, body_start, body_end) in function_bodies { + walk_calls_go( + &mut call_ctx, + tree.root_node(), + source, + caller_nid, + *body_start, + *body_end, + ); + } + } + raw_calls +} + +/// Drop edges whose endpoints aren't in `valid_ids` (except for `imports` edges). +fn filter_dangling_edges(edges: Vec, valid_ids: &HashSet) -> Vec { + edges + .into_iter() + .filter(|e| { + valid_ids.contains(&e.source) + && (valid_ids.contains(&e.target) + || matches!(e.relation.as_str(), "imports" | "imports_from")) + }) + .collect() +} diff --git a/crates/graphify-extract/src/extractors/go/refs.rs b/crates/graphify-extract/src/extractors/go/refs.rs new file mode 100644 index 0000000..eb2aa6b --- /dev/null +++ b/crates/graphify-extract/src/extractors/go/refs.rs @@ -0,0 +1,404 @@ +//! Go type-reference + struct/interface field edge emitters. + +use super::read_text; +use crate::ids::{make_id, make_id1}; +use crate::types::{Edge, Node}; +use std::collections::HashSet; + +/// Go's predeclared type identifiers — never emitted as semantic type references. +const GO_PREDECLARED_TYPES: &[&str] = &[ + "bool", + "byte", + "complex64", + "complex128", + "error", + "float32", + "float64", + "int", + "int8", + "int16", + "int32", + "int64", + "rune", + "string", + "uint", + "uint8", + "uint16", + "uint32", + "uint64", + "uintptr", + "any", + "comparable", +]; + +/// Walk a Go type expression, appending `(name, is_generic_arg)` tuples for each +/// user-defined type referenced. Predeclared types are skipped. Mirrors Python +/// `_go_collect_type_refs`. +fn go_collect_type_refs( + node: tree_sitter::Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, bool)>, +) { + match node.kind() { + "type_identifier" => { + let text = read_text(node, source); + if !text.is_empty() && !GO_PREDECLARED_TYPES.contains(&text) { + out.push((text.to_string(), generic)); + } + } + "qualified_type" => { + let full = read_text(node, source); + let text = full.rsplit('.').next().unwrap_or(full); + if !text.is_empty() && !GO_PREDECLARED_TYPES.contains(&text) { + out.push((text.to_string(), generic)); + } + } + "generic_type" => { + if let Some(type_field) = node.child_by_field_name("type") { + go_collect_type_refs(type_field, source, generic, out); + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == "type_arguments" { + let mut acur = cur.node().walk(); + if acur.goto_first_child() { + loop { + if acur.node().is_named() { + go_collect_type_refs(acur.node(), source, true, out); + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + "pointer_type" | "slice_type" | "array_type" | "map_type" | "channel_type" + | "parenthesized_type" => { + recurse_named_children(node, source, generic, out); + } + _ if node.is_named() => recurse_named_children(node, source, generic, out), + _ => {} + } +} + +/// Recurse `go_collect_type_refs` over every named child of `node`. +fn recurse_named_children( + node: tree_sitter::Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, bool)>, +) { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + go_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } +} + +/// Mutable graph state for the Go semantic-reference passes. Constructed by +/// reborrowing the structural-walk locals at each call site so these passes +/// never need to thread the full [`GoWalkCtx`]. +pub(super) struct GoRefCtx<'a> { + pub(super) source: &'a [u8], + pub(super) pkg_scope: &'a str, + pub(super) str_path: &'a str, + pub(super) nodes: &'a mut Vec, + pub(super) edges: &'a mut Vec, + pub(super) seen_ids: &'a mut HashSet, +} + +impl GoRefCtx<'_> { + /// Return the NID for a named type, creating a bare placeholder node when no + /// package-qualified node already exists. Mirrors Go's `ensure_named_node`. + fn ensure_named_node(&mut self, name: &str, line: usize) -> String { + let nid1 = make_id(&[self.pkg_scope, name]); + if self.seen_ids.contains(&nid1) { + return nid1; + } + let nid2 = make_id1(name); + if self.seen_ids.insert(nid2.clone()) { + self.nodes.push(Node { + id: nid2.clone(), + label: name.to_string(), + file_type: "code".to_string(), + source_file: self.str_path.to_string(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + nid2 + } + + /// Push a `references` edge from `src` to `tgt` with the given context. + fn push_ref(&mut self, src: &str, tgt: &str, context: &str, line: usize) { + self.edges.push(Edge { + external: false, + source: src.to_string(), + target: tgt.to_string(), + relation: "references".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: self.str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: Some(context.to_string()), + confidence_score: None, + }); + } + + /// Push a plain `embeds` edge from `src` to `tgt` (Go struct/interface embedding). + fn push_embeds(&mut self, src: &str, tgt: &str, line: usize) { + self.edges.push(Edge { + external: false, + source: src.to_string(), + target: tgt.to_string(), + relation: "embeds".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: self.str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + } +} + +/// Emit `references` edges for a function/method's parameter and result types. +/// +/// Mirrors Python `emit_go_method_refs`: direct param types use the +/// `parameter_type` context, result types use `return_type`, and any generic +/// arguments use `generic_arg`. +pub(super) fn emit_go_method_refs( + rc: &mut GoRefCtx<'_>, + func_node: tree_sitter::Node<'_>, + func_nid: &str, + line: usize, +) { + if let Some(params) = func_node.child_by_field_name("parameters") { + let mut cur = params.walk(); + if cur.goto_first_child() { + loop { + let p = cur.node(); + if p.kind() == "parameter_declaration" + && let Some(type_node) = p.child_by_field_name("type") + { + let mut refs: Vec<(String, bool)> = Vec::new(); + go_collect_type_refs(type_node, rc.source, false, &mut refs); + for (ref_name, is_generic) in refs { + let ctx = if is_generic { + "generic_arg" + } else { + "parameter_type" + }; + let tgt = rc.ensure_named_node(&ref_name, line); + if tgt != func_nid { + rc.push_ref(func_nid, &tgt, ctx, line); + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + let Some(result) = func_node.child_by_field_name("result") else { + return; + }; + if result.kind() == "parameter_list" { + let mut cur = result.walk(); + if cur.goto_first_child() { + loop { + let p = cur.node(); + if p.kind() == "parameter_declaration" { + let type_node = p.child_by_field_name("type").or_else(|| { + let mut c = p.walk(); + if c.goto_first_child() { + loop { + if c.node().is_named() { + return Some(c.node()); + } + if !c.goto_next_sibling() { + break; + } + } + } + None + }); + if let Some(tn) = type_node { + emit_go_result_refs(rc, tn, func_nid, line); + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + } else { + emit_go_result_refs(rc, result, func_nid, line); + } +} + +/// Emit `return_type` / `generic_arg` references from a single result type node. +fn emit_go_result_refs( + rc: &mut GoRefCtx<'_>, + type_node: tree_sitter::Node<'_>, + func_nid: &str, + line: usize, +) { + let mut refs: Vec<(String, bool)> = Vec::new(); + go_collect_type_refs(type_node, rc.source, false, &mut refs); + for (ref_name, is_generic) in refs { + let ctx = if is_generic { + "generic_arg" + } else { + "return_type" + }; + let tgt = rc.ensure_named_node(&ref_name, line); + if tgt != func_nid { + rc.push_ref(func_nid, &tgt, ctx, line); + } + } +} + +/// Emit `embeds` / `references[field]` edges for a `type_spec`'s struct fields, +/// and `embeds` / `references[generic_arg]` edges for interface embedding. +/// +/// A struct field with no name and a direct (non-generic) type is an embedded +/// field → `embeds`; named fields and generic args become `references`. Mirrors +/// the struct/interface body handling added to Python `extract_go`. +pub(super) fn emit_go_type_body_refs( + rc: &mut GoRefCtx<'_>, + type_spec: tree_sitter::Node<'_>, + type_nid: &str, +) { + let mut type_body: Option> = None; + let mut cur = type_spec.walk(); + if cur.goto_first_child() { + loop { + if matches!(cur.node().kind(), "struct_type" | "interface_type") { + type_body = Some(cur.node()); + break; + } + if !cur.goto_next_sibling() { + break; + } + } + } + let Some(type_body) = type_body else { + return; + }; + + if type_body.kind() == "struct_type" { + let mut fdl_cur = type_body.walk(); + if !fdl_cur.goto_first_child() { + return; + } + loop { + if fdl_cur.node().kind() == "field_declaration_list" { + let mut fcur = fdl_cur.node().walk(); + if fcur.goto_first_child() { + loop { + if fcur.node().kind() == "field_declaration" { + emit_go_struct_field_refs(rc, fcur.node(), type_nid); + } + if !fcur.goto_next_sibling() { + break; + } + } + } + } + if !fdl_cur.goto_next_sibling() { + break; + } + } + } else { + // interface_type — embedded interfaces appear as `type_elem`. + let mut ecur = type_body.walk(); + if !ecur.goto_first_child() { + return; + } + loop { + if ecur.node().kind() == "type_elem" { + let line = ecur.node().start_position().row + 1; + let mut refs: Vec<(String, bool)> = Vec::new(); + let mut scur = ecur.node().walk(); + if scur.goto_first_child() { + loop { + if scur.node().is_named() { + go_collect_type_refs(scur.node(), rc.source, false, &mut refs); + } + if !scur.goto_next_sibling() { + break; + } + } + } + for (ref_name, is_generic) in refs { + let tgt = rc.ensure_named_node(&ref_name, line); + if tgt == type_nid { + continue; + } + if is_generic { + rc.push_ref(type_nid, &tgt, "generic_arg", line); + } else { + rc.push_embeds(type_nid, &tgt, line); + } + } + } + if !ecur.goto_next_sibling() { + break; + } + } + } +} + +/// Emit edges for a single Go struct `field_declaration`. +fn emit_go_struct_field_refs(rc: &mut GoRefCtx<'_>, field: tree_sitter::Node<'_>, type_nid: &str) { + let line = field.start_position().row + 1; + let mut has_name = false; + let mut fallback_type: Option> = None; + let mut fcur = field.walk(); + if fcur.goto_first_child() { + loop { + let fc = fcur.node(); + if fc.kind() == "field_identifier" { + has_name = true; + } else if fallback_type.is_none() && fc.is_named() { + fallback_type = Some(fc); + } + if !fcur.goto_next_sibling() { + break; + } + } + } + let Some(type_node) = field.child_by_field_name("type").or(fallback_type) else { + return; + }; + let mut refs: Vec<(String, bool)> = Vec::new(); + go_collect_type_refs(type_node, rc.source, false, &mut refs); + for (ref_name, is_generic) in refs { + let tgt = rc.ensure_named_node(&ref_name, line); + if tgt == type_nid { + continue; + } + if !has_name && !is_generic { + rc.push_embeds(type_nid, &tgt, line); + } else { + let ctx = if is_generic { "generic_arg" } else { "field" }; + rc.push_ref(type_nid, &tgt, ctx, line); + } + } +} diff --git a/crates/graphify-extract/src/extractors/go/walk.rs b/crates/graphify-extract/src/extractors/go/walk.rs new file mode 100644 index 0000000..3810187 --- /dev/null +++ b/crates/graphify-extract/src/extractors/go/walk.rs @@ -0,0 +1,357 @@ +//! Go structural AST walk (functions, methods, types, imports). + +use super::read_text; +use super::refs::{GoRefCtx, emit_go_method_refs, emit_go_type_body_refs}; +use crate::ids::make_id; +use crate::types::{Edge, Node}; +use std::collections::HashSet; + +/// Recursively walk a Go AST emitting nodes and edges for functions, methods, and type declarations. +/// +/// Handles `function_declaration`, `method_declaration`, `type_declaration`, and `import_declaration` +/// nodes. Descends into all child nodes. Mirrors Python `_walk_go`. +/// Shared state threaded through every [`walk_go`] recursion. +pub(super) struct GoWalkCtx<'a> { + pub(super) str_path: &'a str, + pub(super) stem: &'a str, + pub(super) pkg_scope: &'a str, + pub(super) file_nid: &'a str, + pub(super) nodes: &'a mut Vec, + pub(super) edges: &'a mut Vec, + pub(super) seen_ids: &'a mut HashSet, + pub(super) function_bodies: &'a mut Vec<(String, usize, usize)>, + pub(super) go_imported_pkgs: &'a mut HashSet, +} + +#[allow(clippy::too_many_lines)] // linear dispatch over Go's AST node kinds +pub(super) fn walk_go(ctx: &mut GoWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8]) { + let str_path = ctx.str_path; + let stem = ctx.stem; + let pkg_scope = ctx.pkg_scope; + let file_nid = ctx.file_nid; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + let function_bodies = &mut *ctx.function_bodies; + let go_imported_pkgs = &mut *ctx.go_imported_pkgs; + let t = node.kind(); + + match t { + "function_declaration" => { + if let Some(name_node) = node.child_by_field_name("name") { + let func_name = read_text(name_node, source); + let line = node.start_position().row + 1; + let func_nid = make_id(&[stem, func_name]); + if seen_ids.insert(func_nid.clone()) { + nodes.push(Node { + id: func_nid.clone(), + label: format!("{func_name}()"), + file_type: "code".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: file_nid.to_string(), + target: func_nid.clone(), + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + let mut rc = GoRefCtx { + source, + pkg_scope, + str_path, + nodes: &mut *nodes, + edges: &mut *edges, + seen_ids: &mut *seen_ids, + }; + emit_go_method_refs(&mut rc, node, &func_nid, line); + if let Some(body) = node.child_by_field_name("body") { + function_bodies.push((func_nid, body.start_byte(), body.end_byte())); + } + } + } + "method_declaration" => { + let receiver = node.child_by_field_name("receiver"); + let mut receiver_type: Option = None; + if let Some(recv) = receiver { + let mut cur = recv.walk(); + if cur.goto_first_child() { + loop { + let param = cur.node(); + if param.kind() == "parameter_declaration" { + if let Some(type_node) = param.child_by_field_name("type") { + let raw = read_text(type_node, source) + .trim_start_matches('*') + .trim() + .to_string(); + receiver_type = Some(raw); + } + break; + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + if let Some(name_node) = node.child_by_field_name("name") { + let method_name = read_text(name_node, source); + let line = node.start_position().row + 1; + let method_nid = if let Some(ref rt) = receiver_type { + let parent_nid = make_id(&[pkg_scope, rt]); + if seen_ids.insert(parent_nid.clone()) { + nodes.push(Node { + id: parent_nid.clone(), + label: rt.clone(), + file_type: "code".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + let mnid = make_id(&[&parent_nid, method_name]); + if seen_ids.insert(mnid.clone()) { + nodes.push(Node { + id: mnid.clone(), + label: format!(".{method_name}()"), + file_type: "code".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: parent_nid, + target: mnid.clone(), + relation: "method".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + mnid + } else { + let mnid = make_id(&[stem, method_name]); + if seen_ids.insert(mnid.clone()) { + nodes.push(Node { + id: mnid.clone(), + label: format!("{method_name}()"), + file_type: "code".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: file_nid.to_string(), + target: mnid.clone(), + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + mnid + }; + let mut rc = GoRefCtx { + source, + pkg_scope, + str_path, + nodes: &mut *nodes, + edges: &mut *edges, + seen_ids: &mut *seen_ids, + }; + emit_go_method_refs(&mut rc, node, &method_nid, line); + if let Some(body) = node.child_by_field_name("body") { + function_bodies.push((method_nid, body.start_byte(), body.end_byte())); + } + } + } + "type_declaration" => { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if child.kind() == "type_spec" + && let Some(name_node) = child.child_by_field_name("name") + { + let type_name = read_text(name_node, source); + let line = child.start_position().row + 1; + let type_nid = make_id(&[pkg_scope, type_name]); + if seen_ids.insert(type_nid.clone()) { + nodes.push(Node { + id: type_nid.clone(), + label: type_name.to_string(), + file_type: "code".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: file_nid.to_string(), + target: type_nid.clone(), + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + // Struct field embeds/references and interface embedding. + let mut rc = GoRefCtx { + source, + pkg_scope, + str_path, + nodes: &mut *nodes, + edges: &mut *edges, + seen_ids: &mut *seen_ids, + }; + emit_go_type_body_refs(&mut rc, child, &type_nid); + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + "import_declaration" => { + walk_go_imports( + node, + source, + str_path, + file_nid, + edges, + seen_ids, + go_imported_pkgs, + ); + } + _ => { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_go(ctx, cur.node(), source); + if !cur.goto_next_sibling() { + break; + } + } + } + } + } +} + +/// Walk an `import_declaration` subtree, delegating each `import_spec` to `emit_go_import_spec`. +fn walk_go_imports( + node: tree_sitter::Node<'_>, + source: &[u8], + str_path: &str, + file_nid: &str, + edges: &mut Vec, + seen_ids: &mut HashSet, + go_imported_pkgs: &mut HashSet, +) { + let mut cur = node.walk(); + if !cur.goto_first_child() { + return; + } + loop { + let child = cur.node(); + match child.kind() { + "import_spec_list" => { + let mut c2 = child.walk(); + if c2.goto_first_child() { + loop { + let spec = c2.node(); + if spec.kind() == "import_spec" { + emit_go_import_spec( + spec, + source, + str_path, + file_nid, + edges, + seen_ids, + go_imported_pkgs, + ); + } + if !c2.goto_next_sibling() { + break; + } + } + } + } + "import_spec" => { + emit_go_import_spec( + child, + source, + str_path, + file_nid, + edges, + seen_ids, + go_imported_pkgs, + ); + } + _ => {} + } + if !cur.goto_next_sibling() { + break; + } + } +} + +/// Emit a single `imports_from` edge for one Go `import_spec` node. +/// +/// The target NID is derived from the import path string (e.g. `"fmt"` → `go::pkg::fmt`). +/// The package name is also recorded in `go_imported_pkgs` for use during call resolution. +fn emit_go_import_spec( + spec: tree_sitter::Node<'_>, + source: &[u8], + str_path: &str, + file_nid: &str, + edges: &mut Vec, + _seen_ids: &mut HashSet, + go_imported_pkgs: &mut HashSet, +) { + if let Some(path_node) = spec.child_by_field_name("path") { + let raw = read_text(path_node, source).trim_matches('"'); + let tgt_nid = make_id(&["go", "pkg", raw]); + let line = spec.start_position().row + 1; + edges.push(Edge { + external: false, + source: file_nid.to_string(), + target: tgt_nid, + relation: "imports_from".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: Some("import".to_string()), + confidence_score: None, + }); + // Track local name (alias or last path segment) + let alias = spec.child_by_field_name("name"); + let local_name = if let Some(a) = alias { + read_text(a, source).to_string() + } else { + raw.split('/').next_back().unwrap_or("").to_string() + }; + if !local_name.is_empty() && local_name != "_" && local_name != "." { + go_imported_pkgs.insert(local_name); + } + } +} diff --git a/crates/graphify-extract/src/extractors/groovy.rs b/crates/graphify-extract/src/extractors/groovy.rs new file mode 100644 index 0000000..5849927 --- /dev/null +++ b/crates/graphify-extract/src/extractors/groovy.rs @@ -0,0 +1,213 @@ +//! Groovy / Gradle extractor with Spock-test regex fallback. + +use std::collections::HashSet; +use std::path::Path; +use std::sync::LazyLock; + +use regex::Regex; + +use crate::generic::extract_generic; +use crate::lang_configs; +use crate::types::FileResult; + +#[allow(clippy::expect_used)] // literal pattern; build cannot panic +static CLASS_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"^\s*(?:[\w@]+\s+)*class\s+(\w+)").expect("static spock class regex") +}); +#[allow(clippy::expect_used)] +static FEATURE_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"^\s*def\s+(?:"([^"]+)"|'([^']+)')\s*\("#).expect("static spock feature regex") +}); +#[allow(clippy::expect_used)] +static PLAIN_METHOD_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^\s*def\s+(\w+)\s*\(").expect("static spock method regex")); +static SPOCK_KWS: LazyLock> = LazyLock::new(|| { + ["if", "while", "for", "switch", "catch"] + .into_iter() + .collect() +}); + +/// Extract classes, methods, constructors, and imports from a `.groovy`/`.gradle` file. +/// Falls back to regex-based Spock extractor when needed. +#[must_use] +pub fn extract_groovy(path: &Path) -> FileResult { + let result = extract_generic(path, &lang_configs::GROOVY); + if is_spock_file(path) { + extract_spock_fallback(path, result) + } else { + result + } +} + +/// Return `true` if the Groovy file contains Spock-style `def "feature"()` test methods. +/// +/// Spock test methods use quoted string names that the generic tree-sitter extractor misses; +/// this heuristic triggers the regex fallback when any line starts with `def "` or `def '`. +fn is_spock_file(path: &Path) -> bool { + let Ok(src) = std::fs::read_to_string(path) else { + return false; + }; + // Check for `def "feature"()` patterns + src.lines().any(|l| { + let t = l.trim(); + t.starts_with("def \"") || t.starts_with("def '") + }) +} + +/// Extract class and method nodes from a Spock test file using regex scanning. +/// +/// The generic tree-sitter pass already ran (`ts_result`) but cannot handle Spock's quoted +/// method names. This function discards the tree-sitter node/method edges, keeps the file +/// node and import edges, then re-scans line-by-line with three regexes: +/// `class`, `def "feature"()`, and `def plainMethod()`. Mirrors Python `_extract_spock_fallback`. +#[allow(clippy::too_many_lines, clippy::cast_possible_truncation)] +// ↑ literal regex patterns; function is a direct port; row→u32 is safe +fn extract_spock_fallback(path: &Path, ts_result: FileResult) -> FileResult { + use crate::ids::{file_stem, make_id, make_id1}; + use crate::types::{Edge, Node}; + + let Ok(source) = std::fs::read_to_string(path) else { + return ts_result; + }; + let str_path = path.to_string_lossy().into_owned(); + let stem = file_stem(path); + + // Keep file node + import edges from tree-sitter pass + let file_node = ts_result + .nodes + .iter() + .find(|n| { + path.file_name() + .is_some_and(|f| f.to_string_lossy() == n.label) + }) + .cloned(); + let mut nodes: Vec = file_node.into_iter().collect(); + let mut edges: Vec = ts_result + .edges + .into_iter() + .filter(|e| e.context.as_deref() == Some("import")) + .collect(); + let mut seen_ids: HashSet = nodes.iter().map(|n| n.id.clone()).collect(); + + let file_nid = make_id1(&str_path); + if !seen_ids.contains(&file_nid) { + nodes.push(Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some("L1".to_string()), + metadata: None, + }); + seen_ids.insert(file_nid.clone()); + } + + let mut current_class_nid: Option = None; + + for (lineno, line) in source.lines().enumerate() { + let lineno = lineno + 1; + if let Some(cap) = CLASS_RE.captures(line) { + let class_name = cap.get(1).map_or("", |m| m.as_str()); + let class_nid = make_id(&[&stem, class_name]); + if !seen_ids.contains(&class_nid) { + seen_ids.insert(class_nid.clone()); + nodes.push(Node { + id: class_nid.clone(), + label: class_name.to_string(), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{lineno}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: file_nid.clone(), + target: class_nid.clone(), + relation: "contains".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{lineno}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + current_class_nid = Some(class_nid); + continue; + } + + let Some(ref class_nid) = current_class_nid else { + continue; + }; + + if let Some(cap) = FEATURE_RE.captures(line) { + let method_name = cap.get(1).or_else(|| cap.get(2)).map_or("", |m| m.as_str()); + let method_label = format!("\"{method_name}\""); + let method_nid = make_id(&[class_nid, method_name]); + if !seen_ids.contains(&method_nid) { + seen_ids.insert(method_nid.clone()); + nodes.push(Node { + id: method_nid.clone(), + label: method_label, + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{lineno}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: class_nid.clone(), + target: method_nid, + relation: "method".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{lineno}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + continue; + } + + if let Some(cap) = PLAIN_METHOD_RE.captures(line) { + let method_name = cap.get(1).map_or("", |m| m.as_str()); + if !SPOCK_KWS.contains(method_name) { + let method_label = format!(".{method_name}()"); + let method_nid = make_id(&[class_nid, method_name]); + if !seen_ids.contains(&method_nid) { + seen_ids.insert(method_nid.clone()); + nodes.push(Node { + id: method_nid.clone(), + label: method_label, + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{lineno}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: class_nid.clone(), + target: method_nid, + relation: "method".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{lineno}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + } + } + } + + FileResult { + nodes, + edges, + raw_calls: Vec::new(), + error: None, + } +} diff --git a/crates/graphify-extract/src/extractors/julia/calls.rs b/crates/graphify-extract/src/extractors/julia/calls.rs new file mode 100644 index 0000000..1d522e4 --- /dev/null +++ b/crates/graphify-extract/src/extractors/julia/calls.rs @@ -0,0 +1,151 @@ +//! Julia call-graph pass. + +use super::read_text; +use crate::ids::make_id; +use crate::types::Edge; +use std::collections::HashSet; + +/// Collect `calls` edges within a Julia function body's byte range. +/// +/// Skips nested `function_definition` nodes. Emits `calls` edges for `call_expression` nodes +/// whose callee matches a known NID. Mirrors Python `_walk_calls_julia`. +/// Shared state threaded through every [`walk_calls_julia`] recursion. +pub(super) struct JuliaCallCtx<'a> { + pub(super) str_path: &'a str, + pub(super) stem: &'a str, + pub(super) edges: &'a mut Vec, + pub(super) seen_ids: &'a HashSet, +} + +pub(super) fn walk_calls_julia( + ctx: &mut JuliaCallCtx<'_>, + node: tree_sitter::Node<'_>, + source: &[u8], + func_nid: &str, + body_start: usize, + body_end: usize, +) { + if node.start_byte() >= body_end || node.end_byte() <= body_start { + return; + } + if matches!( + node.kind(), + "function_definition" | "short_function_definition" + ) { + return; + } + if node.kind() == "call_expression" && node.child_count() > 0 { + let callee = { + let mut cur = node.walk(); + if cur.goto_first_child() { + Some(cur.node()) + } else { + None + } + }; + if let Some(callee_node) = callee { + if callee_node.kind() == "identifier" { + let callee_name = read_text(callee_node, source); + let target_nid = make_id(&[ctx.stem, callee_name]); + if ctx.seen_ids.contains(&target_nid) && target_nid != func_nid { + ctx.edges.push(Edge { + external: false, + source: func_nid.to_string(), + target: target_nid, + relation: "calls".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: ctx.str_path.to_string(), + source_location: Some(format!("L{}", node.start_position().row + 1)), + weight: 1.0, + context: Some("call".to_string()), + confidence_score: None, + }); + } + } else if callee_node.kind() == "field_expression" && callee_node.child_count() >= 3 { + let count = u32::try_from(callee_node.child_count()).unwrap_or(0); + let method_node = callee_node.child(count - 1); + if let Some(mn) = method_node { + let method_name = read_text(mn, source); + let target_nid = make_id(&[ctx.stem, method_name]); + if ctx.seen_ids.contains(&target_nid) && target_nid != func_nid { + ctx.edges.push(Edge { + external: false, + source: func_nid.to_string(), + target: target_nid, + relation: "calls".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: ctx.str_path.to_string(), + source_location: Some(format!("L{}", node.start_position().row + 1)), + weight: 1.0, + context: Some("call".to_string()), + confidence_score: None, + }); + } + } + } + } + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_calls_julia(ctx, cur.node(), source, func_nid, body_start, body_end); + if !cur.goto_next_sibling() { + break; + } + } + } +} + +/// Walk the body children of a `function_definition` node, calling `walk_calls_julia` on each. +/// +/// Finds the `function_definition` node by byte range, then iterates its children starting +/// after the signature, so nested function bodies are attributed to the right caller. +// Walk children of a function_definition node (skipping signature) +pub(super) fn walk_calls_julia_children( + ctx: &mut JuliaCallCtx<'_>, + tree_root: tree_sitter::Node<'_>, + source: &[u8], + func_nid: &str, + node_start: usize, + node_end: usize, +) { + // Find the function_definition node by byte range + /// Search the subtree rooted at `n` for a `function_definition` node matching `start`/`end` byte offsets. + fn find_node( + n: tree_sitter::Node<'_>, + start: usize, + end: usize, + ) -> Option> { + if n.start_byte() == start && n.end_byte() == end && n.kind() == "function_definition" { + return Some(n); + } + let mut cur = n.walk(); + if cur.goto_first_child() { + loop { + if let Some(found) = find_node(cur.node(), start, end) { + return Some(found); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None + } + + let Some(func_node) = find_node(tree_root, node_start, node_end) else { + return; + }; + let mut cur = func_node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if child.kind() != "signature" { + walk_calls_julia(ctx, child, source, func_nid, node_start, node_end); + } + if !cur.goto_next_sibling() { + break; + } + } + } +} diff --git a/crates/graphify-extract/src/extractors/julia/mod.rs b/crates/graphify-extract/src/extractors/julia/mod.rs new file mode 100644 index 0000000..3d3ed10 --- /dev/null +++ b/crates/graphify-extract/src/extractors/julia/mod.rs @@ -0,0 +1,129 @@ +//! Julia extractor — custom walk over tree-sitter-julia AST. + +mod calls; +mod walk; + +use crate::ids::{file_stem, make_id1}; +use crate::types::{Edge, FileResult, Node}; +use calls::{JuliaCallCtx, walk_calls_julia, walk_calls_julia_children}; +use std::collections::HashSet; +use std::path::Path; +use walk::{JuliaWalkCtx, walk_julia}; + +/// Return the source bytes covered by `node` as a UTF-8 `&str`, or `""` on bad UTF-8. +fn read_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { + std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") +} + +/// Extract modules, structs, functions, imports, and calls from a `.jl` file. +#[must_use] +pub fn extract_julia(path: &Path) -> FileResult { + let source = match std::fs::read(path) { + Ok(b) => b, + Err(e) => { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: Some(e.to_string()), + }; + } + }; + + let mut parser = tree_sitter::Parser::new(); + if parser + .set_language(&tree_sitter_julia::LANGUAGE.into()) + .is_err() + { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: Some("failed to set julia language".to_string()), + }; + } + let Some(tree) = parser.parse(&source, None) else { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: Some("parse failed".to_string()), + }; + }; + + let stem = file_stem(path); + let str_path = path.to_string_lossy().into_owned(); + + let mut nodes: Vec = Vec::new(); + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::new(); + // (func_nid, node_start_byte, node_end_byte, is_function_def) + let mut function_bodies: Vec<(String, usize, usize, bool)> = Vec::new(); + + let file_nid = make_id1(&str_path); + seen_ids.insert(file_nid.clone()); + nodes.push(Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some("L1".to_string()), + metadata: None, + }); + + let root = tree.root_node(); + { + let mut walk_ctx = JuliaWalkCtx { + str_path: &str_path, + stem: &stem, + file_nid: &file_nid, + nodes: &mut nodes, + edges: &mut edges, + seen_ids: &mut seen_ids, + function_bodies: &mut function_bodies, + }; + walk_julia(&mut walk_ctx, root, &source, &file_nid); + } + + // Second pass: call edges + { + let mut call_ctx = JuliaCallCtx { + str_path: &str_path, + stem: &stem, + edges: &mut edges, + seen_ids: &seen_ids, + }; + for (func_nid, node_start, node_end, is_func_def) in &function_bodies { + let tree_root = tree.root_node(); + if *is_func_def { + walk_calls_julia_children( + &mut call_ctx, + tree_root, + &source, + func_nid, + *node_start, + *node_end, + ); + } else { + walk_calls_julia( + &mut call_ctx, + tree_root, + &source, + func_nid, + *node_start, + *node_end, + ); + } + } + } + + crate::forward_refs::reconcile_forward_refs(&mut nodes, &mut edges); + FileResult { + nodes, + edges, + raw_calls: vec![], + error: None, + } +} diff --git a/crates/graphify-extract/src/extractors/julia.rs b/crates/graphify-extract/src/extractors/julia/walk.rs similarity index 73% rename from crates/graphify-extract/src/extractors/julia.rs rename to crates/graphify-extract/src/extractors/julia/walk.rs index c5b3a17..d477ead 100644 --- a/crates/graphify-extract/src/extractors/julia.rs +++ b/crates/graphify-extract/src/extractors/julia/walk.rs @@ -1,128 +1,9 @@ -//! Julia extractor — custom walk over tree-sitter-julia AST. +//! Julia structural AST walk (modules, structs, functions, imports). +use super::read_text; +use crate::ids::{make_id, make_id1}; +use crate::types::{Edge, Node}; use std::collections::HashSet; -use std::path::Path; - -use crate::ids::{file_stem, make_id, make_id1}; -use crate::types::{Edge, FileResult, Node}; - -/// Return the source bytes covered by `node` as a UTF-8 `&str`, or `""` on bad UTF-8. -fn read_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { - std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") -} - -/// Extract modules, structs, functions, imports, and calls from a `.jl` file. -#[must_use] -pub fn extract_julia(path: &Path) -> FileResult { - let source = match std::fs::read(path) { - Ok(b) => b, - Err(e) => { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: Some(e.to_string()), - }; - } - }; - - let mut parser = tree_sitter::Parser::new(); - if parser - .set_language(&tree_sitter_julia::LANGUAGE.into()) - .is_err() - { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: Some("failed to set julia language".to_string()), - }; - } - let Some(tree) = parser.parse(&source, None) else { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: Some("parse failed".to_string()), - }; - }; - - let stem = file_stem(path); - let str_path = path.to_string_lossy().into_owned(); - - let mut nodes: Vec = Vec::new(); - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::new(); - // (func_nid, node_start_byte, node_end_byte, is_function_def) - let mut function_bodies: Vec<(String, usize, usize, bool)> = Vec::new(); - - let file_nid = make_id1(&str_path); - seen_ids.insert(file_nid.clone()); - nodes.push(Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some("L1".to_string()), - metadata: None, - }); - - let root = tree.root_node(); - { - let mut walk_ctx = JuliaWalkCtx { - str_path: &str_path, - stem: &stem, - file_nid: &file_nid, - nodes: &mut nodes, - edges: &mut edges, - seen_ids: &mut seen_ids, - function_bodies: &mut function_bodies, - }; - walk_julia(&mut walk_ctx, root, &source, &file_nid); - } - - // Second pass: call edges - { - let mut call_ctx = JuliaCallCtx { - str_path: &str_path, - stem: &stem, - edges: &mut edges, - seen_ids: &seen_ids, - }; - for (func_nid, node_start, node_end, is_func_def) in &function_bodies { - let tree_root = tree.root_node(); - if *is_func_def { - walk_calls_julia_children( - &mut call_ctx, - tree_root, - &source, - func_nid, - *node_start, - *node_end, - ); - } else { - walk_calls_julia( - &mut call_ctx, - tree_root, - &source, - func_nid, - *node_start, - *node_end, - ); - } - } - } - - crate::forward_refs::reconcile_forward_refs(&mut nodes, &mut edges); - FileResult { - nodes, - edges, - raw_calls: vec![], - error: None, - } -} /// Extract the function name from a Julia function signature node. /// @@ -134,11 +15,10 @@ fn func_name_from_signature(sig_node: tree_sitter::Node<'_>, source: &[u8]) -> O loop { let child = cur.node(); if child.kind() == "call_expression" { - let callee = child.walk().goto_first_child().then(|| { + let callee = { let mut c = child.walk(); - c.goto_first_child(); - c.node() - }); + c.goto_first_child().then(|| c.node()) + }; if let Some(callee_node) = callee && callee_node.kind() == "identifier" { @@ -170,14 +50,14 @@ fn func_name_from_signature(sig_node: tree_sitter::Node<'_>, source: &[u8]) -> O /// Handles `module_definition`, `struct_definition`, `function_definition`, `macro_definition`, /// and `import_statement`/`using_statement`. Mirrors Python `_walk_julia`. /// Shared state threaded through every [`walk_julia`] recursion. -struct JuliaWalkCtx<'a> { - str_path: &'a str, - stem: &'a str, - file_nid: &'a str, - nodes: &'a mut Vec, - edges: &'a mut Vec, - seen_ids: &'a mut HashSet, - function_bodies: &'a mut Vec<(String, usize, usize, bool)>, +pub(super) struct JuliaWalkCtx<'a> { + pub(super) str_path: &'a str, + pub(super) stem: &'a str, + pub(super) file_nid: &'a str, + pub(super) nodes: &'a mut Vec, + pub(super) edges: &'a mut Vec, + pub(super) seen_ids: &'a mut HashSet, + pub(super) function_bodies: &'a mut Vec<(String, usize, usize, bool)>, } impl JuliaWalkCtx<'_> { @@ -264,7 +144,7 @@ fn emit_julia_struct_fields( } #[allow(clippy::too_many_lines)] // linear dispatch over Julia's AST node kinds -fn walk_julia( +pub(super) fn walk_julia( ctx: &mut JuliaWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8], @@ -745,148 +625,3 @@ fn walk_julia( } } } - -/// Collect `calls` edges within a Julia function body's byte range. -/// -/// Skips nested `function_definition` nodes. Emits `calls` edges for `call_expression` nodes -/// whose callee matches a known NID. Mirrors Python `_walk_calls_julia`. -/// Shared state threaded through every [`walk_calls_julia`] recursion. -struct JuliaCallCtx<'a> { - str_path: &'a str, - stem: &'a str, - edges: &'a mut Vec, - seen_ids: &'a HashSet, -} - -fn walk_calls_julia( - ctx: &mut JuliaCallCtx<'_>, - node: tree_sitter::Node<'_>, - source: &[u8], - func_nid: &str, - body_start: usize, - body_end: usize, -) { - if node.start_byte() >= body_end || node.end_byte() <= body_start { - return; - } - if matches!( - node.kind(), - "function_definition" | "short_function_definition" - ) { - return; - } - if node.kind() == "call_expression" && node.child_count() > 0 { - let callee = { - let mut cur = node.walk(); - if cur.goto_first_child() { - Some(cur.node()) - } else { - None - } - }; - if let Some(callee_node) = callee { - if callee_node.kind() == "identifier" { - let callee_name = read_text(callee_node, source); - let target_nid = make_id(&[ctx.stem, callee_name]); - if ctx.seen_ids.contains(&target_nid) && target_nid != func_nid { - ctx.edges.push(Edge { - external: false, - source: func_nid.to_string(), - target: target_nid, - relation: "calls".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: ctx.str_path.to_string(), - source_location: Some(format!("L{}", node.start_position().row + 1)), - weight: 1.0, - context: Some("call".to_string()), - confidence_score: None, - }); - } - } else if callee_node.kind() == "field_expression" && callee_node.child_count() >= 3 { - let count = u32::try_from(callee_node.child_count()).unwrap_or(0); - let method_node = callee_node.child(count - 1); - if let Some(mn) = method_node { - let method_name = read_text(mn, source); - let target_nid = make_id(&[ctx.stem, method_name]); - if ctx.seen_ids.contains(&target_nid) && target_nid != func_nid { - ctx.edges.push(Edge { - external: false, - source: func_nid.to_string(), - target: target_nid, - relation: "calls".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: ctx.str_path.to_string(), - source_location: Some(format!("L{}", node.start_position().row + 1)), - weight: 1.0, - context: Some("call".to_string()), - confidence_score: None, - }); - } - } - } - } - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_calls_julia(ctx, cur.node(), source, func_nid, body_start, body_end); - if !cur.goto_next_sibling() { - break; - } - } - } -} - -/// Walk the body children of a `function_definition` node, calling `walk_calls_julia` on each. -/// -/// Finds the `function_definition` node by byte range, then iterates its children starting -/// after the signature, so nested function bodies are attributed to the right caller. -// Walk children of a function_definition node (skipping signature) -fn walk_calls_julia_children( - ctx: &mut JuliaCallCtx<'_>, - tree_root: tree_sitter::Node<'_>, - source: &[u8], - func_nid: &str, - node_start: usize, - node_end: usize, -) { - // Find the function_definition node by byte range - /// Search the subtree rooted at `n` for a `function_definition` node matching `start`/`end` byte offsets. - fn find_node( - n: tree_sitter::Node<'_>, - start: usize, - end: usize, - ) -> Option> { - if n.start_byte() == start && n.end_byte() == end && n.kind() == "function_definition" { - return Some(n); - } - let mut cur = n.walk(); - if cur.goto_first_child() { - loop { - if let Some(found) = find_node(cur.node(), start, end) { - return Some(found); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None - } - - let Some(func_node) = find_node(tree_root, node_start, node_end) else { - return; - }; - let mut cur = func_node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if child.kind() != "signature" { - walk_calls_julia(ctx, child, source, func_nid, node_start, node_end); - } - if !cur.goto_next_sibling() { - break; - } - } - } -} diff --git a/crates/graphify-extract/src/extractors/mod.rs b/crates/graphify-extract/src/extractors/mod.rs index 3a992a1..47ba519 100644 --- a/crates/graphify-extract/src/extractors/mod.rs +++ b/crates/graphify-extract/src/extractors/mod.rs @@ -11,6 +11,7 @@ pub mod dotnet; pub mod elixir; pub mod fortran; pub mod go; +pub mod groovy; pub mod json_lang; pub mod julia; pub mod manifest_ingest; @@ -27,24 +28,17 @@ pub mod terraform; pub mod verilog; pub mod zig; +mod python_rationale; + use std::path::Path; use crate::generic::extract_generic; use crate::lang_configs; use crate::types::FileResult; +pub use groovy::extract_groovy; pub use multi::extract; -const RATIONALE_PREFIXES: &[&str] = &[ - "# NOTE:", - "# IMPORTANT:", - "# HACK:", - "# WHY:", - "# RATIONALE:", - "# TODO:", - "# FIXME:", -]; - /// Size cap for project XML files (`.csproj` / `.fsproj` / `.vbproj` / `.lpk`). /// Real files are well under 2 MiB; anything larger is malformed or hostile. /// Mirrors `_PROJECT_XML_MAX_BYTES` in `graphify-py`. @@ -75,7 +69,7 @@ pub(crate) fn project_xml_is_safe(src: &[u8]) -> bool { pub fn extract_python(path: &Path) -> FileResult { let mut result = extract_generic(path, &lang_configs::PYTHON); if result.error.is_none() { - extract_python_rationale(path, &mut result); + python_rationale::extract_python_rationale(path, &mut result); } result } @@ -101,209 +95,6 @@ pub fn extract_java(path: &Path) -> FileResult { extract_generic(path, &lang_configs::JAVA) } -// ── Groovy ──────────────────────────────────────────────────────────────────── - -/// Extract classes, methods, constructors, and imports from a `.groovy`/`.gradle` file. -/// Falls back to regex-based Spock extractor when needed. -#[must_use] -pub fn extract_groovy(path: &Path) -> FileResult { - let result = extract_generic(path, &lang_configs::GROOVY); - if is_spock_file(path) { - extract_spock_fallback(path, result) - } else { - result - } -} - -/// Return `true` if the Groovy file contains Spock-style `def "feature"()` test methods. -/// -/// Spock test methods use quoted string names that the generic tree-sitter extractor misses; -/// this heuristic triggers the regex fallback when any line starts with `def "` or `def '`. -fn is_spock_file(path: &Path) -> bool { - let Ok(src) = std::fs::read_to_string(path) else { - return false; - }; - // Check for `def "feature"()` patterns - src.lines().any(|l| { - let t = l.trim(); - t.starts_with("def \"") || t.starts_with("def '") - }) -} - -/// Extract class and method nodes from a Spock test file using regex scanning. -/// -/// The generic tree-sitter pass already ran (`ts_result`) but cannot handle Spock's quoted -/// method names. This function discards the tree-sitter node/method edges, keeps the file -/// node and import edges, then re-scans line-by-line with three regexes: -/// `class`, `def "feature"()`, and `def plainMethod()`. Mirrors Python `_extract_spock_fallback`. -#[allow( - clippy::too_many_lines, - clippy::expect_used, - clippy::cast_possible_truncation -)] -// ↑ literal regex patterns; function is a direct port; row→u32 is safe -fn extract_spock_fallback(path: &Path, ts_result: FileResult) -> FileResult { - use crate::ids::{file_stem, make_id, make_id1}; - use crate::types::{Edge, Node}; - use std::collections::HashSet; - - let Ok(source) = std::fs::read_to_string(path) else { - return ts_result; - }; - let str_path = path.to_string_lossy().into_owned(); - let stem = file_stem(path); - - // Keep file node + import edges from tree-sitter pass - let file_node = ts_result - .nodes - .iter() - .find(|n| { - path.file_name() - .is_some_and(|f| f.to_string_lossy() == n.label) - }) - .cloned(); - let mut nodes: Vec = file_node.into_iter().collect(); - let mut edges: Vec = ts_result - .edges - .into_iter() - .filter(|e| e.context.as_deref() == Some("import")) - .collect(); - let mut seen_ids: HashSet = nodes.iter().map(|n| n.id.clone()).collect(); - - let file_nid = make_id1(&str_path); - if !seen_ids.contains(&file_nid) { - nodes.push(Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some("L1".to_string()), - metadata: None, - }); - seen_ids.insert(file_nid.clone()); - } - - let class_re = - regex::Regex::new(r"^\s*(?:[\w@]+\s+)*class\s+(\w+)").expect("static spock class regex"); - let feature_re = regex::Regex::new(r#"^\s*def\s+(?:"([^"]+)"|'([^']+)')\s*\("#) - .expect("static spock feature regex"); - let plain_method_re = - regex::Regex::new(r"^\s*def\s+(\w+)\s*\(").expect("static spock method regex"); - let kws: std::collections::HashSet<&str> = ["if", "while", "for", "switch", "catch"] - .iter() - .copied() - .collect(); - - let mut current_class_nid: Option = None; - - for (lineno, line) in source.lines().enumerate() { - let lineno = lineno + 1; - if let Some(cap) = class_re.captures(line) { - let class_name = cap.get(1).map_or("", |m| m.as_str()); - let class_nid = make_id(&[&stem, class_name]); - if !seen_ids.contains(&class_nid) { - seen_ids.insert(class_nid.clone()); - nodes.push(Node { - id: class_nid.clone(), - label: class_name.to_string(), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{lineno}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: file_nid.clone(), - target: class_nid.clone(), - relation: "contains".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{lineno}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - current_class_nid = Some(class_nid); - continue; - } - - let Some(ref class_nid) = current_class_nid else { - continue; - }; - - if let Some(cap) = feature_re.captures(line) { - let method_name = cap.get(1).or_else(|| cap.get(2)).map_or("", |m| m.as_str()); - let method_label = format!("\"{method_name}\""); - let method_nid = make_id(&[class_nid, method_name]); - if !seen_ids.contains(&method_nid) { - seen_ids.insert(method_nid.clone()); - nodes.push(Node { - id: method_nid.clone(), - label: method_label, - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{lineno}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: class_nid.clone(), - target: method_nid, - relation: "method".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{lineno}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - continue; - } - - if let Some(cap) = plain_method_re.captures(line) { - let method_name = cap.get(1).map_or("", |m| m.as_str()); - if !kws.contains(method_name) { - let method_label = format!(".{method_name}()"); - let method_nid = make_id(&[class_nid, method_name]); - if !seen_ids.contains(&method_nid) { - seen_ids.insert(method_nid.clone()); - nodes.push(Node { - id: method_nid.clone(), - label: method_label, - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{lineno}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: class_nid.clone(), - target: method_nid, - relation: "method".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{lineno}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - } - } - } - - FileResult { - nodes, - edges, - raw_calls: Vec::new(), - error: None, - } -} - // ── C ───────────────────────────────────────────────────────────────────────── /// Extract functions and includes from a `.c`/`.h` file. @@ -446,296 +237,3 @@ pub use blade::extract_blade; // ── .NET (.sln / .slnx / .csproj / .razor) ───────────────────────────────────── pub use dotnet::{extract_csproj, extract_razor, extract_sln, extract_slnx}; - -// ── Python rationale extraction ─────────────────────────────────────────────── - -/// Augment a Python extraction result with rationale nodes sourced from docstrings and comments. -/// -/// Walks the file's AST for module, class, and function docstrings (> 20 chars) and scans -/// source lines for `RATIONALE_PREFIXES` comments. Each rationale becomes a node of -/// `file_type = "rationale"` connected via a `rationale_for` edge to the containing entity. -/// Auto-generated files (migrations, protobuf, Alembic) are silently skipped. -/// Mirrors Python `_extract_rationale`. -fn extract_python_rationale(path: &Path, result: &mut FileResult) { - use crate::ids::{file_stem, make_id, make_id1}; - use crate::types::{Edge, Node}; - use std::collections::HashSet; - use tree_sitter::Parser; - - let Ok(source) = std::fs::read(path) else { - return; - }; - - let mut parser = Parser::new(); - if parser - .set_language(&tree_sitter_python::LANGUAGE.into()) - .is_err() - { - return; - } - let Some(tree) = parser.parse(&source, None) else { - return; - }; - - let stem = file_stem(path); - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - let mut seen_ids: HashSet = result.nodes.iter().map(|n| n.id.clone()).collect(); - - let add_rationale = |text: &str, - line: u32, - parent_nid: &str, - seen: &mut HashSet, - nodes: &mut Vec, - edges: &mut Vec| { - let label: String = text - .chars() - .take(80) - .collect::() - .replace("\r\n", " ") - .replace(['\r', '\n'], " ") - .trim() - .to_string(); - let rid = make_id(&[&stem, "rationale", &line.to_string()]); - if seen.insert(rid.clone()) { - nodes.push(Node { - id: rid.clone(), - label, - file_type: "rationale".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - edges.push(Edge { - external: false, - source: rid, - target: parent_nid.to_string(), - relation: "rationale_for".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - }; - - // Module-level docstring — skipped for auto-generated files (Alembic / - // Django migrations, protobuf stubs) whose module docstrings are revision - // annotations, not architectural rationale. Class/function docstrings and - // `# NOTE:`-style comments below are still extracted (Python parity). - let root = tree.root_node(); - if !is_autogenerated_python(&source) - && let Some((doc, line)) = get_docstring(root, &source) - { - add_rationale( - &doc, - line, - &file_nid, - &mut seen_ids, - &mut result.nodes, - &mut result.edges, - ); - } - - // Walk class/function docstrings - { - let mut doc_ctx = DocstringWalkCtx { - stem: &stem, - file_nid: &file_nid, - seen_ids: &mut seen_ids, - nodes: &mut result.nodes, - edges: &mut result.edges, - add_rationale: &add_rationale, - }; - walk_docstrings(&mut doc_ctx, root, &file_nid, &source); - } - - // Rationale comments - let source_text = String::from_utf8_lossy(&source).into_owned(); - for (lineno, line_text) in source_text.lines().enumerate() { - let stripped = line_text.trim(); - if RATIONALE_PREFIXES.iter().any(|p| stripped.starts_with(p)) { - add_rationale( - stripped, - u32::try_from(lineno).unwrap_or(u32::MAX).saturating_add(1), - &file_nid, - &mut seen_ids, - &mut result.nodes, - &mut result.edges, - ); - } - } -} - -/// Return `true` when the Python source is auto-generated and should not have rationale extracted. -/// -/// Checks the first 2048 bytes for `DO NOT EDIT`, `@generated`, or protobuf markers, and also -/// detects Alembic/Flask-Migrate migration files and Django migration classes. Mirrors Python -/// `_is_autogenerated`. -fn is_autogenerated_python(source: &[u8]) -> bool { - let head = String::from_utf8_lossy(&source[..source.len().min(2048)]).into_owned(); - if head.contains("DO NOT EDIT") - || head.contains("@generated") - || head.contains("Generated by the protocol buffer") - { - return true; - } - // Alembic / Flask-Migrate - if head.contains("def upgrade(") - && head.contains("down_revision") - && head.lines().any(|l| { - let t = l.trim(); - t.starts_with("revision") && (t.contains(':') || t.contains('=')) - }) - { - return true; - } - // Django migrations - if head.contains("class Migration(migrations.Migration)") && head.contains("operations") { - return true; - } - false -} - -/// Extract the first triple-quoted docstring from a Python AST node's first child. -/// -/// Looks for an `expression_statement` as the first child containing a `string` or -/// `concatenated_string` node; returns `(cleaned_text, line_number)` when the cleaned text -/// exceeds 20 characters (too-short strings are likely not real docstrings). -fn get_docstring(node: tree_sitter::Node<'_>, source: &[u8]) -> Option<(String, u32)> { - let mut cur = node.walk(); - if !cur.goto_first_child() { - return None; - } - let child = cur.node(); - if child.kind() == "expression_statement" { - let mut ecur = child.walk(); - if ecur.goto_first_child() { - loop { - let sub = ecur.node(); - if matches!(sub.kind(), "string" | "concatenated_string") { - let text = String::from_utf8_lossy(&source[sub.start_byte()..sub.end_byte()]) - .into_owned(); - let clean = text - .trim_matches('"') - .trim_matches('\'') - .trim_start_matches("\"\"\"") - .trim_end_matches("\"\"\"") - .trim_start_matches("'''") - .trim_end_matches("'''") - .trim() - .to_string(); - if clean.len() > 20 { - let row = child.start_position().row; - return Some(( - clean, - u32::try_from(row).unwrap_or(u32::MAX).saturating_add(1), - )); - } - } - if !ecur.goto_next_sibling() { - break; - } - } - } - } - None -} - -/// Recursively walk a Python AST node extracting docstrings from class and function bodies. -/// -/// For `class_definition` nodes, extracts the class body docstring and recurses into methods. -/// For `function_definition` nodes, extracts the function body docstring and stops recursing. -/// All other nodes are traversed without emitting rationale. Called by `extract_python_rationale`. -/// Shared state threaded through every [`walk_docstrings`] recursion. -struct DocstringWalkCtx<'a, F> -where - F: Fn( - &str, - u32, - &str, - &mut std::collections::HashSet, - &mut Vec, - &mut Vec, - ), -{ - stem: &'a str, - file_nid: &'a str, - seen_ids: &'a mut std::collections::HashSet, - nodes: &'a mut Vec, - edges: &'a mut Vec, - add_rationale: &'a F, -} - -fn walk_docstrings( - ctx: &mut DocstringWalkCtx<'_, F>, - node: tree_sitter::Node<'_>, - parent_nid: &str, - source: &[u8], -) where - F: Fn( - &str, - u32, - &str, - &mut std::collections::HashSet, - &mut Vec, - &mut Vec, - ), -{ - use crate::ids::make_id; - let t = node.kind(); - if t == "class_definition" { - if let Some(name_node) = node.child_by_field_name("name") { - let class_name = - String::from_utf8_lossy(&source[name_node.start_byte()..name_node.end_byte()]) - .into_owned(); - let nid = make_id(&[ctx.stem, &class_name]); - if let Some(body) = node.child_by_field_name("body") { - if let Some((doc, line)) = get_docstring(body, source) { - (ctx.add_rationale)(&doc, line, &nid, ctx.seen_ids, ctx.nodes, ctx.edges); - } - let mut cur = body.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - walk_docstrings(ctx, child, &nid, source); - if !cur.goto_next_sibling() { - break; - } - } - } - } - } - return; - } - if t == "function_definition" { - if let Some(name_node) = node.child_by_field_name("name") { - let func_name = - String::from_utf8_lossy(&source[name_node.start_byte()..name_node.end_byte()]) - .into_owned(); - let nid = if parent_nid == ctx.file_nid { - make_id(&[ctx.stem, &func_name]) - } else { - make_id(&[parent_nid, &func_name]) - }; - if let Some(body) = node.child_by_field_name("body") - && let Some((doc, line)) = get_docstring(body, source) - { - (ctx.add_rationale)(&doc, line, &nid, ctx.seen_ids, ctx.nodes, ctx.edges); - } - } - return; - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - walk_docstrings(ctx, child, parent_nid, source); - if !cur.goto_next_sibling() { - break; - } - } - } -} diff --git a/crates/graphify-extract/src/extractors/multi.rs b/crates/graphify-extract/src/extractors/multi.rs deleted file mode 100644 index b294ae1..0000000 --- a/crates/graphify-extract/src/extractors/multi.rs +++ /dev/null @@ -1,2541 +0,0 @@ -//! Multi-file extraction orchestrator. -//! -//! Mirrors Python `extract()` from `extract.py`: -//! - Per-file dispatch via extension (or `.blade.php` suffix) -//! - Cache integration (graphify-cache) -//! - Parallel extraction via rayon for large batches -//! - Cross-file Python import resolution -//! - Cross-file Java import resolution -//! - Cross-file `raw_call` resolution -//! - ID relativisation (absolute → project-relative) -//! - `source_file` field relativisation - -// Source file labels use lowercase extensions; case-insensitive comparison -// would misidentify e.g. ".PY" which does not exist in practice. -#![allow(clippy::case_sensitive_file_extension_comparisons)] - -use std::collections::{HashMap, HashSet}; -use std::path::{Path, PathBuf}; - -use rayon::prelude::*; -use serde_json::Value; - -use crate::extractors::{ - extract_apex, extract_astro, extract_bash, extract_blade, extract_c, extract_cpp, - extract_csharp, extract_csproj, extract_dart, extract_delphi_form, extract_dm, extract_dmf, - extract_dmi, extract_dmm, extract_elixir, extract_fortran, extract_go, extract_groovy, - extract_java, extract_js, extract_json, extract_julia, extract_kotlin, extract_lazarus_form, - extract_lazarus_package, extract_lua, extract_markdown, extract_mcp_config, extract_objc, - extract_package_manifest, extract_pascal, extract_php, extract_powershell, - extract_powershell_manifest, extract_python, extract_razor, extract_ruby, extract_rust, - extract_scala, extract_sln, extract_slnx, extract_sql, extract_svelte, extract_swift, - extract_terraform, extract_verilog, extract_zig, is_mcp_config_path, -}; -use crate::ids::make_id1; -use crate::import_handlers::make_edge; -use crate::types::{Edge, ExtractOutput, FileResult, Node, RawCall}; - -const PARALLEL_THRESHOLD: usize = 20; - -// ── Dispatch table ──────────────────────────────────────────────────────────── - -type ExtractFn = fn(&Path) -> FileResult; - -/// Return the per-language extractor function for a given file path, or `None` for unknown types. -/// -/// Blade templates are identified by the `.blade.php` suffix before the extension is checked, so -/// that `foo.blade.php` routes to `extract_blade` rather than `extract_php`. All other languages -/// are dispatched solely on the file extension. -fn get_extractor(path: &Path) -> Option { - // Blade templates: checked by suffix before extension - let name = path.file_name().map_or("", |n| n.to_str().unwrap_or("")); - if name.ends_with(".blade.php") { - return Some(extract_blade); - } - // MCP config files (.mcp.json, claude_desktop_config.json, ...) are routed - // by filename before generic .json dispatch so they get MCP-aware nodes - // (servers, commands, packages, env vars) instead of opaque JSON keys. - if is_mcp_config_path(path) { - return Some(extract_mcp_config); - } - // Package manifests (apm.yml/pyproject.toml/go.mod/pom.xml) -> a canonical - // package node + depends_on edges, by filename before generic suffix dispatch - // (#1377). apm.yml would otherwise be a .yml document handled by the LLM. - if graphify_detect::is_package_manifest_path(path) { - return Some(extract_package_manifest); - } - let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - match ext { - "py" => Some(extract_python), - "js" | "jsx" | "mjs" | "ts" | "tsx" | "vue" => Some(extract_js), - "go" => Some(extract_go), - "rs" => Some(extract_rust), - "java" => Some(extract_java), - "groovy" | "gradle" => Some(extract_groovy), - "c" | "h" => Some(extract_c), - "cpp" | "cc" | "cxx" | "hpp" => Some(extract_cpp), - "rb" => Some(extract_ruby), - "cs" => Some(extract_csharp), - "kt" | "kts" => Some(extract_kotlin), - "scala" => Some(extract_scala), - "php" => Some(extract_php), - "swift" => Some(extract_swift), - "lua" | "luau" | "toc" => Some(extract_lua), - "zig" => Some(extract_zig), - "ps1" | "psm1" => Some(extract_powershell), - "psd1" => Some(extract_powershell_manifest), - "ex" | "exs" => Some(extract_elixir), - "m" | "mm" => Some(extract_objc), - "jl" => Some(extract_julia), - "f" | "F" | "f90" | "F90" | "f95" | "F95" | "f03" | "F03" | "f08" | "F08" => { - Some(extract_fortran) - } - "svelte" => Some(extract_svelte), - "astro" => Some(extract_astro), - "dart" => Some(extract_dart), - "v" | "sv" | "svh" => Some(extract_verilog), - "sql" => Some(extract_sql), - "md" | "mdx" | "qmd" => Some(extract_markdown), - "pas" | "pp" | "dpr" | "dpk" | "lpr" | "inc" => Some(extract_pascal), - "dfm" => Some(extract_delphi_form), - "lfm" => Some(extract_lazarus_form), - "lpk" => Some(extract_lazarus_package), - "sh" | "bash" => Some(extract_bash), - "json" => Some(extract_json), - "dm" | "dme" => Some(extract_dm), - "dmi" => Some(extract_dmi), - "dmm" => Some(extract_dmm), - "dmf" => Some(extract_dmf), - "sln" => Some(extract_sln), - "slnx" => Some(extract_slnx), - "cls" | "trigger" => Some(extract_apex), - "tf" | "tfvars" | "hcl" => Some(extract_terraform), - "csproj" | "fsproj" | "vbproj" => Some(extract_csproj), - "razor" | "cshtml" => Some(extract_razor), - _ => None, - } -} - -// ── Cache helpers (thin wrappers around graphify-cache) ─────────────────────── - -/// Serialise a `FileResult` to a `serde_json::Value` suitable for caching. -/// -/// Converts nodes, edges, and `raw_calls` to JSON arrays. Used as the write side of the -/// graphify-cache pair; see `value_to_file_result` for the read side. -fn file_result_to_value(result: &FileResult) -> Value { - let nodes: Vec = result - .nodes - .iter() - .map(|n| serde_json::to_value(n).unwrap_or(Value::Null)) - .collect(); - let edges: Vec = result - .edges - .iter() - .map(|e| serde_json::to_value(e).unwrap_or(Value::Null)) - .collect(); - let raw_calls: Vec = result - .raw_calls - .iter() - .map(|rc| { - serde_json::json!({ - "caller_nid": rc.caller_nid, - "callee": rc.callee, - "is_member_call": rc.is_member_call, - "source_file": rc.source_file, - "source_location": rc.source_location, - "receiver": rc.receiver, - }) - }) - .collect(); - serde_json::json!({ - "nodes": nodes, - "edges": edges, - "raw_calls": raw_calls, - }) -} - -/// Deserialise a cached `serde_json::Value` back into a `FileResult`. -/// -/// Missing or malformed sub-fields silently fall back to empty `Vec`s. -/// Counterpart to `file_result_to_value`. -fn value_to_file_result(v: &Value) -> FileResult { - let nodes = v - .get("nodes") - .and_then(Value::as_array) - .map(|arr| { - arr.iter() - .filter_map(|n| serde_json::from_value::(n.clone()).ok()) - .collect() - }) - .unwrap_or_default(); - let edges = v - .get("edges") - .and_then(Value::as_array) - .map(|arr| { - arr.iter() - .filter_map(|e| serde_json::from_value::(e.clone()).ok()) - .collect() - }) - .unwrap_or_default(); - let raw_calls = v - .get("raw_calls") - .and_then(Value::as_array) - .map(|arr| { - arr.iter() - .filter_map(|rc| { - Some(RawCall { - caller_nid: rc.get("caller_nid")?.as_str()?.to_string(), - callee: rc.get("callee")?.as_str()?.to_string(), - is_member_call: rc - .get("is_member_call") - .and_then(Value::as_bool) - .unwrap_or(false), - source_file: rc - .get("source_file") - .and_then(Value::as_str) - .unwrap_or("") - .to_string(), - source_location: rc - .get("source_location") - .and_then(Value::as_str) - .unwrap_or("") - .to_string(), - // `receiver` (#1356) reads back as `None` when absent. - // Safe without a Swift cache bypass or schema-version - // check: the AST cache is namespaced by crate version - // (`cache/ast/v{version}/` via graphify-cache's - // EXTRACTOR_VERSION), so a pre-`receiver` entry sits - // under an older version dir `load_cached` never reads, - // invalidated by the version bump that shipped the field. - receiver: rc - .get("receiver") - .and_then(Value::as_str) - .map(str::to_string), - }) - }) - .collect() - }) - .unwrap_or_default(); - FileResult { - nodes, - edges, - raw_calls, - error: None, - } -} - -// ── Extract a single file (with cache) ─────────────────────────────────────── - -/// File suffixes whose per-file AST extraction is never cached: their cross-file -/// import resolution depends on sibling files that can appear or change between -/// runs, so a cached result would serve a stale (unresolved) import edge. -/// Mirrors Python `_JS_CACHE_BYPASS_SUFFIXES`. -const JS_CACHE_BYPASS_SUFFIXES: [&str; 7] = ["js", "jsx", "mjs", "ts", "tsx", "vue", "svelte"]; - -/// Extract a single file, returning a cached result when available. -/// -/// Looks up the on-disk AST cache first; on a miss, dispatches to the language-specific -/// extractor and writes the result back to the cache. Files with no matching extractor -/// return an empty `FileResult` rather than an error. -fn extract_single_file(path: &Path, effective_root: &Path) -> FileResult { - // JS/TS files bypass the AST cache so workspace/sibling import resolution is - // recomputed each run (#9a7dbfb): a result cached while a sibling was absent - // would otherwise pin a stale unresolved import edge. - let bypass_cache = path - .extension() - .and_then(|e| e.to_str()) - .is_some_and(|ext| JS_CACHE_BYPASS_SUFFIXES.contains(&ext)); - - if !bypass_cache && let Some(v) = graphify_cache::load_cached(path, effective_root, "ast") { - return value_to_file_result(&v); - } - - let Some(extractor) = get_extractor(path) else { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: None, - }; - }; - - let result = extractor(path); - if !bypass_cache && result.error.is_none() { - let v = file_result_to_value(&result); - // best-effort save; ignore failures - let _ = graphify_cache::save_cached(path, &v, effective_root, "ast"); - } - result -} - -// ── Cross-file Python import resolution helpers ─────────────────────────────── - -/// Recursively walk a Python AST collecting `from X import Y` statements. -/// -/// On finding an `import_from_statement`, resolves the source module to a known stem via -/// `bare_to_qualified`, then emits `uses` edges from each local class to each imported symbol -/// that is present in `stem_to_entities`. Mirrors Python `_walk_imports` from `extract.py`. -/// Shared state threaded through every [`walk_imports`] recursion. -struct ImportWalkCtx<'a> { - path: &'a Path, - stem_to_entities: &'a HashMap>, - bare_to_qualified: &'a HashMap, - local_classes: &'a [String], - str_path: &'a str, - new_edges: &'a mut Vec, -} - -#[allow(clippy::too_many_lines)] // linear dispatch over Python's import_from_statement variants -fn walk_imports(ctx: &mut ImportWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8]) { - if node.kind() == "import_from_statement" { - let mut target_fq: Option = None; - let mut past_import_kw = false; - let mut imported_names: Vec = Vec::new(); - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if child.kind() == "relative_import" { - let mut rc = child.walk(); - if rc.goto_first_child() { - loop { - let sub = rc.node(); - if sub.kind() == "dotted_name" { - let raw = - std::str::from_utf8(&source[sub.start_byte()..sub.end_byte()]) - .unwrap_or(""); - let bare = raw.split('.').next_back().unwrap_or("").to_string(); - let candidate = ctx - .path - .parent() - .unwrap_or(ctx.path) - .join(format!("{bare}.py")); - target_fq = Some(crate::ids::file_stem(&candidate)); - break; - } - if !rc.goto_next_sibling() { - break; - } - } - } - break; - } - if child.kind() == "dotted_name" && target_fq.is_none() { - let raw = std::str::from_utf8(&source[child.start_byte()..child.end_byte()]) - .unwrap_or(""); - let bare = raw.split('.').next_back().unwrap_or(""); - target_fq = ctx.bare_to_qualified.get(bare).cloned(); - } - if child.kind() == "import" { - past_import_kw = true; - } else if past_import_kw { - if child.kind() == "dotted_name" { - imported_names.push( - std::str::from_utf8(&source[child.start_byte()..child.end_byte()]) - .unwrap_or("") - .to_string(), - ); - } else if child.kind() == "aliased_import" - && let Some(name_node) = child.child_by_field_name("name") - { - imported_names.push( - std::str::from_utf8( - &source[name_node.start_byte()..name_node.end_byte()], - ) - .unwrap_or("") - .to_string(), - ); - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - - let Some(fq) = target_fq else { return }; - let Some(entities) = ctx.stem_to_entities.get(&fq) else { - return; - }; - let line = node.start_position().row + 1; - for name in &imported_names { - if let Some(tgt_nid) = entities.get(name) { - for src_class_nid in ctx.local_classes { - ctx.new_edges.push(Edge { - external: false, - source: src_class_nid.clone(), - target: tgt_nid.clone(), - relation: "uses".to_string(), - confidence: "INFERRED".to_string(), - source_file: ctx.str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 0.8, - context: None, - confidence_score: None, - }); - } - } - } - return; - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_imports(ctx, cur.node(), source); - if !cur.goto_next_sibling() { - break; - } - } - } -} - -/// Recursively walk a Java AST collecting `import` declarations and resolving them to graph edges. -/// -/// On finding an `import_declaration`, extracts the class name (or second-to-last component for -/// static method imports), looks it up in `name_to_ids`, and emits `imports` edges from the -/// current file node to any matching class nodes. Wildcard imports (`.*`) are silently skipped. -/// Mirrors Python `_walk_java` from `extract.py`. -fn walk_java( - node: tree_sitter::Node<'_>, - source: &[u8], - file_nid: &str, - path: &Path, - name_to_ids: &HashMap>, - new_edges: &mut Vec, - seen_pairs: &mut std::collections::HashSet<(String, String)>, -) { - if node.kind() == "import_declaration" { - let raw = std::str::from_utf8(&source[node.start_byte()..node.end_byte()]) - .unwrap_or("") - .trim() - .to_string(); - let body = raw - .trim_start_matches("import") - .trim() - .trim_end_matches(';') - .trim() - .trim_start_matches("static ") - .trim() - .to_string(); - if body.ends_with(".*") { - return; - } - let parts: Vec<&str> = body.split('.').collect(); - if parts.is_empty() { - return; - } - let last = parts.last().copied().unwrap_or(""); - // If last part is lowercase, try second-to-last (method static import) - let class_name = if last.chars().next().is_some_and(char::is_lowercase) && parts.len() >= 2 - { - parts[parts.len() - 2] - } else { - last - }; - let at_line = node.start_position().row + 1; - for tgt_nid in name_to_ids.get(class_name).into_iter().flatten() { - if tgt_nid == file_nid { - continue; - } - let key = (file_nid.to_string(), tgt_nid.clone()); - if seen_pairs.insert(key) { - new_edges.push(Edge { - external: false, - source: file_nid.to_string(), - target: tgt_nid.clone(), - relation: "imports".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: path.to_string_lossy().into_owned(), - source_location: Some(format!("L{at_line}")), - weight: 1.0, - context: None, - confidence_score: Some(1.0), - }); - } - } - return; - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_java( - cur.node(), - source, - file_nid, - path, - name_to_ids, - new_edges, - seen_pairs, - ); - if !cur.goto_next_sibling() { - break; - } - } - } -} - -// ── Cross-file Python import resolution ────────────────────────────────────── - -/// Emit `uses` edges connecting Python classes to the symbols they import from other files. -/// -/// Two-pass: first builds a map of (file-qualified-stem → label → nid) and -/// (bare stem → qualified stem); then re-parses each Python file to find -/// `from X import Y` statements and emit edges. Mirrors Python `_resolve_cross_file_imports`. -fn resolve_cross_file_python_imports(per_file: &[FileResult], paths: &[PathBuf]) -> Vec { - let mut probe = tree_sitter::Parser::new(); - if probe - .set_language(&tree_sitter_python::LANGUAGE.into()) - .is_err() - { - return vec![]; - } - drop(probe); - - let (stem_to_entities, bare_to_qualified) = build_python_symbol_maps(per_file); - let work: Vec<(&FileResult, &PathBuf)> = per_file.iter().zip(paths.iter()).collect(); - let init_parser = || -> tree_sitter::Parser { - let mut p = tree_sitter::Parser::new(); - let _ = p.set_language(&tree_sitter_python::LANGUAGE.into()); - p - }; - if work.len() >= PARALLEL_THRESHOLD { - work.par_iter() - .map_init(init_parser, |parser, (result, path)| { - python_per_file_edges(result, path, parser, &stem_to_entities, &bare_to_qualified) - }) - .reduce(Vec::new, |mut a, b| { - a.extend(b); - a - }) - } else { - let mut parser = init_parser(); - work.iter() - .flat_map(|(result, path)| { - python_per_file_edges( - result, - path, - &mut parser, - &stem_to_entities, - &bare_to_qualified, - ) - }) - .collect() - } -} - -/// Pass 1: build `(stem → {label → nid})` + `(bare stem → qualified stem)` maps. -fn build_python_symbol_maps( - per_file: &[FileResult], -) -> ( - HashMap>, - HashMap, -) { - use crate::ids::file_stem; - let mut stem_to_entities: HashMap> = HashMap::new(); - let mut bare_to_qualified: HashMap = HashMap::new(); - for result in per_file { - for node in &result.nodes { - if node.source_file.is_empty() { - continue; - } - let label = &node.label; - if label.is_empty() - || label.ends_with(')') - || label.to_lowercase().ends_with(".py") - || label.starts_with('_') - || node.file_type == "rationale" - { - continue; - } - let src_path = PathBuf::from(&node.source_file); - let fq_stem = file_stem(&src_path); - stem_to_entities - .entry(fq_stem.clone()) - .or_default() - .insert(label.clone(), node.id.clone()); - let bare = src_path - .file_stem() - .map_or(String::new(), |s| s.to_string_lossy().into_owned()); - bare_to_qualified.entry(bare).or_insert(fq_stem); - } - } - (stem_to_entities, bare_to_qualified) -} - -/// Pass 2: per-file Python parse + import-edge emission. -fn python_per_file_edges( - result: &FileResult, - path: &Path, - parser: &mut tree_sitter::Parser, - stem_to_entities: &HashMap>, - bare_to_qualified: &HashMap, -) -> Vec { - use crate::ids::file_stem; - let mut local_edges: Vec = Vec::new(); - let str_path = path.to_string_lossy().into_owned(); - let this_stem = file_stem(path); - let this_file_nid = make_id1(&str_path); - let local_classes: Vec = result - .nodes - .iter() - .filter(|n| { - n.source_file == str_path - && !n.label.ends_with(')') - && !n.label.to_lowercase().ends_with(".py") - && n.id != this_file_nid - && n.id != make_id1(&this_stem) - && n.file_type != "rationale" - }) - .map(|n| n.id.clone()) - .collect(); - if local_classes.is_empty() { - return local_edges; - } - let Ok(source) = std::fs::read(path) else { - return local_edges; - }; - let Some(tree) = parser.parse(&source, None) else { - return local_edges; - }; - let mut import_ctx = ImportWalkCtx { - path, - stem_to_entities, - bare_to_qualified, - local_classes: &local_classes, - str_path: &str_path, - new_edges: &mut local_edges, - }; - walk_imports(&mut import_ctx, tree.root_node(), &source); - local_edges -} - -// ── Cross-file Java import resolution ──────────────────────────────────────── - -/// Emit `imports` edges by resolving Java `import` statements across all extracted files. -/// -/// Two-pass: first builds a map of (class-name → [nid]) from all capitalised node labels; -/// then re-parses each `.java` file to find `import_declaration` nodes and emit edges. -/// Mirrors Python `_resolve_cross_file_java_imports`. -#[allow(clippy::too_many_lines)] -fn resolve_cross_file_java_imports(per_file: &[FileResult], paths: &[PathBuf]) -> Vec { - let mut parser = tree_sitter::Parser::new(); - if parser - .set_language(&tree_sitter_java::LANGUAGE.into()) - .is_err() - { - return vec![]; - } - - // Pass 1: class-name → [node_id] - let mut name_to_ids: HashMap> = HashMap::new(); - for result in per_file { - for node in &result.nodes { - let label = &node.label; - if label.is_empty() - || node.source_file.is_empty() - || label.ends_with(')') - || label.to_lowercase().ends_with(".java") - { - continue; - } - if !label - .chars() - .next() - .is_some_and(|c| c.is_alphabetic() && c.is_uppercase()) - { - continue; - } - name_to_ids - .entry(label.clone()) - .or_default() - .push(node.id.clone()); - } - } - - // Pass 2: resolve imports — fan out across Rayon. Per-file work is - // independent; we drop the seed parser and give each worker its own. - // `seen_pairs` is partitioned per-file (each thread accumulates its - // own pairs); the final dedupe runs sequentially after the parallel - // reduce so edge ordering matches the sequential implementation - // wherever it would have been preserved. - drop(parser); - - let init_parser = || -> tree_sitter::Parser { - let mut p = tree_sitter::Parser::new(); - let _ = p.set_language(&tree_sitter_java::LANGUAGE.into()); - p - }; - - let per_file_edges = |path: &PathBuf, parser: &mut tree_sitter::Parser| -> Vec { - let file_nid = make_id1(&path.to_string_lossy()); - let Ok(source) = std::fs::read(path) else { - return Vec::new(); - }; - let Some(tree) = parser.parse(&source, None) else { - return Vec::new(); - }; - let mut local_edges = Vec::new(); - let mut local_seen: std::collections::HashSet<(String, String)> = - std::collections::HashSet::new(); - walk_java( - tree.root_node(), - &source, - &file_nid, - path, - &name_to_ids, - &mut local_edges, - &mut local_seen, - ); - local_edges - }; - - let collected: Vec = if paths.len() >= PARALLEL_THRESHOLD { - paths - .par_iter() - .map_init(init_parser, |parser, path| per_file_edges(path, parser)) - .reduce(Vec::new, |mut a, b| { - a.extend(b); - a - }) - } else { - let mut parser = init_parser(); - paths - .iter() - .flat_map(|p| per_file_edges(p, &mut parser)) - .collect() - }; - - // Global dedupe: per-file `local_seen` only guards within a single - // file, but the original sequential code shared `seen_pairs` across - // every file. Recreate that property with a final pass over the - // merged Vec to drop later duplicates. - let mut new_edges: Vec = Vec::with_capacity(collected.len()); - let mut seen_pairs: std::collections::HashSet<(String, String)> = - std::collections::HashSet::new(); - for e in collected { - let key = (e.source.clone(), e.target.clone()); - if seen_pairs.insert(key) { - new_edges.push(e); - } - } - new_edges -} - -/// Result of cross-file JS/TS default-import resolution (#6dc23db). -struct JsDefaultResolution { - /// `imports` edges wiring an importer file node to the origin symbol of a - /// default export, even when the local binding is renamed. - edges: Vec, - /// `(caller_file_node_id, local_binding_lowercased) -> origin symbol node id`, - /// so a call through a renamed default-import binding (`import mk from - /// './foo'; mk()`) resolves to the origin during cross-file call resolution. - aliases: HashMap<(String, String), String>, -} - -/// The tree-sitter grammar for a JS/TS file, by extension (vue/others skipped). -fn js_grammar_for(path: &Path) -> Option { - match path.extension().and_then(|e| e.to_str()) { - Some("ts") => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()), - Some("tsx") => Some(tree_sitter_typescript::LANGUAGE_TSX.into()), - Some("js" | "jsx" | "mjs" | "cjs") => Some(tree_sitter_javascript::LANGUAGE.into()), - _ => None, - } -} - -/// UTF-8 slice of a node's source span (empty on invalid UTF-8). -fn js_node_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { - std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") -} - -/// Local name of a default export, or `None` for an anonymous default. -/// -/// Handles `export default class Foo {}` / `export default function foo() {}` -/// (name on the `declaration` field) and `export default Foo` (identifier on -/// the `value` field). Mirrors graphify-py `_js_default_export_name`. -fn js_default_export_name(node: tree_sitter::Node<'_>, source: &[u8]) -> Option { - let mut c = node.walk(); - if !node.children(&mut c).any(|ch| ch.kind() == "default") { - return None; - } - if let Some(decl) = node.child_by_field_name("declaration") { - return decl - .child_by_field_name("name") - .map(|n| js_node_text(n, source).to_string()); - } - let value = node.child_by_field_name("value")?; - (value.kind() == "identifier").then(|| js_node_text(value, source).to_string()) -} - -/// Local binding of a default import — the `Foo` in `import Foo from './x'` -/// (also the leading binding of `import Foo, { Bar } from './x'`). Mirrors -/// graphify-py `_js_default_import_name`. -fn js_default_import_name(node: tree_sitter::Node<'_>, source: &[u8]) -> Option { - let mut c = node.walk(); - let clause = node - .children(&mut c) - .find(|ch| ch.kind() == "import_clause")?; - let mut cc = clause.walk(); - clause - .children(&mut cc) - .find(|sub| sub.kind() == "identifier") - .map(|id| js_node_text(id, source).to_string()) -} - -/// The source-module string literal (`'./x'`) of an import/export statement. -fn js_import_source(node: tree_sitter::Node<'_>, source: &[u8]) -> Option { - let mut c = node.walk(); - let s = node.children(&mut c).find(|ch| ch.kind() == "string")?; - Some( - js_node_text(s, source) - .trim_matches(|c| c == '\'' || c == '"' || c == '`' || c == ' ') - .to_string(), - ) -} - -/// A default import occurrence: `(file index, local binding, source string, line)`. -type JsDefaultImport = (usize, String, String, u32); - -/// Default-export names (by file index) and default imports gathered per file. -struct JsDefaultFacts { - export_name: HashMap, - imports: Vec, -} - -/// Parse each JS/TS file once, collecting its default-export name (by file -/// index) and its default imports. Files without a JS/TS grammar or that fail to -/// read/parse are skipped. -fn collect_js_default_facts(paths: &[PathBuf]) -> JsDefaultFacts { - let mut export_name: HashMap = HashMap::new(); - let mut imports: Vec = Vec::new(); - for (i, path) in paths.iter().enumerate() { - let Some(lang) = js_grammar_for(path) else { - continue; - }; - let mut parser = tree_sitter::Parser::new(); - if parser.set_language(&lang).is_err() { - continue; - } - let Ok(source) = std::fs::read(path) else { - continue; - }; - let Some(tree) = parser.parse(&source, None) else { - continue; - }; - let mut stack = vec![tree.root_node()]; - while let Some(node) = stack.pop() { - match node.kind() { - "export_statement" => { - if let Some(name) = js_default_export_name(node, &source) { - export_name.entry(i).or_insert(name); - } - } - "import_statement" => { - if let Some(local) = js_default_import_name(node, &source) - && let Some(src) = js_import_source(node, &source) - { - let line = u32::try_from(node.start_position().row) - .unwrap_or(0) - .saturating_add(1); - imports.push((i, local, src, line)); - } - } - _ => {} - } - let mut c = node.walk(); - stack.extend(node.children(&mut c)); - } - } - JsDefaultFacts { - export_name, - imports, - } -} - -/// Resolve JS/TS default imports to the origin symbol of the matching default -/// export across files (#6dc23db). -/// -/// graphify-py threads default imports/exports through its -/// `_collect_js_symbol_resolution_facts` pass; the Rust port resolves JS imports -/// per-file, so this adds the cross-file default case as a focused resolver -/// parallel to [`resolve_cross_file_python_imports`] / -/// [`resolve_cross_file_java_imports`]. Runs after id remapping so it works in -/// the final node-id space. `all_nodes` is the post-remap node set. -fn resolve_js_default_imports( - all_nodes: &[Node], - paths: &[PathBuf], - root: &Path, -) -> JsDefaultResolution { - use crate::ids::file_node_id; - - let file_nid_of = |path: &Path| -> String { - let rel = relativise_under_root(path, root).unwrap_or_else(|| path.to_path_buf()); - file_node_id(&rel) - }; - - // (file_node_id, normalised label) -> node id, so a default-export name - // resolves to the concrete symbol node in that file. The label is normalised - // the same way the call resolver normalises call labels (strip a trailing - // `()` and a leading `.`) so a function export (`makeFoo`, stored as the node - // label `makeFoo()`) still matches the bare export name. - let mut by_file_label: HashMap<(String, String), String> = HashMap::new(); - for n in all_nodes { - if n.source_file.is_empty() || n.label.is_empty() { - continue; - } - let sf = PathBuf::from(&n.source_file); - let file_nid = if sf.is_absolute() { - file_nid_of(&sf) - } else { - file_node_id(&sf) - }; - let label = n.label.trim_end_matches("()").trim_start_matches('.'); - if label.is_empty() { - continue; - } - by_file_label - .entry((file_nid, label.to_string())) - .or_insert_with(|| n.id.clone()); - } - - // Per file: default-export name + default imports. - let JsDefaultFacts { - export_name, - imports, - } = collect_js_default_facts(paths); - - // Match each canonicalised path to its index, so a resolved import target - // maps back to the file whose default export we recorded. - let mut idx_by_path: HashMap = HashMap::new(); - for (i, p) in paths.iter().enumerate() { - idx_by_path.entry(p.clone()).or_insert(i); - if let Ok(c) = p.canonicalize() { - idx_by_path.entry(c).or_insert(i); - } - } - - let mut edges = Vec::new(); - let mut aliases = HashMap::new(); - let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new(); - for (imp_idx, local, raw, line) in imports { - let importer = &paths[imp_idx]; - let str_path = importer.to_string_lossy(); - let (_, resolved) = crate::generic::resolve_js_import_target(&raw, &str_path); - let Some(resolved) = resolved else { continue }; - let tgt_idx = idx_by_path - .get(&resolved) - .or_else(|| { - resolved - .canonicalize() - .ok() - .and_then(|c| idx_by_path.get(&c)) - }) - .copied(); - let Some(tgt_idx) = tgt_idx else { continue }; - let Some(name) = export_name.get(&tgt_idx) else { - continue; - }; - let tgt_file_nid = file_nid_of(&paths[tgt_idx]); - let Some(origin) = by_file_label.get(&(tgt_file_nid, name.clone())) else { - continue; - }; - let importer_nid = file_nid_of(importer); - if seen.insert((importer_nid.clone(), origin.clone())) { - edges.push(make_edge( - &importer_nid, - origin, - "imports", - Some("import"), - &str_path, - line, - )); - } - aliases.insert((importer_nid, local.to_lowercase()), origin.clone()); - } - - JsDefaultResolution { edges, aliases } -} - -/// Per-file JS/TS export/import specifier facts used to resolve barrel -/// re-export chains to their origin symbols (#barrel-resolution). Collected by -/// [`collect_js_reexport_facts`]. -#[derive(Default)] -struct JsReexportFile { - /// `export { S as P } from './x'` → `(public, source_raw, source_name)`. - reexports: Vec<(String, String, String)>, - /// `export * from './x'` → `source_raw`. - star_sources: Vec, - /// `export { L as P }` (no `from`) → `(public, local)`. - local_reexports: Vec<(String, String)>, - /// `export const X = …` → `X` (the public exported binding name). - exported_const_names: Vec, - /// `import { I as L } from './x'` → `local → (source_raw, imported)`. - named_imports: HashMap, - /// `const B = A` / `export const B = A` (bare-identifier RHS) → `alias → target`. - local_aliases: HashMap, - /// Named imports as consumer facts: `(local_binding, source_raw, imported, line)`. - consumer_imports: Vec<(String, String, String, u32)>, -} - -/// Extract `(name, alias)` from an `import_specifier` / `export_specifier`. -fn js_spec_name_alias( - spec: tree_sitter::Node<'_>, - source: &[u8], -) -> Option<(String, Option)> { - let name = spec.child_by_field_name("name").or_else(|| { - let mut c = spec.walk(); - spec.children(&mut c) - .find(|n| matches!(n.kind(), "identifier" | "property_identifier")) - })?; - let alias = spec - .child_by_field_name("alias") - .map(|a| js_node_text(a, source).to_string()); - Some((js_node_text(name, source).to_string(), alias)) -} - -/// Record `const B = A` bare-identifier aliases from a `lexical_declaration`. -fn collect_js_lexical_aliases(node: tree_sitter::Node<'_>, source: &[u8], f: &mut JsReexportFile) { - let mut cur = node.walk(); - for d in node.children(&mut cur) { - if d.kind() == "variable_declarator" - && let Some(name) = d.child_by_field_name("name") - && let Some(value) = d.child_by_field_name("value") - && value.kind() == "identifier" - { - f.local_aliases.insert( - js_node_text(name, source).to_string(), - js_node_text(value, source).to_string(), - ); - } - } -} - -/// Record named imports (`import { I as L } from './x'`) from an `import_statement`. -fn collect_js_import_stmt(node: tree_sitter::Node<'_>, source: &[u8], f: &mut JsReexportFile) { - let Some(src) = js_import_source(node, source) else { - return; - }; - let line = u32::try_from(node.start_position().row) - .unwrap_or(0) - .saturating_add(1); - let mut cur = node.walk(); - for child in node.children(&mut cur) { - if child.kind() != "import_clause" { - continue; - } - let mut cc = child.walk(); - for sub in child.children(&mut cc) { - if sub.kind() != "named_imports" { - continue; - } - let mut nc = sub.walk(); - for spec in sub.children(&mut nc) { - if spec.kind() == "import_specifier" - && let Some((name, alias)) = js_spec_name_alias(spec, source) - { - let local = alias.unwrap_or_else(|| name.clone()); - f.named_imports - .insert(local.clone(), (src.clone(), name.clone())); - f.consumer_imports.push((local, src.clone(), name, line)); - } - } - } - } -} - -/// Record re-exports / star re-exports / local re-exports / exported consts -/// from an `export_statement`. -fn collect_js_export_stmt(node: tree_sitter::Node<'_>, source: &[u8], f: &mut JsReexportFile) { - let src = js_import_source(node, source); - let mut cur = node.walk(); - let children: Vec> = node.children(&mut cur).collect(); - let export_clause = children - .iter() - .find(|c| c.kind() == "export_clause") - .copied(); - let has_namespace = children.iter().any(|c| c.kind() == "namespace_export"); - let lexical = children - .iter() - .find(|c| c.kind() == "lexical_declaration") - .copied(); - - if let Some(clause) = export_clause { - let mut cc = clause.walk(); - for spec in clause.children(&mut cc) { - if spec.kind() == "export_specifier" - && let Some((name, alias)) = js_spec_name_alias(spec, source) - { - let public = alias.unwrap_or_else(|| name.clone()); - match &src { - Some(s) => f.reexports.push((public, s.clone(), name)), - None => f.local_reexports.push((public, name)), - } - } - } - } else if let Some(s) = &src { - if !has_namespace { - f.star_sources.push(s.clone()); - } - } else if let Some(lex) = lexical { - collect_js_lexical_aliases(lex, source, f); - let mut lc = lex.walk(); - for d in lex.children(&mut lc) { - if d.kind() == "variable_declarator" - && let Some(nn) = d.child_by_field_name("name") - { - f.exported_const_names - .push(js_node_text(nn, source).to_string()); - } - } - } -} - -/// Parse each JS/TS file once, collecting its barrel re-export facts (indexed by -/// `paths` position). Files without a JS/TS grammar are recorded as empty. -fn collect_js_reexport_facts(paths: &[PathBuf]) -> Vec { - let mut out: Vec = Vec::with_capacity(paths.len()); - for path in paths { - let mut f = JsReexportFile::default(); - if let Some(lang) = js_grammar_for(path) - && let Ok(source) = std::fs::read(path) - { - let mut parser = tree_sitter::Parser::new(); - if parser.set_language(&lang).is_ok() - && let Some(tree) = parser.parse(&source, None) - { - let root = tree.root_node(); - let mut cur = root.walk(); - for stmt in root.children(&mut cur) { - match stmt.kind() { - "export_statement" => collect_js_export_stmt(stmt, &source, &mut f), - "import_statement" => collect_js_import_stmt(stmt, &source, &mut f), - "lexical_declaration" => collect_js_lexical_aliases(stmt, &source, &mut f), - _ => {} - } - } - } - } - out.push(f); - } - out -} - -/// Re-export chain resolver over the collected [`JsReexportFile`] facts. -struct ReexportResolver<'a> { - facts: &'a [JsReexportFile], - idx_by_path: &'a HashMap, - paths: &'a [PathBuf], - file_nids: &'a [String], - by_file_label: &'a HashMap<(String, String), String>, -} - -impl ReexportResolver<'_> { - /// `true` when `name` is declared as a real symbol node in file `idx`. - fn is_declared(&self, idx: usize, name: &str) -> bool { - self.by_file_label - .contains_key(&(self.file_nids[idx].clone(), name.to_string())) - } - - /// Resolve an import-source string (`'./x'`) to the `paths` index it targets. - fn resolve_src(&self, file_idx: usize, src_raw: &str) -> Option { - let str_path = self.paths[file_idx].to_string_lossy(); - let (_, resolved) = crate::generic::resolve_js_import_target(src_raw, &str_path); - let resolved = resolved?; - self.idx_by_path - .get(&resolved) - .or_else(|| { - resolved - .canonicalize() - .ok() - .and_then(|c| self.idx_by_path.get(&c)) - }) - .copied() - } - - /// Resolve `name` exported from file `file_idx` to its origin - /// `(file_idx, declared_name)`, following named/aliased/star re-exports, - /// local aliases, and named imports. `visited` guards against cycles. - fn resolve( - &self, - file_idx: usize, - name: &str, - visited: &mut HashSet<(usize, String)>, - ) -> Option<(usize, String)> { - if !visited.insert((file_idx, name.to_string())) { - return None; - } - let f = &self.facts[file_idx]; - for (public, src_raw, src_name) in &f.reexports { - if public == name - && let Some(tgt) = self.resolve_src(file_idx, src_raw) - && let Some(r) = self.resolve(tgt, src_name, visited) - { - return Some(r); - } - } - for (public, local) in &f.local_reexports { - if public == name - && local != name - && let Some(r) = self.resolve(file_idx, local, visited) - { - return Some(r); - } - } - if let Some(target) = f.local_aliases.get(name) - && let Some(r) = self.resolve(file_idx, target, visited) - { - return Some(r); - } - if let Some((src_raw, imported)) = f.named_imports.get(name) - && let Some(tgt) = self.resolve_src(file_idx, src_raw) - && let Some(r) = self.resolve(tgt, imported, visited) - { - return Some(r); - } - for src_raw in &f.star_sources { - if let Some(tgt) = self.resolve_src(file_idx, src_raw) - && let Some(r) = self.resolve(tgt, name, visited) - { - return Some(r); - } - } - if self.is_declared(file_idx, name) { - return Some((file_idx, name.to_string())); - } - None - } - - /// File→file `re_exports` edges for every barrel export that resolves to an - /// origin file other than the barrel itself. - fn reexport_edges(&self) -> Vec { - let mut edges = Vec::new(); - let mut seen: HashSet<(String, String)> = HashSet::new(); - for (idx, f) in self.facts.iter().enumerate() { - let barrel_nid = &self.file_nids[idx]; - let str_path = self.paths[idx].to_string_lossy(); - let publics = f - .reexports - .iter() - .map(|(p, _, _)| p) - .chain(f.local_reexports.iter().map(|(p, _)| p)) - .chain(f.exported_const_names.iter()); - for public in publics { - let mut visited = HashSet::new(); - if let Some((origin_idx, _)) = self.resolve(idx, public, &mut visited) - && origin_idx != idx - && seen.insert((barrel_nid.clone(), self.file_nids[origin_idx].clone())) - { - edges.push(make_edge( - barrel_nid, - &self.file_nids[origin_idx], - "re_exports", - Some("re-export"), - &str_path, - 1, - )); - } - } - for src_raw in &f.star_sources { - if let Some(tgt) = self.resolve_src(idx, src_raw) - && tgt != idx - && seen.insert((barrel_nid.clone(), self.file_nids[tgt].clone())) - { - edges.push(make_edge( - barrel_nid, - &self.file_nids[tgt], - "re_exports", - Some("re-export"), - &str_path, - 1, - )); - } - } - } - edges - } - - /// Consumer `imports` edges + call aliases for named imports that travel - /// through a barrel to an origin symbol in a different file. - fn consumer_import_edges(&self) -> (Vec, HashMap<(String, String), String>) { - let mut edges = Vec::new(); - let mut aliases: HashMap<(String, String), String> = HashMap::new(); - let mut seen: HashSet<(String, String)> = HashSet::new(); - for (idx, f) in self.facts.iter().enumerate() { - let consumer_nid = &self.file_nids[idx]; - let str_path = self.paths[idx].to_string_lossy(); - for (local, src_raw, imported, line) in &f.consumer_imports { - let Some(barrel_idx) = self.resolve_src(idx, src_raw) else { - continue; - }; - let mut visited = HashSet::new(); - let Some((origin_idx, origin_name)) = - self.resolve(barrel_idx, imported, &mut visited) - else { - continue; - }; - // origin == directly-imported file ⇒ plain import handled per-file. - if origin_idx == barrel_idx { - continue; - } - let Some(origin_sym) = self - .by_file_label - .get(&(self.file_nids[origin_idx].clone(), origin_name.clone())) - else { - continue; - }; - if seen.insert((consumer_nid.clone(), origin_sym.clone())) { - edges.push(make_edge( - consumer_nid, - origin_sym, - "imports", - Some("import"), - &str_path, - *line, - )); - } - aliases.insert( - (consumer_nid.clone(), local.to_lowercase()), - origin_sym.clone(), - ); - } - } - (edges, aliases) - } -} - -/// Resolve JS/TS named/aliased/star barrel re-export chains to their origin -/// symbols, emitting file→file `re_exports` edges, consumer→origin `imports` -/// edges, and call aliases (so a call through a barrel-imported binding targets -/// the origin symbol). Mirrors the observable output of graphify-py's -/// `_collect_js_symbol_resolution_facts` / `_apply_symbol_resolution_facts` -/// barrel handling, integrated with the existing per-file resolution. -fn resolve_js_reexport_imports( - all_nodes: &[Node], - paths: &[PathBuf], - root: &Path, -) -> JsDefaultResolution { - use crate::ids::file_node_id; - - let file_nid_of = |path: &Path| -> String { - let rel = relativise_under_root(path, root).unwrap_or_else(|| path.to_path_buf()); - file_node_id(&rel) - }; - let mut by_file_label: HashMap<(String, String), String> = HashMap::new(); - for n in all_nodes { - if n.source_file.is_empty() || n.label.is_empty() { - continue; - } - let sf = PathBuf::from(&n.source_file); - let file_nid = if sf.is_absolute() { - file_nid_of(&sf) - } else { - file_node_id(&sf) - }; - let label = n.label.trim_end_matches("()").trim_start_matches('.'); - if label.is_empty() { - continue; - } - by_file_label - .entry((file_nid, label.to_string())) - .or_insert_with(|| n.id.clone()); - } - - let facts = collect_js_reexport_facts(paths); - let mut idx_by_path: HashMap = HashMap::new(); - for (i, p) in paths.iter().enumerate() { - idx_by_path.entry(p.clone()).or_insert(i); - if let Ok(c) = p.canonicalize() { - idx_by_path.entry(c).or_insert(i); - } - } - let file_nids: Vec = paths.iter().map(|p| file_nid_of(p)).collect(); - let resolver = ReexportResolver { - facts: &facts, - idx_by_path: &idx_by_path, - paths, - file_nids: &file_nids, - by_file_label: &by_file_label, - }; - - let mut edges = resolver.reexport_edges(); - let (import_edges, aliases) = resolver.consumer_import_edges(); - edges.extend(import_edges); - - JsDefaultResolution { edges, aliases } -} - -/// `(module_raw, [(imported_name, local_or_public_name)])` from a Python -/// `import_from_statement` (alias-aware, unlike on-disk-only `python_imported_names`). -fn python_import_from_specs( - source: &[u8], - node: tree_sitter::Node<'_>, -) -> Option<(String, Vec<(String, String)>)> { - let module = node.child_by_field_name("module_name")?; - let module_raw = js_node_text(module, source).to_string(); - let mut specs = Vec::new(); - let mut past_import = false; - let mut cur = node.walk(); - for child in node.children(&mut cur) { - match child.kind() { - "import" => past_import = true, - "dotted_name" if past_import => { - let n = js_node_text(child, source).to_string(); - specs.push((n.clone(), n)); - } - "aliased_import" if past_import => { - if let Some(nn) = child.child_by_field_name("name") { - let imported = js_node_text(nn, source).to_string(); - let local = child - .child_by_field_name("alias") - .map_or_else(|| imported.clone(), |a| js_node_text(a, source).to_string()); - specs.push((imported, local)); - } - } - _ => {} - } - } - Some((module_raw, specs)) -} - -/// Candidate file paths a relative Python module reference can resolve to, -/// against `from_path`. A `.foo` reference can name either a module file -/// (`foo.py`) or a package (`foo/__init__.py`); `from . import x` names the -/// current package's `__init__.py`. Returns an empty list for a non-relative -/// module. The caller picks the first candidate present in the scan set. -fn python_relative_module_candidates(from_path: &Path, module_raw: &str) -> Vec { - if !module_raw.starts_with('.') { - return Vec::new(); - } - let dots = module_raw.len() - module_raw.trim_start_matches('.').len(); - let module_name = module_raw.trim_start_matches('.'); - let Some(mut base) = from_path.parent().map(Path::to_path_buf) else { - return Vec::new(); - }; - for _ in 0..dots.saturating_sub(1) { - let Some(parent) = base.parent() else { - return Vec::new(); - }; - base = parent.to_path_buf(); - } - if module_name.is_empty() { - return vec![base.join("__init__.py")]; - } - let rel = module_name.replace('.', "/"); - vec![ - base.join(format!("{rel}.py")), - base.join(&rel).join("__init__.py"), - ] -} - -/// Look up a path's `paths` index, falling back to its canonicalised form. -fn py_idx_of(idx_by_path: &HashMap, p: &Path) -> Option { - idx_by_path - .get(p) - .or_else(|| p.canonicalize().ok().and_then(|c| idx_by_path.get(&c))) - .copied() -} - -/// Parse a Python file, returning its source bytes + tree. -fn parse_python_file(path: &Path) -> Option<(Vec, tree_sitter::Tree)> { - let mut parser = tree_sitter::Parser::new(); - parser - .set_language(&tree_sitter_python::LANGUAGE.into()) - .ok()?; - let source = std::fs::read(path).ok()?; - let tree = parser.parse(&source, None)?; - Some((source, tree)) -} - -/// `(init_idx, public_name) → (origin_idx, origin_name)` package re-export map. -type PyPkgReexports = HashMap<(usize, String), (usize, String)>; - -/// Shared maps for Python package re-export resolution. -struct PyReexportResolver<'a> { - paths: &'a [PathBuf], - idx_by_path: &'a HashMap, - file_nids: &'a [String], - by_file_label: &'a HashMap<(String, String), String>, -} - -impl PyReexportResolver<'_> { - /// Scan every `__init__.py` for `from .sub import N as A`, building a - /// `(init_idx, public) → (origin_idx, origin_name)` map and emitting - /// file→file `re_exports` edges. - fn pkg_reexports(&self) -> (PyPkgReexports, Vec) { - let mut map: PyPkgReexports = HashMap::new(); - let mut edges = Vec::new(); - let mut seen: HashSet<(usize, usize)> = HashSet::new(); - for (idx, path) in self.paths.iter().enumerate() { - if path.file_name().and_then(|n| n.to_str()) != Some("__init__.py") { - continue; - } - let Some((source, tree)) = parse_python_file(path) else { - continue; - }; - let mut cur = tree.root_node().walk(); - for stmt in tree.root_node().children(&mut cur) { - if stmt.kind() != "import_from_statement" { - continue; - } - let Some((module_raw, specs)) = python_import_from_specs(&source, stmt) else { - continue; - }; - let Some(sub_idx) = python_relative_module_candidates(path, &module_raw) - .iter() - .find_map(|cand| py_idx_of(self.idx_by_path, cand)) - else { - continue; - }; - for (imported, public) in specs { - map.insert((idx, public), (sub_idx, imported)); - } - if seen.insert((idx, sub_idx)) { - edges.push(make_edge( - &self.file_nids[idx], - &self.file_nids[sub_idx], - "re_exports", - Some("re-export"), - &path.to_string_lossy(), - 1, - )); - } - } - } - (map, edges) - } - - /// Resolve each `from pkg import N` against the package re-export map, - /// emitting consumer→origin `imports` edges and call aliases. - fn consumer_edges( - &self, - pkg_reexports: &PyPkgReexports, - ) -> (Vec, HashMap<(String, String), String>) { - let mut edges = Vec::new(); - let mut aliases: HashMap<(String, String), String> = HashMap::new(); - let mut seen: HashSet<(usize, String)> = HashSet::new(); - for (idx, path) in self.paths.iter().enumerate() { - let str_path = path.to_string_lossy(); - let Some((source, tree)) = parse_python_file(path) else { - continue; - }; - let mut cur = tree.root_node().walk(); - for stmt in tree.root_node().children(&mut cur) { - if stmt.kind() != "import_from_statement" { - continue; - } - let Some((module_raw, specs)) = python_import_from_specs(&source, stmt) else { - continue; - }; - if module_raw.starts_with('.') { - continue; - } - let Some(pkg_dir) = - crate::import_handlers::resolve_python_package_dir(&module_raw, &str_path) - else { - continue; - }; - let Some(init_idx) = py_idx_of(self.idx_by_path, &pkg_dir.join("__init__.py")) - else { - continue; - }; - for (imported, local) in specs { - let Some((origin_idx, origin_name)) = pkg_reexports.get(&(init_idx, imported)) - else { - continue; - }; - let label = origin_name.trim_end_matches("()").trim_start_matches('.'); - let Some(origin_sym) = self - .by_file_label - .get(&(self.file_nids[*origin_idx].clone(), label.to_string())) - else { - continue; - }; - if seen.insert((idx, origin_sym.clone())) { - edges.push(make_edge( - &self.file_nids[idx], - origin_sym, - "imports", - Some("import"), - &str_path, - 1, - )); - } - aliases.insert( - (self.file_nids[idx].clone(), local.to_lowercase()), - origin_sym.clone(), - ); - } - } - } - (edges, aliases) - } -} - -/// Resolve Python package re-exports (`pkg/__init__.py` doing -/// `from .sub import Name as Alias`) so a consumer's `from pkg import Alias` -/// (and calls through it) target the origin symbol. Mirrors the observable -/// output of graphify-py's `_collect_python_symbol_resolution_facts`. -fn resolve_python_reexport_imports( - all_nodes: &[Node], - paths: &[PathBuf], - root: &Path, -) -> JsDefaultResolution { - use crate::ids::file_node_id; - - let file_nid_of = |path: &Path| -> String { - let rel = relativise_under_root(path, root).unwrap_or_else(|| path.to_path_buf()); - file_node_id(&rel) - }; - let mut by_file_label: HashMap<(String, String), String> = HashMap::new(); - for n in all_nodes { - if n.source_file.is_empty() || n.label.is_empty() { - continue; - } - let sf = PathBuf::from(&n.source_file); - let file_nid = if sf.is_absolute() { - file_nid_of(&sf) - } else { - file_node_id(&sf) - }; - let label = n.label.trim_end_matches("()").trim_start_matches('.'); - if !label.is_empty() { - by_file_label - .entry((file_nid, label.to_string())) - .or_insert_with(|| n.id.clone()); - } - } - let mut idx_by_path: HashMap = HashMap::new(); - for (i, p) in paths.iter().enumerate() { - idx_by_path.entry(p.clone()).or_insert(i); - if let Ok(c) = p.canonicalize() { - idx_by_path.entry(c).or_insert(i); - } - } - let file_nids: Vec = paths.iter().map(|p| file_nid_of(p)).collect(); - let resolver = PyReexportResolver { - paths, - idx_by_path: &idx_by_path, - file_nids: &file_nids, - by_file_label: &by_file_label, - }; - let (pkg_reexports, mut edges) = resolver.pkg_reexports(); - let (import_edges, aliases) = resolver.consumer_edges(&pkg_reexports); - edges.extend(import_edges); - JsDefaultResolution { edges, aliases } -} - -/// Relativise `path` against `root`, falling back to canonicalising the path -/// when a lexical strip fails (e.g. the path is relative, or differs from -/// `root` only by a symlink such as macOS's `/var` → `/private/var`). -/// -/// Mirrors Python's `path.relative_to(root)` with its -/// `path.resolve().relative_to(root)` fallback. Returns `None` only when the -/// path is genuinely outside `root`. -#[must_use] -fn relativise_under_root(path: &Path, root: &Path) -> Option { - if let Ok(rel) = path.strip_prefix(root) { - return Some(rel.to_path_buf()); - } - path.canonicalize() - .ok() - .and_then(|c| c.strip_prefix(root).map(Path::to_path_buf).ok()) -} - -/// Recursively collect the `package` declaration and `import`s (simple name -> -/// FQN, capitalised type imports only) from a parsed Java file. Mirrors the -/// inner `walk` in Python `_resolve_java_type_references`. -fn collect_java_pkg_imports( - node: tree_sitter::Node<'_>, - source: &[u8], - pkg: &mut String, - imps: &mut HashMap, -) { - match node.kind() { - "package_declaration" => { - let txt = node.utf8_text(source).unwrap_or(""); - *pkg = txt - .trim() - .strip_prefix("package") - .unwrap_or(txt) - .trim() - .trim_end_matches(';') - .trim() - .to_string(); - } - "import_declaration" => { - let txt = node.utf8_text(source).unwrap_or(""); - let stripped = txt - .trim() - .strip_prefix("import") - .unwrap_or(txt) - .trim() - .trim_end_matches(';') - .trim(); - let body = stripped.strip_prefix("static ").map_or(stripped, str::trim); - if !body.ends_with(".*") - && body.contains('.') - && let Some(simple) = body.rsplit('.').next() - && !simple.is_empty() - && simple.chars().next().is_some_and(char::is_uppercase) - { - imps.insert(simple.to_string(), body.to_string()); - } - } - _ => {} - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - collect_java_pkg_imports(cur.node(), source, pkg, imps); - if !cur.goto_next_sibling() { - break; - } - } - } -} - -// Java edge relations re-pointed from shadow stubs to real defs by -// `resolve_java_type_references`. `imports` is included so a file-level import -// edge that also landed on the shadow stub gets re-pointed too, leaving the stub -// unreferenced (and dropped). External/stdlib imports never resolve, so their -// edges correctly stay on their stub. -const JAVA_REPOINT_RELATIONS: &[&str] = &["implements", "inherits", "extends", "imports"]; - -/// Re-point dangling Java `implements`/`inherits`/`extends`/`imports` edges that -/// bare-name resolution left on sourceless shadow stubs, using each referencing -/// file's `import` statements (then its package) to disambiguate same-named types -/// across packages (#1318). Drops shadow stubs no edge references anymore. -/// -/// Mirrors Python `_resolve_java_type_references`. Runs after id-disambiguation -/// and `rewire_unique_stub_nodes` (so it only handles the ambiguous remainder), -/// in the final node-id space; keyed by the absolute `source_file` strings the -/// nodes/edges still carry before the closing relativisation pass. -fn resolve_java_type_references( - java_paths: &[PathBuf], - all_nodes: &mut Vec, - all_edges: &mut [Edge], -) { - let mut parser = tree_sitter::Parser::new(); - if parser - .set_language(&tree_sitter_java::LANGUAGE.into()) - .is_err() - { - return; - } - let mut pkg_by_file: HashMap = HashMap::new(); - let mut imports_by_file: HashMap> = HashMap::new(); - for path in java_paths { - let Ok(source) = std::fs::read(path) else { - continue; - }; - let Some(tree) = parser.parse(&source, None) else { - continue; - }; - let mut pkg = String::new(); - let mut imps: HashMap = HashMap::new(); - collect_java_pkg_imports(tree.root_node(), &source, &mut pkg, &mut imps); - let src = path.to_string_lossy().into_owned(); - pkg_by_file.insert(src.clone(), pkg); - imports_by_file.insert(src, imps); - } - - // FQN (`package.Class`) -> definition node id, for source-backed type-like defs. - let mut fqn_to_id: HashMap = HashMap::new(); - for n in all_nodes.iter() { - if n.label.is_empty() || n.source_file.is_empty() || n.id.is_empty() { - continue; - } - let Some(pkg) = pkg_by_file.get(&n.source_file) else { - continue; - }; - let first_upper = n.label.chars().next().is_some_and(char::is_uppercase); - if !first_upper || n.label.ends_with(')') || n.label.ends_with(".java") { - continue; - } - let fqn = if pkg.is_empty() { - n.label.clone() - } else { - format!("{pkg}.{}", n.label) - }; - fqn_to_id.entry(fqn).or_insert_with(|| n.id.clone()); - } - - // Bare shadow stubs: no source_file, capitalised (type-like) label. - let stub_label: HashMap = all_nodes - .iter() - .filter(|n| { - !n.id.is_empty() - && n.source_file.is_empty() - && n.label.chars().next().is_some_and(char::is_uppercase) - }) - .map(|n| (n.id.clone(), n.label.clone())) - .collect(); - if stub_label.is_empty() { - return; - } - - let mut repointed_from: std::collections::HashSet = std::collections::HashSet::new(); - for edge in all_edges.iter_mut() { - if !JAVA_REPOINT_RELATIONS.contains(&edge.relation.as_str()) { - continue; - } - let Some(label) = stub_label.get(&edge.target) else { - continue; - }; - let resolved: Option = { - let ref_file = edge.source_file.as_str(); - imports_by_file - .get(ref_file) - .and_then(|imps| imps.get(label)) - .and_then(|fqn| fqn_to_id.get(fqn)) - .or_else(|| { - // Same-package reference (no explicit import). - let pkg = pkg_by_file.get(ref_file).map_or("", String::as_str); - let fqn = if pkg.is_empty() { - label.clone() - } else { - format!("{pkg}.{label}") - }; - fqn_to_id.get(&fqn) - }) - .cloned() - }; - if let Some(r) = resolved - && r != edge.target - { - repointed_from.insert(std::mem::replace(&mut edge.target, r)); - } - } - if repointed_from.is_empty() { - return; - } - - // Drop shadow stubs that no edge references anymore. - let still_referenced: std::collections::HashSet<&str> = all_edges - .iter() - .flat_map(|e| [e.source.as_str(), e.target.as_str()]) - .collect(); - all_nodes - .retain(|n| !repointed_from.contains(&n.id) || still_referenced.contains(n.id.as_str())); -} - -/// `_is_type_like_definition`: a real type def (not a method, not a qualified or -/// decorated reference). Mirrors the Python predicate. -fn is_type_like_definition(node: &Node) -> bool { - let label = node.label.trim(); - !label.is_empty() - && !label.ends_with(')') - && !label.starts_with('.') - && !label.contains('.') - && node.file_type == "code" -} - -/// Re-parse a Swift file's AST into a `local name -> type name` table, from -/// property declarations (type annotation, else constructor inference) and -/// function parameters. Feeds [`resolve_swift_member_calls`]. Rebuilt by -/// re-parsing (like the Java type-reference pass) rather than threaded through a -/// `FileResult` sidecar. -fn collect_swift_type_table( - node: tree_sitter::Node<'_>, - source: &[u8], - table: &mut HashMap, -) { - use crate::generic::references::{ - RefRole, swift_collect_type_refs, swift_constructor_type, swift_property_name, - swift_property_type_node, - }; - match node.kind() { - "property_declaration" => { - let mut prop_type: Option = None; - if let Some(anno) = swift_property_type_node(node) { - let mut refs: Vec<(String, RefRole)> = Vec::new(); - swift_collect_type_refs(anno, source, false, &mut refs); - prop_type = refs - .into_iter() - .find(|(_, r)| *r == RefRole::Direct) - .map(|(n, _)| n); - } - if prop_type.is_none() { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "call_expression" - && let Some(ctor) = swift_constructor_type(cur.node(), source) - { - prop_type = Some(ctor); - break; - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - if let (Some(name), Some(ty)) = (swift_property_name(node, source), prop_type) { - table.insert(name, ty); - } - } - "parameter" => { - if let Some(type_node) = node.child_by_field_name("type") { - let mut refs: Vec<(String, RefRole)> = Vec::new(); - swift_collect_type_refs(type_node, source, false, &mut refs); - if let Some((ty, _)) = refs.into_iter().find(|(_, r)| *r == RefRole::Direct) - && let Some(name_node) = node.child_by_field_name("name") - { - let pname = name_node.utf8_text(source).unwrap_or(""); - if !pname.is_empty() { - table.insert(pname.to_string(), ty); - } - } - } - } - _ => {} - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - collect_swift_type_table(cur.node(), source, table); - if !cur.goto_next_sibling() { - break; - } - } - } -} - -/// Resolve cross-file Swift member calls (`recv.method()`) to the receiver's -/// real type definition (#1356). The shared call pass drops every -/// `is_member_call` (a bare method name collides across the corpus); this pass -/// types the receiver via the file's local type table (or treats an upper-cased -/// receiver as a type itself), then emits an edge ONLY when the type name -/// resolves to exactly one definition (god-node guard). Everything it adds is -/// INFERRED (type inference, not an explicit import). -#[allow(clippy::too_many_lines)] // linear: re-parse type tables, build indexes, resolve each member call -fn resolve_swift_member_calls( - swift_paths: &[PathBuf], - all_nodes: &[Node], - all_edges: &mut Vec, - all_raw_calls: &[RawCall], -) { - let mut parser = tree_sitter::Parser::new(); - if parser - .set_language(&tree_sitter_swift::LANGUAGE.into()) - .is_err() - { - return; - } - let mut type_table_by_file: HashMap> = HashMap::new(); - for path in swift_paths { - let Ok(source) = std::fs::read(path) else { - continue; - }; - let Some(tree) = parser.parse(&source, None) else { - continue; - }; - let mut table: HashMap = HashMap::new(); - collect_swift_type_table(tree.root_node(), &source, &mut table); - type_table_by_file.insert(path.to_string_lossy().into_owned(), table); - } - if type_table_by_file.is_empty() { - return; - } - - let key = |s: &str| -> String { - s.chars() - .filter(char::is_ascii_alphanumeric) - .collect::() - .to_lowercase() - }; - - // A genuine type is the target of a `contains` edge from its file; bare type - // references create same-label shadow nodes that are NOT contained, so this - // keeps a shadow from making a real type name look ambiguous. - let contained: std::collections::HashSet<&str> = all_edges - .iter() - .filter(|e| e.relation == "contains") - .map(|e| e.target.as_str()) - .collect(); - let mut type_def_nids: HashMap> = HashMap::new(); - let mut node_by_id: HashMap<&str, &Node> = HashMap::new(); - for n in all_nodes { - node_by_id.insert(n.id.as_str(), n); - if !n.source_file.is_empty() - && contained.contains(n.id.as_str()) - && is_type_like_definition(n) - { - type_def_nids - .entry(key(n.label.as_str())) - .or_default() - .push(n.id.clone()); - } - } - - // (type_node_id, method_key) -> method_node_id, from `method` edges. - let mut method_index: HashMap<(String, String), String> = HashMap::new(); - for e in all_edges.iter() { - if e.relation == "method" - && let Some(tnode) = node_by_id.get(e.target.as_str()) - { - method_index.insert( - (e.source.clone(), key(tnode.label.as_str())), - e.target.clone(), - ); - } - } - - let mut existing_pairs: std::collections::HashSet<(String, String)> = all_edges - .iter() - .map(|e| (e.source.clone(), e.target.clone())) - .collect(); - - let mut new_edges: Vec = Vec::new(); - for rc in all_raw_calls { - if !rc.is_member_call || rc.callee.is_empty() || rc.caller_nid.is_empty() { - continue; - } - let Some(receiver) = rc.receiver.as_deref() else { - continue; - }; - // An upper-cased receiver is itself a type (`Type.staticMethod()`, - // `Singleton.shared.x()`); otherwise look it up in the declaring file's - // local type table. - let type_name = if receiver.chars().next().is_some_and(char::is_uppercase) { - receiver.to_string() - } else if let Some(t) = type_table_by_file - .get(&rc.source_file) - .and_then(|tbl| tbl.get(receiver)) - { - t.clone() - } else { - continue; - }; - let type_nid = match type_def_nids.get(&key(type_name.as_str())) { - Some(defs) if defs.len() == 1 => &defs[0], - _ => continue, // ambiguous or absent -> god-node guard - }; - let (target, relation) = - match method_index.get(&(type_nid.clone(), key(rc.callee.as_str()))) { - Some(method) => (method.clone(), "calls"), - None => (type_nid.clone(), "references"), - }; - if target == rc.caller_nid - || existing_pairs.contains(&(rc.caller_nid.clone(), target.clone())) - { - continue; - } - existing_pairs.insert((rc.caller_nid.clone(), target.clone())); - new_edges.push(Edge { - external: false, - source: rc.caller_nid.clone(), - target, - relation: relation.to_string(), - confidence: "INFERRED".to_string(), - source_file: rc.source_file.clone(), - source_location: Some(rc.source_location.clone()), - weight: 1.0, - context: Some("call".to_string()), - confidence_score: Some(0.8), - }); - } - all_edges.extend(new_edges); -} - -// ── Main extract() ──────────────────────────────────────────────────────────── - -/// Extract AST nodes and edges from a list of code files. -/// -/// Two-pass process: -/// 1. Per-file structural extraction (classes, functions, imports) — parallel if ≥ 20 uncached -/// 2. Cross-file import + call resolution -#[must_use] -#[allow(clippy::too_many_lines)] -pub fn extract(paths: &[PathBuf], cache_root: Option<&Path>) -> ExtractOutput { - if paths.is_empty() { - return ExtractOutput { - nodes: vec![], - edges: vec![], - input_tokens: 0, - output_tokens: 0, - }; - } - - // Workspace package manifests/globs can change between repeated extractions - // (e.g. a new package added) or during `watch`; clear the cache so each run - // re-scans. Mirrors Python `extract()`'s `_WORKSPACE_PACKAGE_CACHE.clear()`. - crate::workspace::clear_workspace_cache(); - - // Infer common root for ID relativisation - let root: PathBuf = { - let inferred = if paths.len() == 1 { - paths[0] - .parent() - .map_or_else(|| PathBuf::from("."), PathBuf::from) - } else { - let min_parts = paths - .iter() - .map(|p| p.components().count()) - .min() - .unwrap_or(0); - let mut common_len = 0usize; - 'outer: for i in 0..min_parts { - let first = paths[0].components().nth(i); - for p in paths.iter().skip(1) { - if p.components().nth(i) != first { - break 'outer; - } - } - common_len = i + 1; - } - if common_len == 0 { - PathBuf::from(".") - } else { - paths[0].components().take(common_len).collect() - } - }; - // An explicit `cache_root` overrides the inferred prefix, matching - // Python's `if cache_root is not None: root = cache_root`. The root - // drives both cache keys and the #1033 file-node-id relativisation, so - // a divergence here splits AST/semantic file nodes apart. - let base = cache_root.map_or(inferred, Path::to_path_buf); - base.canonicalize().unwrap_or(base) - }; - - let effective_root: &Path = cache_root.unwrap_or(&root); - - // Phase 1: extract per file (cached or fresh) - let uncached_work: Vec<(usize, &PathBuf)> = paths - .iter() - .enumerate() - .filter(|(_, p)| get_extractor(p).is_some()) - .collect(); - - let mut per_file: Vec = paths.iter().map(|_| FileResult::default()).collect(); - - if uncached_work.len() >= PARALLEL_THRESHOLD { - // Parallel via rayon - let results: Vec<(usize, FileResult)> = uncached_work - .par_iter() - .map(|(idx, path)| (*idx, extract_single_file(path, effective_root))) - .collect(); - for (idx, result) in results { - per_file[idx] = result; - } - } else { - // Sequential - for (idx, path) in &uncached_work { - per_file[*idx] = extract_single_file(path, effective_root); - } - } - - // Cross-file Python import resolution — must run BEFORE per_file is - // drained into `all_*`, otherwise `resolve_cross_file_*` sees empty - // FileResults and emits no cross-module edges. - let mut cross_edges: Vec = Vec::new(); - let py_indices: Vec = paths - .iter() - .enumerate() - .filter(|(_, p)| p.extension().is_some_and(|e| e == "py")) - .map(|(i, _)| i) - .collect(); - if !py_indices.is_empty() { - let py_results: Vec = py_indices.iter().map(|&i| per_file[i].clone()).collect(); - let py_paths: Vec = py_indices.iter().map(|&i| paths[i].clone()).collect(); - cross_edges.extend(resolve_cross_file_python_imports(&py_results, &py_paths)); - } - - // Cross-file Java import resolution - let java_indices: Vec = paths - .iter() - .enumerate() - .filter(|(_, p)| p.extension().is_some_and(|e| e == "java")) - .map(|(i, _)| i) - .collect(); - if !java_indices.is_empty() { - let java_results: Vec = - java_indices.iter().map(|&i| per_file[i].clone()).collect(); - let java_paths: Vec = java_indices.iter().map(|&i| paths[i].clone()).collect(); - cross_edges.extend(resolve_cross_file_java_imports(&java_results, &java_paths)); - } - - let mut all_nodes: Vec = Vec::new(); - let mut all_edges: Vec = Vec::new(); - let mut all_raw_calls: Vec = Vec::new(); - - for result in &mut per_file { - all_nodes.append(&mut result.nodes); - all_edges.append(&mut result.edges); - all_raw_calls.append(&mut result.raw_calls); - } - all_edges.extend(cross_edges); - - // Remap absolute file-node IDs to the canonical `{parent_dir}_{stem}` spec - // form so (a) edge endpoints are stable across machines (#502) and (b) AST - // file nodes match the IDs semantic subagents generate (#1033). - let mut id_remap: HashMap = HashMap::new(); - // Symbol node IDs embed the file stem the extractor saw as a prefix. For a - // root-level file that stem picks up the absolute parent directory name, so - // a symbol becomes `_main_run` while the file node correctly - // relativises to `main` and the spec wants `main_run` — splitting the symbol - // into AST/semantic ghosts (#1096). Relativise the symbol prefix the same - // way, gated by `source_file` so two files sharing a prefix can't - // cross-contaminate. Keyed by the path string the extractor recorded in - // `source_file` → (old_prefix, new_prefix). - let mut prefix_remap: HashMap = HashMap::new(); - for path in paths { - let old_id = make_id1(&path.to_string_lossy()); - // Resolve relative-to-root; a lexical strip can fail (path is relative, or - // differs from `root` only by a symlink), so fall back to canonicalising — - // mirrors Python's `resolve().relative_to(root)` fallback. - let Some(rel) = relativise_under_root(path, &root) else { - continue; - }; - let new_id = crate::ids::file_node_id(&rel); - if old_id != new_id { - id_remap.insert(old_id, new_id.clone()); - } - // Import resolution (e.g. the pnpm `.`-package entry, #1083) canonicalises - // the resolved path, which on macOS rewrites `/tmp` → `/private/tmp`. That - // id differs from the input-path id keyed above, so an edge targeting the - // canonical spelling would dangle off the relativised file node. Map the - // canonical spelling to the same node so the resolved edge connects. - if let Ok(canon) = path.canonicalize() { - let canon_id = make_id1(&canon.to_string_lossy()); - if canon_id != new_id { - id_remap.entry(canon_id).or_insert_with(|| new_id.clone()); - } - } - let old_pref = crate::ids::file_node_id(path); - if old_pref != new_id { - prefix_remap.insert(path.to_string_lossy().into_owned(), (old_pref, new_id)); - } - } - if !id_remap.is_empty() { - for n in &mut all_nodes { - if let Some(new_id) = id_remap.get(&n.id) { - n.id = new_id.clone(); - } - } - for e in &mut all_edges { - if let Some(new_id) = id_remap.get(&e.source) { - e.source = new_id.clone(); - } - if let Some(new_id) = id_remap.get(&e.target) { - e.target = new_id.clone(); - } - } - } - if !prefix_remap.is_empty() { - let mut sym_remap: HashMap = HashMap::new(); - for n in &all_nodes { - if n.source_file.is_empty() { - continue; - } - // Package (#1377) and Swift module (#1327) anchor nodes carry a - // canonical name-keyed id (`pkg_` / the shared module id) that - // must stay identical across every manifest/file that references them, - // so they are exempt from the file-stem prefix remap. - if n.metadata - .as_ref() - .and_then(|m| m.get("type")) - .and_then(Value::as_str) - .is_some_and(|t| t == "package" || t == "module") - { - continue; - } - let Some((old_pref, new_pref)) = prefix_remap.get(&n.source_file) else { - continue; - }; - // IDs are make_id output (lowercase word chars + `_`), so slicing at - // a byte offset is always on a char boundary. - if n.id.len() > old_pref.len() - && n.id.starts_with(old_pref.as_str()) - && n.id.as_bytes()[old_pref.len()] == b'_' - { - let new_nid = format!("{new_pref}{}", &n.id[old_pref.len()..]); - if new_nid != n.id { - sym_remap.insert(n.id.clone(), new_nid); - } - } - } - if !sym_remap.is_empty() { - for n in &mut all_nodes { - if let Some(new_id) = sym_remap.get(&n.id) { - n.id = new_id.clone(); - } - } - for e in &mut all_edges { - if let Some(new_id) = sym_remap.get(&e.source) { - e.source = new_id.clone(); - } - if let Some(new_id) = sym_remap.get(&e.target) { - e.target = new_id.clone(); - } - } - // raw_calls carry caller_nid (a symbol id) consumed by the cross-file - // call pass below — rewrite it too or those edges dangle on a stale - // source (#1096). - for rc in &mut all_raw_calls { - if let Some(new_id) = sym_remap.get(&rc.caller_nid) { - rc.caller_nid = new_id.clone(); - } - } - } - } - - // Disambiguate node IDs that collide across two or more distinct - // source files (e.g. two `Program.cs` files in different directories). - // Runs before cross-file call resolution so the call resolver sees - // already-qualified IDs. - crate::postprocess::disambiguate_colliding_node_ids( - &mut all_nodes, - &mut all_edges, - &mut all_raw_calls, - &root, - ); - - // Rewire cross-language inheritance stub nodes (no `source_file`) onto - // a unique real definition with the same label. Drops the stub when - // the rewire succeeds. - crate::postprocess::rewire_unique_stub_nodes(&mut all_nodes, &mut all_edges); - - // Re-point dangling Java implements/inherits edges left on shadow stubs by - // bare-name resolution, using imports for exact-package disambiguation - // (#1318). After rewire_unique_stub_nodes so it only handles the ambiguous - // remainder; before the closing source_file relativisation so node/edge - // source_files still match the parsed Java file paths. - let java_type_paths: Vec = paths - .iter() - .filter(|p| p.extension().is_some_and(|e| e == "java")) - .cloned() - .collect(); - if !java_type_paths.is_empty() { - resolve_java_type_references(&java_type_paths, &mut all_nodes, &mut all_edges); - } - - // Collapse Swift `extension Foo` nodes onto the canonical `class Foo` - // declaration. Mirrors `_merge_swift_extensions` in graphify-py. - crate::postprocess::merge_swift_extensions(paths, &mut all_nodes, &mut all_edges); - - // Cross-file JS/TS default-import resolution (#6dc23db). Runs in the final - // node-id space (after remap/disambiguation); the `imports` edges feed the - // import-evidence index below and the aliases let calls through a renamed - // default binding resolve to the origin symbol. - let js_default = resolve_js_default_imports(&all_nodes, paths, &root); - all_edges.extend(js_default.edges); - let mut js_default_aliases = js_default.aliases; - // Cross-file JS/TS barrel re-export resolution: chain named/aliased/star - // re-exports (and local-alias re-exports) to the origin symbol so consumer - // imports + calls through a barrel target the real declaration. - let js_reexport = resolve_js_reexport_imports(&all_nodes, paths, &root); - all_edges.extend(js_reexport.edges); - js_default_aliases.extend(js_reexport.aliases); - // Cross-file Python package re-export resolution: `pkg/__init__.py` doing - // `from .sub import N as A` lets `from pkg import A` (and calls through it) - // target the origin symbol in `sub`. - let py_reexport = resolve_python_reexport_imports(&all_nodes, paths, &root); - all_edges.extend(py_reexport.edges); - js_default_aliases.extend(py_reexport.aliases); - - // Cross-file call resolution via raw_calls - // Build label → [nid] (skip rationale) - let mut global_label_to_nids: HashMap> = HashMap::new(); - for n in &all_nodes { - if n.file_type == "rationale" { - continue; - } - let normalised = n.label.trim_end_matches("()").trim_start_matches('.'); - if !normalised.is_empty() { - global_label_to_nids - .entry(normalised.to_lowercase()) - .or_default() - .push(n.id.clone()); - } - } - - // Import evidence indexes - let mut file_to_symbol_imports: HashMap> = - HashMap::new(); - let mut file_to_module_imports: HashMap> = - HashMap::new(); - for e in &all_edges { - if e.relation == "imports" { - file_to_symbol_imports - .entry(e.source.clone()) - .or_default() - .insert(e.target.clone()); - } else if e.relation == "imports_from" { - file_to_module_imports - .entry(e.source.clone()) - .or_default() - .insert(e.target.clone()); - } - } - - // Map node → file_nid - let mut nid_to_file_nid: HashMap = HashMap::new(); - for n in &all_nodes { - if n.source_file.is_empty() { - continue; - } - let sf_path = PathBuf::from(&n.source_file); - // Relativise the same way `id_remap` does so a symbol's file-nid matches - // its (relativised) file node id — including the canonicalise fallback - // for absolute paths that differ from `root` only by a symlink. Relative - // source paths are used verbatim (mirrors Python). - let sf_rel = if sf_path.is_absolute() { - relativise_under_root(&sf_path, &root).unwrap_or(sf_path) - } else { - sf_path - }; - nid_to_file_nid.insert(n.id.clone(), crate::ids::file_node_id(&sf_rel)); - } - - let mut existing_pairs: std::collections::HashSet<(String, String)> = all_edges - .iter() - .map(|e| (e.source.clone(), e.target.clone())) - .collect(); - - for rc in &all_raw_calls { - // No built-in pre-filter here: the per-language extractors already drop - // *unresolved* built-in calls at the source, so any raw_call that reaches - // this cross-file pass is a genuine unresolved symbol. Filtering on the - // name alone would wrongly suppress a project symbol that happens to - // share a built-in name and resolves uniquely below. - if rc.is_member_call { - continue; - } - let callee_key = rc.callee.to_lowercase(); - let caller = &rc.caller_nid; - let caller_file_nid = nid_to_file_nid.get(caller); - // A renamed default-import binding (`import mk from './foo'; mk()`) aliases - // the local name to the origin symbol; prefer that over global label - // matching, since the local name has no node of its own (#6dc23db). - let alias_tgt = - caller_file_nid.and_then(|f| js_default_aliases.get(&(f.clone(), callee_key.clone()))); - let candidates: Vec<&String> = match alias_tgt { - Some(t) => vec![t], - None => global_label_to_nids - .get(&callee_key) - .map_or_else(Vec::new, |v| v.iter().collect()), - }; - // Only resolve unambiguous matches - if candidates.len() != 1 { - continue; - } - let tgt = candidates[0]; - if tgt == caller { - continue; - } - let pair = (caller.clone(), tgt.clone()); - if existing_pairs.contains(&pair) { - continue; - } - - let tgt_file_nid = nid_to_file_nid.get(tgt); - let imported_symbols = caller_file_nid - .and_then(|f| file_to_symbol_imports.get(f)) - .is_some_and(|s| s.contains(tgt)); - let imported_module = caller_file_nid - .and_then(|f| file_to_module_imports.get(f)) - .zip(tgt_file_nid) - .is_some_and(|(m, cfn)| m.contains(cfn)); - let has_import_evidence = imported_symbols || imported_module; - - let (confidence, confidence_score) = if has_import_evidence { - ("EXTRACTED".to_string(), 1.0f64) - } else { - ("INFERRED".to_string(), 0.8f64) - }; - - existing_pairs.insert(pair); - all_edges.push(Edge { - external: false, - source: caller.clone(), - target: tgt.clone(), - relation: "calls".to_string(), - confidence, - source_file: rc.source_file.clone(), - source_location: Some(rc.source_location.clone()), - weight: 1.0, - context: Some("call".to_string()), - confidence_score: Some(confidence_score), - }); - } - - // Cross-file Swift member-call resolution (#1356): after the shared call pass - // (node ids and caller_nids final) and before source_file relativisation (the - // type-table re-parse keys on the absolute paths nodes/raw_calls still carry). - let swift_paths: Vec = paths - .iter() - .filter(|p| p.extension().is_some_and(|e| e == "swift")) - .cloned() - .collect(); - if !swift_paths.is_empty() { - resolve_swift_member_calls(&swift_paths, &all_nodes, &mut all_edges, &all_raw_calls); - } - - // Relativise source_file fields - for n in &mut all_nodes { - let sf_path = PathBuf::from(&n.source_file); - if sf_path.is_absolute() - && let Ok(rel) = sf_path.strip_prefix(&root) - { - n.source_file = rel.to_string_lossy().into_owned(); - } - } - for e in &mut all_edges { - let sf_path = PathBuf::from(&e.source_file); - if sf_path.is_absolute() - && let Ok(rel) = sf_path.strip_prefix(&root) - { - e.source_file = rel.to_string_lossy().into_owned(); - } - } - - // Convert to IndexMap for ordered serialisation. The per-item serde - // conversion is independent and dominates wall time on large corpora, - // so fan out via Rayon above the per-file threshold. - let to_indexmap = |v: Value| -> Option> { - if let Value::Object(m) = v { - Some(m.into_iter().collect()) - } else { - None - } - }; - let mut nodes_out: Vec> = - if all_nodes.len() >= PARALLEL_THRESHOLD { - all_nodes - .into_par_iter() - .filter_map(|n| serde_json::to_value(n).ok().and_then(to_indexmap)) - .collect() - } else { - all_nodes - .into_iter() - .filter_map(|n| serde_json::to_value(n).ok().and_then(to_indexmap)) - .collect() - }; - // Tag AST provenance so the incremental watch rebuild can distinguish - // AST-extracted nodes from semantic/LLM nodes. On a full re-extraction the - // watcher drops any AST-marked node missing from the fresh output even when - // its source file still exists (#1116/#1118). - for n in &mut nodes_out { - n.insert("_origin".to_string(), Value::String("ast".to_string())); - } - let edges_out: Vec> = if all_edges.len() >= PARALLEL_THRESHOLD - { - all_edges - .into_par_iter() - .filter_map(|e| serde_json::to_value(e).ok().and_then(to_indexmap)) - .collect() - } else { - all_edges - .into_iter() - .filter_map(|e| serde_json::to_value(e).ok().and_then(to_indexmap)) - .collect() - }; - - ExtractOutput { - nodes: nodes_out, - edges: edges_out, - input_tokens: 0, - output_tokens: 0, - } -} diff --git a/crates/graphify-extract/src/extractors/multi/cache.rs b/crates/graphify-extract/src/extractors/multi/cache.rs new file mode 100644 index 0000000..edb0870 --- /dev/null +++ b/crates/graphify-extract/src/extractors/multi/cache.rs @@ -0,0 +1,159 @@ +//! Per-file extraction cache helpers (thin wrappers around graphify-cache). +#![allow(clippy::case_sensitive_file_extension_comparisons)] + +use super::get_extractor; +use crate::types::{Edge, FileResult, Node, RawCall}; +use serde_json::Value; +use std::path::Path; + +/// Serialise a `FileResult` to a `serde_json::Value` suitable for caching. +/// +/// Converts nodes, edges, and `raw_calls` to JSON arrays. Used as the write side of the +/// graphify-cache pair; see `value_to_file_result` for the read side. +fn file_result_to_value(result: &FileResult) -> Value { + let nodes: Vec = result + .nodes + .iter() + .map(|n| serde_json::to_value(n).unwrap_or(Value::Null)) + .collect(); + let edges: Vec = result + .edges + .iter() + .map(|e| serde_json::to_value(e).unwrap_or(Value::Null)) + .collect(); + let raw_calls: Vec = result + .raw_calls + .iter() + .map(|rc| { + serde_json::json!({ + "caller_nid": rc.caller_nid, + "callee": rc.callee, + "is_member_call": rc.is_member_call, + "source_file": rc.source_file, + "source_location": rc.source_location, + "receiver": rc.receiver, + }) + }) + .collect(); + serde_json::json!({ + "nodes": nodes, + "edges": edges, + "raw_calls": raw_calls, + }) +} + +/// Deserialise a cached `serde_json::Value` back into a `FileResult`. +/// +/// Missing or malformed sub-fields silently fall back to empty `Vec`s. +/// Counterpart to `file_result_to_value`. +fn value_to_file_result(v: &Value) -> FileResult { + let nodes = v + .get("nodes") + .and_then(Value::as_array) + .map(|arr| { + arr.iter() + .filter_map(|n| serde_json::from_value::(n.clone()).ok()) + .collect() + }) + .unwrap_or_default(); + let edges = v + .get("edges") + .and_then(Value::as_array) + .map(|arr| { + arr.iter() + .filter_map(|e| serde_json::from_value::(e.clone()).ok()) + .collect() + }) + .unwrap_or_default(); + let raw_calls = v + .get("raw_calls") + .and_then(Value::as_array) + .map(|arr| { + arr.iter() + .filter_map(|rc| { + Some(RawCall { + caller_nid: rc.get("caller_nid")?.as_str()?.to_string(), + callee: rc.get("callee")?.as_str()?.to_string(), + is_member_call: rc + .get("is_member_call") + .and_then(Value::as_bool) + .unwrap_or(false), + source_file: rc + .get("source_file") + .and_then(Value::as_str) + .unwrap_or("") + .to_string(), + source_location: rc + .get("source_location") + .and_then(Value::as_str) + .unwrap_or("") + .to_string(), + // `receiver` (#1356) reads back as `None` when absent. + // Safe without a Swift cache bypass or schema-version + // check: the AST cache is namespaced by crate version + // (`cache/ast/v{version}/` via graphify-cache's + // EXTRACTOR_VERSION), so a pre-`receiver` entry sits + // under an older version dir `load_cached` never reads, + // invalidated by the version bump that shipped the field. + receiver: rc + .get("receiver") + .and_then(Value::as_str) + .map(str::to_string), + }) + }) + .collect() + }) + .unwrap_or_default(); + FileResult { + nodes, + edges, + raw_calls, + error: None, + } +} + +// ── Extract a single file (with cache) ─────────────────────────────────────── + +/// File suffixes whose per-file AST extraction is never cached: their cross-file +/// import resolution depends on sibling files that can appear or change between +/// runs, so a cached result would serve a stale (unresolved) import edge. +/// Mirrors Python `_JS_CACHE_BYPASS_SUFFIXES`. +const JS_CACHE_BYPASS_SUFFIXES: [&str; 7] = ["js", "jsx", "mjs", "ts", "tsx", "vue", "svelte"]; + +/// Extract a single file, returning a cached result when available. +/// +/// Looks up the on-disk AST cache first; on a miss, dispatches to the language-specific +/// extractor and writes the result back to the cache. Files with no matching extractor +/// return an empty `FileResult` rather than an error. +pub(super) fn extract_single_file(path: &Path, effective_root: &Path) -> FileResult { + // JS/TS files bypass the AST cache so workspace/sibling import resolution is + // recomputed each run (#9a7dbfb): a result cached while a sibling was absent + // would otherwise pin a stale unresolved import edge. + let bypass_cache = path + .extension() + .and_then(|e| e.to_str()) + .is_some_and(|ext| JS_CACHE_BYPASS_SUFFIXES.contains(&ext)); + + if !bypass_cache && let Some(v) = graphify_cache::load_cached(path, effective_root, "ast") { + return value_to_file_result(&v); + } + + let Some(extractor) = get_extractor(path) else { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: None, + }; + }; + + let result = extractor(path); + if !bypass_cache && result.error.is_none() { + let v = file_result_to_value(&result); + // best-effort save; ignore failures + let _ = graphify_cache::save_cached(path, &v, effective_root, "ast"); + } + result +} + +// ── Cross-file Python import resolution helpers ─────────────────────────────── diff --git a/crates/graphify-extract/src/extractors/multi/java.rs b/crates/graphify-extract/src/extractors/multi/java.rs new file mode 100644 index 0000000..d55bbe4 --- /dev/null +++ b/crates/graphify-extract/src/extractors/multi/java.rs @@ -0,0 +1,398 @@ +//! Cross-file Java import + type-reference resolution. +#![allow(clippy::case_sensitive_file_extension_comparisons)] + +use super::PARALLEL_THRESHOLD; +use crate::ids::make_id1; +use crate::types::{Edge, FileResult, Node}; +use rayon::prelude::*; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +/// Recursively walk a Java AST collecting `import` declarations and resolving them to graph edges. +/// +/// On finding an `import_declaration`, extracts the class name (or second-to-last component for +/// static method imports), looks it up in `name_to_ids`, and emits `imports` edges from the +/// current file node to any matching class nodes. Wildcard imports (`.*`) are silently skipped. +/// Mirrors Python `_walk_java` from `extract.py`. +fn walk_java( + node: tree_sitter::Node<'_>, + source: &[u8], + file_nid: &str, + path: &Path, + name_to_ids: &HashMap>, + new_edges: &mut Vec, + seen_pairs: &mut std::collections::HashSet<(String, String)>, +) { + if node.kind() == "import_declaration" { + let raw = std::str::from_utf8(&source[node.start_byte()..node.end_byte()]) + .unwrap_or("") + .trim() + .to_string(); + let body = raw + .trim_start_matches("import") + .trim() + .trim_end_matches(';') + .trim() + .trim_start_matches("static ") + .trim() + .to_string(); + if body.ends_with(".*") { + return; + } + let parts: Vec<&str> = body.split('.').collect(); + if parts.is_empty() { + return; + } + let last = parts.last().copied().unwrap_or(""); + // If last part is lowercase, try second-to-last (method static import) + let class_name = if last.chars().next().is_some_and(char::is_lowercase) && parts.len() >= 2 + { + parts[parts.len() - 2] + } else { + last + }; + let at_line = node.start_position().row + 1; + for tgt_nid in name_to_ids.get(class_name).into_iter().flatten() { + if tgt_nid == file_nid { + continue; + } + let key = (file_nid.to_string(), tgt_nid.clone()); + if seen_pairs.insert(key) { + new_edges.push(Edge { + external: false, + source: file_nid.to_string(), + target: tgt_nid.clone(), + relation: "imports".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: path.to_string_lossy().into_owned(), + source_location: Some(format!("L{at_line}")), + weight: 1.0, + context: None, + confidence_score: Some(1.0), + }); + } + } + return; + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_java( + cur.node(), + source, + file_nid, + path, + name_to_ids, + new_edges, + seen_pairs, + ); + if !cur.goto_next_sibling() { + break; + } + } + } +} + +// ── Cross-file Python import resolution ────────────────────────────────────── + +/// Emit `imports` edges by resolving Java `import` statements across all extracted files. +/// +/// Two-pass: first builds a map of (class-name → [nid]) from all capitalised node labels; +/// then re-parses each `.java` file to find `import_declaration` nodes and emit edges. +/// Mirrors Python `_resolve_cross_file_java_imports`. +#[allow(clippy::too_many_lines)] +pub(super) fn resolve_cross_file_java_imports( + per_file: &[FileResult], + paths: &[PathBuf], +) -> Vec { + let mut parser = tree_sitter::Parser::new(); + if parser + .set_language(&tree_sitter_java::LANGUAGE.into()) + .is_err() + { + return vec![]; + } + + // Pass 1: class-name → [node_id] + let mut name_to_ids: HashMap> = HashMap::new(); + for result in per_file { + for node in &result.nodes { + let label = &node.label; + if label.is_empty() + || node.source_file.is_empty() + || label.ends_with(')') + || label.to_lowercase().ends_with(".java") + { + continue; + } + if !label + .chars() + .next() + .is_some_and(|c| c.is_alphabetic() && c.is_uppercase()) + { + continue; + } + name_to_ids + .entry(label.clone()) + .or_default() + .push(node.id.clone()); + } + } + + // Pass 2: resolve imports — fan out across Rayon. Per-file work is + // independent; we drop the seed parser and give each worker its own. + // `seen_pairs` is partitioned per-file (each thread accumulates its + // own pairs); the final dedupe runs sequentially after the parallel + // reduce so edge ordering matches the sequential implementation + // wherever it would have been preserved. + drop(parser); + + let init_parser = || -> tree_sitter::Parser { + let mut p = tree_sitter::Parser::new(); + let _ = p.set_language(&tree_sitter_java::LANGUAGE.into()); + p + }; + + let per_file_edges = |path: &PathBuf, parser: &mut tree_sitter::Parser| -> Vec { + let file_nid = make_id1(&path.to_string_lossy()); + let Ok(source) = std::fs::read(path) else { + return Vec::new(); + }; + let Some(tree) = parser.parse(&source, None) else { + return Vec::new(); + }; + let mut local_edges = Vec::new(); + let mut local_seen: std::collections::HashSet<(String, String)> = + std::collections::HashSet::new(); + walk_java( + tree.root_node(), + &source, + &file_nid, + path, + &name_to_ids, + &mut local_edges, + &mut local_seen, + ); + local_edges + }; + + let collected: Vec = if paths.len() >= PARALLEL_THRESHOLD { + paths + .par_iter() + .map_init(init_parser, |parser, path| per_file_edges(path, parser)) + .reduce(Vec::new, |mut a, b| { + a.extend(b); + a + }) + } else { + let mut parser = init_parser(); + paths + .iter() + .flat_map(|p| per_file_edges(p, &mut parser)) + .collect() + }; + + // Global dedupe: per-file `local_seen` only guards within a single + // file, but the original sequential code shared `seen_pairs` across + // every file. Recreate that property with a final pass over the + // merged Vec to drop later duplicates. + let mut new_edges: Vec = Vec::with_capacity(collected.len()); + let mut seen_pairs: std::collections::HashSet<(String, String)> = + std::collections::HashSet::new(); + for e in collected { + let key = (e.source.clone(), e.target.clone()); + if seen_pairs.insert(key) { + new_edges.push(e); + } + } + new_edges +} + +/// Recursively collect the `package` declaration and `import`s (simple name -> +/// FQN, capitalised type imports only) from a parsed Java file. Mirrors the +/// inner `walk` in Python `_resolve_java_type_references`. +fn collect_java_pkg_imports( + node: tree_sitter::Node<'_>, + source: &[u8], + pkg: &mut String, + imps: &mut HashMap, +) { + match node.kind() { + "package_declaration" => { + let txt = node.utf8_text(source).unwrap_or(""); + *pkg = txt + .trim() + .strip_prefix("package") + .unwrap_or(txt) + .trim() + .trim_end_matches(';') + .trim() + .to_string(); + } + "import_declaration" => { + let txt = node.utf8_text(source).unwrap_or(""); + let stripped = txt + .trim() + .strip_prefix("import") + .unwrap_or(txt) + .trim() + .trim_end_matches(';') + .trim(); + let body = stripped.strip_prefix("static ").map_or(stripped, str::trim); + if !body.ends_with(".*") + && body.contains('.') + && let Some(simple) = body.rsplit('.').next() + && !simple.is_empty() + && simple.chars().next().is_some_and(char::is_uppercase) + { + imps.insert(simple.to_string(), body.to_string()); + } + } + _ => {} + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + collect_java_pkg_imports(cur.node(), source, pkg, imps); + if !cur.goto_next_sibling() { + break; + } + } + } +} + +// Java edge relations re-pointed from shadow stubs to real defs by +// `resolve_java_type_references`. `imports` is included so a file-level import +// edge that also landed on the shadow stub gets re-pointed too, leaving the stub +// unreferenced (and dropped). External/stdlib imports never resolve, so their +// edges correctly stay on their stub. +const JAVA_REPOINT_RELATIONS: &[&str] = &["implements", "inherits", "extends", "imports"]; + +/// Re-point dangling Java `implements`/`inherits`/`extends`/`imports` edges that +/// bare-name resolution left on sourceless shadow stubs, using each referencing +/// file's `import` statements (then its package) to disambiguate same-named types +/// across packages (#1318). Drops shadow stubs no edge references anymore. +/// +/// Mirrors Python `_resolve_java_type_references`. Runs after id-disambiguation +/// and `rewire_unique_stub_nodes` (so it only handles the ambiguous remainder), +/// in the final node-id space; keyed by the absolute `source_file` strings the +/// nodes/edges still carry before the closing relativisation pass. +pub(super) fn resolve_java_type_references( + java_paths: &[PathBuf], + all_nodes: &mut Vec, + all_edges: &mut [Edge], +) { + let mut parser = tree_sitter::Parser::new(); + if parser + .set_language(&tree_sitter_java::LANGUAGE.into()) + .is_err() + { + return; + } + let mut pkg_by_file: HashMap = HashMap::new(); + let mut imports_by_file: HashMap> = HashMap::new(); + for path in java_paths { + let Ok(source) = std::fs::read(path) else { + continue; + }; + let Some(tree) = parser.parse(&source, None) else { + continue; + }; + let mut pkg = String::new(); + let mut imps: HashMap = HashMap::new(); + collect_java_pkg_imports(tree.root_node(), &source, &mut pkg, &mut imps); + let src = path.to_string_lossy().into_owned(); + pkg_by_file.insert(src.clone(), pkg); + imports_by_file.insert(src, imps); + } + + // FQN (`package.Class`) -> definition node id, for source-backed type-like defs. + let mut fqn_to_id: HashMap = HashMap::new(); + for n in all_nodes.iter() { + if n.label.is_empty() || n.source_file.is_empty() || n.id.is_empty() { + continue; + } + let Some(pkg) = pkg_by_file.get(&n.source_file) else { + continue; + }; + let first_upper = n.label.chars().next().is_some_and(char::is_uppercase); + if !first_upper || n.label.ends_with(')') || n.label.ends_with(".java") { + continue; + } + let fqn = if pkg.is_empty() { + n.label.clone() + } else { + format!("{pkg}.{}", n.label) + }; + fqn_to_id.entry(fqn).or_insert_with(|| n.id.clone()); + } + + // Bare shadow stubs: no source_file, capitalised (type-like) label. + let stub_label: HashMap = all_nodes + .iter() + .filter(|n| { + !n.id.is_empty() + && n.source_file.is_empty() + && n.label.chars().next().is_some_and(char::is_uppercase) + }) + .map(|n| (n.id.clone(), n.label.clone())) + .collect(); + if stub_label.is_empty() { + return; + } + + let mut repointed_from: std::collections::HashSet = std::collections::HashSet::new(); + for edge in all_edges.iter_mut() { + if !JAVA_REPOINT_RELATIONS.contains(&edge.relation.as_str()) { + continue; + } + let Some(label) = stub_label.get(&edge.target) else { + continue; + }; + let resolved: Option = { + let ref_file = edge.source_file.as_str(); + imports_by_file + .get(ref_file) + .and_then(|imps| imps.get(label)) + .and_then(|fqn| fqn_to_id.get(fqn)) + .or_else(|| { + // Same-package reference (no explicit import). + let pkg = pkg_by_file.get(ref_file).map_or("", String::as_str); + let fqn = if pkg.is_empty() { + label.clone() + } else { + format!("{pkg}.{label}") + }; + fqn_to_id.get(&fqn) + }) + .cloned() + }; + if let Some(r) = resolved + && r != edge.target + { + repointed_from.insert(std::mem::replace(&mut edge.target, r)); + } + } + if repointed_from.is_empty() { + return; + } + + // Drop shadow stubs that no edge references anymore. + let still_referenced: std::collections::HashSet<&str> = all_edges + .iter() + .flat_map(|e| [e.source.as_str(), e.target.as_str()]) + .collect(); + all_nodes + .retain(|n| !repointed_from.contains(&n.id) || still_referenced.contains(n.id.as_str())); +} + +/// `_is_type_like_definition`: a real type def (not a method, not a qualified or +/// decorated reference). Mirrors the Python predicate. +pub(super) fn is_type_like_definition(node: &Node) -> bool { + let label = node.label.trim(); + !label.is_empty() + && !label.ends_with(')') + && !label.starts_with('.') + && !label.contains('.') + && node.file_type == "code" +} diff --git a/crates/graphify-extract/src/extractors/multi/js.rs b/crates/graphify-extract/src/extractors/multi/js.rs new file mode 100644 index 0000000..1756509 --- /dev/null +++ b/crates/graphify-extract/src/extractors/multi/js.rs @@ -0,0 +1,639 @@ +//! Cross-file JS/TS default-import + barrel re-export resolution. +#![allow(clippy::case_sensitive_file_extension_comparisons)] + +use super::{JsDefaultResolution, relativise_under_root}; +use crate::import_handlers::make_edge; +use crate::types::{Edge, Node}; +use std::collections::{HashMap, HashSet}; +use std::path::{Path, PathBuf}; + +/// The tree-sitter grammar for a JS/TS file, by extension (vue/others skipped). +fn js_grammar_for(path: &Path) -> Option { + match path.extension().and_then(|e| e.to_str()) { + Some("ts") => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()), + Some("tsx") => Some(tree_sitter_typescript::LANGUAGE_TSX.into()), + Some("js" | "jsx" | "mjs" | "cjs") => Some(tree_sitter_javascript::LANGUAGE.into()), + _ => None, + } +} + +/// UTF-8 slice of a node's source span (empty on invalid UTF-8). +pub(super) fn js_node_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { + std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") +} + +/// Local name of a default export, or `None` for an anonymous default. +/// +/// Handles `export default class Foo {}` / `export default function foo() {}` +/// (name on the `declaration` field) and `export default Foo` (identifier on +/// the `value` field). Mirrors graphify-py `_js_default_export_name`. +fn js_default_export_name(node: tree_sitter::Node<'_>, source: &[u8]) -> Option { + let mut c = node.walk(); + if !node.children(&mut c).any(|ch| ch.kind() == "default") { + return None; + } + if let Some(decl) = node.child_by_field_name("declaration") { + return decl + .child_by_field_name("name") + .map(|n| js_node_text(n, source).to_string()); + } + let value = node.child_by_field_name("value")?; + (value.kind() == "identifier").then(|| js_node_text(value, source).to_string()) +} + +/// Local binding of a default import — the `Foo` in `import Foo from './x'` +/// (also the leading binding of `import Foo, { Bar } from './x'`). Mirrors +/// graphify-py `_js_default_import_name`. +fn js_default_import_name(node: tree_sitter::Node<'_>, source: &[u8]) -> Option { + let mut c = node.walk(); + let clause = node + .children(&mut c) + .find(|ch| ch.kind() == "import_clause")?; + let mut cc = clause.walk(); + clause + .children(&mut cc) + .find(|sub| sub.kind() == "identifier") + .map(|id| js_node_text(id, source).to_string()) +} + +/// The source-module string literal (`'./x'`) of an import/export statement. +fn js_import_source(node: tree_sitter::Node<'_>, source: &[u8]) -> Option { + let mut c = node.walk(); + let s = node.children(&mut c).find(|ch| ch.kind() == "string")?; + Some( + js_node_text(s, source) + .trim_matches(|c| c == '\'' || c == '"' || c == '`' || c == ' ') + .to_string(), + ) +} + +/// A default import occurrence: `(file index, local binding, source string, line)`. +type JsDefaultImport = (usize, String, String, u32); + +/// Default-export names (by file index) and default imports gathered per file. +struct JsDefaultFacts { + export_name: HashMap, + imports: Vec, +} + +/// Parse each JS/TS file once, collecting its default-export name (by file +/// index) and its default imports. Files without a JS/TS grammar or that fail to +/// read/parse are skipped. +fn collect_js_default_facts(paths: &[PathBuf]) -> JsDefaultFacts { + let mut export_name: HashMap = HashMap::new(); + let mut imports: Vec = Vec::new(); + for (i, path) in paths.iter().enumerate() { + let Some(lang) = js_grammar_for(path) else { + continue; + }; + let mut parser = tree_sitter::Parser::new(); + if parser.set_language(&lang).is_err() { + continue; + } + let Ok(source) = std::fs::read(path) else { + continue; + }; + let Some(tree) = parser.parse(&source, None) else { + continue; + }; + let mut stack = vec![tree.root_node()]; + while let Some(node) = stack.pop() { + match node.kind() { + "export_statement" => { + if let Some(name) = js_default_export_name(node, &source) { + export_name.entry(i).or_insert(name); + } + } + "import_statement" => { + if let Some(local) = js_default_import_name(node, &source) + && let Some(src) = js_import_source(node, &source) + { + let line = u32::try_from(node.start_position().row) + .unwrap_or(0) + .saturating_add(1); + imports.push((i, local, src, line)); + } + } + _ => {} + } + let mut c = node.walk(); + stack.extend(node.children(&mut c)); + } + } + JsDefaultFacts { + export_name, + imports, + } +} + +/// Resolve JS/TS default imports to the origin symbol of the matching default +/// export across files (#6dc23db). +/// +/// graphify-py threads default imports/exports through its +/// `_collect_js_symbol_resolution_facts` pass; the Rust port resolves JS imports +/// per-file, so this adds the cross-file default case as a focused resolver +/// parallel to [`resolve_cross_file_python_imports`] / +/// [`resolve_cross_file_java_imports`]. Runs after id remapping so it works in +/// the final node-id space. `all_nodes` is the post-remap node set. +pub(super) fn resolve_js_default_imports( + all_nodes: &[Node], + paths: &[PathBuf], + root: &Path, +) -> JsDefaultResolution { + use crate::ids::file_node_id; + + let file_nid_of = |path: &Path| -> String { + let rel = relativise_under_root(path, root).unwrap_or_else(|| path.to_path_buf()); + file_node_id(&rel) + }; + + // (file_node_id, normalised label) -> node id, so a default-export name + // resolves to the concrete symbol node in that file. The label is normalised + // the same way the call resolver normalises call labels (strip a trailing + // `()` and a leading `.`) so a function export (`makeFoo`, stored as the node + // label `makeFoo()`) still matches the bare export name. + let mut by_file_label: HashMap<(String, String), String> = HashMap::new(); + for n in all_nodes { + if n.source_file.is_empty() || n.label.is_empty() { + continue; + } + let sf = PathBuf::from(&n.source_file); + let file_nid = if sf.is_absolute() { + file_nid_of(&sf) + } else { + file_node_id(&sf) + }; + let label = n.label.trim_end_matches("()").trim_start_matches('.'); + if label.is_empty() { + continue; + } + by_file_label + .entry((file_nid, label.to_string())) + .or_insert_with(|| n.id.clone()); + } + + // Per file: default-export name + default imports. + let JsDefaultFacts { + export_name, + imports, + } = collect_js_default_facts(paths); + + // Match each canonicalised path to its index, so a resolved import target + // maps back to the file whose default export we recorded. + let mut idx_by_path: HashMap = HashMap::new(); + for (i, p) in paths.iter().enumerate() { + idx_by_path.entry(p.clone()).or_insert(i); + if let Ok(c) = p.canonicalize() { + idx_by_path.entry(c).or_insert(i); + } + } + + let mut edges = Vec::new(); + let mut aliases = HashMap::new(); + let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new(); + for (imp_idx, local, raw, line) in imports { + let importer = &paths[imp_idx]; + let str_path = importer.to_string_lossy(); + let (_, resolved) = crate::generic::resolve_js_import_target(&raw, &str_path); + let Some(resolved) = resolved else { continue }; + let tgt_idx = idx_by_path + .get(&resolved) + .or_else(|| { + resolved + .canonicalize() + .ok() + .and_then(|c| idx_by_path.get(&c)) + }) + .copied(); + let Some(tgt_idx) = tgt_idx else { continue }; + let Some(name) = export_name.get(&tgt_idx) else { + continue; + }; + let tgt_file_nid = file_nid_of(&paths[tgt_idx]); + let Some(origin) = by_file_label.get(&(tgt_file_nid, name.clone())) else { + continue; + }; + let importer_nid = file_nid_of(importer); + if seen.insert((importer_nid.clone(), origin.clone())) { + edges.push(make_edge( + &importer_nid, + origin, + "imports", + Some("import"), + &str_path, + line, + )); + } + aliases.insert((importer_nid, local.to_lowercase()), origin.clone()); + } + + JsDefaultResolution { edges, aliases } +} + +/// Per-file JS/TS export/import specifier facts used to resolve barrel +/// re-export chains to their origin symbols (#barrel-resolution). Collected by +/// [`collect_js_reexport_facts`]. +#[derive(Default)] +struct JsReexportFile { + /// `export { S as P } from './x'` → `(public, source_raw, source_name)`. + reexports: Vec<(String, String, String)>, + /// `export * from './x'` → `source_raw`. + star_sources: Vec, + /// `export { L as P }` (no `from`) → `(public, local)`. + local_reexports: Vec<(String, String)>, + /// `export const X = …` → `X` (the public exported binding name). + exported_const_names: Vec, + /// `import { I as L } from './x'` → `local → (source_raw, imported)`. + named_imports: HashMap, + /// `const B = A` / `export const B = A` (bare-identifier RHS) → `alias → target`. + local_aliases: HashMap, + /// Named imports as consumer facts: `(local_binding, source_raw, imported, line)`. + consumer_imports: Vec<(String, String, String, u32)>, +} + +/// Extract `(name, alias)` from an `import_specifier` / `export_specifier`. +fn js_spec_name_alias( + spec: tree_sitter::Node<'_>, + source: &[u8], +) -> Option<(String, Option)> { + let name = spec.child_by_field_name("name").or_else(|| { + let mut c = spec.walk(); + spec.children(&mut c) + .find(|n| matches!(n.kind(), "identifier" | "property_identifier")) + })?; + let alias = spec + .child_by_field_name("alias") + .map(|a| js_node_text(a, source).to_string()); + Some((js_node_text(name, source).to_string(), alias)) +} + +/// Record `const B = A` bare-identifier aliases from a `lexical_declaration`. +fn collect_js_lexical_aliases(node: tree_sitter::Node<'_>, source: &[u8], f: &mut JsReexportFile) { + let mut cur = node.walk(); + for d in node.children(&mut cur) { + if d.kind() == "variable_declarator" + && let Some(name) = d.child_by_field_name("name") + && let Some(value) = d.child_by_field_name("value") + && value.kind() == "identifier" + { + f.local_aliases.insert( + js_node_text(name, source).to_string(), + js_node_text(value, source).to_string(), + ); + } + } +} + +/// Record named imports (`import { I as L } from './x'`) from an `import_statement`. +fn collect_js_import_stmt(node: tree_sitter::Node<'_>, source: &[u8], f: &mut JsReexportFile) { + let Some(src) = js_import_source(node, source) else { + return; + }; + let line = u32::try_from(node.start_position().row) + .unwrap_or(0) + .saturating_add(1); + let mut cur = node.walk(); + for child in node.children(&mut cur) { + if child.kind() != "import_clause" { + continue; + } + let mut cc = child.walk(); + for sub in child.children(&mut cc) { + if sub.kind() != "named_imports" { + continue; + } + let mut nc = sub.walk(); + for spec in sub.children(&mut nc) { + if spec.kind() == "import_specifier" + && let Some((name, alias)) = js_spec_name_alias(spec, source) + { + let local = alias.unwrap_or_else(|| name.clone()); + f.named_imports + .insert(local.clone(), (src.clone(), name.clone())); + f.consumer_imports.push((local, src.clone(), name, line)); + } + } + } + } +} + +/// Record re-exports / star re-exports / local re-exports / exported consts +/// from an `export_statement`. +fn collect_js_export_stmt(node: tree_sitter::Node<'_>, source: &[u8], f: &mut JsReexportFile) { + let src = js_import_source(node, source); + let mut cur = node.walk(); + let children: Vec> = node.children(&mut cur).collect(); + let export_clause = children + .iter() + .find(|c| c.kind() == "export_clause") + .copied(); + let has_namespace = children.iter().any(|c| c.kind() == "namespace_export"); + let lexical = children + .iter() + .find(|c| c.kind() == "lexical_declaration") + .copied(); + + if let Some(clause) = export_clause { + let mut cc = clause.walk(); + for spec in clause.children(&mut cc) { + if spec.kind() == "export_specifier" + && let Some((name, alias)) = js_spec_name_alias(spec, source) + { + let public = alias.unwrap_or_else(|| name.clone()); + match &src { + Some(s) => f.reexports.push((public, s.clone(), name)), + None => f.local_reexports.push((public, name)), + } + } + } + } else if let Some(s) = &src { + if !has_namespace { + f.star_sources.push(s.clone()); + } + } else if let Some(lex) = lexical { + collect_js_lexical_aliases(lex, source, f); + let mut lc = lex.walk(); + for d in lex.children(&mut lc) { + if d.kind() == "variable_declarator" + && let Some(nn) = d.child_by_field_name("name") + { + f.exported_const_names + .push(js_node_text(nn, source).to_string()); + } + } + } +} + +/// Parse each JS/TS file once, collecting its barrel re-export facts (indexed by +/// `paths` position). Files without a JS/TS grammar are recorded as empty. +fn collect_js_reexport_facts(paths: &[PathBuf]) -> Vec { + let mut out: Vec = Vec::with_capacity(paths.len()); + for path in paths { + let mut f = JsReexportFile::default(); + if let Some(lang) = js_grammar_for(path) + && let Ok(source) = std::fs::read(path) + { + let mut parser = tree_sitter::Parser::new(); + if parser.set_language(&lang).is_ok() + && let Some(tree) = parser.parse(&source, None) + { + let root = tree.root_node(); + let mut cur = root.walk(); + for stmt in root.children(&mut cur) { + match stmt.kind() { + "export_statement" => collect_js_export_stmt(stmt, &source, &mut f), + "import_statement" => collect_js_import_stmt(stmt, &source, &mut f), + "lexical_declaration" => collect_js_lexical_aliases(stmt, &source, &mut f), + _ => {} + } + } + } + } + out.push(f); + } + out +} + +/// Re-export chain resolver over the collected [`JsReexportFile`] facts. +struct ReexportResolver<'a> { + facts: &'a [JsReexportFile], + idx_by_path: &'a HashMap, + paths: &'a [PathBuf], + file_nids: &'a [String], + by_file_label: &'a HashMap<(String, String), String>, +} + +impl ReexportResolver<'_> { + /// `true` when `name` is declared as a real symbol node in file `idx`. + fn is_declared(&self, idx: usize, name: &str) -> bool { + self.by_file_label + .contains_key(&(self.file_nids[idx].clone(), name.to_string())) + } + + /// Resolve an import-source string (`'./x'`) to the `paths` index it targets. + fn resolve_src(&self, file_idx: usize, src_raw: &str) -> Option { + let str_path = self.paths[file_idx].to_string_lossy(); + let (_, resolved) = crate::generic::resolve_js_import_target(src_raw, &str_path); + let resolved = resolved?; + self.idx_by_path + .get(&resolved) + .or_else(|| { + resolved + .canonicalize() + .ok() + .and_then(|c| self.idx_by_path.get(&c)) + }) + .copied() + } + + /// Resolve `name` exported from file `file_idx` to its origin + /// `(file_idx, declared_name)`, following named/aliased/star re-exports, + /// local aliases, and named imports. `visited` guards against cycles. + fn resolve( + &self, + file_idx: usize, + name: &str, + visited: &mut HashSet<(usize, String)>, + ) -> Option<(usize, String)> { + if !visited.insert((file_idx, name.to_string())) { + return None; + } + let f = &self.facts[file_idx]; + for (public, src_raw, src_name) in &f.reexports { + if public == name + && let Some(tgt) = self.resolve_src(file_idx, src_raw) + && let Some(r) = self.resolve(tgt, src_name, visited) + { + return Some(r); + } + } + for (public, local) in &f.local_reexports { + if public == name + && local != name + && let Some(r) = self.resolve(file_idx, local, visited) + { + return Some(r); + } + } + if let Some(target) = f.local_aliases.get(name) + && let Some(r) = self.resolve(file_idx, target, visited) + { + return Some(r); + } + if let Some((src_raw, imported)) = f.named_imports.get(name) + && let Some(tgt) = self.resolve_src(file_idx, src_raw) + && let Some(r) = self.resolve(tgt, imported, visited) + { + return Some(r); + } + for src_raw in &f.star_sources { + if let Some(tgt) = self.resolve_src(file_idx, src_raw) + && let Some(r) = self.resolve(tgt, name, visited) + { + return Some(r); + } + } + if self.is_declared(file_idx, name) { + return Some((file_idx, name.to_string())); + } + None + } + + /// File→file `re_exports` edges for every barrel export that resolves to an + /// origin file other than the barrel itself. + fn reexport_edges(&self) -> Vec { + let mut edges = Vec::new(); + let mut seen: HashSet<(String, String)> = HashSet::new(); + for (idx, f) in self.facts.iter().enumerate() { + let barrel_nid = &self.file_nids[idx]; + let str_path = self.paths[idx].to_string_lossy(); + let publics = f + .reexports + .iter() + .map(|(p, _, _)| p) + .chain(f.local_reexports.iter().map(|(p, _)| p)) + .chain(f.exported_const_names.iter()); + for public in publics { + let mut visited = HashSet::new(); + if let Some((origin_idx, _)) = self.resolve(idx, public, &mut visited) + && origin_idx != idx + && seen.insert((barrel_nid.clone(), self.file_nids[origin_idx].clone())) + { + edges.push(make_edge( + barrel_nid, + &self.file_nids[origin_idx], + "re_exports", + Some("re-export"), + &str_path, + 1, + )); + } + } + for src_raw in &f.star_sources { + if let Some(tgt) = self.resolve_src(idx, src_raw) + && tgt != idx + && seen.insert((barrel_nid.clone(), self.file_nids[tgt].clone())) + { + edges.push(make_edge( + barrel_nid, + &self.file_nids[tgt], + "re_exports", + Some("re-export"), + &str_path, + 1, + )); + } + } + } + edges + } + + /// Consumer `imports` edges + call aliases for named imports that travel + /// through a barrel to an origin symbol in a different file. + fn consumer_import_edges(&self) -> (Vec, HashMap<(String, String), String>) { + let mut edges = Vec::new(); + let mut aliases: HashMap<(String, String), String> = HashMap::new(); + let mut seen: HashSet<(String, String)> = HashSet::new(); + for (idx, f) in self.facts.iter().enumerate() { + let consumer_nid = &self.file_nids[idx]; + let str_path = self.paths[idx].to_string_lossy(); + for (local, src_raw, imported, line) in &f.consumer_imports { + let Some(barrel_idx) = self.resolve_src(idx, src_raw) else { + continue; + }; + let mut visited = HashSet::new(); + let Some((origin_idx, origin_name)) = + self.resolve(barrel_idx, imported, &mut visited) + else { + continue; + }; + // origin == directly-imported file ⇒ plain import handled per-file. + if origin_idx == barrel_idx { + continue; + } + let Some(origin_sym) = self + .by_file_label + .get(&(self.file_nids[origin_idx].clone(), origin_name.clone())) + else { + continue; + }; + if seen.insert((consumer_nid.clone(), origin_sym.clone())) { + edges.push(make_edge( + consumer_nid, + origin_sym, + "imports", + Some("import"), + &str_path, + *line, + )); + } + aliases.insert( + (consumer_nid.clone(), local.to_lowercase()), + origin_sym.clone(), + ); + } + } + (edges, aliases) + } +} + +/// Resolve JS/TS named/aliased/star barrel re-export chains to their origin +/// symbols, emitting file→file `re_exports` edges, consumer→origin `imports` +/// edges, and call aliases (so a call through a barrel-imported binding targets +/// the origin symbol). Mirrors the observable output of graphify-py's +/// `_collect_js_symbol_resolution_facts` / `_apply_symbol_resolution_facts` +/// barrel handling, integrated with the existing per-file resolution. +pub(super) fn resolve_js_reexport_imports( + all_nodes: &[Node], + paths: &[PathBuf], + root: &Path, +) -> JsDefaultResolution { + use crate::ids::file_node_id; + + let file_nid_of = |path: &Path| -> String { + let rel = relativise_under_root(path, root).unwrap_or_else(|| path.to_path_buf()); + file_node_id(&rel) + }; + let mut by_file_label: HashMap<(String, String), String> = HashMap::new(); + for n in all_nodes { + if n.source_file.is_empty() || n.label.is_empty() { + continue; + } + let sf = PathBuf::from(&n.source_file); + let file_nid = if sf.is_absolute() { + file_nid_of(&sf) + } else { + file_node_id(&sf) + }; + let label = n.label.trim_end_matches("()").trim_start_matches('.'); + if label.is_empty() { + continue; + } + by_file_label + .entry((file_nid, label.to_string())) + .or_insert_with(|| n.id.clone()); + } + + let facts = collect_js_reexport_facts(paths); + let mut idx_by_path: HashMap = HashMap::new(); + for (i, p) in paths.iter().enumerate() { + idx_by_path.entry(p.clone()).or_insert(i); + if let Ok(c) = p.canonicalize() { + idx_by_path.entry(c).or_insert(i); + } + } + let file_nids: Vec = paths.iter().map(|p| file_nid_of(p)).collect(); + let resolver = ReexportResolver { + facts: &facts, + idx_by_path: &idx_by_path, + paths, + file_nids: &file_nids, + by_file_label: &by_file_label, + }; + + let mut edges = resolver.reexport_edges(); + let (import_edges, aliases) = resolver.consumer_import_edges(); + edges.extend(import_edges); + + JsDefaultResolution { edges, aliases } +} diff --git a/crates/graphify-extract/src/extractors/multi/mod.rs b/crates/graphify-extract/src/extractors/multi/mod.rs new file mode 100644 index 0000000..ec9493c --- /dev/null +++ b/crates/graphify-extract/src/extractors/multi/mod.rs @@ -0,0 +1,653 @@ +//! Multi-file extraction orchestrator. +//! +//! Mirrors Python `extract()` from `extract.py`: +//! - Per-file dispatch via extension (or `.blade.php` suffix) +//! - Cache integration (graphify-cache) +//! - Parallel extraction via rayon for large batches +//! - Cross-file Python import resolution +//! - Cross-file Java import resolution +//! - Cross-file `raw_call` resolution +//! - ID relativisation (absolute → project-relative) +//! - `source_file` field relativisation + +#![allow(clippy::case_sensitive_file_extension_comparisons)] + +mod cache; +mod java; +mod js; +mod python; +mod swift; + +use crate::extractors::{ + extract_apex, extract_astro, extract_bash, extract_blade, extract_c, extract_cpp, + extract_csharp, extract_csproj, extract_dart, extract_delphi_form, extract_dm, extract_dmf, + extract_dmi, extract_dmm, extract_elixir, extract_fortran, extract_go, extract_groovy, + extract_java, extract_js, extract_json, extract_julia, extract_kotlin, extract_lazarus_form, + extract_lazarus_package, extract_lua, extract_markdown, extract_mcp_config, extract_objc, + extract_package_manifest, extract_pascal, extract_php, extract_powershell, + extract_powershell_manifest, extract_python, extract_razor, extract_ruby, extract_rust, + extract_scala, extract_sln, extract_slnx, extract_sql, extract_svelte, extract_swift, + extract_terraform, extract_verilog, extract_zig, is_mcp_config_path, +}; +use crate::ids::make_id1; +use crate::types::{Edge, ExtractOutput, FileResult, Node, RawCall}; +use cache::extract_single_file; +use java::{resolve_cross_file_java_imports, resolve_java_type_references}; +use js::{resolve_js_default_imports, resolve_js_reexport_imports}; +use python::{resolve_cross_file_python_imports, resolve_python_reexport_imports}; +use rayon::prelude::*; +use serde_json::Value; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use swift::resolve_swift_member_calls; + +const PARALLEL_THRESHOLD: usize = 20; + +// ── Dispatch table ──────────────────────────────────────────────────────────── + +type ExtractFn = fn(&Path) -> FileResult; + +/// Return the per-language extractor function for a given file path, or `None` for unknown types. +/// +/// Blade templates are identified by the `.blade.php` suffix before the extension is checked, so +/// that `foo.blade.php` routes to `extract_blade` rather than `extract_php`. All other languages +/// are dispatched solely on the file extension. +fn get_extractor(path: &Path) -> Option { + // Blade templates: checked by suffix before extension + let name = path.file_name().map_or("", |n| n.to_str().unwrap_or("")); + if name.ends_with(".blade.php") { + return Some(extract_blade); + } + // MCP config files (.mcp.json, claude_desktop_config.json, ...) are routed + // by filename before generic .json dispatch so they get MCP-aware nodes + // (servers, commands, packages, env vars) instead of opaque JSON keys. + if is_mcp_config_path(path) { + return Some(extract_mcp_config); + } + // Package manifests (apm.yml/pyproject.toml/go.mod/pom.xml) -> a canonical + // package node + depends_on edges, by filename before generic suffix dispatch + // (#1377). apm.yml would otherwise be a .yml document handled by the LLM. + if graphify_detect::is_package_manifest_path(path) { + return Some(extract_package_manifest); + } + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + match ext { + "py" => Some(extract_python), + "js" | "jsx" | "mjs" | "ts" | "tsx" | "vue" => Some(extract_js), + "go" => Some(extract_go), + "rs" => Some(extract_rust), + "java" => Some(extract_java), + "groovy" | "gradle" => Some(extract_groovy), + "c" | "h" => Some(extract_c), + "cpp" | "cc" | "cxx" | "hpp" => Some(extract_cpp), + "rb" => Some(extract_ruby), + "cs" => Some(extract_csharp), + "kt" | "kts" => Some(extract_kotlin), + "scala" => Some(extract_scala), + "php" => Some(extract_php), + "swift" => Some(extract_swift), + "lua" | "luau" | "toc" => Some(extract_lua), + "zig" => Some(extract_zig), + "ps1" | "psm1" => Some(extract_powershell), + "psd1" => Some(extract_powershell_manifest), + "ex" | "exs" => Some(extract_elixir), + "m" | "mm" => Some(extract_objc), + "jl" => Some(extract_julia), + "f" | "F" | "f90" | "F90" | "f95" | "F95" | "f03" | "F03" | "f08" | "F08" => { + Some(extract_fortran) + } + "svelte" => Some(extract_svelte), + "astro" => Some(extract_astro), + "dart" => Some(extract_dart), + "v" | "sv" | "svh" => Some(extract_verilog), + "sql" => Some(extract_sql), + "md" | "mdx" | "qmd" => Some(extract_markdown), + "pas" | "pp" | "dpr" | "dpk" | "lpr" | "inc" => Some(extract_pascal), + "dfm" => Some(extract_delphi_form), + "lfm" => Some(extract_lazarus_form), + "lpk" => Some(extract_lazarus_package), + "sh" | "bash" => Some(extract_bash), + "json" => Some(extract_json), + "dm" | "dme" => Some(extract_dm), + "dmi" => Some(extract_dmi), + "dmm" => Some(extract_dmm), + "dmf" => Some(extract_dmf), + "sln" => Some(extract_sln), + "slnx" => Some(extract_slnx), + "cls" | "trigger" => Some(extract_apex), + "tf" | "tfvars" | "hcl" => Some(extract_terraform), + "csproj" | "fsproj" | "vbproj" => Some(extract_csproj), + "razor" | "cshtml" => Some(extract_razor), + _ => None, + } +} + +// ── Cache helpers (thin wrappers around graphify-cache) ─────────────────────── + +/// Result of cross-file JS/TS default-import resolution (#6dc23db). +struct JsDefaultResolution { + /// `imports` edges wiring an importer file node to the origin symbol of a + /// default export, even when the local binding is renamed. + edges: Vec, + /// `(caller_file_node_id, local_binding_lowercased) -> origin symbol node id`, + /// so a call through a renamed default-import binding (`import mk from + /// './foo'; mk()`) resolves to the origin during cross-file call resolution. + aliases: HashMap<(String, String), String>, +} + +/// Relativise `path` against `root`, falling back to canonicalising the path +/// when a lexical strip fails (e.g. the path is relative, or differs from +/// `root` only by a symlink such as macOS's `/var` → `/private/var`). +/// +/// Mirrors Python's `path.relative_to(root)` with its +/// `path.resolve().relative_to(root)` fallback. Returns `None` only when the +/// path is genuinely outside `root`. +#[must_use] +fn relativise_under_root(path: &Path, root: &Path) -> Option { + if let Ok(rel) = path.strip_prefix(root) { + return Some(rel.to_path_buf()); + } + path.canonicalize() + .ok() + .and_then(|c| c.strip_prefix(root).map(Path::to_path_buf).ok()) +} + +/// Extract AST nodes and edges from a list of code files. +/// +/// Two-pass process: +/// 1. Per-file structural extraction (classes, functions, imports) — parallel if ≥ 20 uncached +/// 2. Cross-file import + call resolution +#[must_use] +#[allow(clippy::too_many_lines)] +pub fn extract(paths: &[PathBuf], cache_root: Option<&Path>) -> ExtractOutput { + if paths.is_empty() { + return ExtractOutput { + nodes: vec![], + edges: vec![], + input_tokens: 0, + output_tokens: 0, + }; + } + + // Workspace package manifests/globs can change between repeated extractions + // (e.g. a new package added) or during `watch`; clear the cache so each run + // re-scans. Mirrors Python `extract()`'s `_WORKSPACE_PACKAGE_CACHE.clear()`. + crate::workspace::clear_workspace_cache(); + + // Infer common root for ID relativisation + let root: PathBuf = { + let inferred = if paths.len() == 1 { + paths[0] + .parent() + .map_or_else(|| PathBuf::from("."), PathBuf::from) + } else { + let min_parts = paths + .iter() + .map(|p| p.components().count()) + .min() + .unwrap_or(0); + let mut common_len = 0usize; + 'outer: for i in 0..min_parts { + let first = paths[0].components().nth(i); + for p in paths.iter().skip(1) { + if p.components().nth(i) != first { + break 'outer; + } + } + common_len = i + 1; + } + if common_len == 0 { + PathBuf::from(".") + } else { + paths[0].components().take(common_len).collect() + } + }; + // An explicit `cache_root` overrides the inferred prefix, matching + // Python's `if cache_root is not None: root = cache_root`. The root + // drives both cache keys and the #1033 file-node-id relativisation, so + // a divergence here splits AST/semantic file nodes apart. + let base = cache_root.map_or(inferred, Path::to_path_buf); + base.canonicalize().unwrap_or(base) + }; + + let effective_root: &Path = cache_root.unwrap_or(&root); + + // Phase 1: extract per file (cached or fresh) + let uncached_work: Vec<(usize, &PathBuf)> = paths + .iter() + .enumerate() + .filter(|(_, p)| get_extractor(p).is_some()) + .collect(); + + let mut per_file: Vec = paths.iter().map(|_| FileResult::default()).collect(); + + if uncached_work.len() >= PARALLEL_THRESHOLD { + // Parallel via rayon + let results: Vec<(usize, FileResult)> = uncached_work + .par_iter() + .map(|(idx, path)| (*idx, extract_single_file(path, effective_root))) + .collect(); + for (idx, result) in results { + per_file[idx] = result; + } + } else { + // Sequential + for (idx, path) in &uncached_work { + per_file[*idx] = extract_single_file(path, effective_root); + } + } + + // Cross-file Python import resolution — must run BEFORE per_file is + // drained into `all_*`, otherwise `resolve_cross_file_*` sees empty + // FileResults and emits no cross-module edges. + let mut cross_edges: Vec = Vec::new(); + let py_indices: Vec = paths + .iter() + .enumerate() + .filter(|(_, p)| p.extension().is_some_and(|e| e == "py")) + .map(|(i, _)| i) + .collect(); + if !py_indices.is_empty() { + let py_results: Vec = py_indices.iter().map(|&i| per_file[i].clone()).collect(); + let py_paths: Vec = py_indices.iter().map(|&i| paths[i].clone()).collect(); + cross_edges.extend(resolve_cross_file_python_imports(&py_results, &py_paths)); + } + + // Cross-file Java import resolution + let java_indices: Vec = paths + .iter() + .enumerate() + .filter(|(_, p)| p.extension().is_some_and(|e| e == "java")) + .map(|(i, _)| i) + .collect(); + if !java_indices.is_empty() { + let java_results: Vec = + java_indices.iter().map(|&i| per_file[i].clone()).collect(); + let java_paths: Vec = java_indices.iter().map(|&i| paths[i].clone()).collect(); + cross_edges.extend(resolve_cross_file_java_imports(&java_results, &java_paths)); + } + + let mut all_nodes: Vec = Vec::new(); + let mut all_edges: Vec = Vec::new(); + let mut all_raw_calls: Vec = Vec::new(); + + for result in &mut per_file { + all_nodes.append(&mut result.nodes); + all_edges.append(&mut result.edges); + all_raw_calls.append(&mut result.raw_calls); + } + all_edges.extend(cross_edges); + + // Remap absolute file-node IDs to the canonical `{parent_dir}_{stem}` spec + // form so (a) edge endpoints are stable across machines (#502) and (b) AST + // file nodes match the IDs semantic subagents generate (#1033). + let mut id_remap: HashMap = HashMap::new(); + // Symbol node IDs embed the file stem the extractor saw as a prefix. For a + // root-level file that stem picks up the absolute parent directory name, so + // a symbol becomes `_main_run` while the file node correctly + // relativises to `main` and the spec wants `main_run` — splitting the symbol + // into AST/semantic ghosts (#1096). Relativise the symbol prefix the same + // way, gated by `source_file` so two files sharing a prefix can't + // cross-contaminate. Keyed by the path string the extractor recorded in + // `source_file` → (old_prefix, new_prefix). + let mut prefix_remap: HashMap = HashMap::new(); + for path in paths { + let old_id = make_id1(&path.to_string_lossy()); + // Resolve relative-to-root; a lexical strip can fail (path is relative, or + // differs from `root` only by a symlink), so fall back to canonicalising — + // mirrors Python's `resolve().relative_to(root)` fallback. + let Some(rel) = relativise_under_root(path, &root) else { + continue; + }; + let new_id = crate::ids::file_node_id(&rel); + if old_id != new_id { + id_remap.insert(old_id, new_id.clone()); + } + // Import resolution (e.g. the pnpm `.`-package entry, #1083) canonicalises + // the resolved path, which on macOS rewrites `/tmp` → `/private/tmp`. That + // id differs from the input-path id keyed above, so an edge targeting the + // canonical spelling would dangle off the relativised file node. Map the + // canonical spelling to the same node so the resolved edge connects. + if let Ok(canon) = path.canonicalize() { + let canon_id = make_id1(&canon.to_string_lossy()); + if canon_id != new_id { + id_remap.entry(canon_id).or_insert_with(|| new_id.clone()); + } + } + let old_pref = crate::ids::file_node_id(path); + if old_pref != new_id { + prefix_remap.insert(path.to_string_lossy().into_owned(), (old_pref, new_id)); + } + } + if !id_remap.is_empty() { + for n in &mut all_nodes { + if let Some(new_id) = id_remap.get(&n.id) { + n.id = new_id.clone(); + } + } + for e in &mut all_edges { + if let Some(new_id) = id_remap.get(&e.source) { + e.source = new_id.clone(); + } + if let Some(new_id) = id_remap.get(&e.target) { + e.target = new_id.clone(); + } + } + } + if !prefix_remap.is_empty() { + let mut sym_remap: HashMap = HashMap::new(); + for n in &all_nodes { + if n.source_file.is_empty() { + continue; + } + // Package (#1377) and Swift module (#1327) anchor nodes carry a + // canonical name-keyed id (`pkg_` / the shared module id) that + // must stay identical across every manifest/file that references them, + // so they are exempt from the file-stem prefix remap. + if n.metadata + .as_ref() + .and_then(|m| m.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "package" || t == "module") + { + continue; + } + let Some((old_pref, new_pref)) = prefix_remap.get(&n.source_file) else { + continue; + }; + // IDs are make_id output (lowercase word chars + `_`), so slicing at + // a byte offset is always on a char boundary. + if n.id.len() > old_pref.len() + && n.id.starts_with(old_pref.as_str()) + && n.id.as_bytes()[old_pref.len()] == b'_' + { + let new_nid = format!("{new_pref}{}", &n.id[old_pref.len()..]); + if new_nid != n.id { + sym_remap.insert(n.id.clone(), new_nid); + } + } + } + if !sym_remap.is_empty() { + for n in &mut all_nodes { + if let Some(new_id) = sym_remap.get(&n.id) { + n.id = new_id.clone(); + } + } + for e in &mut all_edges { + if let Some(new_id) = sym_remap.get(&e.source) { + e.source = new_id.clone(); + } + if let Some(new_id) = sym_remap.get(&e.target) { + e.target = new_id.clone(); + } + } + // raw_calls carry caller_nid (a symbol id) consumed by the cross-file + // call pass below — rewrite it too or those edges dangle on a stale + // source (#1096). + for rc in &mut all_raw_calls { + if let Some(new_id) = sym_remap.get(&rc.caller_nid) { + rc.caller_nid = new_id.clone(); + } + } + } + } + + // Disambiguate node IDs that collide across two or more distinct + // source files (e.g. two `Program.cs` files in different directories). + // Runs before cross-file call resolution so the call resolver sees + // already-qualified IDs. + crate::postprocess::disambiguate_colliding_node_ids( + &mut all_nodes, + &mut all_edges, + &mut all_raw_calls, + &root, + ); + + // Rewire cross-language inheritance stub nodes (no `source_file`) onto + // a unique real definition with the same label. Drops the stub when + // the rewire succeeds. + crate::postprocess::rewire_unique_stub_nodes(&mut all_nodes, &mut all_edges); + + // Re-point dangling Java implements/inherits edges left on shadow stubs by + // bare-name resolution, using imports for exact-package disambiguation + // (#1318). After rewire_unique_stub_nodes so it only handles the ambiguous + // remainder; before the closing source_file relativisation so node/edge + // source_files still match the parsed Java file paths. + let java_type_paths: Vec = paths + .iter() + .filter(|p| p.extension().is_some_and(|e| e == "java")) + .cloned() + .collect(); + if !java_type_paths.is_empty() { + resolve_java_type_references(&java_type_paths, &mut all_nodes, &mut all_edges); + } + + // Collapse Swift `extension Foo` nodes onto the canonical `class Foo` + // declaration. Mirrors `_merge_swift_extensions` in graphify-py. + crate::postprocess::merge_swift_extensions(paths, &mut all_nodes, &mut all_edges); + + // Cross-file JS/TS default-import resolution (#6dc23db). Runs in the final + // node-id space (after remap/disambiguation); the `imports` edges feed the + // import-evidence index below and the aliases let calls through a renamed + // default binding resolve to the origin symbol. + let js_default = resolve_js_default_imports(&all_nodes, paths, &root); + all_edges.extend(js_default.edges); + let mut js_default_aliases = js_default.aliases; + // Cross-file JS/TS barrel re-export resolution: chain named/aliased/star + // re-exports (and local-alias re-exports) to the origin symbol so consumer + // imports + calls through a barrel target the real declaration. + let js_reexport = resolve_js_reexport_imports(&all_nodes, paths, &root); + all_edges.extend(js_reexport.edges); + js_default_aliases.extend(js_reexport.aliases); + // Cross-file Python package re-export resolution: `pkg/__init__.py` doing + // `from .sub import N as A` lets `from pkg import A` (and calls through it) + // target the origin symbol in `sub`. + let py_reexport = resolve_python_reexport_imports(&all_nodes, paths, &root); + all_edges.extend(py_reexport.edges); + js_default_aliases.extend(py_reexport.aliases); + + // Cross-file call resolution via raw_calls + // Build label → [nid] (skip rationale) + let mut global_label_to_nids: HashMap> = HashMap::new(); + for n in &all_nodes { + if n.file_type == "rationale" { + continue; + } + let normalised = n.label.trim_end_matches("()").trim_start_matches('.'); + if !normalised.is_empty() { + global_label_to_nids + .entry(normalised.to_lowercase()) + .or_default() + .push(n.id.clone()); + } + } + + // Import evidence indexes + let mut file_to_symbol_imports: HashMap> = + HashMap::new(); + let mut file_to_module_imports: HashMap> = + HashMap::new(); + for e in &all_edges { + if e.relation == "imports" { + file_to_symbol_imports + .entry(e.source.clone()) + .or_default() + .insert(e.target.clone()); + } else if e.relation == "imports_from" { + file_to_module_imports + .entry(e.source.clone()) + .or_default() + .insert(e.target.clone()); + } + } + + // Map node → file_nid + let mut nid_to_file_nid: HashMap = HashMap::new(); + for n in &all_nodes { + if n.source_file.is_empty() { + continue; + } + let sf_path = PathBuf::from(&n.source_file); + // Relativise the same way `id_remap` does so a symbol's file-nid matches + // its (relativised) file node id — including the canonicalise fallback + // for absolute paths that differ from `root` only by a symlink. Relative + // source paths are used verbatim (mirrors Python). + let sf_rel = if sf_path.is_absolute() { + relativise_under_root(&sf_path, &root).unwrap_or(sf_path) + } else { + sf_path + }; + nid_to_file_nid.insert(n.id.clone(), crate::ids::file_node_id(&sf_rel)); + } + + let mut existing_pairs: std::collections::HashSet<(String, String)> = all_edges + .iter() + .map(|e| (e.source.clone(), e.target.clone())) + .collect(); + + for rc in &all_raw_calls { + // No built-in pre-filter here: the per-language extractors already drop + // *unresolved* built-in calls at the source, so any raw_call that reaches + // this cross-file pass is a genuine unresolved symbol. Filtering on the + // name alone would wrongly suppress a project symbol that happens to + // share a built-in name and resolves uniquely below. + if rc.is_member_call { + continue; + } + let callee_key = rc.callee.to_lowercase(); + let caller = &rc.caller_nid; + let caller_file_nid = nid_to_file_nid.get(caller); + // A renamed default-import binding (`import mk from './foo'; mk()`) aliases + // the local name to the origin symbol; prefer that over global label + // matching, since the local name has no node of its own (#6dc23db). + let alias_tgt = + caller_file_nid.and_then(|f| js_default_aliases.get(&(f.clone(), callee_key.clone()))); + let candidates: Vec<&String> = match alias_tgt { + Some(t) => vec![t], + None => global_label_to_nids + .get(&callee_key) + .map_or_else(Vec::new, |v| v.iter().collect()), + }; + // Only resolve unambiguous matches + if candidates.len() != 1 { + continue; + } + let tgt = candidates[0]; + if tgt == caller { + continue; + } + let pair = (caller.clone(), tgt.clone()); + if existing_pairs.contains(&pair) { + continue; + } + + let tgt_file_nid = nid_to_file_nid.get(tgt); + let imported_symbols = caller_file_nid + .and_then(|f| file_to_symbol_imports.get(f)) + .is_some_and(|s| s.contains(tgt)); + let imported_module = caller_file_nid + .and_then(|f| file_to_module_imports.get(f)) + .zip(tgt_file_nid) + .is_some_and(|(m, cfn)| m.contains(cfn)); + let has_import_evidence = imported_symbols || imported_module; + + let (confidence, confidence_score) = if has_import_evidence { + ("EXTRACTED".to_string(), 1.0f64) + } else { + ("INFERRED".to_string(), 0.8f64) + }; + + existing_pairs.insert(pair); + all_edges.push(Edge { + external: false, + source: caller.clone(), + target: tgt.clone(), + relation: "calls".to_string(), + confidence, + source_file: rc.source_file.clone(), + source_location: Some(rc.source_location.clone()), + weight: 1.0, + context: Some("call".to_string()), + confidence_score: Some(confidence_score), + }); + } + + // Cross-file Swift member-call resolution (#1356): after the shared call pass + // (node ids and caller_nids final) and before source_file relativisation (the + // type-table re-parse keys on the absolute paths nodes/raw_calls still carry). + let swift_paths: Vec = paths + .iter() + .filter(|p| p.extension().is_some_and(|e| e == "swift")) + .cloned() + .collect(); + if !swift_paths.is_empty() { + resolve_swift_member_calls(&swift_paths, &all_nodes, &mut all_edges, &all_raw_calls); + } + + // Relativise source_file fields + for n in &mut all_nodes { + let sf_path = PathBuf::from(&n.source_file); + if sf_path.is_absolute() + && let Some(rel) = relativise_under_root(&sf_path, &root) + { + n.source_file = rel.to_string_lossy().into_owned(); + } + } + for e in &mut all_edges { + let sf_path = PathBuf::from(&e.source_file); + if sf_path.is_absolute() + && let Some(rel) = relativise_under_root(&sf_path, &root) + { + e.source_file = rel.to_string_lossy().into_owned(); + } + } + + // Convert to IndexMap for ordered serialisation. The per-item serde + // conversion is independent and dominates wall time on large corpora, + // so fan out via Rayon above the per-file threshold. + let to_indexmap = |v: Value| -> Option> { + if let Value::Object(m) = v { + Some(m.into_iter().collect()) + } else { + None + } + }; + let mut nodes_out: Vec> = + if all_nodes.len() >= PARALLEL_THRESHOLD { + all_nodes + .into_par_iter() + .filter_map(|n| serde_json::to_value(n).ok().and_then(to_indexmap)) + .collect() + } else { + all_nodes + .into_iter() + .filter_map(|n| serde_json::to_value(n).ok().and_then(to_indexmap)) + .collect() + }; + // Tag AST provenance so the incremental watch rebuild can distinguish + // AST-extracted nodes from semantic/LLM nodes. On a full re-extraction the + // watcher drops any AST-marked node missing from the fresh output even when + // its source file still exists (#1116/#1118). + for n in &mut nodes_out { + n.insert("_origin".to_string(), Value::String("ast".to_string())); + } + let edges_out: Vec> = if all_edges.len() >= PARALLEL_THRESHOLD + { + all_edges + .into_par_iter() + .filter_map(|e| serde_json::to_value(e).ok().and_then(to_indexmap)) + .collect() + } else { + all_edges + .into_iter() + .filter_map(|e| serde_json::to_value(e).ok().and_then(to_indexmap)) + .collect() + }; + + ExtractOutput { + nodes: nodes_out, + edges: edges_out, + input_tokens: 0, + output_tokens: 0, + } +} diff --git a/crates/graphify-extract/src/extractors/multi/python.rs b/crates/graphify-extract/src/extractors/multi/python.rs new file mode 100644 index 0000000..11c5c2b --- /dev/null +++ b/crates/graphify-extract/src/extractors/multi/python.rs @@ -0,0 +1,528 @@ +//! Cross-file Python import + package re-export resolution. +#![allow(clippy::case_sensitive_file_extension_comparisons)] + +use super::js::js_node_text; +use super::{JsDefaultResolution, PARALLEL_THRESHOLD, relativise_under_root}; +use crate::ids::make_id1; +use crate::import_handlers::make_edge; +use crate::types::{Edge, FileResult, Node}; +use rayon::prelude::*; +use std::collections::{HashMap, HashSet}; +use std::path::{Path, PathBuf}; + +/// Recursively walk a Python AST collecting `from X import Y` statements. +/// +/// On finding an `import_from_statement`, resolves the source module to a known stem via +/// `bare_to_qualified`, then emits `uses` edges from each local class to each imported symbol +/// that is present in `stem_to_entities`. Mirrors Python `_walk_imports` from `extract.py`. +/// Shared state threaded through every [`walk_imports`] recursion. +struct ImportWalkCtx<'a> { + path: &'a Path, + stem_to_entities: &'a HashMap>, + bare_to_qualified: &'a HashMap, + local_classes: &'a [String], + str_path: &'a str, + new_edges: &'a mut Vec, +} + +#[allow(clippy::too_many_lines)] // linear dispatch over Python's import_from_statement variants +fn walk_imports(ctx: &mut ImportWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8]) { + if node.kind() == "import_from_statement" { + let mut target_fq: Option = None; + let mut past_import_kw = false; + let mut imported_names: Vec = Vec::new(); + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if child.kind() == "relative_import" { + let mut rc = child.walk(); + if rc.goto_first_child() { + loop { + let sub = rc.node(); + if sub.kind() == "dotted_name" { + let raw = + std::str::from_utf8(&source[sub.start_byte()..sub.end_byte()]) + .unwrap_or(""); + let bare = raw.split('.').next_back().unwrap_or("").to_string(); + let candidate = ctx + .path + .parent() + .unwrap_or(ctx.path) + .join(format!("{bare}.py")); + target_fq = Some(crate::ids::file_stem(&candidate)); + break; + } + if !rc.goto_next_sibling() { + break; + } + } + } + break; + } + if child.kind() == "dotted_name" && target_fq.is_none() { + let raw = std::str::from_utf8(&source[child.start_byte()..child.end_byte()]) + .unwrap_or(""); + let bare = raw.split('.').next_back().unwrap_or(""); + target_fq = ctx.bare_to_qualified.get(bare).cloned(); + } + if child.kind() == "import" { + past_import_kw = true; + } else if past_import_kw { + if child.kind() == "dotted_name" { + imported_names.push( + std::str::from_utf8(&source[child.start_byte()..child.end_byte()]) + .unwrap_or("") + .to_string(), + ); + } else if child.kind() == "aliased_import" + && let Some(name_node) = child.child_by_field_name("name") + { + imported_names.push( + std::str::from_utf8( + &source[name_node.start_byte()..name_node.end_byte()], + ) + .unwrap_or("") + .to_string(), + ); + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + + let Some(fq) = target_fq else { return }; + let Some(entities) = ctx.stem_to_entities.get(&fq) else { + return; + }; + let line = node.start_position().row + 1; + for name in &imported_names { + if let Some(tgt_nid) = entities.get(name) { + for src_class_nid in ctx.local_classes { + ctx.new_edges.push(Edge { + external: false, + source: src_class_nid.clone(), + target: tgt_nid.clone(), + relation: "uses".to_string(), + confidence: "INFERRED".to_string(), + source_file: ctx.str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 0.8, + context: None, + confidence_score: None, + }); + } + } + } + return; + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_imports(ctx, cur.node(), source); + if !cur.goto_next_sibling() { + break; + } + } + } +} + +/// Emit `uses` edges connecting Python classes to the symbols they import from other files. +/// +/// Two-pass: first builds a map of (file-qualified-stem → label → nid) and +/// (bare stem → qualified stem); then re-parses each Python file to find +/// `from X import Y` statements and emit edges. Mirrors Python `_resolve_cross_file_imports`. +pub(super) fn resolve_cross_file_python_imports( + per_file: &[FileResult], + paths: &[PathBuf], +) -> Vec { + let mut probe = tree_sitter::Parser::new(); + if probe + .set_language(&tree_sitter_python::LANGUAGE.into()) + .is_err() + { + return vec![]; + } + drop(probe); + + let (stem_to_entities, bare_to_qualified) = build_python_symbol_maps(per_file); + let work: Vec<(&FileResult, &PathBuf)> = per_file.iter().zip(paths.iter()).collect(); + let init_parser = || -> tree_sitter::Parser { + let mut p = tree_sitter::Parser::new(); + let _ = p.set_language(&tree_sitter_python::LANGUAGE.into()); + p + }; + if work.len() >= PARALLEL_THRESHOLD { + work.par_iter() + .map_init(init_parser, |parser, (result, path)| { + python_per_file_edges(result, path, parser, &stem_to_entities, &bare_to_qualified) + }) + .reduce(Vec::new, |mut a, b| { + a.extend(b); + a + }) + } else { + let mut parser = init_parser(); + work.iter() + .flat_map(|(result, path)| { + python_per_file_edges( + result, + path, + &mut parser, + &stem_to_entities, + &bare_to_qualified, + ) + }) + .collect() + } +} + +/// Pass 1: build `(stem → {label → nid})` + `(bare stem → qualified stem)` maps. +fn build_python_symbol_maps( + per_file: &[FileResult], +) -> ( + HashMap>, + HashMap, +) { + use crate::ids::file_stem; + let mut stem_to_entities: HashMap> = HashMap::new(); + let mut bare_to_qualified: HashMap = HashMap::new(); + for result in per_file { + for node in &result.nodes { + if node.source_file.is_empty() { + continue; + } + let label = &node.label; + if label.is_empty() + || label.ends_with(')') + || label.to_lowercase().ends_with(".py") + || label.starts_with('_') + || node.file_type == "rationale" + { + continue; + } + let src_path = PathBuf::from(&node.source_file); + let fq_stem = file_stem(&src_path); + stem_to_entities + .entry(fq_stem.clone()) + .or_default() + .insert(label.clone(), node.id.clone()); + let bare = src_path + .file_stem() + .map_or(String::new(), |s| s.to_string_lossy().into_owned()); + bare_to_qualified.entry(bare).or_insert(fq_stem); + } + } + (stem_to_entities, bare_to_qualified) +} + +/// Pass 2: per-file Python parse + import-edge emission. +fn python_per_file_edges( + result: &FileResult, + path: &Path, + parser: &mut tree_sitter::Parser, + stem_to_entities: &HashMap>, + bare_to_qualified: &HashMap, +) -> Vec { + use crate::ids::file_stem; + let mut local_edges: Vec = Vec::new(); + let str_path = path.to_string_lossy().into_owned(); + let this_stem = file_stem(path); + let this_file_nid = make_id1(&str_path); + let local_classes: Vec = result + .nodes + .iter() + .filter(|n| { + n.source_file == str_path + && !n.label.ends_with(')') + && !n.label.to_lowercase().ends_with(".py") + && n.id != this_file_nid + && n.id != make_id1(&this_stem) + && n.file_type != "rationale" + }) + .map(|n| n.id.clone()) + .collect(); + if local_classes.is_empty() { + return local_edges; + } + let Ok(source) = std::fs::read(path) else { + return local_edges; + }; + let Some(tree) = parser.parse(&source, None) else { + return local_edges; + }; + let mut import_ctx = ImportWalkCtx { + path, + stem_to_entities, + bare_to_qualified, + local_classes: &local_classes, + str_path: &str_path, + new_edges: &mut local_edges, + }; + walk_imports(&mut import_ctx, tree.root_node(), &source); + local_edges +} + +// ── Cross-file Java import resolution ──────────────────────────────────────── + +/// `(module_raw, [(imported_name, local_or_public_name)])` from a Python +/// `import_from_statement` (alias-aware, unlike on-disk-only `python_imported_names`). +fn python_import_from_specs( + source: &[u8], + node: tree_sitter::Node<'_>, +) -> Option<(String, Vec<(String, String)>)> { + let module = node.child_by_field_name("module_name")?; + let module_raw = js_node_text(module, source).to_string(); + let mut specs = Vec::new(); + let mut past_import = false; + let mut cur = node.walk(); + for child in node.children(&mut cur) { + match child.kind() { + "import" => past_import = true, + "dotted_name" if past_import => { + let n = js_node_text(child, source).to_string(); + specs.push((n.clone(), n)); + } + "aliased_import" if past_import => { + if let Some(nn) = child.child_by_field_name("name") { + let imported = js_node_text(nn, source).to_string(); + let local = child + .child_by_field_name("alias") + .map_or_else(|| imported.clone(), |a| js_node_text(a, source).to_string()); + specs.push((imported, local)); + } + } + _ => {} + } + } + Some((module_raw, specs)) +} + +/// Candidate file paths a relative Python module reference can resolve to, +/// against `from_path`. A `.foo` reference can name either a module file +/// (`foo.py`) or a package (`foo/__init__.py`); `from . import x` names the +/// current package's `__init__.py`. Returns an empty list for a non-relative +/// module. The caller picks the first candidate present in the scan set. +fn python_relative_module_candidates(from_path: &Path, module_raw: &str) -> Vec { + if !module_raw.starts_with('.') { + return Vec::new(); + } + let dots = module_raw.len() - module_raw.trim_start_matches('.').len(); + let module_name = module_raw.trim_start_matches('.'); + let Some(mut base) = from_path.parent().map(Path::to_path_buf) else { + return Vec::new(); + }; + for _ in 0..dots.saturating_sub(1) { + let Some(parent) = base.parent() else { + return Vec::new(); + }; + base = parent.to_path_buf(); + } + if module_name.is_empty() { + return vec![base.join("__init__.py")]; + } + let rel = module_name.replace('.', "/"); + vec![ + base.join(format!("{rel}.py")), + base.join(&rel).join("__init__.py"), + ] +} + +/// Look up a path's `paths` index, falling back to its canonicalised form. +fn py_idx_of(idx_by_path: &HashMap, p: &Path) -> Option { + idx_by_path + .get(p) + .or_else(|| p.canonicalize().ok().and_then(|c| idx_by_path.get(&c))) + .copied() +} + +/// Parse a Python file, returning its source bytes + tree. +fn parse_python_file(path: &Path) -> Option<(Vec, tree_sitter::Tree)> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter_python::LANGUAGE.into()) + .ok()?; + let source = std::fs::read(path).ok()?; + let tree = parser.parse(&source, None)?; + Some((source, tree)) +} + +/// `(init_idx, public_name) → (origin_idx, origin_name)` package re-export map. +type PyPkgReexports = HashMap<(usize, String), (usize, String)>; + +/// Shared maps for Python package re-export resolution. +struct PyReexportResolver<'a> { + paths: &'a [PathBuf], + idx_by_path: &'a HashMap, + file_nids: &'a [String], + by_file_label: &'a HashMap<(String, String), String>, +} + +impl PyReexportResolver<'_> { + /// Scan every `__init__.py` for `from .sub import N as A`, building a + /// `(init_idx, public) → (origin_idx, origin_name)` map and emitting + /// file→file `re_exports` edges. + fn pkg_reexports(&self) -> (PyPkgReexports, Vec) { + let mut map: PyPkgReexports = HashMap::new(); + let mut edges = Vec::new(); + let mut seen: HashSet<(usize, usize)> = HashSet::new(); + for (idx, path) in self.paths.iter().enumerate() { + if path.file_name().and_then(|n| n.to_str()) != Some("__init__.py") { + continue; + } + let Some((source, tree)) = parse_python_file(path) else { + continue; + }; + let mut cur = tree.root_node().walk(); + for stmt in tree.root_node().children(&mut cur) { + if stmt.kind() != "import_from_statement" { + continue; + } + let Some((module_raw, specs)) = python_import_from_specs(&source, stmt) else { + continue; + }; + let Some(sub_idx) = python_relative_module_candidates(path, &module_raw) + .iter() + .find_map(|cand| py_idx_of(self.idx_by_path, cand)) + else { + continue; + }; + for (imported, public) in specs { + map.insert((idx, public), (sub_idx, imported)); + } + if seen.insert((idx, sub_idx)) { + edges.push(make_edge( + &self.file_nids[idx], + &self.file_nids[sub_idx], + "re_exports", + Some("re-export"), + &path.to_string_lossy(), + 1, + )); + } + } + } + (map, edges) + } + + /// Resolve each `from pkg import N` against the package re-export map, + /// emitting consumer→origin `imports` edges and call aliases. + fn consumer_edges( + &self, + pkg_reexports: &PyPkgReexports, + ) -> (Vec, HashMap<(String, String), String>) { + let mut edges = Vec::new(); + let mut aliases: HashMap<(String, String), String> = HashMap::new(); + let mut seen: HashSet<(usize, String)> = HashSet::new(); + for (idx, path) in self.paths.iter().enumerate() { + let str_path = path.to_string_lossy(); + let Some((source, tree)) = parse_python_file(path) else { + continue; + }; + let mut cur = tree.root_node().walk(); + for stmt in tree.root_node().children(&mut cur) { + if stmt.kind() != "import_from_statement" { + continue; + } + let Some((module_raw, specs)) = python_import_from_specs(&source, stmt) else { + continue; + }; + if module_raw.starts_with('.') { + continue; + } + let Some(pkg_dir) = + crate::import_handlers::resolve_python_package_dir(&module_raw, &str_path) + else { + continue; + }; + let Some(init_idx) = py_idx_of(self.idx_by_path, &pkg_dir.join("__init__.py")) + else { + continue; + }; + for (imported, local) in specs { + let Some((origin_idx, origin_name)) = pkg_reexports.get(&(init_idx, imported)) + else { + continue; + }; + let label = origin_name.trim_end_matches("()").trim_start_matches('.'); + let Some(origin_sym) = self + .by_file_label + .get(&(self.file_nids[*origin_idx].clone(), label.to_string())) + else { + continue; + }; + if seen.insert((idx, origin_sym.clone())) { + edges.push(make_edge( + &self.file_nids[idx], + origin_sym, + "imports", + Some("import"), + &str_path, + 1, + )); + } + aliases.insert( + (self.file_nids[idx].clone(), local.to_lowercase()), + origin_sym.clone(), + ); + } + } + } + (edges, aliases) + } +} + +/// Resolve Python package re-exports (`pkg/__init__.py` doing +/// `from .sub import Name as Alias`) so a consumer's `from pkg import Alias` +/// (and calls through it) target the origin symbol. Mirrors the observable +/// output of graphify-py's `_collect_python_symbol_resolution_facts`. +pub(super) fn resolve_python_reexport_imports( + all_nodes: &[Node], + paths: &[PathBuf], + root: &Path, +) -> JsDefaultResolution { + use crate::ids::file_node_id; + + let file_nid_of = |path: &Path| -> String { + let rel = relativise_under_root(path, root).unwrap_or_else(|| path.to_path_buf()); + file_node_id(&rel) + }; + let mut by_file_label: HashMap<(String, String), String> = HashMap::new(); + for n in all_nodes { + if n.source_file.is_empty() || n.label.is_empty() { + continue; + } + let sf = PathBuf::from(&n.source_file); + let file_nid = if sf.is_absolute() { + file_nid_of(&sf) + } else { + file_node_id(&sf) + }; + let label = n.label.trim_end_matches("()").trim_start_matches('.'); + if !label.is_empty() { + by_file_label + .entry((file_nid, label.to_string())) + .or_insert_with(|| n.id.clone()); + } + } + let mut idx_by_path: HashMap = HashMap::new(); + for (i, p) in paths.iter().enumerate() { + idx_by_path.entry(p.clone()).or_insert(i); + if let Ok(c) = p.canonicalize() { + idx_by_path.entry(c).or_insert(i); + } + } + let file_nids: Vec = paths.iter().map(|p| file_nid_of(p)).collect(); + let resolver = PyReexportResolver { + paths, + idx_by_path: &idx_by_path, + file_nids: &file_nids, + by_file_label: &by_file_label, + }; + let (pkg_reexports, mut edges) = resolver.pkg_reexports(); + let (import_edges, aliases) = resolver.consumer_edges(&pkg_reexports); + edges.extend(import_edges); + JsDefaultResolution { edges, aliases } +} diff --git a/crates/graphify-extract/src/extractors/multi/swift.rs b/crates/graphify-extract/src/extractors/multi/swift.rs new file mode 100644 index 0000000..c0c5019 --- /dev/null +++ b/crates/graphify-extract/src/extractors/multi/swift.rs @@ -0,0 +1,217 @@ +//! Cross-file Swift member-call resolution. + +use super::java::is_type_like_definition; +use crate::types::{Edge, Node, RawCall}; +use std::collections::HashMap; +use std::path::PathBuf; + +/// Re-parse a Swift file's AST into a `local name -> type name` table, from +/// property declarations (type annotation, else constructor inference) and +/// function parameters. Feeds [`resolve_swift_member_calls`]. Rebuilt by +/// re-parsing (like the Java type-reference pass) rather than threaded through a +/// `FileResult` sidecar. +fn collect_swift_type_table( + node: tree_sitter::Node<'_>, + source: &[u8], + table: &mut HashMap, +) { + use crate::generic::references::{ + RefRole, swift_collect_type_refs, swift_constructor_type, swift_property_name, + swift_property_type_node, + }; + match node.kind() { + "property_declaration" => { + let mut prop_type: Option = None; + if let Some(anno) = swift_property_type_node(node) { + let mut refs: Vec<(String, RefRole)> = Vec::new(); + swift_collect_type_refs(anno, source, false, &mut refs); + prop_type = refs + .into_iter() + .find(|(_, r)| *r == RefRole::Direct) + .map(|(n, _)| n); + } + if prop_type.is_none() { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == "call_expression" + && let Some(ctor) = swift_constructor_type(cur.node(), source) + { + prop_type = Some(ctor); + break; + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + if let (Some(name), Some(ty)) = (swift_property_name(node, source), prop_type) { + table.insert(name, ty); + } + } + "parameter" => { + if let Some(type_node) = node.child_by_field_name("type") { + let mut refs: Vec<(String, RefRole)> = Vec::new(); + swift_collect_type_refs(type_node, source, false, &mut refs); + if let Some((ty, _)) = refs.into_iter().find(|(_, r)| *r == RefRole::Direct) + && let Some(name_node) = node.child_by_field_name("name") + { + let pname = name_node.utf8_text(source).unwrap_or(""); + if !pname.is_empty() { + table.insert(pname.to_string(), ty); + } + } + } + } + _ => {} + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + collect_swift_type_table(cur.node(), source, table); + if !cur.goto_next_sibling() { + break; + } + } + } +} + +/// Resolve cross-file Swift member calls (`recv.method()`) to the receiver's +/// real type definition (#1356). The shared call pass drops every +/// `is_member_call` (a bare method name collides across the corpus); this pass +/// types the receiver via the file's local type table (or treats an upper-cased +/// receiver as a type itself), then emits an edge ONLY when the type name +/// resolves to exactly one definition (god-node guard). Everything it adds is +/// INFERRED (type inference, not an explicit import). +#[allow(clippy::too_many_lines)] // linear: re-parse type tables, build indexes, resolve each member call +pub(super) fn resolve_swift_member_calls( + swift_paths: &[PathBuf], + all_nodes: &[Node], + all_edges: &mut Vec, + all_raw_calls: &[RawCall], +) { + let mut parser = tree_sitter::Parser::new(); + if parser + .set_language(&tree_sitter_swift::LANGUAGE.into()) + .is_err() + { + return; + } + let mut type_table_by_file: HashMap> = HashMap::new(); + for path in swift_paths { + let Ok(source) = std::fs::read(path) else { + continue; + }; + let Some(tree) = parser.parse(&source, None) else { + continue; + }; + let mut table: HashMap = HashMap::new(); + collect_swift_type_table(tree.root_node(), &source, &mut table); + type_table_by_file.insert(path.to_string_lossy().into_owned(), table); + } + if type_table_by_file.is_empty() { + return; + } + + let key = |s: &str| -> String { + s.chars() + .filter(char::is_ascii_alphanumeric) + .collect::() + .to_lowercase() + }; + + // A genuine type is the target of a `contains` edge from its file; bare type + // references create same-label shadow nodes that are NOT contained, so this + // keeps a shadow from making a real type name look ambiguous. + let contained: std::collections::HashSet<&str> = all_edges + .iter() + .filter(|e| e.relation == "contains") + .map(|e| e.target.as_str()) + .collect(); + let mut type_def_nids: HashMap> = HashMap::new(); + let mut node_by_id: HashMap<&str, &Node> = HashMap::new(); + for n in all_nodes { + node_by_id.insert(n.id.as_str(), n); + if !n.source_file.is_empty() + && contained.contains(n.id.as_str()) + && is_type_like_definition(n) + { + type_def_nids + .entry(key(n.label.as_str())) + .or_default() + .push(n.id.clone()); + } + } + + // (type_node_id, method_key) -> method_node_id, from `method` edges. + let mut method_index: HashMap<(String, String), String> = HashMap::new(); + for e in all_edges.iter() { + if e.relation == "method" + && let Some(tnode) = node_by_id.get(e.target.as_str()) + { + method_index.insert( + (e.source.clone(), key(tnode.label.as_str())), + e.target.clone(), + ); + } + } + + let mut existing_pairs: std::collections::HashSet<(String, String)> = all_edges + .iter() + .map(|e| (e.source.clone(), e.target.clone())) + .collect(); + + let mut new_edges: Vec = Vec::new(); + for rc in all_raw_calls { + if !rc.is_member_call || rc.callee.is_empty() || rc.caller_nid.is_empty() { + continue; + } + let Some(receiver) = rc.receiver.as_deref() else { + continue; + }; + // An upper-cased receiver is itself a type (`Type.staticMethod()`, + // `Singleton.shared.x()`); otherwise look it up in the declaring file's + // local type table. + let type_name = if receiver.chars().next().is_some_and(char::is_uppercase) { + receiver.to_string() + } else if let Some(t) = type_table_by_file + .get(&rc.source_file) + .and_then(|tbl| tbl.get(receiver)) + { + t.clone() + } else { + continue; + }; + let type_nid = match type_def_nids.get(&key(type_name.as_str())) { + Some(defs) if defs.len() == 1 => &defs[0], + _ => continue, // ambiguous or absent -> god-node guard + }; + let (target, relation) = + match method_index.get(&(type_nid.clone(), key(rc.callee.as_str()))) { + Some(method) => (method.clone(), "calls"), + None => (type_nid.clone(), "references"), + }; + if target == rc.caller_nid + || existing_pairs.contains(&(rc.caller_nid.clone(), target.clone())) + { + continue; + } + existing_pairs.insert((rc.caller_nid.clone(), target.clone())); + new_edges.push(Edge { + external: false, + source: rc.caller_nid.clone(), + target, + relation: relation.to_string(), + confidence: "INFERRED".to_string(), + source_file: rc.source_file.clone(), + source_location: Some(rc.source_location.clone()), + weight: 1.0, + context: Some("call".to_string()), + confidence_score: Some(0.8), + }); + } + all_edges.extend(new_edges); +} + +// ── Main extract() ──────────────────────────────────────────────────────────── diff --git a/crates/graphify-extract/src/extractors/pascal/forms.rs b/crates/graphify-extract/src/extractors/pascal/forms.rs new file mode 100644 index 0000000..cf533ae --- /dev/null +++ b/crates/graphify-extract/src/extractors/pascal/forms.rs @@ -0,0 +1,187 @@ +//! Lazarus `.lfm` / Delphi `.dfm` text-form extractors. + +use crate::ids::{file_stem, make_id, make_id1}; +use crate::types::{Edge, FileResult, Node}; +use regex::Regex; +use std::collections::HashSet; +use std::path::Path; + +/// Parse a Lazarus `.lfm` or Delphi `.dfm` text-form file, emitting component and event nodes. +/// +/// Scans line-by-line for `object Name : ClassName` declarations and `OnXxx = Handler` event +/// bindings. Component nodes are connected via `contains` edges; event handlers produce `handles` +/// edges. Shared by `extract_lazarus_form` and `extract_delphi_form`. +#[allow(clippy::too_many_lines)] +fn parse_form_text(text: &str, path: &Path) -> FileResult { + #[allow(clippy::expect_used)] + let obj_re = Regex::new(r"(?i)^\s*object\s+\w+\s*:\s*(\w+)").expect("static lfm object regex"); + #[allow(clippy::expect_used)] + let event_re = Regex::new(r"(?i)^\s*On\w+\s*=\s*(\w+)").expect("static lfm event regex"); + #[allow(clippy::expect_used)] + let end_re = Regex::new(r"(?i)^\s*end\s*$").expect("static lfm end regex"); + + let str_path = path.to_string_lossy().into_owned(); + let stem = file_stem(path); + + let mut nodes: Vec = Vec::new(); + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::new(); + let mut seen_edge_pairs: HashSet<(String, String, String)> = HashSet::new(); + + let add_node = |nodes: &mut Vec, + seen_ids: &mut HashSet, + nid: String, + label: String, + line: usize, + str_path: &str| { + if seen_ids.insert(nid.clone()) { + nodes.push(Node { + id: nid, + label, + file_type: "code".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + }; + + let add_edge = |edges: &mut Vec, + seen_edge_pairs: &mut HashSet<(String, String, String)>, + src: String, + tgt: String, + relation: String, + line: usize, + context: Option, + str_path: &str| { + let key = (src.clone(), tgt.clone(), relation.clone()); + if seen_edge_pairs.insert(key) { + edges.push(Edge { + external: false, + source: src, + target: tgt, + relation, + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context, + confidence_score: None, + }); + } + }; + + let file_nid = make_id1(&str_path); + add_node( + &mut nodes, + &mut seen_ids, + file_nid.clone(), + path.file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + 1, + &str_path, + ); + + let mut stack: Vec = vec![file_nid]; + + for (lineno, line) in text.lines().enumerate() { + let lineno = lineno + 1; + if let Some(cap) = obj_re.captures(line) { + let class_name = cap.get(1).map_or("", |m| m.as_str()); + let nid = make_id(&[&stem, class_name]); + add_node( + &mut nodes, + &mut seen_ids, + nid.clone(), + class_name.to_string(), + lineno, + &str_path, + ); + let parent = stack.last().cloned().unwrap_or_default(); + add_edge( + &mut edges, + &mut seen_edge_pairs, + parent, + nid.clone(), + "contains".to_string(), + lineno, + None, + &str_path, + ); + stack.push(nid); + continue; + } + if let Some(cap) = event_re.captures(line) { + if stack.len() > 1 { + let handler = cap.get(1).map_or("", |m| m.as_str()); + let handler_nid = make_id(&[&stem, handler]); + add_node( + &mut nodes, + &mut seen_ids, + handler_nid.clone(), + format!("{handler}()"), + lineno, + &str_path, + ); + let parent = stack.last().cloned().unwrap_or_default(); + add_edge( + &mut edges, + &mut seen_edge_pairs, + parent, + handler_nid, + "references".to_string(), + lineno, + Some("event".to_string()), + &str_path, + ); + } + continue; + } + if end_re.is_match(line) && stack.len() > 1 { + stack.pop(); + } + } + + FileResult { + nodes, + edges, + raw_calls: vec![], + error: None, + } +} + +// ── extract_lazarus_form (.lfm) ─────────────────────────────────────────────── + +/// Extract component hierarchy from a Lazarus `.lfm` form file. +#[must_use] +pub fn extract_lazarus_form(path: &Path) -> FileResult { + match std::fs::read_to_string(path) { + Ok(text) => parse_form_text(&text, path), + Err(e) => FileResult::error(e.to_string()), + } +} + +// ── extract_delphi_form (.dfm) ──────────────────────────────────────────────── + +/// Extract component hierarchy from a Delphi `.dfm` form file. +/// +/// Binary DFM files (magic bytes `FF 0A`) are returned as an error. +#[must_use] +pub fn extract_delphi_form(path: &Path) -> FileResult { + let raw = match std::fs::read(path) { + Ok(b) => b, + Err(e) => return FileResult::error(e.to_string()), + }; + // Binary DFM detection + if raw.starts_with(b"\xff\x0a") { + return FileResult::error(format!( + "binary DFM (convert to text in Delphi IDE to index): {}", + path.file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()) + )); + } + let text = String::from_utf8_lossy(&raw).into_owned(); + parse_form_text(&text, path) +} + +// ── extract_lazarus_package (.lpk) ─────────────────────────────────────────── diff --git a/crates/graphify-extract/src/extractors/pascal.rs b/crates/graphify-extract/src/extractors/pascal/mod.rs similarity index 69% rename from crates/graphify-extract/src/extractors/pascal.rs rename to crates/graphify-extract/src/extractors/pascal/mod.rs index dacddf6..711696b 100644 --- a/crates/graphify-extract/src/extractors/pascal.rs +++ b/crates/graphify-extract/src/extractors/pascal/mod.rs @@ -5,16 +5,18 @@ //! `extract_delphi_form` — .dfm component hierarchy. //! `extract_lazarus_package` — .lpk XML package metadata. -use std::collections::HashSet; -use std::path::Path; -use std::sync::LazyLock; +mod forms; +mod package; -use regex::Regex; +pub use forms::{extract_delphi_form, extract_lazarus_form}; +pub use package::extract_lazarus_package; use crate::ids::{file_stem, make_id, make_id1}; use crate::types::{Edge, FileResult, Node}; - -// ── Regex patterns ──────────────────────────────────────────────────────────── +use regex::Regex; +use std::collections::HashSet; +use std::path::Path; +use std::sync::LazyLock; #[allow(clippy::expect_used)] // literal pattern; build cannot panic static PAS_TOKEN_RE: LazyLock = LazyLock::new(|| { @@ -730,329 +732,3 @@ pub fn extract_pascal(path: &Path) -> FileResult { } // ── Shared form parser for .lfm / .dfm ─────────────────────────────────────── - -/// Parse a Lazarus `.lfm` or Delphi `.dfm` text-form file, emitting component and event nodes. -/// -/// Scans line-by-line for `object Name : ClassName` declarations and `OnXxx = Handler` event -/// bindings. Component nodes are connected via `contains` edges; event handlers produce `handles` -/// edges. Shared by `extract_lazarus_form` and `extract_delphi_form`. -#[allow(clippy::too_many_lines)] -fn parse_form_text(text: &str, path: &Path) -> FileResult { - #[allow(clippy::expect_used)] - let obj_re = Regex::new(r"(?i)^\s*object\s+\w+\s*:\s*(\w+)").expect("static lfm object regex"); - #[allow(clippy::expect_used)] - let event_re = Regex::new(r"(?i)^\s*On\w+\s*=\s*(\w+)").expect("static lfm event regex"); - #[allow(clippy::expect_used)] - let end_re = Regex::new(r"(?i)^\s*end\s*$").expect("static lfm end regex"); - - let str_path = path.to_string_lossy().into_owned(); - let stem = file_stem(path); - - let mut nodes: Vec = Vec::new(); - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::new(); - let mut seen_edge_pairs: HashSet<(String, String, String)> = HashSet::new(); - - let add_node = |nodes: &mut Vec, - seen_ids: &mut HashSet, - nid: String, - label: String, - line: usize, - str_path: &str| { - if seen_ids.insert(nid.clone()) { - nodes.push(Node { - id: nid, - label, - file_type: "code".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } - }; - - let add_edge = |edges: &mut Vec, - seen_edge_pairs: &mut HashSet<(String, String, String)>, - src: String, - tgt: String, - relation: String, - line: usize, - context: Option, - str_path: &str| { - let key = (src.clone(), tgt.clone(), relation.clone()); - if seen_edge_pairs.insert(key) { - edges.push(Edge { - external: false, - source: src, - target: tgt, - relation, - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context, - confidence_score: None, - }); - } - }; - - let file_nid = make_id1(&str_path); - add_node( - &mut nodes, - &mut seen_ids, - file_nid.clone(), - path.file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - 1, - &str_path, - ); - - let mut stack: Vec = vec![file_nid]; - - for (lineno, line) in text.lines().enumerate() { - let lineno = lineno + 1; - if let Some(cap) = obj_re.captures(line) { - let class_name = cap.get(1).map_or("", |m| m.as_str()); - let nid = make_id(&[&stem, class_name]); - add_node( - &mut nodes, - &mut seen_ids, - nid.clone(), - class_name.to_string(), - lineno, - &str_path, - ); - let parent = stack.last().cloned().unwrap_or_default(); - add_edge( - &mut edges, - &mut seen_edge_pairs, - parent, - nid.clone(), - "contains".to_string(), - lineno, - None, - &str_path, - ); - stack.push(nid); - continue; - } - if let Some(cap) = event_re.captures(line) { - if stack.len() > 1 { - let handler = cap.get(1).map_or("", |m| m.as_str()); - let handler_nid = make_id(&[&stem, handler]); - add_node( - &mut nodes, - &mut seen_ids, - handler_nid.clone(), - format!("{handler}()"), - lineno, - &str_path, - ); - let parent = stack.last().cloned().unwrap_or_default(); - add_edge( - &mut edges, - &mut seen_edge_pairs, - parent, - handler_nid, - "references".to_string(), - lineno, - Some("event".to_string()), - &str_path, - ); - } - continue; - } - if end_re.is_match(line) && stack.len() > 1 { - stack.pop(); - } - } - - FileResult { - nodes, - edges, - raw_calls: vec![], - error: None, - } -} - -// ── extract_lazarus_form (.lfm) ─────────────────────────────────────────────── - -/// Extract component hierarchy from a Lazarus `.lfm` form file. -#[must_use] -pub fn extract_lazarus_form(path: &Path) -> FileResult { - match std::fs::read_to_string(path) { - Ok(text) => parse_form_text(&text, path), - Err(e) => FileResult::error(e.to_string()), - } -} - -// ── extract_delphi_form (.dfm) ──────────────────────────────────────────────── - -/// Extract component hierarchy from a Delphi `.dfm` form file. -/// -/// Binary DFM files (magic bytes `FF 0A`) are returned as an error. -#[must_use] -pub fn extract_delphi_form(path: &Path) -> FileResult { - let raw = match std::fs::read(path) { - Ok(b) => b, - Err(e) => return FileResult::error(e.to_string()), - }; - // Binary DFM detection - if raw.starts_with(b"\xff\x0a") { - return FileResult::error(format!( - "binary DFM (convert to text in Delphi IDE to index): {}", - path.file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()) - )); - } - let text = String::from_utf8_lossy(&raw).into_owned(); - parse_form_text(&text, path) -} - -// ── extract_lazarus_package (.lpk) ─────────────────────────────────────────── - -/// Extract package metadata from a Lazarus `.lpk` package file (XML). -#[must_use] -#[allow(clippy::too_many_lines, clippy::missing_panics_doc)] -pub fn extract_lazarus_package(path: &Path) -> FileResult { - // Check the on-disk size before reading so an oversized file can't force a - // multi-megabyte allocation just to be rejected. - match std::fs::metadata(path) { - Ok(meta) if meta.len() > crate::extractors::PROJECT_XML_MAX_BYTES => { - return FileResult::error("package file too large"); - } - Ok(_) => {} - Err(e) => return FileResult::error(e.to_string()), - } - let raw = match std::fs::read(path) { - Ok(b) => b, - Err(e) => return FileResult::error(e.to_string()), - }; - if raw.len() as u64 > crate::extractors::PROJECT_XML_MAX_BYTES { - return FileResult::error("package file too large"); - } - if !crate::extractors::project_xml_is_safe(&raw) { - return FileResult::error("refusing XML with DOCTYPE/ENTITY declaration"); - } - let text = String::from_utf8_lossy(&raw).into_owned(); - - let str_path = path.to_string_lossy().into_owned(); - let stem = file_stem(path); - let mut nodes: Vec = Vec::new(); - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::new(); - - let make_node = |nid: &str, label: &str, str_path: &str| -> Node { - Node { - id: nid.to_string(), - label: label.to_string(), - file_type: "code".to_string(), - source_file: str_path.to_string(), - source_location: Some("L1".to_string()), - metadata: None, - } - }; - - let file_nid = make_id1(&str_path); - seen_ids.insert(file_nid.clone()); - nodes.push(make_node( - &file_nid, - &path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - &str_path, - )); - - // Simple XML parse for .lpk using regex (avoid pulling in an XML crate) - #[allow(clippy::expect_used)] - let pkg_name_re = - Regex::new(r#"(?i) FileResult { + // Check the on-disk size before reading so an oversized file can't force a + // multi-megabyte allocation just to be rejected. + match std::fs::metadata(path) { + Ok(meta) if meta.len() > crate::extractors::PROJECT_XML_MAX_BYTES => { + return FileResult::error("package file too large"); + } + Ok(_) => {} + Err(e) => return FileResult::error(e.to_string()), + } + let raw = match std::fs::read(path) { + Ok(b) => b, + Err(e) => return FileResult::error(e.to_string()), + }; + if raw.len() as u64 > crate::extractors::PROJECT_XML_MAX_BYTES { + return FileResult::error("package file too large"); + } + if !crate::extractors::project_xml_is_safe(&raw) { + return FileResult::error("refusing XML with DOCTYPE/ENTITY declaration"); + } + let text = String::from_utf8_lossy(&raw).into_owned(); + + let str_path = path.to_string_lossy().into_owned(); + let stem = file_stem(path); + let mut nodes: Vec = Vec::new(); + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::new(); + + let make_node = |nid: &str, label: &str, str_path: &str| -> Node { + Node { + id: nid.to_string(), + label: label.to_string(), + file_type: "code".to_string(), + source_file: str_path.to_string(), + source_location: Some("L1".to_string()), + metadata: None, + } + }; + + let file_nid = make_id1(&str_path); + seen_ids.insert(file_nid.clone()); + nodes.push(make_node( + &file_nid, + &path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + &str_path, + )); + + // Simple XML parse for .lpk using regex (avoid pulling in an XML crate) + #[allow(clippy::expect_used)] + let pkg_name_re = + Regex::new(r#"(?i)> = LazyLock::new(|| { + ["RootModule", "NestedModules", "RequiredModules"] + .into_iter() + .collect() +}); + +/// Derive a bare module name from a raw string value: strip the path prefix and +/// extension (`MyModule.psm1` -> `MyModule`, `./sub/Util.psm1` -> `Util`). +/// Mirrors `_psd1_module_name`. +fn psd1_module_name(raw: &str) -> String { + let normalized = raw.replace('\\', "/"); + let basename = normalized.rsplit('/').next().unwrap_or(""); + let no_ext = basename.rsplit_once('.').map_or(basename, |(base, _)| base); + no_ext.trim().to_string() +} + +/// Recursively collect all `string_literal` text values (surrounding quotes +/// stripped) under `node`. Mirrors `_psd1_collect_string_literals`. +fn psd1_collect_string_literals(node: tree_sitter::Node<'_>, source: &[u8], out: &mut Vec) { + if node.kind() == "string_literal" { + out.push( + read_text(node, source) + .trim_matches(['\'', '"']) + .to_string(), + ); + return; + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + psd1_collect_string_literals(cur.node(), source, out); + if !cur.goto_next_sibling() { + break; + } + } + } +} + +/// Like [`psd1_collect_string_literals`] but keeps each string's start byte so a +/// caller can distinguish strings nested inside a `hash_entry` from direct ones. +fn psd1_collect_string_nodes( + node: tree_sitter::Node<'_>, + source: &[u8], + out: &mut Vec<(usize, String)>, +) { + if node.kind() == "string_literal" { + out.push(( + node.start_byte(), + read_text(node, source) + .trim_matches(['\'', '"']) + .to_string(), + )); + return; + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + psd1_collect_string_nodes(cur.node(), source, out); + if !cur.goto_next_sibling() { + break; + } + } + } +} + +/// For `RequiredModules`: collect `ModuleName` values from hashtable specs and +/// record every string nested in a `hash_entry` (so the caller can treat only +/// the remaining direct strings as simple module names, and `ModuleVersion` +/// values never leak in). Mirrors the inner `find_modulename_entries`. +fn psd1_find_modulename_entries( + node: tree_sitter::Node<'_>, + source: &[u8], + module_names: &mut Vec, + inside_hash: &mut HashSet, +) { + if node.kind() == "hash_entry" { + let sub_key = first_child_kind(node, "key_expression"); + let sk_text = sub_key.map_or_else(String::new, |k| read_text(k, source).trim().to_string()); + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == "pipeline" { + let mut found = Vec::new(); + psd1_collect_string_nodes(cur.node(), source, &mut found); + for (sb, s) in found { + inside_hash.insert(sb); + if sk_text == "ModuleName" { + module_names.push(s); + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; // don't recurse further into this hash_entry + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + psd1_find_modulename_entries(cur.node(), source, module_names, inside_hash); + if !cur.goto_next_sibling() { + break; + } + } + } +} + +/// Push a `file -> module` `imports_from` edge for a raw `.psd1` module value. +fn add_psd1_import_edge( + edges: &mut Vec, + file_nid: &str, + str_path: &str, + module_raw: &str, + line: usize, +) { + let name = psd1_module_name(module_raw); + if name.is_empty() { + return; + } + edges.push(Edge { + external: false, + source: file_nid.to_string(), + target: make_id1(&name), + relation: "imports_from".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: Some("import".to_string()), + confidence_score: None, + }); +} + +/// Walk a `.psd1` AST, emitting `imports_from` edges for `RootModule`, +/// `NestedModules`, and `RequiredModules` entries. Mirrors `walk_manifest`. +fn walk_psd1_manifest( + node: tree_sitter::Node<'_>, + source: &[u8], + file_nid: &str, + str_path: &str, + edges: &mut Vec, +) { + if node.kind() != "hash_entry" { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_psd1_manifest(cur.node(), source, file_nid, str_path, edges); + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + let Some(key_node) = first_child_kind(node, "key_expression") else { + return; + }; + let key_text = read_text(key_node, source).trim().to_string(); + if !PSD1_IMPORT_KEYS.contains(key_text.as_str()) { + return; + } + let line = node.start_position().row + 1; + let Some(value_node) = first_child_kind(node, "pipeline") else { + return; + }; + match key_text.as_str() { + "RootModule" | "NestedModules" => { + let mut strings = Vec::new(); + psd1_collect_string_literals(value_node, source, &mut strings); + for s in strings { + add_psd1_import_edge(edges, file_nid, str_path, &s, line); + } + } + "RequiredModules" => { + // Two forms: plain 'Module' strings, and @{ ModuleName='Foo'; ... } + // specs (follow only ModuleName; ModuleVersion etc. are excluded). + let mut module_names = Vec::new(); + let mut inside_hash = HashSet::new(); + psd1_find_modulename_entries(value_node, source, &mut module_names, &mut inside_hash); + let mut all_strings = Vec::new(); + psd1_collect_string_nodes(value_node, source, &mut all_strings); + for (sb, s) in &all_strings { + if !inside_hash.contains(sb) { + add_psd1_import_edge(edges, file_nid, str_path, s, line); + } + } + for s in &module_names { + add_psd1_import_edge(edges, file_nid, str_path, s, line); + } + } + _ => {} + } +} + +/// Extract module dependency edges from a PowerShell `.psd1` manifest. +/// +/// `.psd1` files are PowerShell data hashtables (syntactically valid PowerShell), +/// so tree-sitter parses them. Emits a file node plus `imports_from` edges for +/// every module named under `RootModule`, `NestedModules`, and `RequiredModules` +/// (both the plain-string and `@{ ModuleName=... }` forms). Mirrors +/// `extract_powershell_manifest`. +#[must_use] +pub fn extract_powershell_manifest(path: &Path) -> FileResult { + let source = match std::fs::read(path) { + Ok(s) => s, + Err(e) => return FileResult::error(format!("powershell manifest read error: {e}")), + }; + let mut parser = tree_sitter::Parser::new(); + if parser + .set_language(&tree_sitter_powershell::LANGUAGE.into()) + .is_err() + { + return FileResult::error("tree_sitter_powershell language load failed"); + } + let Some(tree) = parser.parse(&source, None) else { + return FileResult::error("powershell manifest parse failed"); + }; + + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + let nodes = vec![Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some("L1".to_string()), + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + walk_psd1_manifest(tree.root_node(), &source, &file_nid, &str_path, &mut edges); + + FileResult { + nodes, + edges, + raw_calls: vec![], + error: None, + } +} diff --git a/crates/graphify-extract/src/extractors/powershell.rs b/crates/graphify-extract/src/extractors/powershell/mod.rs similarity index 78% rename from crates/graphify-extract/src/extractors/powershell.rs rename to crates/graphify-extract/src/extractors/powershell/mod.rs index 43c260f..e465268 100644 --- a/crates/graphify-extract/src/extractors/powershell.rs +++ b/crates/graphify-extract/src/extractors/powershell/mod.rs @@ -1,12 +1,15 @@ -//! PowerShell extractor — custom walk over tree-sitter-powershell AST. +//! PowerShell source extractor (`.ps1` / `.psm1`) over tree-sitter-powershell. -use std::collections::{HashMap, HashSet}; -use std::path::Path; -use std::sync::LazyLock; +mod manifest; + +pub use manifest::extract_powershell_manifest; use crate::generic::walk::first_child_kind; use crate::ids::{file_stem, make_id, make_id1}; use crate::types::{Edge, FileResult, Node, RawCall}; +use std::collections::{HashMap, HashSet}; +use std::path::Path; +use std::sync::LazyLock; static PS_SKIP: LazyLock> = LazyLock::new(|| { [ @@ -788,248 +791,3 @@ fn walk_calls_ps( } } } - -/// `.psd1` manifest keys whose values are module names/paths treated as imports. -/// Mirrors `_PSD1_IMPORT_KEYS`. -static PSD1_IMPORT_KEYS: LazyLock> = LazyLock::new(|| { - ["RootModule", "NestedModules", "RequiredModules"] - .into_iter() - .collect() -}); - -/// Derive a bare module name from a raw string value: strip the path prefix and -/// extension (`MyModule.psm1` -> `MyModule`, `./sub/Util.psm1` -> `Util`). -/// Mirrors `_psd1_module_name`. -fn psd1_module_name(raw: &str) -> String { - let normalized = raw.replace('\\', "/"); - let basename = normalized.rsplit('/').next().unwrap_or(""); - let no_ext = basename.rsplit_once('.').map_or(basename, |(base, _)| base); - no_ext.trim().to_string() -} - -/// Recursively collect all `string_literal` text values (surrounding quotes -/// stripped) under `node`. Mirrors `_psd1_collect_string_literals`. -fn psd1_collect_string_literals(node: tree_sitter::Node<'_>, source: &[u8], out: &mut Vec) { - if node.kind() == "string_literal" { - out.push( - read_text(node, source) - .trim_matches(['\'', '"']) - .to_string(), - ); - return; - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - psd1_collect_string_literals(cur.node(), source, out); - if !cur.goto_next_sibling() { - break; - } - } - } -} - -/// Like [`psd1_collect_string_literals`] but keeps each string's start byte so a -/// caller can distinguish strings nested inside a `hash_entry` from direct ones. -fn psd1_collect_string_nodes( - node: tree_sitter::Node<'_>, - source: &[u8], - out: &mut Vec<(usize, String)>, -) { - if node.kind() == "string_literal" { - out.push(( - node.start_byte(), - read_text(node, source) - .trim_matches(['\'', '"']) - .to_string(), - )); - return; - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - psd1_collect_string_nodes(cur.node(), source, out); - if !cur.goto_next_sibling() { - break; - } - } - } -} - -/// For `RequiredModules`: collect `ModuleName` values from hashtable specs and -/// record every string nested in a `hash_entry` (so the caller can treat only -/// the remaining direct strings as simple module names, and `ModuleVersion` -/// values never leak in). Mirrors the inner `find_modulename_entries`. -fn psd1_find_modulename_entries( - node: tree_sitter::Node<'_>, - source: &[u8], - module_names: &mut Vec, - inside_hash: &mut HashSet, -) { - if node.kind() == "hash_entry" { - let sub_key = first_child_kind(node, "key_expression"); - let sk_text = sub_key.map_or_else(String::new, |k| read_text(k, source).trim().to_string()); - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "pipeline" { - let mut found = Vec::new(); - psd1_collect_string_nodes(cur.node(), source, &mut found); - for (sb, s) in found { - inside_hash.insert(sb); - if sk_text == "ModuleName" { - module_names.push(s); - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; // don't recurse further into this hash_entry - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - psd1_find_modulename_entries(cur.node(), source, module_names, inside_hash); - if !cur.goto_next_sibling() { - break; - } - } - } -} - -/// Push a `file -> module` `imports_from` edge for a raw `.psd1` module value. -fn add_psd1_import_edge( - edges: &mut Vec, - file_nid: &str, - str_path: &str, - module_raw: &str, - line: usize, -) { - let name = psd1_module_name(module_raw); - if name.is_empty() { - return; - } - edges.push(Edge { - external: false, - source: file_nid.to_string(), - target: make_id1(&name), - relation: "imports_from".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: Some("import".to_string()), - confidence_score: None, - }); -} - -/// Walk a `.psd1` AST, emitting `imports_from` edges for `RootModule`, -/// `NestedModules`, and `RequiredModules` entries. Mirrors `walk_manifest`. -fn walk_psd1_manifest( - node: tree_sitter::Node<'_>, - source: &[u8], - file_nid: &str, - str_path: &str, - edges: &mut Vec, -) { - if node.kind() != "hash_entry" { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_psd1_manifest(cur.node(), source, file_nid, str_path, edges); - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - let Some(key_node) = first_child_kind(node, "key_expression") else { - return; - }; - let key_text = read_text(key_node, source).trim().to_string(); - if !PSD1_IMPORT_KEYS.contains(key_text.as_str()) { - return; - } - let line = node.start_position().row + 1; - let Some(value_node) = first_child_kind(node, "pipeline") else { - return; - }; - match key_text.as_str() { - "RootModule" | "NestedModules" => { - let mut strings = Vec::new(); - psd1_collect_string_literals(value_node, source, &mut strings); - for s in strings { - add_psd1_import_edge(edges, file_nid, str_path, &s, line); - } - } - "RequiredModules" => { - // Two forms: plain 'Module' strings, and @{ ModuleName='Foo'; ... } - // specs (follow only ModuleName; ModuleVersion etc. are excluded). - let mut module_names = Vec::new(); - let mut inside_hash = HashSet::new(); - psd1_find_modulename_entries(value_node, source, &mut module_names, &mut inside_hash); - let mut all_strings = Vec::new(); - psd1_collect_string_nodes(value_node, source, &mut all_strings); - for (sb, s) in &all_strings { - if !inside_hash.contains(sb) { - add_psd1_import_edge(edges, file_nid, str_path, s, line); - } - } - for s in &module_names { - add_psd1_import_edge(edges, file_nid, str_path, s, line); - } - } - _ => {} - } -} - -/// Extract module dependency edges from a PowerShell `.psd1` manifest. -/// -/// `.psd1` files are PowerShell data hashtables (syntactically valid PowerShell), -/// so tree-sitter parses them. Emits a file node plus `imports_from` edges for -/// every module named under `RootModule`, `NestedModules`, and `RequiredModules` -/// (both the plain-string and `@{ ModuleName=... }` forms). Mirrors -/// `extract_powershell_manifest`. -#[must_use] -pub fn extract_powershell_manifest(path: &Path) -> FileResult { - let source = match std::fs::read(path) { - Ok(s) => s, - Err(e) => return FileResult::error(format!("powershell manifest read error: {e}")), - }; - let mut parser = tree_sitter::Parser::new(); - if parser - .set_language(&tree_sitter_powershell::LANGUAGE.into()) - .is_err() - { - return FileResult::error("tree_sitter_powershell language load failed"); - } - let Some(tree) = parser.parse(&source, None) else { - return FileResult::error("powershell manifest parse failed"); - }; - - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - let nodes = vec![Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some("L1".to_string()), - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - walk_psd1_manifest(tree.root_node(), &source, &file_nid, &str_path, &mut edges); - - FileResult { - nodes, - edges, - raw_calls: vec![], - error: None, - } -} diff --git a/crates/graphify-extract/src/extractors/python_rationale.rs b/crates/graphify-extract/src/extractors/python_rationale.rs new file mode 100644 index 0000000..b4bc117 --- /dev/null +++ b/crates/graphify-extract/src/extractors/python_rationale.rs @@ -0,0 +1,306 @@ +//! Python rationale extraction (docstrings + RATIONALE-prefixed comments). + +use std::path::Path; + +use crate::types::FileResult; + +const RATIONALE_PREFIXES: &[&str] = &[ + "# NOTE:", + "# IMPORTANT:", + "# HACK:", + "# WHY:", + "# RATIONALE:", + "# TODO:", + "# FIXME:", +]; + +/// Augment a Python extraction result with rationale nodes sourced from docstrings and comments. +/// +/// Walks the file's AST for module, class, and function docstrings (> 20 chars) and scans +/// source lines for `RATIONALE_PREFIXES` comments. Each rationale becomes a node of +/// `file_type = "rationale"` connected via a `rationale_for` edge to the containing entity. +/// Auto-generated files (migrations, protobuf, Alembic) are silently skipped. +/// Mirrors Python `_extract_rationale`. +pub(super) fn extract_python_rationale(path: &Path, result: &mut FileResult) { + use crate::ids::{file_stem, make_id, make_id1}; + use crate::types::{Edge, Node}; + use std::collections::HashSet; + use tree_sitter::Parser; + + let Ok(source) = std::fs::read(path) else { + return; + }; + + let mut parser = Parser::new(); + if parser + .set_language(&tree_sitter_python::LANGUAGE.into()) + .is_err() + { + return; + } + let Some(tree) = parser.parse(&source, None) else { + return; + }; + + let stem = file_stem(path); + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + let mut seen_ids: HashSet = result.nodes.iter().map(|n| n.id.clone()).collect(); + + let add_rationale = |text: &str, + line: u32, + parent_nid: &str, + seen: &mut HashSet, + nodes: &mut Vec, + edges: &mut Vec| { + let label: String = text + .chars() + .take(80) + .collect::() + .replace("\r\n", " ") + .replace(['\r', '\n'], " ") + .trim() + .to_string(); + let rid = make_id(&[&stem, "rationale", &line.to_string()]); + if seen.insert(rid.clone()) { + nodes.push(Node { + id: rid.clone(), + label, + file_type: "rationale".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } + edges.push(Edge { + external: false, + source: rid, + target: parent_nid.to_string(), + relation: "rationale_for".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + }; + + // Module-level docstring — skipped for auto-generated files (Alembic / + // Django migrations, protobuf stubs) whose module docstrings are revision + // annotations, not architectural rationale. Class/function docstrings and + // `# NOTE:`-style comments below are still extracted (Python parity). + let root = tree.root_node(); + if !is_autogenerated_python(&source) + && let Some((doc, line)) = get_docstring(root, &source) + { + add_rationale( + &doc, + line, + &file_nid, + &mut seen_ids, + &mut result.nodes, + &mut result.edges, + ); + } + + // Walk class/function docstrings + { + let mut doc_ctx = DocstringWalkCtx { + stem: &stem, + file_nid: &file_nid, + seen_ids: &mut seen_ids, + nodes: &mut result.nodes, + edges: &mut result.edges, + add_rationale: &add_rationale, + }; + walk_docstrings(&mut doc_ctx, root, &file_nid, &source); + } + + // Rationale comments + let source_text = String::from_utf8_lossy(&source).into_owned(); + for (lineno, line_text) in source_text.lines().enumerate() { + let stripped = line_text.trim(); + if RATIONALE_PREFIXES.iter().any(|p| stripped.starts_with(p)) { + add_rationale( + stripped, + u32::try_from(lineno).unwrap_or(u32::MAX).saturating_add(1), + &file_nid, + &mut seen_ids, + &mut result.nodes, + &mut result.edges, + ); + } + } +} + +/// Return `true` when the Python source is auto-generated and should not have rationale extracted. +/// +/// Checks the first 2048 bytes for `DO NOT EDIT`, `@generated`, or protobuf markers, and also +/// detects Alembic/Flask-Migrate migration files and Django migration classes. Mirrors Python +/// `_is_autogenerated`. +fn is_autogenerated_python(source: &[u8]) -> bool { + let head = String::from_utf8_lossy(&source[..source.len().min(2048)]).into_owned(); + if head.contains("DO NOT EDIT") + || head.contains("@generated") + || head.contains("Generated by the protocol buffer") + { + return true; + } + // Alembic / Flask-Migrate + if head.contains("def upgrade(") + && head.contains("down_revision") + && head.lines().any(|l| { + let t = l.trim(); + t.starts_with("revision") && (t.contains(':') || t.contains('=')) + }) + { + return true; + } + // Django migrations + if head.contains("class Migration(migrations.Migration)") && head.contains("operations") { + return true; + } + false +} + +/// Extract the first triple-quoted docstring from a Python AST node's first child. +/// +/// Looks for an `expression_statement` as the first child containing a `string` or +/// `concatenated_string` node; returns `(cleaned_text, line_number)` when the cleaned text +/// exceeds 20 characters (too-short strings are likely not real docstrings). +fn get_docstring(node: tree_sitter::Node<'_>, source: &[u8]) -> Option<(String, u32)> { + let mut cur = node.walk(); + if !cur.goto_first_child() { + return None; + } + let child = cur.node(); + if child.kind() == "expression_statement" { + let mut ecur = child.walk(); + if ecur.goto_first_child() { + loop { + let sub = ecur.node(); + if matches!(sub.kind(), "string" | "concatenated_string") { + let text = String::from_utf8_lossy(&source[sub.start_byte()..sub.end_byte()]) + .into_owned(); + let clean = text + .trim_matches('"') + .trim_matches('\'') + .trim_start_matches("\"\"\"") + .trim_end_matches("\"\"\"") + .trim_start_matches("'''") + .trim_end_matches("'''") + .trim() + .to_string(); + if clean.len() > 20 { + let row = child.start_position().row; + return Some(( + clean, + u32::try_from(row).unwrap_or(u32::MAX).saturating_add(1), + )); + } + } + if !ecur.goto_next_sibling() { + break; + } + } + } + } + None +} + +/// Recursively walk a Python AST node extracting docstrings from class and function bodies. +/// +/// For `class_definition` nodes, extracts the class body docstring and recurses into methods. +/// For `function_definition` nodes, extracts the function body docstring and stops recursing. +/// All other nodes are traversed without emitting rationale. Called by `extract_python_rationale`. +/// Shared state threaded through every [`walk_docstrings`] recursion. +struct DocstringWalkCtx<'a, F> +where + F: Fn( + &str, + u32, + &str, + &mut std::collections::HashSet, + &mut Vec, + &mut Vec, + ), +{ + stem: &'a str, + file_nid: &'a str, + seen_ids: &'a mut std::collections::HashSet, + nodes: &'a mut Vec, + edges: &'a mut Vec, + add_rationale: &'a F, +} + +fn walk_docstrings( + ctx: &mut DocstringWalkCtx<'_, F>, + node: tree_sitter::Node<'_>, + parent_nid: &str, + source: &[u8], +) where + F: Fn( + &str, + u32, + &str, + &mut std::collections::HashSet, + &mut Vec, + &mut Vec, + ), +{ + use crate::ids::make_id; + let t = node.kind(); + if t == "class_definition" { + if let Some(name_node) = node.child_by_field_name("name") { + let class_name = + String::from_utf8_lossy(&source[name_node.start_byte()..name_node.end_byte()]) + .into_owned(); + let nid = make_id(&[ctx.stem, &class_name]); + if let Some(body) = node.child_by_field_name("body") { + if let Some((doc, line)) = get_docstring(body, source) { + (ctx.add_rationale)(&doc, line, &nid, ctx.seen_ids, ctx.nodes, ctx.edges); + } + let mut cur = body.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + walk_docstrings(ctx, child, &nid, source); + if !cur.goto_next_sibling() { + break; + } + } + } + } + } + return; + } + if t == "function_definition" { + if let Some(name_node) = node.child_by_field_name("name") { + let func_name = + String::from_utf8_lossy(&source[name_node.start_byte()..name_node.end_byte()]) + .into_owned(); + let nid = if parent_nid == ctx.file_nid { + make_id(&[ctx.stem, &func_name]) + } else { + make_id(&[parent_nid, &func_name]) + }; + if let Some(body) = node.child_by_field_name("body") + && let Some((doc, line)) = get_docstring(body, source) + { + (ctx.add_rationale)(&doc, line, &nid, ctx.seen_ids, ctx.nodes, ctx.edges); + } + } + return; + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + walk_docstrings(ctx, child, parent_nid, source); + if !cur.goto_next_sibling() { + break; + } + } + } +} diff --git a/crates/graphify-extract/src/extractors/rust_lang/calls.rs b/crates/graphify-extract/src/extractors/rust_lang/calls.rs new file mode 100644 index 0000000..70d8be6 --- /dev/null +++ b/crates/graphify-extract/src/extractors/rust_lang/calls.rs @@ -0,0 +1,107 @@ +//! Rust call-graph pass. + +use super::{RUST_TRAIT_METHOD_BLOCKLIST, read_text}; +use crate::types::{Edge, RawCall}; +use std::collections::{HashMap, HashSet}; + +/// Collect `calls` ctx.edges within a Rust function body's byte range. +/// +/// Recurses through the body AST, emitting `calls` ctx.edges for `call_expression` and +/// `macro_invocation` ctx.nodes whose callee matches a known NID. Mirrors Python `_walk_calls_rust`. +/// Shared state threaded through every [`walk_calls_rust`] recursion. +pub(super) struct RustCallCtx<'a> { + pub(super) str_path: &'a str, + pub(super) label_to_nid: &'a HashMap, + pub(super) edges: &'a mut Vec, + pub(super) seen_call_pairs: &'a mut HashSet<(String, String)>, + pub(super) raw_calls: &'a mut Vec, +} + +pub(super) fn walk_calls_rust( + ctx: &mut RustCallCtx<'_>, + node: tree_sitter::Node<'_>, + source: &[u8], + caller_nid: &str, + body_start: usize, + body_end: usize, +) { + if node.start_byte() >= body_end || node.end_byte() <= body_start { + return; + } + if node.kind() == "function_item" { + return; + } + + if node.kind() == "call_expression" + && let Some(func_node) = node.child_by_field_name("function") + { + let mut callee_name: Option = None; + let mut is_member_call = false; + let mut is_scoped_call = false; + match func_node.kind() { + "identifier" => { + callee_name = Some(read_text(func_node, source).to_string()); + } + "field_expression" => { + is_member_call = true; + if let Some(field) = func_node.child_by_field_name("field") { + callee_name = Some(read_text(field, source).to_string()); + } + } + "scoped_identifier" => { + is_scoped_call = true; + if let Some(name) = func_node.child_by_field_name("name") { + callee_name = Some(read_text(name, source).to_string()); + } + } + _ => {} + } + if let Some(cn) = callee_name { + // Resolve first so a built-in name backing a real local symbol is + // kept; only drop unresolved built-ins (god-node guard, #726). + let tgt_nid = ctx.label_to_nid.get(&cn.to_lowercase()).cloned(); + if let Some(tgt) = tgt_nid { + if tgt != caller_nid { + let pair = (caller_nid.to_string(), tgt.clone()); + if ctx.seen_call_pairs.insert(pair) { + let line = node.start_position().row + 1; + ctx.edges.push(Edge { + external: false, + source: caller_nid.to_string(), + target: tgt, + relation: "calls".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: ctx.str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: Some("call".to_string()), + confidence_score: None, + }); + } + } + } else if !is_scoped_call + && !RUST_TRAIT_METHOD_BLOCKLIST.contains(cn.to_lowercase().as_str()) + && !crate::builtins::is_language_builtin_global(&cn) + { + ctx.raw_calls.push(RawCall { + caller_nid: caller_nid.to_string(), + callee: cn, + is_member_call, + source_file: ctx.str_path.to_string(), + source_location: format!("L{}", node.start_position().row + 1), + receiver: None, + }); + } + } + } + + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_calls_rust(ctx, cur.node(), source, caller_nid, body_start, body_end); + if !cur.goto_next_sibling() { + break; + } + } + } +} diff --git a/crates/graphify-extract/src/extractors/rust_lang/mod.rs b/crates/graphify-extract/src/extractors/rust_lang/mod.rs new file mode 100644 index 0000000..b987fcb --- /dev/null +++ b/crates/graphify-extract/src/extractors/rust_lang/mod.rs @@ -0,0 +1,169 @@ +//! Rust extractor — custom walk over tree-sitter-rust AST. + +mod calls; +mod refs; +mod walk; + +use crate::ids::{file_stem, make_id1}; +use crate::types::{Edge, FileResult, Node, RawCall}; +use calls::{RustCallCtx, walk_calls_rust}; +use std::collections::{HashMap, HashSet}; +use std::path::Path; +use std::sync::LazyLock; +use walk::{RustWalkCtx, walk_rust}; + +/// Common Rust trait/stdlib method names that appear in virtually every codebase. +/// Resolving these cross-file produces spurious INFERRED edges — skip them from +/// the unresolved-call queue entirely. +static RUST_TRAIT_METHOD_BLOCKLIST: LazyLock> = LazyLock::new(|| { + [ + "new", + "default", + "parse", + "from_str", + "now", + "clone", + "into", + "from", + "to_string", + "to_owned", + "len", + "is_empty", + "iter", + "next", + "build", + "start", + "run", + "init", + "app", + "get", + "set", + "push", + "pop", + "insert", + "remove", + "contains", + "collect", + "map", + "filter", + "unwrap", + "expect", + "ok", + "err", + "some", + "none", + "send", + "recv", + "lock", + "read", + "write", + ] + .into_iter() + .collect() +}); + +/// Return the source bytes covered by `node` as a UTF-8 `&str`, or `""` on bad UTF-8. +fn read_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { + std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") +} + +/// Extract functions, structs, enums, traits, impl methods, and use declarations from a `.rs` file. +#[must_use] +pub fn extract_rust(path: &Path) -> FileResult { + let Some((source, tree)) = parse_rust_source(path) else { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: Some("parse failed".to_string()), + }; + }; + let stem = file_stem(path); + let str_path = path.to_string_lossy().into_owned(); + let file_nid = make_id1(&str_path); + let mut nodes: Vec = vec![Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: Some("L1".to_string()), + metadata: None, + }]; + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::from([file_nid.clone()]); + let mut function_bodies: Vec<(String, usize, usize)> = Vec::new(); + + { + let mut walk_ctx = RustWalkCtx { + str_path: &str_path, + stem: &stem, + file_nid: &file_nid, + nodes: &mut nodes, + edges: &mut edges, + seen_ids: &mut seen_ids, + function_bodies: &mut function_bodies, + }; + walk_rust(&mut walk_ctx, tree.root_node(), &source, None); + } + + let mut label_to_nid: HashMap = HashMap::new(); + for n in &nodes { + let normalised = n.label.trim_end_matches("()").trim_start_matches('.'); + label_to_nid.insert(normalised.to_lowercase(), n.id.clone()); + } + + let mut seen_call_pairs: HashSet<(String, String)> = HashSet::new(); + let mut raw_calls: Vec = Vec::new(); + { + let mut call_ctx = RustCallCtx { + str_path: &str_path, + label_to_nid: &label_to_nid, + edges: &mut edges, + seen_call_pairs: &mut seen_call_pairs, + raw_calls: &mut raw_calls, + }; + for (caller_nid, body_start, body_end) in &function_bodies { + walk_calls_rust( + &mut call_ctx, + tree.root_node(), + &source, + caller_nid, + *body_start, + *body_end, + ); + } + } + + crate::forward_refs::reconcile_forward_refs(&mut nodes, &mut edges); + // Validate dangling edges against the reconciled graph rather than the + // now-stale `seen_ids`, which still lists any placeholder ids reconcile + // folded away. + let valid_ids: HashSet = nodes.iter().map(|n| n.id.clone()).collect(); + let clean_edges: Vec = edges + .into_iter() + .filter(|e| { + valid_ids.contains(&e.source) + && (valid_ids.contains(&e.target) + || matches!(e.relation.as_str(), "imports" | "imports_from")) + }) + .collect(); + FileResult { + nodes, + edges: clean_edges, + raw_calls, + error: None, + } +} + +/// Read + tree-sitter-parse a `.rs` file. `None` on any I/O or parse error. +fn parse_rust_source(path: &Path) -> Option<(Vec, tree_sitter::Tree)> { + let source = std::fs::read(path).ok()?; + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter_rust::LANGUAGE.into()) + .ok()?; + let tree = parser.parse(&source, None)?; + Some((source, tree)) +} diff --git a/crates/graphify-extract/src/extractors/rust_lang/refs.rs b/crates/graphify-extract/src/extractors/rust_lang/refs.rs new file mode 100644 index 0000000..964e17c --- /dev/null +++ b/crates/graphify-extract/src/extractors/rust_lang/refs.rs @@ -0,0 +1,102 @@ +//! Rust type-reference name collectors. + +use super::read_text; + +/// Walk a Rust type expression, appending `(name, is_generic_arg)` tuples for +/// each user-defined type referenced. Primitive types are skipped. Mirrors +/// Python `_rust_collect_type_refs`. +pub(super) fn rust_collect_type_refs( + node: tree_sitter::Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, bool)>, +) { + match node.kind() { + "primitive_type" => {} + "type_identifier" => { + let text = read_text(node, source); + if !text.is_empty() { + out.push((text.to_string(), generic)); + } + } + "scoped_type_identifier" => { + let full = read_text(node, source); + let text = full.rsplit("::").next().unwrap_or(full); + if !text.is_empty() { + out.push((text.to_string(), generic)); + } + } + "generic_type" => { + let name_node = node.child_by_field_name("type").or_else(|| { + let mut c = node.walk(); + if c.goto_first_child() { + loop { + if matches!( + c.node().kind(), + "type_identifier" | "scoped_type_identifier" + ) { + return Some(c.node()); + } + if !c.goto_next_sibling() { + break; + } + } + } + None + }); + if let Some(nn) = name_node { + let full = read_text(nn, source); + let text = full.rsplit("::").next().unwrap_or(full); + if !text.is_empty() { + out.push((text.to_string(), generic)); + } + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == "type_arguments" { + let mut acur = cur.node().walk(); + if acur.goto_first_child() { + loop { + if acur.node().is_named() { + rust_collect_type_refs(acur.node(), source, true, out); + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + "reference_type" | "pointer_type" | "array_type" | "tuple_type" | "slice_type" => { + rust_recurse_named(node, source, generic, out); + } + _ if node.is_named() => rust_recurse_named(node, source, generic, out), + _ => {} + } +} + +/// Recurse `rust_collect_type_refs` over every named child of `node`. +fn rust_recurse_named( + node: tree_sitter::Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, bool)>, +) { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + rust_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } +} diff --git a/crates/graphify-extract/src/extractors/rust_lang.rs b/crates/graphify-extract/src/extractors/rust_lang/walk.rs similarity index 56% rename from crates/graphify-extract/src/extractors/rust_lang.rs rename to crates/graphify-extract/src/extractors/rust_lang/walk.rs index af8c1ab..93e8ddb 100644 --- a/crates/graphify-extract/src/extractors/rust_lang.rs +++ b/crates/graphify-extract/src/extractors/rust_lang/walk.rs @@ -1,185 +1,28 @@ -//! Rust extractor — custom walk over tree-sitter-rust AST. +//! Rust structural AST walk + type-reference edge emitters. -use std::collections::{HashMap, HashSet}; -use std::path::Path; -use std::sync::LazyLock; - -use crate::ids::{file_stem, make_id, make_id1}; -use crate::types::{Edge, FileResult, Node, RawCall}; - -/// Common Rust trait/stdlib method names that appear in virtually every codebase. -/// Resolving these cross-file produces spurious INFERRED edges — skip them from -/// the unresolved-call queue entirely. -static RUST_TRAIT_METHOD_BLOCKLIST: LazyLock> = LazyLock::new(|| { - [ - "new", - "default", - "parse", - "from_str", - "now", - "clone", - "into", - "from", - "to_string", - "to_owned", - "len", - "is_empty", - "iter", - "next", - "build", - "start", - "run", - "init", - "app", - "get", - "set", - "push", - "pop", - "insert", - "remove", - "contains", - "collect", - "map", - "filter", - "unwrap", - "expect", - "ok", - "err", - "some", - "none", - "send", - "recv", - "lock", - "read", - "write", - ] - .into_iter() - .collect() -}); - -/// Return the source bytes covered by `node` as a UTF-8 `&str`, or `""` on bad UTF-8. -fn read_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { - std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") -} - -/// Extract functions, structs, enums, traits, impl methods, and use declarations from a `.rs` file. -#[must_use] -pub fn extract_rust(path: &Path) -> FileResult { - let Some((source, tree)) = parse_rust_source(path) else { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: Some("parse failed".to_string()), - }; - }; - let stem = file_stem(path); - let str_path = path.to_string_lossy().into_owned(); - let file_nid = make_id1(&str_path); - let mut nodes: Vec = vec![Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: Some("L1".to_string()), - metadata: None, - }]; - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::from([file_nid.clone()]); - let mut function_bodies: Vec<(String, usize, usize)> = Vec::new(); - - { - let mut walk_ctx = RustWalkCtx { - str_path: &str_path, - stem: &stem, - file_nid: &file_nid, - nodes: &mut nodes, - edges: &mut edges, - seen_ids: &mut seen_ids, - function_bodies: &mut function_bodies, - }; - walk_rust(&mut walk_ctx, tree.root_node(), &source, None); - } - - let mut label_to_nid: HashMap = HashMap::new(); - for n in &nodes { - let normalised = n.label.trim_end_matches("()").trim_start_matches('.'); - label_to_nid.insert(normalised.to_lowercase(), n.id.clone()); - } - - let mut seen_call_pairs: HashSet<(String, String)> = HashSet::new(); - let mut raw_calls: Vec = Vec::new(); - { - let mut call_ctx = RustCallCtx { - str_path: &str_path, - label_to_nid: &label_to_nid, - edges: &mut edges, - seen_call_pairs: &mut seen_call_pairs, - raw_calls: &mut raw_calls, - }; - for (caller_nid, body_start, body_end) in &function_bodies { - walk_calls_rust( - &mut call_ctx, - tree.root_node(), - &source, - caller_nid, - *body_start, - *body_end, - ); - } - } - - crate::forward_refs::reconcile_forward_refs(&mut nodes, &mut edges); - // Validate dangling edges against the reconciled graph rather than the - // now-stale `seen_ids`, which still lists any placeholder ids reconcile - // folded away. - let valid_ids: HashSet = nodes.iter().map(|n| n.id.clone()).collect(); - let clean_edges: Vec = edges - .into_iter() - .filter(|e| { - valid_ids.contains(&e.source) - && (valid_ids.contains(&e.target) - || matches!(e.relation.as_str(), "imports" | "imports_from")) - }) - .collect(); - FileResult { - nodes, - edges: clean_edges, - raw_calls, - error: None, - } -} - -/// Read + tree-sitter-parse a `.rs` file. `None` on any I/O or parse error. -fn parse_rust_source(path: &Path) -> Option<(Vec, tree_sitter::Tree)> { - let source = std::fs::read(path).ok()?; - let mut parser = tree_sitter::Parser::new(); - parser - .set_language(&tree_sitter_rust::LANGUAGE.into()) - .ok()?; - let tree = parser.parse(&source, None)?; - Some((source, tree)) -} +use super::read_text; +use super::refs::rust_collect_type_refs; +use crate::ids::{make_id, make_id1}; +use crate::types::{Edge, Node}; +use std::collections::HashSet; /// Recursively walk a Rust AST emitting nodes for functions, structs, enums, traits, and impls. /// /// Records function body byte ranges for the subsequent call-graph pass. Handles `use_declaration` /// to produce import edges. Mirrors Python `_walk_rust`. /// Shared state threaded through every [`walk_rust`] recursion. -struct RustWalkCtx<'a> { - str_path: &'a str, - stem: &'a str, - file_nid: &'a str, - nodes: &'a mut Vec, - edges: &'a mut Vec, - seen_ids: &'a mut HashSet, - function_bodies: &'a mut Vec<(String, usize, usize)>, +pub(super) struct RustWalkCtx<'a> { + pub(super) str_path: &'a str, + pub(super) stem: &'a str, + pub(super) file_nid: &'a str, + pub(super) nodes: &'a mut Vec, + pub(super) edges: &'a mut Vec, + pub(super) seen_ids: &'a mut HashSet, + pub(super) function_bodies: &'a mut Vec<(String, usize, usize)>, } #[allow(clippy::too_many_lines)] // linear dispatch over Rust's AST node kinds -fn walk_rust( +pub(super) fn walk_rust( ctx: &mut RustWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8], @@ -320,7 +163,10 @@ fn walk_rust( .trim_end_matches('*') .trim_end_matches(':') .to_string(); - let module_name = clean.split("::").last().unwrap_or("").trim().to_string(); + // Strip any `as` alias (`use foo::bar as baz` -> `bar`). Diverges + // from graphify-py (extract.py:6813), which keeps `bar as baz`. + let base = clean.split_once(" as ").map_or(clean.as_str(), |(b, _)| b); + let module_name = base.split("::").last().unwrap_or("").trim().to_string(); if !module_name.is_empty() { let tgt_nid = make_id1(&module_name); let line = node.start_position().row + 1; @@ -358,105 +204,6 @@ fn walk_rust( } } -/// Walk a Rust type expression, appending `(name, is_generic_arg)` tuples for -/// each user-defined type referenced. Primitive types are skipped. Mirrors -/// Python `_rust_collect_type_refs`. -fn rust_collect_type_refs( - node: tree_sitter::Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, bool)>, -) { - match node.kind() { - "primitive_type" => {} - "type_identifier" => { - let text = read_text(node, source); - if !text.is_empty() { - out.push((text.to_string(), generic)); - } - } - "scoped_type_identifier" => { - let full = read_text(node, source); - let text = full.rsplit("::").next().unwrap_or(full); - if !text.is_empty() { - out.push((text.to_string(), generic)); - } - } - "generic_type" => { - let name_node = node.child_by_field_name("type").or_else(|| { - let mut c = node.walk(); - if c.goto_first_child() { - loop { - if matches!( - c.node().kind(), - "type_identifier" | "scoped_type_identifier" - ) { - return Some(c.node()); - } - if !c.goto_next_sibling() { - break; - } - } - } - None - }); - if let Some(nn) = name_node { - let full = read_text(nn, source); - let text = full.rsplit("::").next().unwrap_or(full); - if !text.is_empty() { - out.push((text.to_string(), generic)); - } - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "type_arguments" { - let mut acur = cur.node().walk(); - if acur.goto_first_child() { - loop { - if acur.node().is_named() { - rust_collect_type_refs(acur.node(), source, true, out); - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - "reference_type" | "pointer_type" | "array_type" | "tuple_type" | "slice_type" => { - rust_recurse_named(node, source, generic, out); - } - _ if node.is_named() => rust_recurse_named(node, source, generic, out), - _ => {} - } -} - -/// Recurse `rust_collect_type_refs` over every named child of `node`. -fn rust_recurse_named( - node: tree_sitter::Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, bool)>, -) { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - rust_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } -} - impl RustWalkCtx<'_> { /// Return the NID for a named type, creating a bare placeholder node when no /// file-qualified node already exists. Mirrors Rust's `ensure_named_node`. @@ -713,105 +460,3 @@ fn emit_rust_impl_trait( } } } - -/// Collect `calls` ctx.edges within a Rust function body's byte range. -/// -/// Recurses through the body AST, emitting `calls` ctx.edges for `call_expression` and -/// `macro_invocation` ctx.nodes whose callee matches a known NID. Mirrors Python `_walk_calls_rust`. -/// Shared state threaded through every [`walk_calls_rust`] recursion. -struct RustCallCtx<'a> { - str_path: &'a str, - label_to_nid: &'a HashMap, - edges: &'a mut Vec, - seen_call_pairs: &'a mut HashSet<(String, String)>, - raw_calls: &'a mut Vec, -} - -fn walk_calls_rust( - ctx: &mut RustCallCtx<'_>, - node: tree_sitter::Node<'_>, - source: &[u8], - caller_nid: &str, - body_start: usize, - body_end: usize, -) { - if node.start_byte() >= body_end || node.end_byte() <= body_start { - return; - } - if node.kind() == "function_item" { - return; - } - - if node.kind() == "call_expression" - && let Some(func_node) = node.child_by_field_name("function") - { - let mut callee_name: Option = None; - let mut is_member_call = false; - let mut is_scoped_call = false; - match func_node.kind() { - "identifier" => { - callee_name = Some(read_text(func_node, source).to_string()); - } - "field_expression" => { - is_member_call = true; - if let Some(field) = func_node.child_by_field_name("field") { - callee_name = Some(read_text(field, source).to_string()); - } - } - "scoped_identifier" => { - is_scoped_call = true; - if let Some(name) = func_node.child_by_field_name("name") { - callee_name = Some(read_text(name, source).to_string()); - } - } - _ => {} - } - if let Some(cn) = callee_name { - // Resolve first so a built-in name backing a real local symbol is - // kept; only drop unresolved built-ins (god-node guard, #726). - let tgt_nid = ctx.label_to_nid.get(&cn.to_lowercase()).cloned(); - if let Some(tgt) = tgt_nid { - if tgt != caller_nid { - let pair = (caller_nid.to_string(), tgt.clone()); - if ctx.seen_call_pairs.insert(pair) { - let line = node.start_position().row + 1; - ctx.edges.push(Edge { - external: false, - source: caller_nid.to_string(), - target: tgt, - relation: "calls".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: ctx.str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: Some("call".to_string()), - confidence_score: None, - }); - } - } - } else if !is_scoped_call - && !RUST_TRAIT_METHOD_BLOCKLIST.contains(cn.to_lowercase().as_str()) - && !crate::builtins::is_language_builtin_global(&cn) - { - ctx.raw_calls.push(RawCall { - caller_nid: caller_nid.to_string(), - callee: cn, - is_member_call, - source_file: ctx.str_path.to_string(), - source_location: format!("L{}", node.start_position().row + 1), - receiver: None, - }); - } - } - } - - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_calls_rust(ctx, cur.node(), source, caller_nid, body_start, body_end); - if !cur.goto_next_sibling() { - break; - } - } - } -} diff --git a/crates/graphify-extract/src/extractors/sql/mod.rs b/crates/graphify-extract/src/extractors/sql/mod.rs new file mode 100644 index 0000000..d00db1d --- /dev/null +++ b/crates/graphify-extract/src/extractors/sql/mod.rs @@ -0,0 +1,245 @@ +//! SQL extractor — tables, views, functions, triggers, and relationships. + +mod refs; +mod walk; + +use std::collections::HashSet; +use std::path::Path; +use std::sync::LazyLock; + +use crate::ids::{file_stem, make_id, make_id1}; +use crate::types::{Edge, FileResult, Node}; +use regex::Regex; +use walk::{SqlWalkCtx, walk_sql}; + +/// Matches `REFERENCES ` in SQL fragments for FK extraction. +#[allow(clippy::expect_used)] // literal pattern; build cannot panic +static SQL_REF_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)\bREFERENCES\s+([\w$]+)").expect("static sql references regex") +}); + +#[allow(clippy::expect_used)] // literal pattern; build cannot panic +static SQL_FB_HDR_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?i)CREATE\s+(?:OR\s+(?:REPLACE|ALTER)\s+)?(PROCEDURE|TRIGGER|FUNCTION)\s+([\w$]+)", + ) + .expect("static sql fb-header regex") +}); + +#[allow(clippy::expect_used)] // literal pattern; build cannot panic +static SQL_FOR_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\bFOR\s+([\w$]+)").expect("static sql for regex")); + +#[allow(clippy::expect_used)] // literal pattern; build cannot panic +static SQL_FROM_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)\b(?:FROM|JOIN|INTO)\s+([\w$]+)").expect("static sql from regex") +}); + +#[allow(clippy::expect_used)] // literal pattern; build cannot panic +static SQL_UPDATE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\bUPDATE\s+([\w$]+)").expect("static sql update regex")); + +static SQL_NON_TABLES: LazyLock> = LazyLock::new(|| { + [ + "select", "where", "set", "dual", "null", "true", "false", "first", "skip", "rows", "next", + "only", "lateral", + ] + .into_iter() + .collect() +}); + +#[allow(clippy::expect_used)] // literal pattern; build cannot panic +static SQL_CREATE_TABLE_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)CREATE\s+TABLE\s+([\w$]+)\s*\(").expect("static sql create-table regex") +}); + +#[allow(clippy::expect_used)] // literal pattern; build cannot panic +static SQL_END_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)(?:^|\n)(?:CREATE|SET\s+TERM|ALTER)\s").expect("static sql end regex") +}); + +/// Return the source bytes covered by `node` as a UTF-8 `&str`, or `""` on bad UTF-8. +fn read_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { + std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") +} + +/// Extract the text of the first `object_reference` child of `n`. +/// +/// Used to pull table/view names from SQL DDL statement nodes such as +/// `create_table_statement`, `create_view_statement`, etc. +fn obj_name<'a>(n: tree_sitter::Node<'_>, source: &'a [u8]) -> Option<&'a str> { + let mut cur = n.walk(); + n.children(&mut cur) + .find(|c| c.kind() == "object_reference") + .map(|c| read_text(c, source)) +} + +/// Extract tables, views, functions, and relationships from `.sql` files via tree-sitter. +#[must_use] +pub fn extract_sql(path: &Path) -> FileResult { + let source = match std::fs::read(path) { + Ok(b) => b, + Err(e) => { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: Some(e.to_string()), + }; + } + }; + extract_sql_from_source(path, &source) +} + +/// Like [`extract_sql`] but parses in-memory `content` while still attributing +/// nodes/edges to `path`. Used by `--postgres` introspection, which reconstructs +/// DDL in memory and attributes it to a virtual `postgresql://host/db` path. +/// Mirrors the `content=` parameter of graphify-py's `extract_sql`. +#[must_use] +pub fn extract_sql_with_content(path: &Path, content: &[u8]) -> FileResult { + extract_sql_from_source(path, content) +} + +/// Shared body of [`extract_sql`] / [`extract_sql_with_content`]: parse `source` +/// and build the graph, attributing every node/edge to `path`. +#[allow(clippy::too_many_lines)] +fn extract_sql_from_source(path: &Path, source: &[u8]) -> FileResult { + let mut parser = tree_sitter::Parser::new(); + if parser + .set_language(&tree_sitter_sequel::LANGUAGE.into()) + .is_err() + { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: Some("failed to set sql language".to_string()), + }; + } + let Some(tree) = parser.parse(source, None) else { + return FileResult { + nodes: vec![], + edges: vec![], + raw_calls: vec![], + error: Some("parse failed".to_string()), + }; + }; + + let stem = file_stem(path); + let str_path = path.to_string_lossy().into_owned(); + + let mut nodes: Vec = Vec::new(); + let mut edges: Vec = Vec::new(); + let mut seen_ids: HashSet = HashSet::new(); + let mut table_nids: std::collections::HashMap = + std::collections::HashMap::new(); + + let file_nid = make_id1(&str_path); + seen_ids.insert(file_nid.clone()); + nodes.push(Node { + id: file_nid.clone(), + label: path + .file_name() + .map_or(String::new(), |f| f.to_string_lossy().into_owned()), + file_type: "code".to_string(), + source_file: str_path.clone(), + source_location: None, + metadata: None, + }); + + let root = tree.root_node(); + + // Walk top-level statements + let mut cur = root.walk(); + if cur.goto_first_child() { + let mut walk_ctx = SqlWalkCtx { + str_path: &str_path, + stem: &stem, + file_nid: &file_nid, + nodes: &mut nodes, + edges: &mut edges, + seen_ids: &mut seen_ids, + table_nids: &mut table_nids, + }; + loop { + let stmt = cur.node(); + if stmt.kind() == "statement" { + let mut sc = stmt.walk(); + if sc.goto_first_child() { + loop { + walk_sql(&mut walk_ctx, sc.node(), source); + if !sc.goto_next_sibling() { + break; + } + } + } + } else if matches!( + stmt.kind(), + "fb_proc_or_trigger" | "set_term" | "declare_external_function" + ) { + walk_sql(&mut walk_ctx, stmt, source); + } + if !cur.goto_next_sibling() { + break; + } + } + } + + // Global regex fallback for REFERENCES not captured by the tree + let src_text = String::from_utf8_lossy(source).into_owned(); + let emitted: HashSet<(String, String)> = edges + .iter() + .filter(|e| e.relation == "references") + .map(|e| (e.source.clone(), e.target.clone())) + .collect(); + let mut emitted = emitted; + + for m in SQL_CREATE_TABLE_RE.find_iter(&src_text) { + let cap = SQL_CREATE_TABLE_RE + .captures(&src_text[m.start()..]) + .and_then(|c| c.get(1).map(|g| g.as_str().to_string())); + let Some(tbl_name) = cap else { continue }; + let Some(tbl_nid) = table_nids.get(&tbl_name.to_lowercase()).cloned() else { + continue; + }; + let tbl_line = src_text[..m.start()].chars().filter(|&c| c == '\n').count() + 1; + let tail = &src_text[m.start()..]; + let block_end = SQL_END_RE + .find(&tail[1..]) + .map_or(tail.len(), |em| em.start() + 1); + let block = &tail[..block_end]; + for rm in SQL_REF_RE.find_iter(block) { + let rcap = SQL_REF_RE + .captures(&block[rm.start()..]) + .and_then(|c| c.get(1).map(|g| g.as_str().to_string())); + let Some(ref_name) = rcap else { continue }; + let ref_nid = table_nids + .get(&ref_name.to_lowercase()) + .cloned() + .unwrap_or_else(|| make_id(&[&stem, &ref_name])); + let key = (tbl_nid.clone(), ref_nid.clone()); + if !emitted.contains(&key) { + emitted.insert(key); + edges.push(Edge { + external: false, + source: tbl_nid.clone(), + target: ref_nid, + relation: "references".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.clone(), + source_location: Some(format!("L{tbl_line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + } + } + } + + FileResult { + nodes, + edges, + raw_calls: vec![], + error: None, + } +} diff --git a/crates/graphify-extract/src/extractors/sql/refs.rs b/crates/graphify-extract/src/extractors/sql/refs.rs new file mode 100644 index 0000000..5f42cf4 --- /dev/null +++ b/crates/graphify-extract/src/extractors/sql/refs.rs @@ -0,0 +1,65 @@ +//! SQL FROM/JOIN data-flow reference edges. + +use super::read_text; +use crate::ids::make_id; +use crate::types::Edge; + +/// Recursively walk a SQL AST finding `FROM` and `JOIN` clauses and emitting `references` edges. +/// +/// Used to add query-time data-flow edges from functions/views to the tables they read. +/// Mirrors Python `_walk_from_refs`. +pub(super) fn walk_from_refs( + node: tree_sitter::Node<'_>, + source: &[u8], + str_path: &str, + stem: &str, + caller_nid: &str, + edges: &mut Vec, +) { + if matches!(node.kind(), "from" | "join") { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == "relation" { + let mut rc = cur.node().walk(); + if rc.goto_first_child() { + loop { + if rc.node().kind() == "object_reference" { + let tbl = read_text(rc.node(), source); + let tbl_nid = make_id(&[stem, tbl]); + let line = rc.node().start_position().row + 1; + edges.push(Edge { + external: false, + source: caller_nid.to_string(), + target: tbl_nid, + relation: "reads_from".to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: None, + confidence_score: None, + }); + } + if !rc.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + walk_from_refs(cur.node(), source, str_path, stem, caller_nid, edges); + if !cur.goto_next_sibling() { + break; + } + } + } +} diff --git a/crates/graphify-extract/src/extractors/sql.rs b/crates/graphify-extract/src/extractors/sql/walk.rs similarity index 67% rename from crates/graphify-extract/src/extractors/sql.rs rename to crates/graphify-extract/src/extractors/sql/walk.rs index 79b466c..548e353 100644 --- a/crates/graphify-extract/src/extractors/sql.rs +++ b/crates/graphify-extract/src/extractors/sql/walk.rs @@ -1,252 +1,13 @@ -//! SQL extractor — tables, views, functions, triggers, and relationships. +//! SQL structural AST walk (tables, views, functions, triggers). +use super::refs::walk_from_refs; +use super::{ + SQL_FB_HDR_RE, SQL_FOR_RE, SQL_FROM_RE, SQL_NON_TABLES, SQL_REF_RE, SQL_UPDATE_RE, obj_name, + read_text, +}; +use crate::ids::make_id; +use crate::types::{Edge, Node}; use std::collections::HashSet; -use std::path::Path; -use std::sync::LazyLock; - -use crate::ids::{file_stem, make_id, make_id1}; -use crate::types::{Edge, FileResult, Node}; -use regex::Regex; - -/// Matches `REFERENCES
` in SQL fragments for FK extraction. -#[allow(clippy::expect_used)] // literal pattern; build cannot panic -static SQL_REF_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"(?i)\bREFERENCES\s+([\w$]+)").expect("static sql references regex") -}); - -#[allow(clippy::expect_used)] // literal pattern; build cannot panic -static SQL_FB_HDR_RE: LazyLock = LazyLock::new(|| { - Regex::new( - r"(?i)CREATE\s+(?:OR\s+(?:REPLACE|ALTER)\s+)?(PROCEDURE|TRIGGER|FUNCTION)\s+([\w$]+)", - ) - .expect("static sql fb-header regex") -}); - -#[allow(clippy::expect_used)] // literal pattern; build cannot panic -static SQL_FOR_RE: LazyLock = - LazyLock::new(|| Regex::new(r"(?i)\bFOR\s+([\w$]+)").expect("static sql for regex")); - -#[allow(clippy::expect_used)] // literal pattern; build cannot panic -static SQL_FROM_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"(?i)\b(?:FROM|JOIN|INTO)\s+([\w$]+)").expect("static sql from regex") -}); - -#[allow(clippy::expect_used)] // literal pattern; build cannot panic -static SQL_UPDATE_RE: LazyLock = - LazyLock::new(|| Regex::new(r"(?i)\bUPDATE\s+([\w$]+)").expect("static sql update regex")); - -static SQL_NON_TABLES: LazyLock> = LazyLock::new(|| { - [ - "select", "where", "set", "dual", "null", "true", "false", "first", "skip", "rows", "next", - "only", "lateral", - ] - .into_iter() - .collect() -}); - -#[allow(clippy::expect_used)] // literal pattern; build cannot panic -static SQL_CREATE_TABLE_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"(?i)CREATE\s+TABLE\s+([\w$]+)\s*\(").expect("static sql create-table regex") -}); - -#[allow(clippy::expect_used)] // literal pattern; build cannot panic -static SQL_END_RE: LazyLock = LazyLock::new(|| { - Regex::new(r"(?i)(?:^|\n)(?:CREATE|SET\s+TERM|ALTER)\s").expect("static sql end regex") -}); - -/// Return the source bytes covered by `node` as a UTF-8 `&str`, or `""` on bad UTF-8. -fn read_text<'a>(node: tree_sitter::Node<'_>, source: &'a [u8]) -> &'a str { - std::str::from_utf8(&source[node.start_byte()..node.end_byte()]).unwrap_or("") -} - -/// Extract the text of the first `object_reference` child of `n`. -/// -/// Used to pull table/view names from SQL DDL statement nodes such as -/// `create_table_statement`, `create_view_statement`, etc. -fn obj_name<'a>(n: tree_sitter::Node<'_>, source: &'a [u8]) -> Option<&'a str> { - let mut cur = n.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "object_reference" { - return Some(read_text(cur.node(), source)); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// Extract tables, views, functions, and relationships from `.sql` files via tree-sitter. -#[must_use] -pub fn extract_sql(path: &Path) -> FileResult { - let source = match std::fs::read(path) { - Ok(b) => b, - Err(e) => { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: Some(e.to_string()), - }; - } - }; - extract_sql_from_source(path, &source) -} - -/// Like [`extract_sql`] but parses in-memory `content` while still attributing -/// nodes/edges to `path`. Used by `--postgres` introspection, which reconstructs -/// DDL in memory and attributes it to a virtual `postgresql://host/db` path. -/// Mirrors the `content=` parameter of graphify-py's `extract_sql`. -#[must_use] -pub fn extract_sql_with_content(path: &Path, content: &[u8]) -> FileResult { - extract_sql_from_source(path, content) -} - -/// Shared body of [`extract_sql`] / [`extract_sql_with_content`]: parse `source` -/// and build the graph, attributing every node/edge to `path`. -#[allow(clippy::too_many_lines)] -fn extract_sql_from_source(path: &Path, source: &[u8]) -> FileResult { - let mut parser = tree_sitter::Parser::new(); - if parser - .set_language(&tree_sitter_sequel::LANGUAGE.into()) - .is_err() - { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: Some("failed to set sql language".to_string()), - }; - } - let Some(tree) = parser.parse(source, None) else { - return FileResult { - nodes: vec![], - edges: vec![], - raw_calls: vec![], - error: Some("parse failed".to_string()), - }; - }; - - let stem = file_stem(path); - let str_path = path.to_string_lossy().into_owned(); - - let mut nodes: Vec = Vec::new(); - let mut edges: Vec = Vec::new(); - let mut seen_ids: HashSet = HashSet::new(); - let mut table_nids: std::collections::HashMap = - std::collections::HashMap::new(); - - let file_nid = make_id1(&str_path); - seen_ids.insert(file_nid.clone()); - nodes.push(Node { - id: file_nid.clone(), - label: path - .file_name() - .map_or(String::new(), |f| f.to_string_lossy().into_owned()), - file_type: "code".to_string(), - source_file: str_path.clone(), - source_location: None, - metadata: None, - }); - - let root = tree.root_node(); - - // Walk top-level statements - let mut cur = root.walk(); - if cur.goto_first_child() { - let mut walk_ctx = SqlWalkCtx { - str_path: &str_path, - stem: &stem, - file_nid: &file_nid, - nodes: &mut nodes, - edges: &mut edges, - seen_ids: &mut seen_ids, - table_nids: &mut table_nids, - }; - loop { - let stmt = cur.node(); - if stmt.kind() == "statement" { - let mut sc = stmt.walk(); - if sc.goto_first_child() { - loop { - walk_sql(&mut walk_ctx, sc.node(), source); - if !sc.goto_next_sibling() { - break; - } - } - } - } else if matches!( - stmt.kind(), - "fb_proc_or_trigger" | "set_term" | "declare_external_function" - ) { - walk_sql(&mut walk_ctx, stmt, source); - } - if !cur.goto_next_sibling() { - break; - } - } - } - - // Global regex fallback for REFERENCES not captured by the tree - let src_text = String::from_utf8_lossy(source).into_owned(); - let emitted: HashSet<(String, String)> = edges - .iter() - .filter(|e| e.relation == "references") - .map(|e| (e.source.clone(), e.target.clone())) - .collect(); - let mut emitted = emitted; - - for m in SQL_CREATE_TABLE_RE.find_iter(&src_text) { - let cap = SQL_CREATE_TABLE_RE - .captures(&src_text[m.start()..]) - .and_then(|c| c.get(1).map(|g| g.as_str().to_string())); - let Some(tbl_name) = cap else { continue }; - let Some(tbl_nid) = table_nids.get(&tbl_name.to_lowercase()).cloned() else { - continue; - }; - let tbl_line = src_text[..m.start()].chars().filter(|&c| c == '\n').count() + 1; - let tail = &src_text[m.start()..]; - let block_end = SQL_END_RE - .find(&tail[1..]) - .map_or(tail.len(), |em| em.start() + 1); - let block = &tail[..block_end]; - for rm in SQL_REF_RE.find_iter(block) { - let rcap = SQL_REF_RE - .captures(&block[rm.start()..]) - .and_then(|c| c.get(1).map(|g| g.as_str().to_string())); - let Some(ref_name) = rcap else { continue }; - let ref_nid = table_nids - .get(&ref_name.to_lowercase()) - .cloned() - .unwrap_or_else(|| make_id(&[&stem, &ref_name])); - let key = (tbl_nid.clone(), ref_nid.clone()); - if !emitted.contains(&key) { - emitted.insert(key); - edges.push(Edge { - external: false, - source: tbl_nid.clone(), - target: ref_nid, - relation: "references".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.clone(), - source_location: Some(format!("L{tbl_line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - } - } - } - - FileResult { - nodes, - edges, - raw_calls: vec![], - error: None, - } -} /// Recursively walk a SQL AST emitting nodes for tables, views, and functions. /// @@ -254,18 +15,18 @@ fn extract_sql_from_source(path: &Path, source: &[u8]) -> FileResult { /// and `create_procedure_statement`. Also records `table_nids` for use by `walk_from_refs`. /// Mirrors Python `_walk_sql`. /// Shared state threaded through every [`walk_sql`] recursion. -struct SqlWalkCtx<'a> { - str_path: &'a str, - stem: &'a str, - file_nid: &'a str, - nodes: &'a mut Vec, - edges: &'a mut Vec, - seen_ids: &'a mut HashSet, - table_nids: &'a mut std::collections::HashMap, +pub(super) struct SqlWalkCtx<'a> { + pub(super) str_path: &'a str, + pub(super) stem: &'a str, + pub(super) file_nid: &'a str, + pub(super) nodes: &'a mut Vec, + pub(super) edges: &'a mut Vec, + pub(super) seen_ids: &'a mut HashSet, + pub(super) table_nids: &'a mut std::collections::HashMap, } #[allow(clippy::too_many_lines)] // linear dispatch over SQL's AST node kinds -fn walk_sql(ctx: &mut SqlWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8]) { +pub(super) fn walk_sql(ctx: &mut SqlWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8]) { let t = node.kind(); let line = node.start_position().row + 1; @@ -681,6 +442,9 @@ fn walk_sql(ctx: &mut SqlWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8] }); } } + // Divergence from graphify-py (extract.py:5702), which labels + // UPDATE targets `reads_from`: an UPDATE mutates its target, so + // emit `writes_to` for the correct data-flow direction. for rm in SQL_UPDATE_RE.captures_iter(text) { let tbl = rm[1].to_string(); if !SQL_NON_TABLES.contains(tbl.to_lowercase().as_str()) @@ -696,7 +460,7 @@ fn walk_sql(ctx: &mut SqlWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8] external: false, source: obj_nid.clone(), target: tbl_nid, - relation: "reads_from".to_string(), + relation: "writes_to".to_string(), confidence: "EXTRACTED".to_string(), source_file: ctx.str_path.to_string(), source_location: Some(format!("L{line}")), @@ -721,63 +485,3 @@ fn walk_sql(ctx: &mut SqlWalkCtx<'_>, node: tree_sitter::Node<'_>, source: &[u8] } } } - -/// Recursively walk a SQL AST finding `FROM` and `JOIN` clauses and emitting `references` edges. -/// -/// Used to add query-time data-flow edges from functions/views to the tables they read. -/// Mirrors Python `_walk_from_refs`. -fn walk_from_refs( - node: tree_sitter::Node<'_>, - source: &[u8], - str_path: &str, - stem: &str, - caller_nid: &str, - edges: &mut Vec, -) { - if matches!(node.kind(), "from" | "join") { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "relation" { - let mut rc = cur.node().walk(); - if rc.goto_first_child() { - loop { - if rc.node().kind() == "object_reference" { - let tbl = read_text(rc.node(), source); - let tbl_nid = make_id(&[stem, tbl]); - let line = rc.node().start_position().row + 1; - edges.push(Edge { - external: false, - source: caller_nid.to_string(), - target: tbl_nid, - relation: "reads_from".to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: None, - confidence_score: None, - }); - } - if !rc.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - walk_from_refs(cur.node(), source, str_path, stem, caller_nid, edges); - if !cur.goto_next_sibling() { - break; - } - } - } -} diff --git a/crates/graphify-extract/src/generic/graph.rs b/crates/graphify-extract/src/generic/graph.rs new file mode 100644 index 0000000..1a1bd2c --- /dev/null +++ b/crates/graphify-extract/src/generic/graph.rs @@ -0,0 +1,175 @@ +//! Low-level graph + AST primitives shared across the generic extractor. +//! +//! `add_node` / `add_edge` build the node/edge lists; the AST helpers +//! (`named_children`, `first_child_kind`, `any_child_kind`, `find_body`, +//! `ensure_named_node`) are reused by every per-language submodule. + +#![allow(clippy::cast_possible_truncation)] + +use super::config::LangConfig; +use crate::ids::{make_id, make_id1}; +use crate::types::{Edge, Node as GNode}; +use std::collections::HashSet; +use tree_sitter::Node; + +/// Insert a new graph node if `nid` has not been seen before. +/// +/// The `seen_ids` set is the deduplication gate — a second call with the same +/// `nid` is silently dropped so that multiple structural passes (e.g. +/// file-level node + function-level) cannot produce duplicate node entries. +pub(crate) fn add_node( + nid: &str, + label: &str, + line: u32, + str_path: &str, + nodes: &mut Vec, + seen_ids: &mut HashSet, +) { + if seen_ids.insert(nid.to_string()) { + nodes.push(GNode { + id: nid.to_string(), + label: label.to_string(), + file_type: "code".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + metadata: None, + }); + } +} + +/// Append an edge to the edge list. +/// +/// Unlike nodes, edges are not deduplicated here — the caller is responsible +/// for deduplication via `seen_call_pairs` or the final clean pass in +/// [`extract_generic`]. +pub(crate) fn add_edge( + src: &str, + tgt: &str, + relation: &str, + line: u32, + str_path: &str, + context: Option<&str>, + edges: &mut Vec, +) { + edges.push(Edge { + external: false, + source: src.to_string(), + target: tgt.to_string(), + relation: relation.to_string(), + confidence: "EXTRACTED".to_string(), + source_file: str_path.to_string(), + source_location: Some(format!("L{line}")), + weight: 1.0, + context: context.map(str::to_string), + confidence_score: None, + }); +} + +// ── Small AST helpers ────────────────────────────────────────────────────────── + +/// Collect the named children of `node` into a `Vec`. +#[must_use] +pub(crate) fn named_children(node: Node<'_>) -> Vec> { + let mut out = Vec::new(); + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + out.push(cur.node()); + } + if !cur.goto_next_sibling() { + break; + } + } + } + out +} + +/// Return the first child of `node` whose kind is `kind`. +#[must_use] +pub(crate) fn first_child_kind<'tree>(node: Node<'tree>, kind: &str) -> Option> { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == kind { + return Some(cur.node()); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// `true` if any child of `node` has the given `kind` (allocation-free). +#[must_use] +pub(crate) fn any_child_kind(node: Node<'_>, kind: &str) -> bool { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == kind { + return true; + } + if !cur.goto_next_sibling() { + break; + } + } + } + false +} + +// ── Body finder ─────────────────────────────────────────────────────────────── + +/// Locate the body child of a class or function node. +/// +/// First tries the grammar's `body` field; falls back to scanning for a child +/// whose kind appears in `config.body_fallback_child_types`. The fallback is +/// needed for languages like Kotlin whose grammar uses `class_body` or +/// `function_body` node types rather than a named field. +#[must_use] +pub(crate) fn find_body<'tree>(node: Node<'tree>, config: &LangConfig) -> Option> { + if let Some(b) = node.child_by_field_name(config.body_field) { + return Some(b); + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if config.body_fallback_child_types.contains(&child.kind()) { + return Some(child); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +// ── ensure_named_node ───────────────────────────────────────────────────────── + +/// Return the NID for a named entity, creating a placeholder node if needed. +/// +/// First checks for a file-qualified ID (`_`); if already seen, +/// returns that ID. Otherwise ensures the bare-name node exists (creating it +/// when absent) and returns the bare NID. Used for cross-file type references +/// in C# `field_declaration` processing. +pub(crate) fn ensure_named_node( + name: &str, + line: u32, + stem: &str, + str_path: &str, + nodes: &mut Vec, + seen_ids: &mut HashSet, +) -> String { + let nid1 = make_id(&[stem, name]); + if seen_ids.contains(&nid1) { + return nid1; + } + let nid2 = make_id1(name); + if !seen_ids.contains(&nid2) { + add_node(&nid2, name, line, str_path, nodes, seen_ids); + } + nid2 +} diff --git a/crates/graphify-extract/src/generic/inherit.rs b/crates/graphify-extract/src/generic/inherit.rs deleted file mode 100644 index cc50750..0000000 --- a/crates/graphify-extract/src/generic/inherit.rs +++ /dev/null @@ -1,1076 +0,0 @@ -//! Per-language inheritance-edge emitters. -//! -//! Each `emit_*_inheritance` function is called from the structural `walk` -//! pass when a class node is encountered for the corresponding language. -//! They inspect language-specific child nodes (e.g. `base_list`, `superclass`, -//! `base_class_clause`) and push `inherits` / `extends` / `implements` edges. - -// Tree-sitter row numbers are source line indices; files with 2^32+ lines do -// not exist in practice, so usize→u32 truncation is safe. -#![allow(clippy::cast_possible_truncation)] - -use std::collections::HashSet; - -use tree_sitter::Node; - -use crate::types::{Edge, Node as GNode}; - -use super::names::read_text_owned; -use super::walk::{add_edge, first_child_kind, named_children}; - -// ── Shared helper ───────────────────────────────────────────────────────────── - -/// Ensure a base-class node exists and return its NID. -pub(super) fn emit_base_node( - base: &str, - _line: u32, - stem: &str, - _str_path: &str, - nodes: &mut Vec, - seen_ids: &mut HashSet, -) -> String { - use crate::ids::{make_id, make_id1}; - - let nid1 = make_id(&[stem, base]); - if seen_ids.contains(&nid1) { - return nid1; - } - let nid2 = make_id1(base); - if !seen_ids.contains(&nid2) { - nodes.push(GNode { - id: nid2.clone(), - label: base.to_string(), - file_type: "code".to_string(), - source_file: String::new(), - source_location: None, - metadata: None, - }); - seen_ids.insert(nid2.clone()); - } - nid2 -} - -// ── Swift ───────────────────────────────────────────────────────────────────── - -/// Return the leading kind keyword for a Swift `class_declaration` -/// (`class` / `struct` / `enum` / `extension` / `actor`), if present. -#[must_use] -pub(super) fn swift_declaration_keyword(node: Node<'_>) -> Option<&'static str> { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let c = cur.node(); - if !c.is_named() { - match c.kind() { - "class" => return Some("class"), - "struct" => return Some("struct"), - "enum" => return Some("enum"), - "extension" => return Some("extension"), - "actor" => return Some("actor"), - _ => {} - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// Pre-scan a Swift compilation unit, returning `(protocol_names, class_like_names)`. -/// -/// Used to classify each `inheritance_specifier` entry as `inherits` (a class) -/// or `implements` (a protocol). Mirrors Python `_swift_pre_scan`. -#[must_use] -pub(super) fn swift_pre_scan(root: Node<'_>, source: &[u8]) -> (HashSet, HashSet) { - let mut protocols: HashSet = HashSet::new(); - let mut classes: HashSet = HashSet::new(); - let mut stack: Vec> = vec![root]; - while let Some(n) = stack.pop() { - if n.kind() == "protocol_declaration" { - let name_node = n - .child_by_field_name("name") - .or_else(|| first_child_kind(n, "type_identifier")); - if let Some(nn) = name_node { - let text = read_text_owned(nn, source); - if !text.is_empty() { - protocols.insert(text); - } - } - } else if n.kind() == "class_declaration" - && matches!( - swift_declaration_keyword(n), - Some("class" | "struct" | "enum" | "actor") - ) - && let Some(nn) = n.child_by_field_name("name") - { - let text = read_text_owned(nn, source); - if !text.is_empty() { - classes.insert(text); - } - } - let mut cur = n.walk(); - if cur.goto_first_child() { - loop { - stack.push(cur.node()); - if !cur.goto_next_sibling() { - break; - } - } - } - } - (protocols, classes) -} - -/// Classify a Swift inheritance entry as `inherits` or `implements`. -/// -/// Declared protocols → `implements`; declared classes → `inherits`. A -/// `struct`/`enum`/`extension`/`actor` can only conform to protocols, so all -/// of its entries are `implements`. For a `class`, the first entry is the base -/// class (`inherits`) and the rest are protocol conformances (`implements`). -/// Mirrors Python `_swift_classify_base`. -fn swift_classify_base( - name: &str, - kind: Option<&str>, - is_first: bool, - protocols: &HashSet, - classes: &HashSet, -) -> &'static str { - if protocols.contains(name) { - return "implements"; - } - if classes.contains(name) { - return "inherits"; - } - if matches!(kind, Some("struct" | "enum" | "extension" | "actor")) { - return "implements"; - } - if is_first { "inherits" } else { "implements" } -} - -/// Emit `inherits` / `implements` edges for a Swift class/protocol/extension's -/// `inheritance_specifier` children, plus `references[generic_arg]` edges for -/// any generic arguments on a base type. Mirrors Python `_extract_swift`. -#[allow(clippy::too_many_lines)] // linear walk over inheritance specifiers + their generic args -pub(super) fn emit_swift_inheritance( - ctx: &mut super::walk::WalkCtx<'_, '_>, - node: Node<'_>, - source: &[u8], - class_nid: &str, - line: u32, -) { - use super::references::{RefRole, swift_collect_type_refs, swift_user_type_name}; - - let stem = ctx.stem; - let str_path = ctx.str_path; - let protocols = ctx.swift_protocol_names; - let classes = ctx.swift_class_names; - let is_protocol = node.kind() == "protocol_declaration"; - let kind = if node.kind() == "class_declaration" { - swift_declaration_keyword(node) - } else { - Some("protocol") - }; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - - let mut seen_base = false; - let mut cur = node.walk(); - if !cur.goto_first_child() { - return; - } - loop { - let child = cur.node(); - if child.kind() == "inheritance_specifier" { - // Resolve the base name (and the user_type carrying any generics). - let mut base_name: Option = None; - let mut user_type_node: Option> = None; - let mut scur = child.walk(); - if scur.goto_first_child() { - loop { - let sub = scur.node(); - if sub.kind() == "user_type" { - user_type_node = Some(sub); - base_name = swift_user_type_name(sub, source); - break; - } - if sub.kind() == "type_identifier" { - let t = read_text_owned(sub, source); - base_name = (!t.is_empty()).then_some(t); - break; - } - if !scur.goto_next_sibling() { - break; - } - } - } - if let Some(base_name) = base_name { - let base_nid = emit_base_node(&base_name, line, stem, str_path, nodes, seen_ids); - let relation = if is_protocol { - "inherits" - } else { - swift_classify_base(&base_name, kind, !seen_base, protocols, classes) - }; - seen_base = true; - add_edge(class_nid, &base_nid, relation, line, str_path, None, edges); - // Generic arguments on the base type → references[generic_arg]. - if let Some(ut) = user_type_node { - let mut tacur = ut.walk(); - if tacur.goto_first_child() { - loop { - if tacur.node().kind() == "type_arguments" { - let mut acur = tacur.node().walk(); - if acur.goto_first_child() { - loop { - if acur.node().is_named() { - let mut refs: Vec<(String, RefRole)> = Vec::new(); - swift_collect_type_refs( - acur.node(), - source, - true, - &mut refs, - ); - for (ref_name, _role) in refs { - let target = super::walk::ensure_named_node( - &ref_name, line, stem, str_path, nodes, - seen_ids, - ); - add_edge( - class_nid, - &target, - "references", - line, - str_path, - Some("generic_arg"), - edges, - ); - } - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !tacur.goto_next_sibling() { - break; - } - } - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } -} - -// ── C# ──────────────────────────────────────────────────────────────────────── - -/// Walk the whole tree and return the set of identifiers declared as -/// `interface` in this C# compilation unit. -/// -/// Used by [`emit_csharp_inheritance`] to classify each entry in a -/// `base_list`: declared interfaces produce an `implements` edge, everything -/// else falls back to the I-prefix heuristic (`IFoo` with a capital second -/// letter) or is treated as a base class (`inherits`). -/// -/// Mirrors Python `_csharp_pre_scan_interfaces`. -#[must_use] -pub(super) fn csharp_pre_scan_interfaces(root: Node<'_>, source: &[u8]) -> HashSet { - let mut out = HashSet::new(); - let mut stack: Vec> = vec![root]; - while let Some(n) = stack.pop() { - if n.kind() == "interface_declaration" - && let Some(name_node) = n.child_by_field_name("name") - { - let text = read_text_owned(name_node, source); - if !text.is_empty() { - out.insert(text); - } - } - let mut cur = n.walk(); - if cur.goto_first_child() { - loop { - stack.push(cur.node()); - if !cur.goto_next_sibling() { - break; - } - } - } - } - out -} - -/// Classify a C# base-list entry as `implements` or `inherits`. -/// -/// An entry is `implements` when the name was declared as `interface` in this -/// compilation unit, or when it follows the C# `I…` interface -/// naming convention. Otherwise it is `inherits`. -fn csharp_classify_base(name: &str, interface_names: &HashSet) -> &'static str { - if interface_names.contains(name) { - return "implements"; - } - let mut chars = name.chars(); - if let (Some(first), Some(second)) = (chars.next(), chars.next()) - && first == 'I' - && second.is_uppercase() - { - return "implements"; - } - "inherits" -} - -/// Walk a C# type-argument tree and append `(name, role)` tuples where role is -/// `"generic_arg"` for arguments nested inside a `type_argument_list`. -/// -/// Mirrors Python `_csharp_collect_type_refs` restricted to the generic case. -fn csharp_collect_type_arg_refs(node: Node<'_>, source: &[u8], out: &mut Vec) { - let t = node.kind(); - if t == "predefined_type" { - return; - } - if t == "identifier" { - let name = read_text_owned(node, source); - if !name.is_empty() { - out.push(name); - } - return; - } - if t == "qualified_name" { - let text = read_text_owned(node, source); - let tail = text.rsplit('.').next().unwrap_or(&text).to_string(); - if !tail.is_empty() { - out.push(tail); - } - return; - } - if t == "generic_name" { - let name_node = node.child_by_field_name("name").or_else(|| { - let mut sc = node.walk(); - if sc.goto_first_child() { - loop { - if sc.node().kind() == "identifier" { - return Some(sc.node()); - } - if !sc.goto_next_sibling() { - break; - } - } - } - None - }); - if let Some(nn) = name_node { - let name = read_text_owned(nn, source); - if !name.is_empty() { - out.push(name); - } - } - let mut sc = node.walk(); - if sc.goto_first_child() { - loop { - if sc.node().kind() == "type_argument_list" { - let mut acur = sc.node().walk(); - if acur.goto_first_child() { - loop { - if acur.node().is_named() { - csharp_collect_type_arg_refs(acur.node(), source, out); - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !sc.goto_next_sibling() { - break; - } - } - } - return; - } - if matches!( - t, - "nullable_type" | "array_type" | "pointer_type" | "ref_type" - ) { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - csharp_collect_type_arg_refs(cur.node(), source, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if node.is_named() { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - csharp_collect_type_arg_refs(cur.node(), source, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - } -} - -/// Emit `inherits` / `implements` edges from a C# `base_list` node. -/// -/// Each base-list entry is classified by [`csharp_classify_base`]; declared -/// interfaces (and `I…`-named types) produce `implements`, -/// everything else `inherits`. When the entry is a `generic_name`, its type -/// arguments also produce `references` edges with `context = generic_arg` so -/// downstream queries can tell `class Foo : IBar` introduces a usage of -/// `Baz`. -pub(super) fn emit_csharp_inheritance( - ctx: &mut super::walk::WalkCtx<'_, '_>, - node: Node<'_>, - source: &[u8], - class_nid: &str, - line: u32, -) { - let stem = ctx.stem; - let str_path = ctx.str_path; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - let interface_names = ctx.csharp_interface_names; - let mut cur = node.walk(); - if !cur.goto_first_child() { - return; - } - loop { - let child = cur.node(); - if child.kind() == "base_list" { - let mut scur = child.walk(); - if scur.goto_first_child() { - loop { - let sub = scur.node(); - let base = match sub.kind() { - "identifier" => Some(read_text_owned(sub, source)), - "qualified_name" => { - let full = read_text_owned(sub, source); - Some(full.rsplit('.').next().unwrap_or(&full).to_string()) - } - "generic_name" => { - if let Some(nc) = sub.child_by_field_name("name") { - Some(read_text_owned(nc, source)) - } else { - { - let mut tc = sub.walk(); - if tc.goto_first_child() { - Some(tc.node()) - } else { - None - } - } - .map(|first| read_text_owned(first, source)) - } - } - _ => None, - }; - if let Some(b) = base - && !b.is_empty() - { - let base_nid = emit_base_node(&b, line, stem, str_path, nodes, seen_ids); - let relation = csharp_classify_base(&b, interface_names); - add_edge(class_nid, &base_nid, relation, line, str_path, None, edges); - if sub.kind() == "generic_name" { - let mut tc = sub.walk(); - if tc.goto_first_child() { - loop { - if tc.node().kind() == "type_argument_list" { - let mut acur = tc.node().walk(); - if acur.goto_first_child() { - loop { - if acur.node().is_named() { - let mut refs: Vec = Vec::new(); - csharp_collect_type_arg_refs( - acur.node(), - source, - &mut refs, - ); - for ref_name in refs { - let target = emit_base_node( - &ref_name, line, stem, str_path, nodes, - seen_ids, - ); - add_edge( - class_nid, - &target, - "references", - line, - str_path, - Some("generic_arg"), - edges, - ); - } - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !tc.goto_next_sibling() { - break; - } - } - } - } - } - if !scur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } -} - -// ── Java ────────────────────────────────────────────────────────────────────── - -/// Emit `inherits` and `implements` edges for a Java class or interface node. -/// -/// Java's source-level `extends` keyword (class extending a superclass or -/// interface extending other interfaces) is normalised to the `inherits` -/// relation so cross-language consumers see the same shape as C#, Swift, and -/// C++. `implements` (class implementing an interface) is kept as-is. All -/// three cases are handled here to match Python `_extract_java`. -#[allow(clippy::too_many_lines)] // sequential dispatch over Java's three inheritance shapes -pub(super) fn emit_java_inheritance( - ctx: &mut super::walk::WalkCtx<'_, '_>, - node: Node<'_>, - source: &[u8], - class_nid: &str, - node_type: &str, - line: u32, -) { - let stem = ctx.stem; - let str_path = ctx.str_path; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - let emit = |base_name: &str, - rel: &str, - nodes: &mut Vec, - edges: &mut Vec, - seen_ids: &mut HashSet| { - if base_name.is_empty() { - return; - } - let base_nid = emit_base_node(base_name, line, stem, str_path, nodes, seen_ids); - add_edge(class_nid, &base_nid, rel, line, str_path, None, edges); - }; - - if let Some(sup) = node.child_by_field_name("superclass") { - let mut cur = sup.walk(); - if cur.goto_first_child() { - loop { - let sub = cur.node(); - if sub.kind() == "type_identifier" { - emit( - &read_text_owned(sub, source), - "inherits", - nodes, - edges, - seen_ids, - ); - break; - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - - if let Some(ifs) = node.child_by_field_name("interfaces") { - let mut cur = ifs.walk(); - if cur.goto_first_child() { - loop { - let sub = cur.node(); - if sub.kind() == "type_list" { - let mut tcur = sub.walk(); - if tcur.goto_first_child() { - loop { - let tid = tcur.node(); - if tid.kind() == "type_identifier" { - emit( - &read_text_owned(tid, source), - "implements", - nodes, - edges, - seen_ids, - ); - } - if !tcur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - - if node_type == "interface_declaration" { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if child.kind() == "extends_interfaces" { - let mut scur = child.walk(); - if scur.goto_first_child() { - loop { - let sub = scur.node(); - if sub.kind() == "type_list" { - let mut tcur = sub.walk(); - if tcur.goto_first_child() { - loop { - let tid = tcur.node(); - if tid.kind() == "type_identifier" { - emit( - &read_text_owned(tid, source), - "inherits", - nodes, - edges, - seen_ids, - ); - } - if !tcur.goto_next_sibling() { - break; - } - } - } - } - if !scur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - } -} - -// ── TypeScript / JavaScript ────────────────────────────────────────────────── - -/// Emit `inherits` / `implements` edges for a TS class declaration's -/// `class_heritage` child. -/// -/// TS distinguishes `extends_clause` (single class) from `implements_clause` -/// (one or more interfaces). `extends` is normalised to `inherits` so all -/// languages share a single relation name for class extension. The `name` -/// field's type-arguments are NOT walked here — that happens in the field / -/// method passes via `ts_collect_type_refs`. -pub(super) fn emit_ts_inheritance( - ctx: &mut super::walk::WalkCtx<'_, '_>, - node: Node<'_>, - source: &[u8], - class_nid: &str, - line: u32, -) { - let stem = ctx.stem; - let str_path = ctx.str_path; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - let mut cur = node.walk(); - if !cur.goto_first_child() { - return; - } - loop { - let child = cur.node(); - if child.kind() == "class_heritage" { - let mut hcur = child.walk(); - if hcur.goto_first_child() { - loop { - let clause = hcur.node(); - let relation = match clause.kind() { - "extends_clause" => Some("inherits"), - "implements_clause" => Some("implements"), - _ => None, - }; - if let Some(rel) = relation { - for name in super::references::ts_heritage_clause_entries(clause, source) { - let base_nid = - emit_base_node(&name, line, stem, str_path, nodes, seen_ids); - add_edge(class_nid, &base_nid, rel, line, str_path, None, edges); - } - } - if !hcur.goto_next_sibling() { - break; - } - } - } - } else if child.kind() == "extends_type_clause" { - // Interface heritage (`interface A extends B, C`) is an - // extends_type_clause node directly under the declaration, NOT - // wrapped in class_heritage. Its base entries are the same node types - // extends_clause holds, so the entry helper is reusable. Without this - // branch interface inheritance is dropped entirely (#1095). - for name in super::references::ts_heritage_clause_entries(child, source) { - let base_nid = emit_base_node(&name, line, stem, str_path, nodes, seen_ids); - add_edge( - class_nid, &base_nid, "inherits", line, str_path, None, edges, - ); - } - } - if !cur.goto_next_sibling() { - break; - } - } -} - -// ── C++ ─────────────────────────────────────────────────────────────────────── - -/// Emit `inherits` edges from a C++ `base_class_clause` node. -/// -/// C++ allows multiple inheritance; all entries in the clause produce -/// `inherits` edges regardless of access specifier (`public`, `protected`, -/// `private`), matching Python `_extract_cpp`. -pub(super) fn emit_cpp_inheritance( - ctx: &mut super::walk::WalkCtx<'_, '_>, - node: Node<'_>, - source: &[u8], - class_nid: &str, - line: u32, -) { - let stem = ctx.stem; - let str_path = ctx.str_path; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - let mut cur = node.walk(); - if !cur.goto_first_child() { - return; - } - loop { - let child = cur.node(); - if child.kind() == "base_class_clause" { - let mut scur = child.walk(); - if scur.goto_first_child() { - loop { - let sub = scur.node(); - let base = match sub.kind() { - "type_identifier" => Some(read_text_owned(sub, source)), - "qualified_identifier" | "template_type" => { - if let Some(tail) = sub.child_by_field_name("name") { - Some(read_text_owned(tail, source)) - } else { - Some(read_text_owned(sub, source)) - } - } - _ => None, - }; - if let Some(b) = base - && !b.is_empty() - { - let base_nid = emit_base_node(&b, line, stem, str_path, nodes, seen_ids); - add_edge( - class_nid, &base_nid, "inherits", line, str_path, None, edges, - ); - } - if !scur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } -} - -// ── PHP ─────────────────────────────────────────────────────────────────────── - -/// Emit `inherits` (`extends`) / `implements` (`implements`) / `mixes_in` -/// (trait `use`) edges for a PHP class. Mirrors Python `_extract_php`. -pub(super) fn emit_php_inheritance( - ctx: &mut super::walk::WalkCtx<'_, '_>, - node: Node<'_>, - source: &[u8], - class_nid: &str, - _line: u32, -) { - let stem = ctx.stem; - let str_path = ctx.str_path; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - - let emit = |base_name: Option, - rel: &str, - at_line: u32, - nodes: &mut Vec, - edges: &mut Vec, - seen_ids: &mut HashSet| { - let Some(base_name) = base_name else { return }; - if base_name.is_empty() { - return; - } - let base_nid = emit_base_node(&base_name, at_line, stem, str_path, nodes, seen_ids); - add_edge(class_nid, &base_nid, rel, at_line, str_path, None, edges); - }; - - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - let child_line = child.start_position().row as u32 + 1; - match child.kind() { - "base_clause" => { - for sub in named_children(child) { - if matches!(sub.kind(), "name" | "qualified_name") { - emit( - super::references::php_name_text(sub, source), - "inherits", - child_line, - nodes, - edges, - seen_ids, - ); - } - } - } - "class_interface_clause" => { - for sub in named_children(child) { - if matches!(sub.kind(), "name" | "qualified_name") { - emit( - super::references::php_name_text(sub, source), - "implements", - child_line, - nodes, - edges, - seen_ids, - ); - } - } - } - _ => {} - } - if !cur.goto_next_sibling() { - break; - } - } - } - - // Trait `use` declarations inside the class body → `mixes_in`. - let body = node - .child_by_field_name("body") - .or_else(|| first_child_kind(node, "declaration_list")); - if let Some(body) = body { - for member in named_children(body) { - if member.kind() != "use_declaration" { - continue; - } - let member_line = member.start_position().row as u32 + 1; - for sub in named_children(member) { - if matches!(sub.kind(), "name" | "qualified_name") { - emit( - super::references::php_name_text(sub, source), - "mixes_in", - member_line, - nodes, - edges, - seen_ids, - ); - } - } - } - } -} - -// ── Kotlin ──────────────────────────────────────────────────────────────────── - -/// Emit `inherits` (`: Base()`) / `implements` (`: Interface`) edges for a -/// Kotlin class's `delegation_specifiers`, plus `references[generic_arg]` for -/// type arguments on the base. Mirrors Python `_extract_kotlin`. -pub(super) fn emit_kotlin_inheritance( - ctx: &mut super::walk::WalkCtx<'_, '_>, - node: Node<'_>, - source: &[u8], - class_nid: &str, - line: u32, -) { - use super::references::{RefRole, kotlin_collect_type_refs, kotlin_user_type_name}; - let stem = ctx.stem; - let str_path = ctx.str_path; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - - for child in named_children(node) { - if child.kind() != "delegation_specifiers" { - continue; - } - for spec in named_children(child) { - if spec.kind() != "delegation_specifier" { - continue; - } - let mut relation = "implements"; - let mut user_type_node: Option> = None; - for sub in named_children(spec) { - if sub.kind() == "constructor_invocation" { - relation = "inherits"; - user_type_node = first_child_kind(sub, "user_type"); - break; - } - if sub.kind() == "user_type" { - user_type_node = Some(sub); - break; - } - } - let Some(ut) = user_type_node else { continue }; - // Skip empty base names (consistent with the PHP emitter) so a - // malformed `user_type` never spawns an empty-label node. - let Some(base) = kotlin_user_type_name(ut, source).filter(|b| !b.is_empty()) else { - continue; - }; - let base_nid = emit_base_node(&base, line, stem, str_path, nodes, seen_ids); - add_edge(class_nid, &base_nid, relation, line, str_path, None, edges); - for arg_child in named_children(ut) { - if arg_child.kind() != "type_arguments" { - continue; - } - for arg in named_children(arg_child) { - let mut refs: Vec<(String, RefRole)> = Vec::new(); - if arg.kind() == "type_projection" { - for inner in named_children(arg) { - kotlin_collect_type_refs(inner, source, true, &mut refs); - } - } else { - kotlin_collect_type_refs(arg, source, true, &mut refs); - } - for (ref_name, _role) in refs { - let target = super::walk::ensure_named_node( - &ref_name, line, stem, str_path, nodes, seen_ids, - ); - add_edge( - class_nid, - &target, - "references", - line, - str_path, - Some("generic_arg"), - edges, - ); - } - } - } - } - } -} - -// ── Scala ───────────────────────────────────────────────────────────────────── - -/// Emit `inherits` (first base after `extends`) / `mixes_in` (each `with` -/// trait) edges plus `references[field]` edges for constructor parameters. -/// Mirrors Python `_extract_scala`. -pub(super) fn emit_scala_inheritance( - ctx: &mut super::walk::WalkCtx<'_, '_>, - node: Node<'_>, - source: &[u8], - class_nid: &str, - _line: u32, -) { - use super::references::{RefRole, scala_collect_type_refs}; - let stem = ctx.stem; - let str_path = ctx.str_path; - let nodes = &mut *ctx.nodes; - let edges = &mut *ctx.edges; - let seen_ids = &mut *ctx.seen_ids; - - let extend = node - .child_by_field_name("extend") - .or_else(|| first_child_kind(node, "extends_clause")); - if let Some(extend) = extend { - let mut bases: Vec<(String, u32)> = Vec::new(); - for c in named_children(extend) { - let c_line = c.start_position().row as u32 + 1; - // Skip empty base names (consistent with the PHP emitter) so a - // malformed node never spawns an empty-label node. - if c.kind() == "type_identifier" { - let name = read_text_owned(c, source); - if !name.is_empty() { - bases.push((name, c_line)); - } - } else if c.kind() == "generic_type" { - let base = c - .child_by_field_name("type") - .or_else(|| first_child_kind(c, "type_identifier")); - if let Some(base) = base { - let name = read_text_owned(base, source); - if !name.is_empty() { - bases.push((name, c_line)); - } - } - } - } - for (idx, (base_name, base_line)) in bases.into_iter().enumerate() { - let rel = if idx == 0 { "inherits" } else { "mixes_in" }; - let base_nid = super::walk::ensure_named_node( - &base_name, base_line, stem, str_path, nodes, seen_ids, - ); - if base_nid != class_nid { - add_edge(class_nid, &base_nid, rel, base_line, str_path, None, edges); - } - } - } - - for c in named_children(node) { - if c.kind() != "class_parameters" { - continue; - } - for cp in named_children(c) { - if cp.kind() != "class_parameter" { - continue; - } - let Some(ptype) = cp.child_by_field_name("type") else { - continue; - }; - let cp_line = cp.start_position().row as u32 + 1; - let mut refs: Vec<(String, RefRole)> = Vec::new(); - scala_collect_type_refs(ptype, source, false, &mut refs); - for (ref_name, role) in refs { - let context = role.into_context("field"); - let target = super::walk::ensure_named_node( - &ref_name, cp_line, stem, str_path, nodes, seen_ids, - ); - if target != class_nid { - add_edge( - class_nid, - &target, - "references", - cp_line, - str_path, - Some(context), - edges, - ); - } - } - } - } -} diff --git a/crates/graphify-extract/src/generic/inherit/cpp.rs b/crates/graphify-extract/src/generic/inherit/cpp.rs new file mode 100644 index 0000000..df6970f --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/cpp.rs @@ -0,0 +1,65 @@ +//! C++ inheritance-edge emitter. + +use super::emit_base_node; +use crate::generic::names::read_text_owned; +use crate::generic::walk::add_edge; +use tree_sitter::Node; + +/// Emit `inherits` edges from a C++ `base_class_clause` node. +/// +/// C++ allows multiple inheritance; all entries in the clause produce +/// `inherits` edges regardless of access specifier (`public`, `protected`, +/// `private`), matching Python `_extract_cpp`. +pub(crate) fn emit_cpp_inheritance( + ctx: &mut crate::generic::walk::WalkCtx<'_, '_>, + node: Node<'_>, + source: &[u8], + class_nid: &str, + line: u32, +) { + let stem = ctx.stem; + let str_path = ctx.str_path; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + let mut cur = node.walk(); + if !cur.goto_first_child() { + return; + } + loop { + let child = cur.node(); + if child.kind() == "base_class_clause" { + let mut scur = child.walk(); + if scur.goto_first_child() { + loop { + let sub = scur.node(); + let base = match sub.kind() { + "type_identifier" => Some(read_text_owned(sub, source)), + "qualified_identifier" | "template_type" => { + if let Some(tail) = sub.child_by_field_name("name") { + Some(read_text_owned(tail, source)) + } else { + Some(read_text_owned(sub, source)) + } + } + _ => None, + }; + if let Some(b) = base + && !b.is_empty() + { + let base_nid = emit_base_node(&b, line, stem, str_path, nodes, seen_ids); + add_edge( + class_nid, &base_nid, "inherits", line, str_path, None, edges, + ); + } + if !scur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } +} diff --git a/crates/graphify-extract/src/generic/inherit/csharp.rs b/crates/graphify-extract/src/generic/inherit/csharp.rs new file mode 100644 index 0000000..99d90b6 --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/csharp.rs @@ -0,0 +1,278 @@ +//! C# inheritance-edge emitter. + +use super::emit_base_node; +use crate::generic::names::read_text_owned; +use crate::generic::walk::add_edge; +use std::collections::HashSet; +use tree_sitter::Node; + +/// Walk the whole tree and return the set of identifiers declared as +/// `interface` in this C# compilation unit. +/// +/// Used by [`emit_csharp_inheritance`] to classify each entry in a +/// `base_list`: declared interfaces produce an `implements` edge, everything +/// else falls back to the I-prefix heuristic (`IFoo` with a capital second +/// letter) or is treated as a base class (`inherits`). +/// +/// Mirrors Python `_csharp_pre_scan_interfaces`. +#[must_use] +pub(crate) fn csharp_pre_scan_interfaces(root: Node<'_>, source: &[u8]) -> HashSet { + let mut out = HashSet::new(); + let mut stack: Vec> = vec![root]; + while let Some(n) = stack.pop() { + if n.kind() == "interface_declaration" + && let Some(name_node) = n.child_by_field_name("name") + { + let text = read_text_owned(name_node, source); + if !text.is_empty() { + out.insert(text); + } + } + let mut cur = n.walk(); + if cur.goto_first_child() { + loop { + stack.push(cur.node()); + if !cur.goto_next_sibling() { + break; + } + } + } + } + out +} + +/// Classify a C# base-list entry as `implements` or `inherits`. +/// +/// An entry is `implements` when the name was declared as `interface` in this +/// compilation unit, or when it follows the C# `I…` interface +/// naming convention. Otherwise it is `inherits`. +fn csharp_classify_base(name: &str, interface_names: &HashSet) -> &'static str { + if interface_names.contains(name) { + return "implements"; + } + let mut chars = name.chars(); + if let (Some(first), Some(second)) = (chars.next(), chars.next()) + && first == 'I' + && second.is_uppercase() + { + return "implements"; + } + "inherits" +} + +/// Walk a C# type-argument tree and append `(name, role)` tuples where role is +/// `"generic_arg"` for arguments nested inside a `type_argument_list`. +/// +/// Mirrors Python `_csharp_collect_type_refs` restricted to the generic case. +fn csharp_collect_type_arg_refs(node: Node<'_>, source: &[u8], out: &mut Vec) { + let t = node.kind(); + if t == "predefined_type" { + return; + } + if t == "identifier" { + let name = read_text_owned(node, source); + if !name.is_empty() { + out.push(name); + } + return; + } + if t == "qualified_name" { + let text = read_text_owned(node, source); + let tail = text.rsplit('.').next().unwrap_or(&text).to_string(); + if !tail.is_empty() { + out.push(tail); + } + return; + } + if t == "generic_name" { + let name_node = node.child_by_field_name("name").or_else(|| { + let mut sc = node.walk(); + if sc.goto_first_child() { + loop { + if sc.node().kind() == "identifier" { + return Some(sc.node()); + } + if !sc.goto_next_sibling() { + break; + } + } + } + None + }); + if let Some(nn) = name_node { + let name = read_text_owned(nn, source); + if !name.is_empty() { + out.push(name); + } + } + let mut sc = node.walk(); + if sc.goto_first_child() { + loop { + if sc.node().kind() == "type_argument_list" { + let mut acur = sc.node().walk(); + if acur.goto_first_child() { + loop { + if acur.node().is_named() { + csharp_collect_type_arg_refs(acur.node(), source, out); + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !sc.goto_next_sibling() { + break; + } + } + } + return; + } + if matches!( + t, + "nullable_type" | "array_type" | "pointer_type" | "ref_type" + ) { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + csharp_collect_type_arg_refs(cur.node(), source, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if node.is_named() { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + csharp_collect_type_arg_refs(cur.node(), source, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + } +} + +/// Emit `inherits` / `implements` edges from a C# `base_list` node. +/// +/// Each base-list entry is classified by [`csharp_classify_base`]; declared +/// interfaces (and `I…`-named types) produce `implements`, +/// everything else `inherits`. When the entry is a `generic_name`, its type +/// arguments also produce `references` edges with `context = generic_arg` so +/// downstream queries can tell `class Foo : IBar` introduces a usage of +/// `Baz`. +pub(crate) fn emit_csharp_inheritance( + ctx: &mut crate::generic::walk::WalkCtx<'_, '_>, + node: Node<'_>, + source: &[u8], + class_nid: &str, + line: u32, +) { + let stem = ctx.stem; + let str_path = ctx.str_path; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + let interface_names = ctx.csharp_interface_names; + let mut cur = node.walk(); + if !cur.goto_first_child() { + return; + } + loop { + let child = cur.node(); + if child.kind() == "base_list" { + let mut scur = child.walk(); + if scur.goto_first_child() { + loop { + let sub = scur.node(); + let base = match sub.kind() { + "identifier" => Some(read_text_owned(sub, source)), + "qualified_name" => { + let full = read_text_owned(sub, source); + Some(full.rsplit('.').next().unwrap_or(&full).to_string()) + } + "generic_name" => { + if let Some(nc) = sub.child_by_field_name("name") { + Some(read_text_owned(nc, source)) + } else { + { + let mut tc = sub.walk(); + if tc.goto_first_child() { + Some(tc.node()) + } else { + None + } + } + .map(|first| read_text_owned(first, source)) + } + } + _ => None, + }; + if let Some(b) = base + && !b.is_empty() + { + let base_nid = emit_base_node(&b, line, stem, str_path, nodes, seen_ids); + let relation = csharp_classify_base(&b, interface_names); + add_edge(class_nid, &base_nid, relation, line, str_path, None, edges); + if sub.kind() == "generic_name" { + let mut tc = sub.walk(); + if tc.goto_first_child() { + loop { + if tc.node().kind() == "type_argument_list" { + let mut acur = tc.node().walk(); + if acur.goto_first_child() { + loop { + if acur.node().is_named() { + let mut refs: Vec = Vec::new(); + csharp_collect_type_arg_refs( + acur.node(), + source, + &mut refs, + ); + for ref_name in refs { + let target = emit_base_node( + &ref_name, line, stem, str_path, nodes, + seen_ids, + ); + add_edge( + class_nid, + &target, + "references", + line, + str_path, + Some("generic_arg"), + edges, + ); + } + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !tc.goto_next_sibling() { + break; + } + } + } + } + } + if !scur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } +} diff --git a/crates/graphify-extract/src/generic/inherit/java.rs b/crates/graphify-extract/src/generic/inherit/java.rs new file mode 100644 index 0000000..cbd4091 --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/java.rs @@ -0,0 +1,158 @@ +//! Java inheritance-edge emitter. + +use super::emit_base_node; +use crate::generic::names::read_text_owned; +use crate::generic::walk::add_edge; +use crate::types::{Edge, Node as GNode}; +use std::collections::HashSet; +use tree_sitter::Node; + +/// Emit `inherits` and `implements` edges for a Java class or interface node. +/// +/// Java's source-level `extends` keyword (class extending a superclass or +/// interface extending other interfaces) is normalised to the `inherits` +/// relation so cross-language consumers see the same shape as C#, Swift, and +/// C++. `implements` (class implementing an interface) is kept as-is. All +/// three cases are handled here to match Python `_extract_java`. +#[allow(clippy::too_many_lines)] // sequential dispatch over Java's three inheritance shapes +pub(crate) fn emit_java_inheritance( + ctx: &mut crate::generic::walk::WalkCtx<'_, '_>, + node: Node<'_>, + source: &[u8], + class_nid: &str, + node_type: &str, + line: u32, +) { + let stem = ctx.stem; + let str_path = ctx.str_path; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + let emit = |base_name: &str, + rel: &str, + nodes: &mut Vec, + edges: &mut Vec, + seen_ids: &mut HashSet| { + if base_name.is_empty() { + return; + } + let base_nid = emit_base_node(base_name, line, stem, str_path, nodes, seen_ids); + add_edge(class_nid, &base_nid, rel, line, str_path, None, edges); + }; + + if let Some(sup) = node.child_by_field_name("superclass") { + let mut cur = sup.walk(); + if cur.goto_first_child() { + loop { + let sub = cur.node(); + if let Some(name) = java_base_name(sub, source) { + emit(&name, "inherits", nodes, edges, seen_ids); + break; + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + + if let Some(ifs) = node.child_by_field_name("interfaces") { + let mut cur = ifs.walk(); + if cur.goto_first_child() { + loop { + let sub = cur.node(); + if sub.kind() == "type_list" { + let mut tcur = sub.walk(); + if tcur.goto_first_child() { + loop { + let tid = tcur.node(); + if let Some(name) = java_base_name(tid, source) { + emit(&name, "implements", nodes, edges, seen_ids); + } + if !tcur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + + if node_type == "interface_declaration" { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if child.kind() == "extends_interfaces" { + let mut scur = child.walk(); + if scur.goto_first_child() { + loop { + let sub = scur.node(); + if sub.kind() == "type_list" { + let mut tcur = sub.walk(); + if tcur.goto_first_child() { + loop { + let tid = tcur.node(); + if let Some(name) = java_base_name(tid, source) { + emit(&name, "inherits", nodes, edges, seen_ids); + } + if !tcur.goto_next_sibling() { + break; + } + } + } + } + if !scur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + } +} + +/// Extract the base type name from a Java inheritance entry: a plain +/// `type_identifier`, a qualified `scoped_type_identifier` (tail after the +/// final `.`), or a `generic_type` (its base, qualified-tail when scoped). +/// Returns `None` for non-type nodes such as the `extends` keyword. +/// +/// Divergence from graphify-py `_extract_java` (extract.py:2777-2799), which +/// matches only `type_identifier` and silently drops qualified/generic bases. +fn java_base_name(node: Node<'_>, source: &[u8]) -> Option { + match node.kind() { + "type_identifier" => { + let name = read_text_owned(node, source); + (!name.is_empty()).then_some(name) + } + "scoped_type_identifier" => { + let text = read_text_owned(node, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + (!tail.is_empty()).then(|| tail.to_string()) + } + "generic_type" => { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if matches!(child.kind(), "type_identifier" | "scoped_type_identifier") { + return java_base_name(child, source); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None + } + _ => None, + } +} diff --git a/crates/graphify-extract/src/generic/inherit/kotlin.rs b/crates/graphify-extract/src/generic/inherit/kotlin.rs new file mode 100644 index 0000000..225dcfe --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/kotlin.rs @@ -0,0 +1,84 @@ +//! Kotlin inheritance-edge emitter. + +use super::emit_base_node; +use crate::generic::walk::{add_edge, first_child_kind, named_children}; +use tree_sitter::Node; + +/// Emit `inherits` (`: Base()`) / `implements` (`: Interface`) edges for a +/// Kotlin class's `delegation_specifiers`, plus `references[generic_arg]` for +/// type arguments on the base. Mirrors Python `_extract_kotlin`. +pub(crate) fn emit_kotlin_inheritance( + ctx: &mut crate::generic::walk::WalkCtx<'_, '_>, + node: Node<'_>, + source: &[u8], + class_nid: &str, + line: u32, +) { + use crate::generic::references::{RefRole, kotlin_collect_type_refs, kotlin_user_type_name}; + let stem = ctx.stem; + let str_path = ctx.str_path; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + + for child in named_children(node) { + if child.kind() != "delegation_specifiers" { + continue; + } + for spec in named_children(child) { + if spec.kind() != "delegation_specifier" { + continue; + } + let mut relation = "implements"; + let mut user_type_node: Option> = None; + for sub in named_children(spec) { + if sub.kind() == "constructor_invocation" { + relation = "inherits"; + user_type_node = first_child_kind(sub, "user_type"); + break; + } + if sub.kind() == "user_type" { + user_type_node = Some(sub); + break; + } + } + let Some(ut) = user_type_node else { continue }; + // Skip empty base names (consistent with the PHP emitter) so a + // malformed `user_type` never spawns an empty-label node. + let Some(base) = kotlin_user_type_name(ut, source).filter(|b| !b.is_empty()) else { + continue; + }; + let base_nid = emit_base_node(&base, line, stem, str_path, nodes, seen_ids); + add_edge(class_nid, &base_nid, relation, line, str_path, None, edges); + for arg_child in named_children(ut) { + if arg_child.kind() != "type_arguments" { + continue; + } + for arg in named_children(arg_child) { + let mut refs: Vec<(String, RefRole)> = Vec::new(); + if arg.kind() == "type_projection" { + for inner in named_children(arg) { + kotlin_collect_type_refs(inner, source, true, &mut refs); + } + } else { + kotlin_collect_type_refs(arg, source, true, &mut refs); + } + for (ref_name, _role) in refs { + let target = crate::generic::walk::ensure_named_node( + &ref_name, line, stem, str_path, nodes, seen_ids, + ); + add_edge( + class_nid, + &target, + "references", + line, + str_path, + Some("generic_arg"), + edges, + ); + } + } + } + } + } +} diff --git a/crates/graphify-extract/src/generic/inherit/mod.rs b/crates/graphify-extract/src/generic/inherit/mod.rs new file mode 100644 index 0000000..6a7e51e --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/mod.rs @@ -0,0 +1,60 @@ +//! Per-language inheritance-edge emitters. +//! +//! Each `emit_*_inheritance` function is called from the structural `walk` +//! pass when a class node is encountered for the corresponding language. +//! They inspect language-specific child nodes (e.g. `base_list`, `superclass`, +//! `base_class_clause`) and push `inherits` / `extends` / `implements` edges. +//! +//! One submodule per language; this file holds the shared `emit_base_node`. + +use std::collections::HashSet; + +use crate::types::Node as GNode; + +mod cpp; +mod csharp; +mod java; +mod kotlin; +mod php; +mod scala; +mod swift; +mod ts; + +pub(crate) use cpp::*; +pub(crate) use csharp::*; +pub(crate) use java::*; +pub(crate) use kotlin::*; +pub(crate) use php::*; +pub(crate) use scala::*; +pub(crate) use swift::*; +pub(crate) use ts::*; + +/// Ensure a base-class node exists and return its NID. +pub(crate) fn emit_base_node( + base: &str, + _line: u32, + stem: &str, + _str_path: &str, + nodes: &mut Vec, + seen_ids: &mut HashSet, +) -> String { + use crate::ids::{make_id, make_id1}; + + let nid1 = make_id(&[stem, base]); + if seen_ids.contains(&nid1) { + return nid1; + } + let nid2 = make_id1(base); + if !seen_ids.contains(&nid2) { + nodes.push(GNode { + id: nid2.clone(), + label: base.to_string(), + file_type: "code".to_string(), + source_file: String::new(), + source_location: None, + metadata: None, + }); + seen_ids.insert(nid2.clone()); + } + nid2 +} diff --git a/crates/graphify-extract/src/generic/inherit/php.rs b/crates/graphify-extract/src/generic/inherit/php.rs new file mode 100644 index 0000000..4b5b49a --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/php.rs @@ -0,0 +1,106 @@ +//! PHP inheritance-edge emitter. + +#![allow(clippy::cast_possible_truncation)] + +use super::emit_base_node; +use crate::generic::walk::{add_edge, first_child_kind, named_children}; +use crate::types::{Edge, Node as GNode}; +use std::collections::HashSet; +use tree_sitter::Node; + +/// Emit `inherits` (`extends`) / `implements` (`implements`) / `mixes_in` +/// (trait `use`) edges for a PHP class. Mirrors Python `_extract_php`. +pub(crate) fn emit_php_inheritance( + ctx: &mut crate::generic::walk::WalkCtx<'_, '_>, + node: Node<'_>, + source: &[u8], + class_nid: &str, + _line: u32, +) { + let stem = ctx.stem; + let str_path = ctx.str_path; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + + let emit = |base_name: Option, + rel: &str, + at_line: u32, + nodes: &mut Vec, + edges: &mut Vec, + seen_ids: &mut HashSet| { + let Some(base_name) = base_name else { return }; + if base_name.is_empty() { + return; + } + let base_nid = emit_base_node(&base_name, at_line, stem, str_path, nodes, seen_ids); + add_edge(class_nid, &base_nid, rel, at_line, str_path, None, edges); + }; + + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + let child_line = child.start_position().row as u32 + 1; + match child.kind() { + "base_clause" => { + for sub in named_children(child) { + if matches!(sub.kind(), "name" | "qualified_name") { + emit( + crate::generic::references::php_name_text(sub, source), + "inherits", + child_line, + nodes, + edges, + seen_ids, + ); + } + } + } + "class_interface_clause" => { + for sub in named_children(child) { + if matches!(sub.kind(), "name" | "qualified_name") { + emit( + crate::generic::references::php_name_text(sub, source), + "implements", + child_line, + nodes, + edges, + seen_ids, + ); + } + } + } + _ => {} + } + if !cur.goto_next_sibling() { + break; + } + } + } + + // Trait `use` declarations inside the class body → `mixes_in`. + let body = node + .child_by_field_name("body") + .or_else(|| first_child_kind(node, "declaration_list")); + if let Some(body) = body { + for member in named_children(body) { + if member.kind() != "use_declaration" { + continue; + } + let member_line = member.start_position().row as u32 + 1; + for sub in named_children(member) { + if matches!(sub.kind(), "name" | "qualified_name") { + emit( + crate::generic::references::php_name_text(sub, source), + "mixes_in", + member_line, + nodes, + edges, + seen_ids, + ); + } + } + } + } +} diff --git a/crates/graphify-extract/src/generic/inherit/scala.rs b/crates/graphify-extract/src/generic/inherit/scala.rs new file mode 100644 index 0000000..bdc532e --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/scala.rs @@ -0,0 +1,96 @@ +//! Scala inheritance-edge emitter. + +#![allow(clippy::cast_possible_truncation)] + +use crate::generic::names::read_text_owned; +use crate::generic::walk::{add_edge, first_child_kind, named_children}; +use tree_sitter::Node; + +/// Emit `inherits` (first base after `extends`) / `mixes_in` (each `with` +/// trait) edges plus `references[field]` edges for constructor parameters. +/// Mirrors Python `_extract_scala`. +pub(crate) fn emit_scala_inheritance( + ctx: &mut crate::generic::walk::WalkCtx<'_, '_>, + node: Node<'_>, + source: &[u8], + class_nid: &str, + _line: u32, +) { + use crate::generic::references::{RefRole, scala_collect_type_refs}; + let stem = ctx.stem; + let str_path = ctx.str_path; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + + let extend = node + .child_by_field_name("extend") + .or_else(|| first_child_kind(node, "extends_clause")); + if let Some(extend) = extend { + let mut bases: Vec<(String, u32)> = Vec::new(); + for c in named_children(extend) { + let c_line = c.start_position().row as u32 + 1; + // Skip empty base names (consistent with the PHP emitter) so a + // malformed node never spawns an empty-label node. + if c.kind() == "type_identifier" { + let name = read_text_owned(c, source); + if !name.is_empty() { + bases.push((name, c_line)); + } + } else if c.kind() == "generic_type" { + let base = c + .child_by_field_name("type") + .or_else(|| first_child_kind(c, "type_identifier")); + if let Some(base) = base { + let name = read_text_owned(base, source); + if !name.is_empty() { + bases.push((name, c_line)); + } + } + } + } + for (idx, (base_name, base_line)) in bases.into_iter().enumerate() { + let rel = if idx == 0 { "inherits" } else { "mixes_in" }; + let base_nid = crate::generic::walk::ensure_named_node( + &base_name, base_line, stem, str_path, nodes, seen_ids, + ); + if base_nid != class_nid { + add_edge(class_nid, &base_nid, rel, base_line, str_path, None, edges); + } + } + } + + for c in named_children(node) { + if c.kind() != "class_parameters" { + continue; + } + for cp in named_children(c) { + if cp.kind() != "class_parameter" { + continue; + } + let Some(ptype) = cp.child_by_field_name("type") else { + continue; + }; + let cp_line = cp.start_position().row as u32 + 1; + let mut refs: Vec<(String, RefRole)> = Vec::new(); + scala_collect_type_refs(ptype, source, false, &mut refs); + for (ref_name, role) in refs { + let context = role.into_context("field"); + let target = crate::generic::walk::ensure_named_node( + &ref_name, cp_line, stem, str_path, nodes, seen_ids, + ); + if target != class_nid { + add_edge( + class_nid, + &target, + "references", + cp_line, + str_path, + Some(context), + edges, + ); + } + } + } + } +} diff --git a/crates/graphify-extract/src/generic/inherit/swift.rs b/crates/graphify-extract/src/generic/inherit/swift.rs new file mode 100644 index 0000000..f7b816a --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/swift.rs @@ -0,0 +1,224 @@ +//! Swift inheritance-edge emitter. + +use super::emit_base_node; +use crate::generic::names::read_text_owned; +use crate::generic::walk::{add_edge, first_child_kind}; +use std::collections::HashSet; +use tree_sitter::Node; + +/// Return the leading kind keyword for a Swift `class_declaration` +/// (`class` / `struct` / `enum` / `extension` / `actor`), if present. +#[must_use] +pub(crate) fn swift_declaration_keyword(node: Node<'_>) -> Option<&'static str> { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let c = cur.node(); + if !c.is_named() { + match c.kind() { + "class" => return Some("class"), + "struct" => return Some("struct"), + "enum" => return Some("enum"), + "extension" => return Some("extension"), + "actor" => return Some("actor"), + _ => {} + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// Pre-scan a Swift compilation unit, returning `(protocol_names, class_like_names)`. +/// +/// Used to classify each `inheritance_specifier` entry as `inherits` (a class) +/// or `implements` (a protocol). Mirrors Python `_swift_pre_scan`. +#[must_use] +pub(crate) fn swift_pre_scan(root: Node<'_>, source: &[u8]) -> (HashSet, HashSet) { + let mut protocols: HashSet = HashSet::new(); + let mut classes: HashSet = HashSet::new(); + let mut stack: Vec> = vec![root]; + while let Some(n) = stack.pop() { + if n.kind() == "protocol_declaration" { + let name_node = n + .child_by_field_name("name") + .or_else(|| first_child_kind(n, "type_identifier")); + if let Some(nn) = name_node { + let text = read_text_owned(nn, source); + if !text.is_empty() { + protocols.insert(text); + } + } + } else if n.kind() == "class_declaration" + && matches!( + swift_declaration_keyword(n), + Some("class" | "struct" | "enum" | "actor") + ) + && let Some(nn) = n.child_by_field_name("name") + { + let text = read_text_owned(nn, source); + if !text.is_empty() { + classes.insert(text); + } + } + let mut cur = n.walk(); + if cur.goto_first_child() { + loop { + stack.push(cur.node()); + if !cur.goto_next_sibling() { + break; + } + } + } + } + (protocols, classes) +} + +/// Classify a Swift inheritance entry as `inherits` or `implements`. +/// +/// Declared protocols → `implements`; declared classes → `inherits`. A +/// `struct`/`enum`/`extension`/`actor` can only conform to protocols, so all +/// of its entries are `implements`. For a `class`, the first entry is the base +/// class (`inherits`) and the rest are protocol conformances (`implements`). +/// Mirrors Python `_swift_classify_base`. +fn swift_classify_base( + name: &str, + kind: Option<&str>, + is_first: bool, + protocols: &HashSet, + classes: &HashSet, +) -> &'static str { + if protocols.contains(name) { + return "implements"; + } + if classes.contains(name) { + return "inherits"; + } + if matches!(kind, Some("struct" | "enum" | "extension" | "actor")) { + return "implements"; + } + if is_first { "inherits" } else { "implements" } +} + +/// Emit `inherits` / `implements` edges for a Swift class/protocol/extension's +/// `inheritance_specifier` children, plus `references[generic_arg]` edges for +/// any generic arguments on a base type. Mirrors Python `_extract_swift`. +#[allow(clippy::too_many_lines)] // linear walk over inheritance specifiers + their generic args +pub(crate) fn emit_swift_inheritance( + ctx: &mut crate::generic::walk::WalkCtx<'_, '_>, + node: Node<'_>, + source: &[u8], + class_nid: &str, + line: u32, +) { + use crate::generic::references::{RefRole, swift_collect_type_refs, swift_user_type_name}; + + let stem = ctx.stem; + let str_path = ctx.str_path; + let protocols = ctx.swift_protocol_names; + let classes = ctx.swift_class_names; + let is_protocol = node.kind() == "protocol_declaration"; + let kind = if node.kind() == "class_declaration" { + swift_declaration_keyword(node) + } else { + Some("protocol") + }; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + + let mut seen_base = false; + let mut cur = node.walk(); + if !cur.goto_first_child() { + return; + } + loop { + let child = cur.node(); + if child.kind() == "inheritance_specifier" { + // Resolve the base name (and the user_type carrying any generics). + let mut base_name: Option = None; + let mut user_type_node: Option> = None; + let mut scur = child.walk(); + if scur.goto_first_child() { + loop { + let sub = scur.node(); + if sub.kind() == "user_type" { + user_type_node = Some(sub); + base_name = swift_user_type_name(sub, source); + break; + } + if sub.kind() == "type_identifier" { + let t = read_text_owned(sub, source); + base_name = (!t.is_empty()).then_some(t); + break; + } + if !scur.goto_next_sibling() { + break; + } + } + } + if let Some(base_name) = base_name { + let base_nid = emit_base_node(&base_name, line, stem, str_path, nodes, seen_ids); + let relation = if is_protocol { + "inherits" + } else { + swift_classify_base(&base_name, kind, !seen_base, protocols, classes) + }; + seen_base = true; + add_edge(class_nid, &base_nid, relation, line, str_path, None, edges); + // Generic arguments on the base type → references[generic_arg]. + if let Some(ut) = user_type_node { + let mut tacur = ut.walk(); + if tacur.goto_first_child() { + loop { + if tacur.node().kind() == "type_arguments" { + let mut acur = tacur.node().walk(); + if acur.goto_first_child() { + loop { + if acur.node().is_named() { + let mut refs: Vec<(String, RefRole)> = Vec::new(); + swift_collect_type_refs( + acur.node(), + source, + true, + &mut refs, + ); + for (ref_name, _role) in refs { + let target = + crate::generic::walk::ensure_named_node( + &ref_name, line, stem, str_path, nodes, + seen_ids, + ); + add_edge( + class_nid, + &target, + "references", + line, + str_path, + Some("generic_arg"), + edges, + ); + } + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !tacur.goto_next_sibling() { + break; + } + } + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } +} diff --git a/crates/graphify-extract/src/generic/inherit/ts.rs b/crates/graphify-extract/src/generic/inherit/ts.rs new file mode 100644 index 0000000..b9f5a7a --- /dev/null +++ b/crates/graphify-extract/src/generic/inherit/ts.rs @@ -0,0 +1,74 @@ +//! TypeScript / JavaScript inheritance-edge emitter. + +use super::emit_base_node; +use crate::generic::walk::add_edge; +use tree_sitter::Node; + +/// Emit `inherits` / `implements` edges for a TS class declaration's +/// `class_heritage` child. +/// +/// TS distinguishes `extends_clause` (single class) from `implements_clause` +/// (one or more interfaces). `extends` is normalised to `inherits` so all +/// languages share a single relation name for class extension. The `name` +/// field's type-arguments are NOT walked here — that happens in the field / +/// method passes via `ts_collect_type_refs`. +pub(crate) fn emit_ts_inheritance( + ctx: &mut crate::generic::walk::WalkCtx<'_, '_>, + node: Node<'_>, + source: &[u8], + class_nid: &str, + line: u32, +) { + let stem = ctx.stem; + let str_path = ctx.str_path; + let nodes = &mut *ctx.nodes; + let edges = &mut *ctx.edges; + let seen_ids = &mut *ctx.seen_ids; + let mut cur = node.walk(); + if !cur.goto_first_child() { + return; + } + loop { + let child = cur.node(); + if child.kind() == "class_heritage" { + let mut hcur = child.walk(); + if hcur.goto_first_child() { + loop { + let clause = hcur.node(); + let relation = match clause.kind() { + "extends_clause" => Some("inherits"), + "implements_clause" => Some("implements"), + _ => None, + }; + if let Some(rel) = relation { + for name in + crate::generic::references::ts_heritage_clause_entries(clause, source) + { + let base_nid = + emit_base_node(&name, line, stem, str_path, nodes, seen_ids); + add_edge(class_nid, &base_nid, rel, line, str_path, None, edges); + } + } + if !hcur.goto_next_sibling() { + break; + } + } + } + } else if child.kind() == "extends_type_clause" { + // Interface heritage (`interface A extends B, C`) is an + // extends_type_clause node directly under the declaration, NOT + // wrapped in class_heritage. Its base entries are the same node types + // extends_clause holds, so the entry helper is reusable. Without this + // branch interface inheritance is dropped entirely (#1095). + for name in crate::generic::references::ts_heritage_clause_entries(child, source) { + let base_nid = emit_base_node(&name, line, stem, str_path, nodes, seen_ids); + add_edge( + class_nid, &base_nid, "inherits", line, str_path, None, edges, + ); + } + } + if !cur.goto_next_sibling() { + break; + } + } +} diff --git a/crates/graphify-extract/src/generic/mod.rs b/crates/graphify-extract/src/generic/mod.rs index 56a853a..529bf00 100644 --- a/crates/graphify-extract/src/generic/mod.rs +++ b/crates/graphify-extract/src/generic/mod.rs @@ -15,6 +15,7 @@ mod calls; pub mod config; +mod graph; mod inherit; mod js_extra; mod names; diff --git a/crates/graphify-extract/src/generic/references.rs b/crates/graphify-extract/src/generic/references.rs deleted file mode 100644 index 3adab76..0000000 --- a/crates/graphify-extract/src/generic/references.rs +++ /dev/null @@ -1,1402 +0,0 @@ -//! Per-language type-reference emitters for function/method declarations. -//! -//! These helpers walk parameter lists, return types, and annotations on -//! function nodes to emit `references` edges with a `context` set to -//! `parameter_type`, `return_type`, `generic_arg`, or `attribute`. -//! -//! Mirrors the `_python_*` / `_csharp_*` / `_java_*` helpers added to -//! `graphify-py/graphify/extract.py` in ab4e542. - -use tree_sitter::Node; - -use super::names::read_text_owned; -use super::walk::first_child_kind; - -/// Role of a collected type reference. `Direct` = used as the type itself -/// (e.g. `def f(x: Foo)`), `Generic` = used as a type argument to a generic -/// (e.g. `def f(x: list[Foo])`). -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum RefRole { - Direct, - Generic, -} - -impl RefRole { - /// Map a role into the canonical `context` string used on the emitted edge. - /// `Direct` becomes the supplied `direct_ctx` (e.g. `"parameter_type"` or - /// `"return_type"`); `Generic` always becomes `"generic_arg"`. - pub(super) fn into_context(self, direct_ctx: &'static str) -> &'static str { - match self { - Self::Direct => direct_ctx, - Self::Generic => "generic_arg", - } - } -} - -// ── Python ──────────────────────────────────────────────────────────────────── - -/// Python `typing` containers that are not themselves user-defined types and -/// must therefore be skipped when collecting reference names — but their -/// nested arguments still count as `generic_arg`. -/// -/// Mirrors `_PYTHON_TYPE_CONTAINERS` in `extract.py`. -const PYTHON_TYPE_CONTAINERS: &[&str] = &[ - "list", - "dict", - "set", - "tuple", - "frozenset", - "type", - "List", - "Dict", - "Set", - "Tuple", - "FrozenSet", - "Type", - "Optional", - "Union", - "Sequence", - "Iterable", - "Mapping", - "MutableMapping", - "Iterator", - "Callable", - "Awaitable", - "AsyncIterable", - "AsyncIterator", - "Coroutine", - "Generator", - "AsyncGenerator", - "ContextManager", - "AsyncContextManager", - "Annotated", - "ClassVar", - "Final", - "Literal", - "Concatenate", - "ParamSpec", - "TypeVar", - "None", - "Ellipsis", -]; - -/// Scalar builtins and `unittest.mock` names that appear as type annotations but -/// carry no useful semantic meaning as graph nodes (#1147). Suppressed at the -/// annotation walker level so they are never created as nodes or emitted as -/// edges. Mirrors `_PYTHON_ANNOTATION_NOISE` in `extract.py`. -const PYTHON_ANNOTATION_NOISE: &[&str] = &[ - // scalar builtins - "str", - "int", - "float", - "bool", - "bytes", - "bytearray", - "complex", - "object", - "True", - "False", - // unittest.mock - "MagicMock", - "Mock", - "AsyncMock", - "NonCallableMock", - "NonCallableMagicMock", - "PropertyMock", - "patch", - "sentinel", -]; - -fn is_python_container(name: &str) -> bool { - PYTHON_TYPE_CONTAINERS.contains(&name) -} - -fn is_python_annotation_noise(name: &str) -> bool { - PYTHON_ANNOTATION_NOISE.contains(&name) -} - -/// Walk a Python type annotation tree and append `(name, role)` pairs. -/// -/// `generic = true` means we entered the function from a `subscript` value or -/// `type_arguments` child, so every emitted name takes the `Generic` role. -#[allow(clippy::too_many_lines)] // single recursive dispatch over tree-sitter Python type kinds; splitting would fragment the per-kind branches -pub(super) fn python_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - let t = node.kind(); - if t == "type" { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if child.is_named() { - python_collect_type_refs(child, source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if t == "identifier" { - let name = read_text_owned(node, source); - if !name.is_empty() && !is_python_container(&name) && !is_python_annotation_noise(&name) { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((name, role)); - } - return; - } - if t == "attribute" { - let text = read_text_owned(node, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() && !is_python_container(tail) && !is_python_annotation_noise(tail) { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((tail.to_string(), role)); - } - return; - } - if t == "generic_type" { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if child.kind() == "identifier" { - let container = read_text_owned(child, source); - if !container.is_empty() - && !is_python_container(&container) - && !is_python_annotation_noise(&container) - { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((container, role)); - } - } else if child.kind() == "type_parameter" { - let mut sc = child.walk(); - if sc.goto_first_child() { - loop { - if sc.node().is_named() { - python_collect_type_refs(sc.node(), source, true, out); - } - if !sc.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if t == "subscript" { - let value = node.child_by_field_name("value"); - if let Some(v) = value { - python_collect_type_refs(v, source, generic, out); - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if Some(child) != value && child.is_named() { - python_collect_type_refs(child, source, true, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if node.is_named() { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - python_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - } -} - -/// Collect type references from each typed parameter under a `parameters` node. -#[must_use] -pub(super) fn python_collect_param_refs( - params_node: Option>, - source: &[u8], -) -> Vec<(String, RefRole)> { - let mut out = Vec::new(); - let Some(params) = params_node else { - return out; - }; - let mut cur = params.walk(); - if !cur.goto_first_child() { - return out; - } - loop { - let child = cur.node(); - if matches!(child.kind(), "typed_parameter" | "typed_default_parameter") - && let Some(type_node) = child.child_by_field_name("type") - { - python_collect_type_refs(type_node, source, false, &mut out); - } - if !cur.goto_next_sibling() { - break; - } - } - out -} - -// ── C# ──────────────────────────────────────────────────────────────────────── - -#[allow(clippy::too_many_lines)] // single recursive dispatch over tree-sitter C# type kinds -pub(super) fn csharp_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - let t = node.kind(); - if t == "predefined_type" { - return; - } - if t == "identifier" { - let name = read_text_owned(node, source); - if !name.is_empty() { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((name, role)); - } - return; - } - if t == "qualified_name" { - let full = read_text_owned(node, source); - let tail = full.rsplit('.').next().unwrap_or(&full); - if !tail.is_empty() { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((tail.to_string(), role)); - } - return; - } - if t == "generic_name" { - let name_node = node.child_by_field_name("name").or_else(|| { - let mut sc = node.walk(); - if sc.goto_first_child() { - loop { - if sc.node().kind() == "identifier" { - return Some(sc.node()); - } - if !sc.goto_next_sibling() { - break; - } - } - } - None - }); - if let Some(nn) = name_node { - let name = read_text_owned(nn, source); - if !name.is_empty() { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((name, role)); - } - } - let mut sc = node.walk(); - if sc.goto_first_child() { - loop { - if sc.node().kind() == "type_argument_list" { - let mut acur = sc.node().walk(); - if acur.goto_first_child() { - loop { - if acur.node().is_named() { - csharp_collect_type_refs(acur.node(), source, true, out); - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !sc.goto_next_sibling() { - break; - } - } - } - return; - } - if matches!( - t, - "nullable_type" | "array_type" | "pointer_type" | "ref_type" - ) { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - csharp_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if node.is_named() { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - csharp_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - } -} - -/// Collect attribute names from a C# method's `attribute_list` children. -/// -/// `[Authorize, Route("/api")]` on a method produces `["Authorize", "Route"]`. -#[must_use] -pub(super) fn csharp_attribute_names(method_node: Node<'_>, source: &[u8]) -> Vec { - let mut names = Vec::new(); - let mut cur = method_node.walk(); - if !cur.goto_first_child() { - return names; - } - loop { - let child = cur.node(); - if child.kind() == "attribute_list" { - let mut acur = child.walk(); - if acur.goto_first_child() { - loop { - let attr = acur.node(); - if attr.kind() == "attribute" { - let name_node = attr.child_by_field_name("name").or_else(|| { - let mut sc = attr.walk(); - if sc.goto_first_child() { - loop { - if matches!(sc.node().kind(), "identifier" | "qualified_name") { - return Some(sc.node()); - } - if !sc.goto_next_sibling() { - break; - } - } - } - None - }); - if let Some(nn) = name_node { - let text = read_text_owned(nn, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() { - names.push(tail.to_string()); - } - } - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - names -} - -// ── Java ────────────────────────────────────────────────────────────────────── - -#[allow(clippy::too_many_lines)] // single recursive dispatch over tree-sitter Java type kinds -pub(super) fn java_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - let t = node.kind(); - if matches!( - t, - "integral_type" | "floating_point_type" | "boolean_type" | "void_type" - ) { - return; - } - if t == "type_identifier" { - let name = read_text_owned(node, source); - if !name.is_empty() { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((name, role)); - } - return; - } - if t == "scoped_type_identifier" { - let text = read_text_owned(node, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((tail.to_string(), role)); - } - return; - } - if t == "generic_type" { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if matches!(child.kind(), "type_identifier" | "scoped_type_identifier") { - let text = read_text_owned(child, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((tail.to_string(), role)); - } - break; - } - if !cur.goto_next_sibling() { - break; - } - } - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if child.kind() == "type_arguments" { - let mut acur = child.walk(); - if acur.goto_first_child() { - loop { - if acur.node().is_named() { - java_collect_type_refs(acur.node(), source, true, out); - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if t == "array_type" { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - java_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if node.is_named() { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - java_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - } -} - -/// Find the `modifiers` child of a Java method declaration, if any. -fn find_modifiers(method_node: Node<'_>) -> Option> { - let mut cur = method_node.walk(); - if !cur.goto_first_child() { - return None; - } - loop { - if cur.node().kind() == "modifiers" { - return Some(cur.node()); - } - if !cur.goto_next_sibling() { - return None; - } - } -} - -// ── TypeScript / JavaScript ────────────────────────────────────────────────── - -/// TS/JS primitive type names that are emitted by tree-sitter as `identifier` -/// or `type_identifier` but do not represent user-defined types. We skip them -/// when collecting reference names to avoid noise like `string` / `number`. -/// -/// Mirrors `_JS_PRIMITIVE_TYPES` in `extract.py`. -const JS_PRIMITIVE_TYPES: &[&str] = &[ - "string", - "number", - "boolean", - "any", - "unknown", - "void", - "never", - "object", - "null", - "undefined", - "bigint", - "symbol", - "this", -]; - -fn is_js_primitive(name: &str) -> bool { - JS_PRIMITIVE_TYPES.contains(&name) -} - -/// Walk a TypeScript type annotation tree and append `(name, role)` tuples. -/// -/// Mirrors Python `_ts_collect_type_refs`. -#[allow(clippy::too_many_lines)] // single recursive dispatch over tree-sitter TypeScript type kinds -pub(super) fn ts_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - let t = node.kind(); - if t == "type_annotation" { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - ts_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if matches!(t, "type_identifier" | "identifier") { - let name = read_text_owned(node, source); - if !name.is_empty() && !is_js_primitive(&name) { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((name, role)); - } - return; - } - if t == "nested_type_identifier" { - let text = read_text_owned(node, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() && !is_js_primitive(tail) { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((tail.to_string(), role)); - } - return; - } - if t == "generic_type" { - let name_node = node.child_by_field_name("name"); - if let Some(nn) = name_node { - let text = read_text_owned(nn, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() && !is_js_primitive(tail) { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((tail.to_string(), role)); - } - } else { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if matches!( - cur.node().kind(), - "type_identifier" | "nested_type_identifier" - ) { - let text = read_text_owned(cur.node(), source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() && !is_js_primitive(tail) { - let role = if generic { - RefRole::Generic - } else { - RefRole::Direct - }; - out.push((tail.to_string(), role)); - } - break; - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "type_arguments" { - let mut acur = cur.node().walk(); - if acur.goto_first_child() { - loop { - if acur.node().is_named() { - ts_collect_type_refs(acur.node(), source, true, out); - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } - } - return; - } - if node.is_named() { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - ts_collect_type_refs(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } - } -} - -/// Return the type-identifier names extracted from an `extends_clause` or -/// `implements_clause`. Both clauses can list multiple types (e.g. -/// `implements A, B`); each name is returned as the tail of any -/// qualified path (`Foo.Bar` → `"Bar"`). -/// -/// Mirrors Python `_ts_heritage_clause_entries`. -#[must_use] -pub(super) fn ts_heritage_clause_entries(clause: Node<'_>, source: &[u8]) -> Vec { - let mut out = Vec::new(); - let mut cur = clause.walk(); - if !cur.goto_first_child() { - return out; - } - loop { - let child = cur.node(); - if child.is_named() { - match child.kind() { - "identifier" | "type_identifier" => { - let name = read_text_owned(child, source); - if !name.is_empty() { - out.push(name); - } - } - "generic_type" => { - let name_node = child.child_by_field_name("name").or_else(|| { - let mut sc = child.walk(); - if sc.goto_first_child() { - loop { - if matches!( - sc.node().kind(), - "type_identifier" | "nested_type_identifier" | "identifier" - ) { - return Some(sc.node()); - } - if !sc.goto_next_sibling() { - break; - } - } - } - None - }); - if let Some(nn) = name_node { - let text = read_text_owned(nn, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() { - out.push(tail.to_string()); - } - } - } - "nested_type_identifier" => { - let text = read_text_owned(child, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() { - out.push(tail.to_string()); - } - } - _ => {} - } - } - if !cur.goto_next_sibling() { - break; - } - } - out -} - -/// Collect annotation names from a Java method's `modifiers` child. -/// -/// `@Override @Deprecated public void foo()` yields `["Override", "Deprecated"]`. -#[must_use] -pub(super) fn java_method_annotation_names(method_node: Node<'_>, source: &[u8]) -> Vec { - let mut names = Vec::new(); - let Some(modifiers) = find_modifiers(method_node) else { - return names; - }; - let mut acur = modifiers.walk(); - if !acur.goto_first_child() { - return names; - } - loop { - let anno = acur.node(); - if matches!(anno.kind(), "marker_annotation" | "annotation") { - let name_node = anno.child_by_field_name("name").or_else(|| { - let mut sc = anno.walk(); - if sc.goto_first_child() { - loop { - if matches!( - sc.node().kind(), - "identifier" | "scoped_identifier" | "type_identifier" - ) { - return Some(sc.node()); - } - if !sc.goto_next_sibling() { - break; - } - } - } - None - }); - if let Some(nn) = name_node { - let text = read_text_owned(nn, source); - let tail = text.rsplit('.').next().unwrap_or(&text); - if !tail.is_empty() { - names.push(tail.to_string()); - } - } - } - if !acur.goto_next_sibling() { - break; - } - } - names -} - -// ── Shared helpers for the v0.8.25 cross-language type-ref collectors ────────── - -/// Map a `generic` flag to the corresponding [`RefRole`]. -fn role_of(generic: bool) -> RefRole { - if generic { - RefRole::Generic - } else { - RefRole::Direct - } -} - -/// A language type-reference collector: walks a type node, appending -/// `(name, role)` tuples for each referenced user type. -pub(super) type RefCollector = fn(Node<'_>, &[u8], bool, &mut Vec<(String, RefRole)>); - -/// Recurse `collect` over every named child of `node`, preserving `generic`. -fn recurse_named_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, - collect: RefCollector, -) { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - collect(cur.node(), source, generic, out); - } - if !cur.goto_next_sibling() { - break; - } - } - } -} - -// ── Swift ─────────────────────────────────────────────────────────────────── - -/// Return the head `type_identifier` text from a Swift `user_type` node. -#[must_use] -pub(super) fn swift_user_type_name(user_type_node: Node<'_>, source: &[u8]) -> Option { - first_child_kind(user_type_node, "type_identifier") - .map(|n| read_text_owned(n, source)) - .filter(|t| !t.is_empty()) -} - -/// Return the `type_annotation` child of a Swift `property_declaration`, if any. -#[must_use] -pub(crate) fn swift_property_type_node(property_node: Node<'_>) -> Option> { - first_child_kind(property_node, "type_annotation") -} - -/// Return the bound name of a Swift property (`let x` / `var x = ...`). Mirrors -/// `_swift_property_name`. -#[must_use] -pub(crate) fn swift_property_name(property_node: Node<'_>, source: &[u8]) -> Option { - let mut cur = property_node.walk(); - if cur.goto_first_child() { - loop { - let c = cur.node(); - if c.kind() == "pattern" - && let Some(id) = first_child_kind(c, "simple_identifier") - { - return Some(read_text_owned(id, source)); - } - if c.kind() == "simple_identifier" { - return Some(read_text_owned(c, source)); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// If a Swift call expression is a constructor (`Foo()`), return the type name. -/// Only upper-cased callees are treated as types so a free-function call like -/// `configure()` in an initializer is not mistaken for a constructor. Mirrors -/// `_swift_constructor_type`. -#[must_use] -pub(crate) fn swift_constructor_type(call_node: Node<'_>, source: &[u8]) -> Option { - let first = call_node.child(0)?; - if first.kind() == "simple_identifier" { - let text = read_text_owned(first, source); - if text.chars().next().is_some_and(char::is_uppercase) { - return Some(text); - } - } - None -} - -/// Walk a Swift type expression; append `(name, role)` tuples. Mirrors -/// Python `_swift_collect_type_refs`. -pub(crate) fn swift_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - match node.kind() { - "user_type" => { - if let Some(head) = first_child_kind(node, "type_identifier") { - let text = read_text_owned(head, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "type_arguments" { - recurse_named_refs(cur.node(), source, true, out, swift_collect_type_refs); - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - "type_identifier" => { - let text = read_text_owned(node, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - // `optional_type`, `array_type`, `dictionary_type`, `tuple_type`, etc. - // are all named wrappers handled identically by the fallback below. - _ if node.is_named() => { - recurse_named_refs(node, source, generic, out, swift_collect_type_refs); - } - _ => {} - } -} - -// ── PHP ─────────────────────────────────────────────────────────────────────── - -/// Return the unqualified tail of a PHP `name` / `qualified_name` node. -#[must_use] -pub(super) fn php_name_text(node: Node<'_>, source: &[u8]) -> Option { - let full = read_text_owned(node, source); - let tail = full.rsplit('\\').next().unwrap_or(&full); - if tail.is_empty() { - None - } else { - Some(tail.to_string()) - } -} - -/// PHP type-node kinds that count as a type annotation on params/properties. -pub(super) const PHP_TYPE_NODE_KINDS: &[&str] = &[ - "named_type", - "primitive_type", - "nullable_type", - "union_type", - "intersection_type", - "optional_type", -]; - -/// Return the return-type node following `formal_parameters` on a PHP method. -#[must_use] -pub(super) fn php_method_return_type_node(method_node: Node<'_>) -> Option> { - let mut saw_params = false; - let mut cur = method_node.walk(); - if cur.goto_first_child() { - loop { - let c = cur.node(); - if c.kind() == "formal_parameters" { - saw_params = true; - } else if saw_params - && c.is_named() - && c.kind() != "compound_statement" - && PHP_TYPE_NODE_KINDS.contains(&c.kind()) - { - return Some(c); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// Walk a PHP type expression; append `(name, role)` tuples. Mirrors -/// Python `_php_collect_type_refs`. -pub(super) fn php_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - match node.kind() { - "primitive_type" => {} - "named_type" => { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if matches!(cur.node().kind(), "name" | "qualified_name") { - if let Some(text) = php_name_text(cur.node(), source) { - out.push((text, role_of(generic))); - } - return; - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - "name" | "qualified_name" => { - if let Some(text) = php_name_text(node, source) { - out.push((text, role_of(generic))); - } - } - // `nullable_type` / `union_type` / `intersection_type` / `optional_type` - // are named wrappers handled identically by the fallback below. - _ if node.is_named() => { - recurse_named_refs(node, source, generic, out, php_collect_type_refs); - } - _ => {} - } -} - -// ── Kotlin ──────────────────────────────────────────────────────────────────── - -/// Return the head identifier text from a Kotlin `user_type` node. -#[must_use] -pub(super) fn kotlin_user_type_name(user_type_node: Node<'_>, source: &[u8]) -> Option { - let mut cur = user_type_node.walk(); - if cur.goto_first_child() { - loop { - let c = cur.node(); - match c.kind() { - "type_identifier" | "identifier" => { - let text = read_text_owned(c, source); - return if text.is_empty() { None } else { Some(text) }; - } - "simple_user_type" => { - if let Some(sub) = first_named_identifier(c) { - let text = read_text_owned(sub, source); - return if text.is_empty() { None } else { Some(text) }; - } - } - _ => {} - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// Return the first `identifier` / `type_identifier` child of `node`. -fn first_named_identifier(node: Node<'_>) -> Option> { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if matches!(cur.node().kind(), "identifier" | "type_identifier") { - return Some(cur.node()); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// Find the type node within a Kotlin `property_declaration`. -#[must_use] -pub(super) fn kotlin_property_type_node(property_node: Node<'_>) -> Option> { - let mut cur = property_node.walk(); - if cur.goto_first_child() { - loop { - let c = cur.node(); - if c.kind() == "variable_declaration" - && let Some(sub) = kotlin_type_child(c) - { - return Some(sub); - } - if matches!(c.kind(), "user_type" | "nullable_type" | "type_reference") { - return Some(c); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -fn kotlin_type_child(node: Node<'_>) -> Option> { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if matches!( - cur.node().kind(), - "user_type" | "nullable_type" | "type_reference" - ) { - return Some(cur.node()); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// Find the return-type node of a Kotlin `function_declaration`. -#[must_use] -pub(super) fn kotlin_function_return_type_node(func_node: Node<'_>) -> Option> { - let mut saw_params = false; - let mut saw_colon = false; - let mut cur = func_node.walk(); - if cur.goto_first_child() { - loop { - let c = cur.node(); - if c.kind() == "function_value_parameters" { - saw_params = true; - } else if saw_params && c.kind() == ":" { - saw_colon = true; - } else if saw_colon && c.is_named() { - return Some(c); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// Walk a Kotlin type expression; append `(name, role)` tuples. Mirrors -/// Python `_kotlin_collect_type_refs`. -pub(super) fn kotlin_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - match node.kind() { - "integral_literal" | "boolean_literal" => {} - "user_type" => { - if let Some(head) = kotlin_user_type_head(node) { - let text = read_text_owned(head, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - kotlin_collect_type_arguments(node, source, out); - } - "identifier" | "type_identifier" => { - let text = read_text_owned(node, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - // `nullable_type` / `parenthesized_type` / `type_reference` are named - // wrappers handled identically by the fallback below. - _ if node.is_named() => { - recurse_named_refs(node, source, generic, out, kotlin_collect_type_refs); - } - _ => {} - } -} - -/// Return the head `identifier`/`type_identifier` node of a Kotlin `user_type`, -/// drilling through a `simple_user_type` wrapper. -fn kotlin_user_type_head(node: Node<'_>) -> Option> { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let c = cur.node(); - if matches!(c.kind(), "identifier" | "type_identifier") { - return Some(c); - } - if c.kind() == "simple_user_type" { - return first_named_identifier(c); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// Recurse into a Kotlin `user_type`'s `type_arguments`, marking refs generic. -fn kotlin_collect_type_arguments(node: Node<'_>, source: &[u8], out: &mut Vec<(String, RefRole)>) { - let mut cur = node.walk(); - if !cur.goto_first_child() { - return; - } - loop { - if cur.node().kind() == "type_arguments" { - let mut acur = cur.node().walk(); - if acur.goto_first_child() { - loop { - let arg = acur.node(); - if arg.kind() == "type_projection" { - recurse_named_refs(arg, source, true, out, kotlin_collect_type_refs); - } else if arg.is_named() { - kotlin_collect_type_refs(arg, source, true, out); - } - if !acur.goto_next_sibling() { - break; - } - } - } - } - if !cur.goto_next_sibling() { - break; - } - } -} - -// ── Scala ───────────────────────────────────────────────────────────────────── - -/// Walk a Scala type expression; append `(name, role)` tuples. Mirrors -/// Python `_scala_collect_type_refs`. -pub(super) fn scala_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - match node.kind() { - "type_identifier" => { - let text = read_text_owned(node, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - "generic_type" => { - let base = node - .child_by_field_name("type") - .or_else(|| first_child_kind(node, "type_identifier")); - if let Some(base) = base - && base.kind() == "type_identifier" - { - let text = read_text_owned(base, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == "type_arguments" { - recurse_named_refs(cur.node(), source, true, out, scala_collect_type_refs); - } - if !cur.goto_next_sibling() { - break; - } - } - } - } - "compound_type" | "infix_type" | "function_type" | "tuple_type" | "annotated_type" - | "projected_type" => { - recurse_named_refs(node, source, generic, out, scala_collect_type_refs); - } - // No catch-all recurse: graphify-py's `_scala_collect_type_refs` - // (extract.py) handles only `type_identifier`, `generic_type`, and the - // wrapper kinds above, so other named nodes are intentionally ignored to - // preserve parity. - _ => {} - } -} - -// ── C / C++ ───────────────────────────────────────────────────────────────── - -/// Node kinds that are C/C++ primitive types and never yield a type reference. -const C_PRIMITIVE_TYPE_NODES: &[&str] = &[ - "primitive_type", - "sized_type_specifier", - "auto", - "placeholder_type_specifier", -]; - -/// Walk a C type expression; append `(name, role)` tuples for user-defined types. -pub(super) fn c_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - if C_PRIMITIVE_TYPE_NODES.contains(&node.kind()) { - return; - } - match node.kind() { - "type_identifier" => { - let text = read_text_owned(node, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - "pointer_declarator" - | "reference_declarator" - | "array_declarator" - | "type_qualifier" - | "type_descriptor" - | "abstract_pointer_declarator" - | "abstract_reference_declarator" - | "abstract_array_declarator" => { - recurse_named_refs(node, source, generic, out, c_collect_type_refs); - } - _ => {} - } -} - -/// Walk a C++ type expression; append `(name, role)` tuples. Resolves -/// `qualified_identifier` tails and `template_type` base + arguments. -pub(super) fn cpp_collect_type_refs( - node: Node<'_>, - source: &[u8], - generic: bool, - out: &mut Vec<(String, RefRole)>, -) { - if C_PRIMITIVE_TYPE_NODES.contains(&node.kind()) { - return; - } - match node.kind() { - "type_identifier" => { - let text = read_text_owned(node, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - "qualified_identifier" => { - if let Some(name_node) = node.child_by_field_name("name") { - cpp_collect_type_refs(name_node, source, generic, out); - } - } - "template_type" => { - if let Some(name_node) = node.child_by_field_name("name") { - let text = read_text_owned(name_node, source); - if !text.is_empty() { - out.push((text, role_of(generic))); - } - } - if let Some(args_node) = node.child_by_field_name("arguments") { - recurse_named_refs(args_node, source, true, out, cpp_collect_type_refs); - } - } - "type_descriptor" - | "pointer_declarator" - | "reference_declarator" - | "array_declarator" - | "type_qualifier" - | "abstract_pointer_declarator" - | "abstract_reference_declarator" - | "abstract_array_declarator" => { - recurse_named_refs(node, source, generic, out, cpp_collect_type_refs); - } - _ => {} - } -} diff --git a/crates/graphify-extract/src/generic/references/c_cpp.rs b/crates/graphify-extract/src/generic/references/c_cpp.rs new file mode 100644 index 0000000..9569911 --- /dev/null +++ b/crates/graphify-extract/src/generic/references/c_cpp.rs @@ -0,0 +1,93 @@ +//! C / C++ type-reference collectors. + +use tree_sitter::Node; + +use super::{RefRole, recurse_named_refs, role_of}; +use crate::generic::names::read_text_owned; + +/// Node kinds that are C/C++ primitive types and never yield a type reference. +const C_PRIMITIVE_TYPE_NODES: &[&str] = &[ + "primitive_type", + "sized_type_specifier", + "auto", + "placeholder_type_specifier", +]; + +/// Walk a C type expression; append `(name, role)` tuples for user-defined types. +pub(crate) fn c_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + if C_PRIMITIVE_TYPE_NODES.contains(&node.kind()) { + return; + } + match node.kind() { + "type_identifier" => { + let text = read_text_owned(node, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + "pointer_declarator" + | "reference_declarator" + | "array_declarator" + | "type_qualifier" + | "type_descriptor" + | "abstract_pointer_declarator" + | "abstract_reference_declarator" + | "abstract_array_declarator" => { + recurse_named_refs(node, source, generic, out, c_collect_type_refs); + } + _ => {} + } +} + +/// Walk a C++ type expression; append `(name, role)` tuples. Resolves +/// `qualified_identifier` tails and `template_type` base + arguments. +pub(crate) fn cpp_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + if C_PRIMITIVE_TYPE_NODES.contains(&node.kind()) { + return; + } + match node.kind() { + "type_identifier" => { + let text = read_text_owned(node, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + "qualified_identifier" => { + if let Some(name_node) = node.child_by_field_name("name") { + cpp_collect_type_refs(name_node, source, generic, out); + } + } + "template_type" => { + if let Some(name_node) = node.child_by_field_name("name") { + let text = read_text_owned(name_node, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + if let Some(args_node) = node.child_by_field_name("arguments") { + recurse_named_refs(args_node, source, true, out, cpp_collect_type_refs); + } + } + "type_descriptor" + | "pointer_declarator" + | "reference_declarator" + | "array_declarator" + | "type_qualifier" + | "abstract_pointer_declarator" + | "abstract_reference_declarator" + | "abstract_array_declarator" => { + recurse_named_refs(node, source, generic, out, cpp_collect_type_refs); + } + _ => {} + } +} diff --git a/crates/graphify-extract/src/generic/references/csharp.rs b/crates/graphify-extract/src/generic/references/csharp.rs new file mode 100644 index 0000000..eec3f71 --- /dev/null +++ b/crates/graphify-extract/src/generic/references/csharp.rs @@ -0,0 +1,176 @@ +//! C# type-reference and attribute collectors. + +use tree_sitter::Node; + +use super::RefRole; +use crate::generic::names::read_text_owned; + +#[allow(clippy::too_many_lines)] // single recursive dispatch over tree-sitter C# type kinds +pub(crate) fn csharp_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + let t = node.kind(); + if t == "predefined_type" { + return; + } + if t == "identifier" { + let name = read_text_owned(node, source); + if !name.is_empty() { + let role = if generic { + RefRole::Generic + } else { + RefRole::Direct + }; + out.push((name, role)); + } + return; + } + if t == "qualified_name" { + let full = read_text_owned(node, source); + let tail = full.rsplit('.').next().unwrap_or(&full); + if !tail.is_empty() { + let role = if generic { + RefRole::Generic + } else { + RefRole::Direct + }; + out.push((tail.to_string(), role)); + } + return; + } + if t == "generic_name" { + let name_node = node.child_by_field_name("name").or_else(|| { + let mut sc = node.walk(); + if sc.goto_first_child() { + loop { + if sc.node().kind() == "identifier" { + return Some(sc.node()); + } + if !sc.goto_next_sibling() { + break; + } + } + } + None + }); + if let Some(nn) = name_node { + let name = read_text_owned(nn, source); + if !name.is_empty() { + let role = if generic { + RefRole::Generic + } else { + RefRole::Direct + }; + out.push((name, role)); + } + } + let mut sc = node.walk(); + if sc.goto_first_child() { + loop { + if sc.node().kind() == "type_argument_list" { + let mut acur = sc.node().walk(); + if acur.goto_first_child() { + loop { + if acur.node().is_named() { + csharp_collect_type_refs(acur.node(), source, true, out); + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !sc.goto_next_sibling() { + break; + } + } + } + return; + } + if matches!( + t, + "nullable_type" | "array_type" | "pointer_type" | "ref_type" + ) { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + csharp_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if node.is_named() { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + csharp_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + } +} + +/// Collect attribute names from a C# method's `attribute_list` children. +/// +/// `[Authorize, Route("/api")]` on a method produces `["Authorize", "Route"]`. +#[must_use] +pub(crate) fn csharp_attribute_names(method_node: Node<'_>, source: &[u8]) -> Vec { + let mut names = Vec::new(); + let mut cur = method_node.walk(); + if !cur.goto_first_child() { + return names; + } + loop { + let child = cur.node(); + if child.kind() == "attribute_list" { + let mut acur = child.walk(); + if acur.goto_first_child() { + loop { + let attr = acur.node(); + if attr.kind() == "attribute" { + let name_node = attr.child_by_field_name("name").or_else(|| { + let mut sc = attr.walk(); + if sc.goto_first_child() { + loop { + if matches!(sc.node().kind(), "identifier" | "qualified_name") { + return Some(sc.node()); + } + if !sc.goto_next_sibling() { + break; + } + } + } + None + }); + if let Some(nn) = name_node { + let text = read_text_owned(nn, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() { + names.push(tail.to_string()); + } + } + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + names +} diff --git a/crates/graphify-extract/src/generic/references/java.rs b/crates/graphify-extract/src/generic/references/java.rs new file mode 100644 index 0000000..c16dc78 --- /dev/null +++ b/crates/graphify-extract/src/generic/references/java.rs @@ -0,0 +1,173 @@ +//! Java type-reference and annotation collectors. + +use tree_sitter::Node; + +use super::{RefRole, role_of}; +use crate::generic::names::read_text_owned; + +#[allow(clippy::too_many_lines)] // single recursive dispatch over tree-sitter Java type kinds +pub(crate) fn java_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + let t = node.kind(); + if matches!( + t, + "integral_type" | "floating_point_type" | "boolean_type" | "void_type" + ) { + return; + } + if t == "type_identifier" { + let name = read_text_owned(node, source); + if !name.is_empty() { + let role = role_of(generic); + out.push((name, role)); + } + return; + } + if t == "scoped_type_identifier" { + let text = read_text_owned(node, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() { + let role = role_of(generic); + out.push((tail.to_string(), role)); + } + return; + } + if t == "generic_type" { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if matches!(child.kind(), "type_identifier" | "scoped_type_identifier") { + let text = read_text_owned(child, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() { + let role = role_of(generic); + out.push((tail.to_string(), role)); + } + break; + } + if !cur.goto_next_sibling() { + break; + } + } + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if child.kind() == "type_arguments" { + let mut acur = child.walk(); + if acur.goto_first_child() { + loop { + if acur.node().is_named() { + java_collect_type_refs(acur.node(), source, true, out); + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if t == "array_type" { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + java_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if node.is_named() { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + java_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + } +} + +/// Find the `modifiers` child of a Java method declaration, if any. +fn find_modifiers(method_node: Node<'_>) -> Option> { + let mut cur = method_node.walk(); + if !cur.goto_first_child() { + return None; + } + loop { + if cur.node().kind() == "modifiers" { + return Some(cur.node()); + } + if !cur.goto_next_sibling() { + return None; + } + } +} + +/// Collect annotation names from a Java method's `modifiers` child. +/// +/// `@Override @Deprecated public void foo()` yields `["Override", "Deprecated"]`. +#[must_use] +pub(crate) fn java_method_annotation_names(method_node: Node<'_>, source: &[u8]) -> Vec { + let mut names = Vec::new(); + let Some(modifiers) = find_modifiers(method_node) else { + return names; + }; + let mut acur = modifiers.walk(); + if !acur.goto_first_child() { + return names; + } + loop { + let anno = acur.node(); + if matches!(anno.kind(), "marker_annotation" | "annotation") { + let name_node = anno.child_by_field_name("name").or_else(|| { + let mut sc = anno.walk(); + if sc.goto_first_child() { + loop { + if matches!( + sc.node().kind(), + "identifier" | "scoped_identifier" | "type_identifier" + ) { + return Some(sc.node()); + } + if !sc.goto_next_sibling() { + break; + } + } + } + None + }); + if let Some(nn) = name_node { + let text = read_text_owned(nn, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() { + names.push(tail.to_string()); + } + } + } + if !acur.goto_next_sibling() { + break; + } + } + names +} diff --git a/crates/graphify-extract/src/generic/references/kotlin.rs b/crates/graphify-extract/src/generic/references/kotlin.rs new file mode 100644 index 0000000..8432245 --- /dev/null +++ b/crates/graphify-extract/src/generic/references/kotlin.rs @@ -0,0 +1,199 @@ +//! Kotlin type-reference helpers. + +use tree_sitter::Node; + +use super::{RefRole, recurse_named_refs, role_of}; +use crate::generic::names::read_text_owned; + +/// Return the head identifier text from a Kotlin `user_type` node. +#[must_use] +pub(crate) fn kotlin_user_type_name(user_type_node: Node<'_>, source: &[u8]) -> Option { + let mut cur = user_type_node.walk(); + if cur.goto_first_child() { + loop { + let c = cur.node(); + match c.kind() { + "type_identifier" | "identifier" => { + let text = read_text_owned(c, source); + return if text.is_empty() { None } else { Some(text) }; + } + "simple_user_type" => { + if let Some(sub) = first_named_identifier(c) { + let text = read_text_owned(sub, source); + return if text.is_empty() { None } else { Some(text) }; + } + } + _ => {} + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// Return the first `identifier` / `type_identifier` child of `node`. +fn first_named_identifier(node: Node<'_>) -> Option> { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if matches!(cur.node().kind(), "identifier" | "type_identifier") { + return Some(cur.node()); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// Find the type node within a Kotlin `property_declaration`. +#[must_use] +pub(crate) fn kotlin_property_type_node(property_node: Node<'_>) -> Option> { + let mut cur = property_node.walk(); + if cur.goto_first_child() { + loop { + let c = cur.node(); + if c.kind() == "variable_declaration" + && let Some(sub) = kotlin_type_child(c) + { + return Some(sub); + } + if matches!(c.kind(), "user_type" | "nullable_type" | "type_reference") { + return Some(c); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +fn kotlin_type_child(node: Node<'_>) -> Option> { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if matches!( + cur.node().kind(), + "user_type" | "nullable_type" | "type_reference" + ) { + return Some(cur.node()); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// Find the return-type node of a Kotlin `function_declaration`. +#[must_use] +pub(crate) fn kotlin_function_return_type_node(func_node: Node<'_>) -> Option> { + let mut saw_params = false; + let mut saw_colon = false; + let mut cur = func_node.walk(); + if cur.goto_first_child() { + loop { + let c = cur.node(); + if c.kind() == "function_value_parameters" { + saw_params = true; + } else if saw_params && c.kind() == ":" { + saw_colon = true; + } else if saw_colon && c.is_named() { + return Some(c); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// Walk a Kotlin type expression; append `(name, role)` tuples. Mirrors +/// Python `_kotlin_collect_type_refs`. +pub(crate) fn kotlin_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + match node.kind() { + "integral_literal" | "boolean_literal" => {} + "user_type" => { + if let Some(head) = kotlin_user_type_head(node) { + let text = read_text_owned(head, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + kotlin_collect_type_arguments(node, source, out); + } + "identifier" | "type_identifier" => { + let text = read_text_owned(node, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + // `nullable_type` / `parenthesized_type` / `type_reference` are named + // wrappers handled identically by the fallback below. + _ if node.is_named() => { + recurse_named_refs(node, source, generic, out, kotlin_collect_type_refs); + } + _ => {} + } +} + +/// Return the head `identifier`/`type_identifier` node of a Kotlin `user_type`, +/// drilling through a `simple_user_type` wrapper. +fn kotlin_user_type_head(node: Node<'_>) -> Option> { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let c = cur.node(); + if matches!(c.kind(), "identifier" | "type_identifier") { + return Some(c); + } + if c.kind() == "simple_user_type" { + return first_named_identifier(c); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// Recurse into a Kotlin `user_type`'s `type_arguments`, marking refs generic. +fn kotlin_collect_type_arguments(node: Node<'_>, source: &[u8], out: &mut Vec<(String, RefRole)>) { + let mut cur = node.walk(); + if !cur.goto_first_child() { + return; + } + loop { + if cur.node().kind() == "type_arguments" { + let mut acur = cur.node().walk(); + if acur.goto_first_child() { + loop { + let arg = acur.node(); + if arg.kind() == "type_projection" { + recurse_named_refs(arg, source, true, out, kotlin_collect_type_refs); + } else if arg.is_named() { + kotlin_collect_type_refs(arg, source, true, out); + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } +} diff --git a/crates/graphify-extract/src/generic/references/mod.rs b/crates/graphify-extract/src/generic/references/mod.rs new file mode 100644 index 0000000..1fb74ac --- /dev/null +++ b/crates/graphify-extract/src/generic/references/mod.rs @@ -0,0 +1,88 @@ +//! Per-language type-reference emitters for function/method declarations. +//! +//! These helpers walk parameter lists, return types, and annotations on +//! function nodes to emit `references` edges with a `context` set to +//! `parameter_type`, `return_type`, `generic_arg`, or `attribute`. +//! +//! Mirrors the `_python_*` / `_csharp_*` / `_java_*` helpers added to +//! `graphify-py/graphify/extract.py` in ab4e542. +//! +//! One submodule per language; this file holds the shared `RefRole`, +//! `RefCollector`, and recursion helpers used across them. + +use tree_sitter::Node; + +mod c_cpp; +mod csharp; +mod java; +mod kotlin; +mod php; +mod python; +mod scala; +mod swift; +mod ts; + +pub(crate) use c_cpp::*; +pub(crate) use csharp::*; +pub(crate) use java::*; +pub(crate) use kotlin::*; +pub(crate) use php::*; +pub(crate) use python::*; +pub(crate) use scala::*; +pub(crate) use swift::*; +pub(crate) use ts::*; + +/// Role of a collected type reference. `Direct` = used as the type itself +/// (e.g. `def f(x: Foo)`), `Generic` = used as a type argument to a generic +/// (e.g. `def f(x: list[Foo])`). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum RefRole { + Direct, + Generic, +} + +impl RefRole { + /// Map a role into the canonical `context` string used on the emitted edge. + /// `Direct` becomes the supplied `direct_ctx` (e.g. `"parameter_type"` or + /// `"return_type"`); `Generic` always becomes `"generic_arg"`. + pub(super) fn into_context(self, direct_ctx: &'static str) -> &'static str { + match self { + Self::Direct => direct_ctx, + Self::Generic => "generic_arg", + } + } +} + +/// Map a `generic` flag to the corresponding [`RefRole`]. +fn role_of(generic: bool) -> RefRole { + if generic { + RefRole::Generic + } else { + RefRole::Direct + } +} + +/// A language type-reference collector: walks a type node, appending +/// `(name, role)` tuples for each referenced user type. +pub(super) type RefCollector = fn(Node<'_>, &[u8], bool, &mut Vec<(String, RefRole)>); + +/// Recurse `collect` over every named child of `node`, preserving `generic`. +fn recurse_named_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, + collect: RefCollector, +) { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + collect(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } +} diff --git a/crates/graphify-extract/src/generic/references/php.rs b/crates/graphify-extract/src/generic/references/php.rs new file mode 100644 index 0000000..1c5524a --- /dev/null +++ b/crates/graphify-extract/src/generic/references/php.rs @@ -0,0 +1,93 @@ +//! PHP type-reference helpers. + +use tree_sitter::Node; + +use super::{RefRole, recurse_named_refs, role_of}; +use crate::generic::names::read_text_owned; + +/// Return the unqualified tail of a PHP `name` / `qualified_name` node. +#[must_use] +pub(crate) fn php_name_text(node: Node<'_>, source: &[u8]) -> Option { + let full = read_text_owned(node, source); + let tail = full.rsplit('\\').next().unwrap_or(&full); + if tail.is_empty() { + None + } else { + Some(tail.to_string()) + } +} + +/// PHP type-node kinds that count as a type annotation on params/properties. +pub(crate) const PHP_TYPE_NODE_KINDS: &[&str] = &[ + "named_type", + "primitive_type", + "nullable_type", + "union_type", + "intersection_type", + "optional_type", +]; + +/// Return the return-type node following `formal_parameters` on a PHP method. +#[must_use] +pub(crate) fn php_method_return_type_node(method_node: Node<'_>) -> Option> { + let mut saw_params = false; + let mut cur = method_node.walk(); + if cur.goto_first_child() { + loop { + let c = cur.node(); + if c.kind() == "formal_parameters" { + saw_params = true; + } else if saw_params + && c.is_named() + && c.kind() != "compound_statement" + && PHP_TYPE_NODE_KINDS.contains(&c.kind()) + { + return Some(c); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// Walk a PHP type expression; append `(name, role)` tuples. Mirrors +/// Python `_php_collect_type_refs`. +pub(crate) fn php_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + match node.kind() { + "primitive_type" => {} + "named_type" => { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if matches!(cur.node().kind(), "name" | "qualified_name") { + if let Some(text) = php_name_text(cur.node(), source) { + out.push((text, role_of(generic))); + } + return; + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + "name" | "qualified_name" => { + if let Some(text) = php_name_text(node, source) { + out.push((text, role_of(generic))); + } + } + // `nullable_type` / `union_type` / `intersection_type` / `optional_type` + // are named wrappers handled identically by the fallback below. + _ if node.is_named() => { + recurse_named_refs(node, source, generic, out, php_collect_type_refs); + } + _ => {} + } +} diff --git a/crates/graphify-extract/src/generic/references/python.rs b/crates/graphify-extract/src/generic/references/python.rs new file mode 100644 index 0000000..0adbef2 --- /dev/null +++ b/crates/graphify-extract/src/generic/references/python.rs @@ -0,0 +1,226 @@ +//! Python type-reference collectors. + +use tree_sitter::Node; + +use super::{RefRole, role_of}; +use crate::generic::names::read_text_owned; + +/// Python `typing` containers that are not themselves user-defined types and +/// must therefore be skipped when collecting reference names — but their +/// nested arguments still count as `generic_arg`. +/// +/// Mirrors `_PYTHON_TYPE_CONTAINERS` in `extract.py`. +const PYTHON_TYPE_CONTAINERS: &[&str] = &[ + "list", + "dict", + "set", + "tuple", + "frozenset", + "type", + "List", + "Dict", + "Set", + "Tuple", + "FrozenSet", + "Type", + "Optional", + "Union", + "Sequence", + "Iterable", + "Mapping", + "MutableMapping", + "Iterator", + "Callable", + "Awaitable", + "AsyncIterable", + "AsyncIterator", + "Coroutine", + "Generator", + "AsyncGenerator", + "ContextManager", + "AsyncContextManager", + "Annotated", + "ClassVar", + "Final", + "Literal", + "Concatenate", + "ParamSpec", + "TypeVar", + "None", + "Ellipsis", +]; + +/// Scalar builtins and `unittest.mock` names that appear as type annotations but +/// carry no useful semantic meaning as graph nodes (#1147). Suppressed at the +/// annotation walker level so they are never created as nodes or emitted as +/// edges. Mirrors `_PYTHON_ANNOTATION_NOISE` in `extract.py`. +const PYTHON_ANNOTATION_NOISE: &[&str] = &[ + // scalar builtins + "str", + "int", + "float", + "bool", + "bytes", + "bytearray", + "complex", + "object", + "True", + "False", + // unittest.mock + "MagicMock", + "Mock", + "AsyncMock", + "NonCallableMock", + "NonCallableMagicMock", + "PropertyMock", + "patch", + "sentinel", +]; + +fn is_python_container(name: &str) -> bool { + PYTHON_TYPE_CONTAINERS.contains(&name) +} + +fn is_python_annotation_noise(name: &str) -> bool { + PYTHON_ANNOTATION_NOISE.contains(&name) +} + +/// Walk a Python type annotation tree and append `(name, role)` pairs. +/// +/// `generic = true` means we entered the function from a `subscript` value or +/// `type_arguments` child, so every emitted name takes the `Generic` role. +#[allow(clippy::too_many_lines)] // single recursive dispatch over tree-sitter Python type kinds; splitting would fragment the per-kind branches +pub(crate) fn python_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + let t = node.kind(); + if t == "type" { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if child.is_named() { + python_collect_type_refs(child, source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if t == "identifier" { + let name = read_text_owned(node, source); + if !name.is_empty() && !is_python_container(&name) && !is_python_annotation_noise(&name) { + let role = role_of(generic); + out.push((name, role)); + } + return; + } + if t == "attribute" { + let text = read_text_owned(node, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() && !is_python_container(tail) && !is_python_annotation_noise(tail) { + let role = role_of(generic); + out.push((tail.to_string(), role)); + } + return; + } + if t == "generic_type" { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if child.kind() == "identifier" { + let container = read_text_owned(child, source); + if !container.is_empty() + && !is_python_container(&container) + && !is_python_annotation_noise(&container) + { + let role = role_of(generic); + out.push((container, role)); + } + } else if child.kind() == "type_parameter" { + let mut sc = child.walk(); + if sc.goto_first_child() { + loop { + if sc.node().is_named() { + python_collect_type_refs(sc.node(), source, true, out); + } + if !sc.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if t == "subscript" { + let value = node.child_by_field_name("value"); + if let Some(v) = value { + python_collect_type_refs(v, source, generic, out); + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + let child = cur.node(); + if Some(child) != value && child.is_named() { + python_collect_type_refs(child, source, true, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if node.is_named() { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + python_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + } +} + +/// Collect type references from each typed parameter under a `parameters` node. +#[must_use] +pub(crate) fn python_collect_param_refs( + params_node: Option>, + source: &[u8], +) -> Vec<(String, RefRole)> { + let mut out = Vec::new(); + let Some(params) = params_node else { + return out; + }; + let mut cur = params.walk(); + if !cur.goto_first_child() { + return out; + } + loop { + let child = cur.node(); + if matches!(child.kind(), "typed_parameter" | "typed_default_parameter") + && let Some(type_node) = child.child_by_field_name("type") + { + python_collect_type_refs(type_node, source, false, &mut out); + } + if !cur.goto_next_sibling() { + break; + } + } + out +} diff --git a/crates/graphify-extract/src/generic/references/scala.rs b/crates/graphify-extract/src/generic/references/scala.rs new file mode 100644 index 0000000..e402c5d --- /dev/null +++ b/crates/graphify-extract/src/generic/references/scala.rs @@ -0,0 +1,58 @@ +//! Scala type-reference collector. + +use tree_sitter::Node; + +use super::{RefRole, recurse_named_refs, role_of}; +use crate::generic::names::read_text_owned; +use crate::generic::walk::first_child_kind; + +/// Walk a Scala type expression; append `(name, role)` tuples. Mirrors +/// Python `_scala_collect_type_refs`. +pub(crate) fn scala_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + match node.kind() { + "type_identifier" => { + let text = read_text_owned(node, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + "generic_type" => { + let base = node + .child_by_field_name("type") + .or_else(|| first_child_kind(node, "type_identifier")); + if let Some(base) = base + && base.kind() == "type_identifier" + { + let text = read_text_owned(base, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == "type_arguments" { + recurse_named_refs(cur.node(), source, true, out, scala_collect_type_refs); + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + "compound_type" | "infix_type" | "function_type" | "tuple_type" | "annotated_type" + | "projected_type" => { + recurse_named_refs(node, source, generic, out, scala_collect_type_refs); + } + // No catch-all recurse: graphify-py's `_scala_collect_type_refs` + // (extract.py) handles only `type_identifier`, `generic_type`, and the + // wrapper kinds above, so other named nodes are intentionally ignored to + // preserve parity. + _ => {} + } +} diff --git a/crates/graphify-extract/src/generic/references/swift.rs b/crates/graphify-extract/src/generic/references/swift.rs new file mode 100644 index 0000000..f91306d --- /dev/null +++ b/crates/graphify-extract/src/generic/references/swift.rs @@ -0,0 +1,104 @@ +//! Swift type-reference and property/constructor helpers. + +use tree_sitter::Node; + +use super::{RefRole, recurse_named_refs, role_of}; +use crate::generic::names::read_text_owned; +use crate::generic::walk::first_child_kind; + +/// Return the head `type_identifier` text from a Swift `user_type` node. +#[must_use] +pub(crate) fn swift_user_type_name(user_type_node: Node<'_>, source: &[u8]) -> Option { + first_child_kind(user_type_node, "type_identifier") + .map(|n| read_text_owned(n, source)) + .filter(|t| !t.is_empty()) +} + +/// Return the `type_annotation` child of a Swift `property_declaration`, if any. +#[must_use] +pub(crate) fn swift_property_type_node(property_node: Node<'_>) -> Option> { + first_child_kind(property_node, "type_annotation") +} + +/// Return the bound name of a Swift property (`let x` / `var x = ...`). Mirrors +/// `_swift_property_name`. +#[must_use] +pub(crate) fn swift_property_name(property_node: Node<'_>, source: &[u8]) -> Option { + let mut cur = property_node.walk(); + if cur.goto_first_child() { + loop { + let c = cur.node(); + if c.kind() == "pattern" + && let Some(id) = first_child_kind(c, "simple_identifier") + { + return Some(read_text_owned(id, source)); + } + if c.kind() == "simple_identifier" { + return Some(read_text_owned(c, source)); + } + if !cur.goto_next_sibling() { + break; + } + } + } + None +} + +/// If a Swift call expression is a constructor (`Foo()`), return the type name. +/// Only upper-cased callees are treated as types so a free-function call like +/// `configure()` in an initializer is not mistaken for a constructor. Mirrors +/// `_swift_constructor_type`. +#[must_use] +pub(crate) fn swift_constructor_type(call_node: Node<'_>, source: &[u8]) -> Option { + let first = call_node.child(0)?; + if first.kind() == "simple_identifier" { + let text = read_text_owned(first, source); + if text.chars().next().is_some_and(char::is_uppercase) { + return Some(text); + } + } + None +} + +/// Walk a Swift type expression; append `(name, role)` tuples. Mirrors +/// Python `_swift_collect_type_refs`. +pub(crate) fn swift_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + match node.kind() { + "user_type" => { + if let Some(head) = first_child_kind(node, "type_identifier") { + let text = read_text_owned(head, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == "type_arguments" { + recurse_named_refs(cur.node(), source, true, out, swift_collect_type_refs); + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + "type_identifier" => { + let text = read_text_owned(node, source); + if !text.is_empty() { + out.push((text, role_of(generic))); + } + } + // `optional_type`, `array_type`, `dictionary_type`, `tuple_type`, etc. + // are all named wrappers handled identically by the fallback below. + _ if node.is_named() => { + recurse_named_refs(node, source, generic, out, swift_collect_type_refs); + } + _ => {} + } +} diff --git a/crates/graphify-extract/src/generic/references/ts.rs b/crates/graphify-extract/src/generic/references/ts.rs new file mode 100644 index 0000000..701aba4 --- /dev/null +++ b/crates/graphify-extract/src/generic/references/ts.rs @@ -0,0 +1,208 @@ +//! TypeScript / JavaScript type-reference and heritage collectors. + +use tree_sitter::Node; + +use super::{RefRole, role_of}; +use crate::generic::names::read_text_owned; + +/// TS/JS primitive type names that are emitted by tree-sitter as `identifier` +/// or `type_identifier` but do not represent user-defined types. We skip them +/// when collecting reference names to avoid noise like `string` / `number`. +/// +/// Mirrors `_JS_PRIMITIVE_TYPES` in `extract.py`. +const JS_PRIMITIVE_TYPES: &[&str] = &[ + "string", + "number", + "boolean", + "any", + "unknown", + "void", + "never", + "object", + "null", + "undefined", + "bigint", + "symbol", + "this", +]; + +fn is_js_primitive(name: &str) -> bool { + JS_PRIMITIVE_TYPES.contains(&name) +} + +/// Walk a TypeScript type annotation tree and append `(name, role)` tuples. +/// +/// Mirrors Python `_ts_collect_type_refs`. +#[allow(clippy::too_many_lines)] // single recursive dispatch over tree-sitter TypeScript type kinds +pub(crate) fn ts_collect_type_refs( + node: Node<'_>, + source: &[u8], + generic: bool, + out: &mut Vec<(String, RefRole)>, +) { + let t = node.kind(); + if t == "type_annotation" { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + ts_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if matches!(t, "type_identifier" | "identifier") { + let name = read_text_owned(node, source); + if !name.is_empty() && !is_js_primitive(&name) { + let role = role_of(generic); + out.push((name, role)); + } + return; + } + if t == "nested_type_identifier" { + let text = read_text_owned(node, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() && !is_js_primitive(tail) { + let role = role_of(generic); + out.push((tail.to_string(), role)); + } + return; + } + if t == "generic_type" { + let name_node = node.child_by_field_name("name"); + if let Some(nn) = name_node { + let text = read_text_owned(nn, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() && !is_js_primitive(tail) { + let role = role_of(generic); + out.push((tail.to_string(), role)); + } + } else { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if matches!( + cur.node().kind(), + "type_identifier" | "nested_type_identifier" + ) { + let text = read_text_owned(cur.node(), source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() && !is_js_primitive(tail) { + let role = role_of(generic); + out.push((tail.to_string(), role)); + } + break; + } + if !cur.goto_next_sibling() { + break; + } + } + } + } + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().kind() == "type_arguments" { + let mut acur = cur.node().walk(); + if acur.goto_first_child() { + loop { + if acur.node().is_named() { + ts_collect_type_refs(acur.node(), source, true, out); + } + if !acur.goto_next_sibling() { + break; + } + } + } + } + if !cur.goto_next_sibling() { + break; + } + } + } + return; + } + if node.is_named() { + let mut cur = node.walk(); + if cur.goto_first_child() { + loop { + if cur.node().is_named() { + ts_collect_type_refs(cur.node(), source, generic, out); + } + if !cur.goto_next_sibling() { + break; + } + } + } + } +} + +/// Return the type-identifier names extracted from an `extends_clause` or +/// `implements_clause`. Both clauses can list multiple types (e.g. +/// `implements A, B`); each name is returned as the tail of any +/// qualified path (`Foo.Bar` → `"Bar"`). +/// +/// Mirrors Python `_ts_heritage_clause_entries`. +#[must_use] +pub(crate) fn ts_heritage_clause_entries(clause: Node<'_>, source: &[u8]) -> Vec { + let mut out = Vec::new(); + let mut cur = clause.walk(); + if !cur.goto_first_child() { + return out; + } + loop { + let child = cur.node(); + if child.is_named() { + match child.kind() { + "identifier" | "type_identifier" => { + let name = read_text_owned(child, source); + if !name.is_empty() { + out.push(name); + } + } + "generic_type" => { + let name_node = child.child_by_field_name("name").or_else(|| { + let mut sc = child.walk(); + if sc.goto_first_child() { + loop { + if matches!( + sc.node().kind(), + "type_identifier" | "nested_type_identifier" | "identifier" + ) { + return Some(sc.node()); + } + if !sc.goto_next_sibling() { + break; + } + } + } + None + }); + if let Some(nn) = name_node { + let text = read_text_owned(nn, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() { + out.push(tail.to_string()); + } + } + } + "nested_type_identifier" => { + let text = read_text_owned(child, source); + let tail = text.rsplit('.').next().unwrap_or(&text); + if !tail.is_empty() { + out.push(tail.to_string()); + } + } + _ => {} + } + } + if !cur.goto_next_sibling() { + break; + } + } + out +} diff --git a/crates/graphify-extract/src/generic/walk.rs b/crates/graphify-extract/src/generic/walk.rs index 99801b8..121b05d 100644 --- a/crates/graphify-extract/src/generic/walk.rs +++ b/crates/graphify-extract/src/generic/walk.rs @@ -28,168 +28,10 @@ use super::js_extra::{ }; use super::names::{get_cpp_func_name, read_csharp_type_name, read_text_owned}; -// ── Graph helpers ───────────────────────────────────────────────────────────── - -/// Insert a new graph node if `nid` has not been seen before. -/// -/// The `seen_ids` set is the deduplication gate — a second call with the same -/// `nid` is silently dropped so that multiple structural passes (e.g. -/// file-level node + function-level) cannot produce duplicate node entries. -pub(super) fn add_node( - nid: &str, - label: &str, - line: u32, - str_path: &str, - nodes: &mut Vec, - seen_ids: &mut HashSet, -) { - if seen_ids.insert(nid.to_string()) { - nodes.push(GNode { - id: nid.to_string(), - label: label.to_string(), - file_type: "code".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - metadata: None, - }); - } -} - -/// Append an edge to the edge list. -/// -/// Unlike nodes, edges are not deduplicated here — the caller is responsible -/// for deduplication via `seen_call_pairs` or the final clean pass in -/// [`extract_generic`]. -pub(super) fn add_edge( - src: &str, - tgt: &str, - relation: &str, - line: u32, - str_path: &str, - context: Option<&str>, - edges: &mut Vec, -) { - edges.push(Edge { - external: false, - source: src.to_string(), - target: tgt.to_string(), - relation: relation.to_string(), - confidence: "EXTRACTED".to_string(), - source_file: str_path.to_string(), - source_location: Some(format!("L{line}")), - weight: 1.0, - context: context.map(str::to_string), - confidence_score: None, - }); -} - -// ── Small AST helpers ────────────────────────────────────────────────────────── - -/// Collect the named children of `node` into a `Vec`. -#[must_use] -pub(crate) fn named_children(node: Node<'_>) -> Vec> { - let mut out = Vec::new(); - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().is_named() { - out.push(cur.node()); - } - if !cur.goto_next_sibling() { - break; - } - } - } - out -} - -/// Return the first child of `node` whose kind is `kind`. -#[must_use] -pub(crate) fn first_child_kind<'tree>(node: Node<'tree>, kind: &str) -> Option> { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == kind { - return Some(cur.node()); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -/// `true` if any child of `node` has the given `kind` (allocation-free). -#[must_use] -pub(super) fn any_child_kind(node: Node<'_>, kind: &str) -> bool { - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - if cur.node().kind() == kind { - return true; - } - if !cur.goto_next_sibling() { - break; - } - } - } - false -} - -// ── Body finder ─────────────────────────────────────────────────────────────── - -/// Locate the body child of a class or function node. -/// -/// First tries the grammar's `body` field; falls back to scanning for a child -/// whose kind appears in `config.body_fallback_child_types`. The fallback is -/// needed for languages like Kotlin whose grammar uses `class_body` or -/// `function_body` node types rather than a named field. -pub(super) fn find_body<'tree>(node: Node<'tree>, config: &LangConfig) -> Option> { - if let Some(b) = node.child_by_field_name(config.body_field) { - return Some(b); - } - let mut cur = node.walk(); - if cur.goto_first_child() { - loop { - let child = cur.node(); - if config.body_fallback_child_types.contains(&child.kind()) { - return Some(child); - } - if !cur.goto_next_sibling() { - break; - } - } - } - None -} - -// ── ensure_named_node ───────────────────────────────────────────────────────── - -/// Return the NID for a named entity, creating a placeholder node if needed. -/// -/// First checks for a file-qualified ID (`_`); if already seen, -/// returns that ID. Otherwise ensures the bare-name node exists (creating it -/// when absent) and returns the bare NID. Used for cross-file type references -/// in C# `field_declaration` processing. -pub(super) fn ensure_named_node( - name: &str, - line: u32, - stem: &str, - str_path: &str, - nodes: &mut Vec, - seen_ids: &mut HashSet, -) -> String { - let nid1 = make_id(&[stem, name]); - if seen_ids.contains(&nid1) { - return nid1; - } - let nid2 = make_id1(name); - if !seen_ids.contains(&nid2) { - add_node(&nid2, name, line, str_path, nodes, seen_ids); - } - nid2 -} +pub(crate) use super::graph::{ + add_edge, add_node, any_child_kind, ensure_named_node, find_body, first_child_kind, + named_children, +}; // ── Function-level reference edges ──────────────────────────────────────────── diff --git a/crates/graphify-extract/tests/coverage_collectors.rs b/crates/graphify-extract/tests/coverage_collectors.rs index 8cb9685..a9cbdfa 100644 --- a/crates/graphify-extract/tests/coverage_collectors.rs +++ b/crates/graphify-extract/tests/coverage_collectors.rs @@ -10,8 +10,8 @@ #![allow(clippy::expect_used)] use graphify_extract::{ - FileResult, extract_c, extract_cpp, extract_go, extract_kotlin, extract_php, extract_rust, - extract_scala, extract_swift, + FileResult, extract_c, extract_cpp, extract_go, extract_java, extract_kotlin, extract_php, + extract_rust, extract_scala, extract_swift, }; mod common; @@ -453,3 +453,48 @@ fn go_forward_reference_binds_to_declaration_not_placeholder() { ); assert!(has_edge(&r, "references", Some("field"), "Store", "Item")); } + +// ── Java: qualified / generic inheritance bases ──────────────────────────────── + +/// Java inheritance must follow qualified (`scoped_type_identifier`) and generic +/// (`generic_type`) bases, not only a plain `type_identifier`. Divergence from +/// graphify-py `_extract_java` (extract.py:2777-2799), which drops them. +#[test] +fn java_inheritance_handles_scoped_and_generic_bases() { + let src = "package app;\n\ +public class Sub extends pkg.Base implements Iface {}\n"; + let (_t, r) = extract_snippet("Sub.java", src, extract_java); + assert!( + has_edge(&r, "inherits", None, "Sub", "Base"), + "{:?}", + r.edges + ); + assert!( + has_edge(&r, "implements", None, "Sub", "Iface"), + "{:?}", + r.edges + ); +} + +// ── Rust: `use` alias stripping ──────────────────────────────────────────────── + +/// `use foo::bar as baz` must import `bar`, not `bar as baz`. Divergence from +/// graphify-py (extract.py:6813), which keeps the alias in the node id. +#[test] +fn rust_use_strips_as_alias() { + let (_t1, aliased) = extract_snippet("aliased.rs", "use foo::bar as baz;\n", extract_rust); + let (_t2, plain) = extract_snippet("plain.rs", "use foo::bar;\n", extract_rust); + let import_target = |r: &FileResult| { + r.edges + .iter() + .find(|e| e.relation == "imports_from") + .map(|e| e.target.clone()) + }; + let aliased_tgt = import_target(&aliased); + assert!(aliased_tgt.is_some(), "expected an imports_from edge"); + assert_eq!( + aliased_tgt, + import_target(&plain), + "`as` alias must not leak into the import target" + ); +}