From 7501a7fdc8bef9b24a19ed4d86eec7ea8cc44051 Mon Sep 17 00:00:00 2001 From: David Spencer <1526975+DecisionNerd@users.noreply.github.com> Date: Tue, 23 Jun 2026 10:20:15 -0600 Subject: [PATCH 1/4] feat(ir): startNode(r)/endNode(r) return the endpoint node value (#753) `startNode(r)` / `endNode(r)` over a matched fixed-hop relationship now return the start/end node with identity, labels, and readable properties (reusing #785's node-value materialization), not a bare UUID. The binder records each fixed hop's `(src, dst)` node vars at `Expand` build (`edge_vars`) and rewrites the calls onto them: - general/property-access context (`startNode(r).name`) -> a node VarRef, so the access resolves against the endpoint node's columns; - terminal RETURN (`RETURN endNode(r)`) -> a whole node value via the shared `_node_struct` helper. Variable-length edge vars bind to a relationship *list*, not one relationship, so they are deliberately not recorded; any unrecognized argument falls through to the usual `UnknownFunction` error rather than silently returning a UUID (which would fail TCK). Closes the #743 deferral. Validated end-to-end: `start_end_node_return_whole_node` and `start_end_node_property_access` in gf-api e2e_baseline (73 pass); binder + lowerer goldens unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 11 +++ crates/gf-api/tests/e2e_baseline.rs | 80 +++++++++++++++++++++ crates/gf-ir/src/binder.rs | 105 +++++++++++++++++++++++----- 3 files changed, 179 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37cddae8..6fa36378 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### In Development — v0.5.0 (Rust Core, `rust-core` branch) +- `startNode(r)` / `endNode(r)` return the endpoint **node value** (#753) — over + a matched fixed-hop relationship these now return the start/end node with + identity, labels, and readable properties (reusing the #785 node-value + materialization), not a bare UUID. The binder records each fixed hop's + `(src, dst)` node vars at `Expand` and rewrites the calls onto them: as a node + reference for property access (`startNode(r).name`) and, in a terminal RETURN, + as a whole node value (`RETURN endNode(r)`). Variable-length edge vars bind to + a relationship *list*, not one relationship, so they are deliberately not + resolved here; an unrecognized argument falls through to the usual + `UnknownFunction` error rather than silently returning a UUID (which would fail + TCK). Closes the #743 deferral. - Neighborhood-proportional traversal latency (#838) — variable-length and single-hop expansion now read only the **reached destination** node records (`read_nodes_filtered`, an `node_id`-filtered read mirroring the #830 edge diff --git a/crates/gf-api/tests/e2e_baseline.rs b/crates/gf-api/tests/e2e_baseline.rs index 353ba066..7530606c 100644 --- a/crates/gf-api/tests/e2e_baseline.rs +++ b/crates/gf-api/tests/e2e_baseline.rs @@ -142,6 +142,86 @@ fn match_returns_whole_node_value() { assert_eq!(name_str.value(0), "Alice"); } +#[test] +fn start_end_node_return_whole_node() { + // #753: startNode(r) / endNode(r) over a matched relationship return the + // endpoint node's whole value (identity + labels + properties), reusing the + // #785 materialization — not the raw UUID. + let gf = forge(); + let r = rows( + &gf, + "MATCH (a:Person {name:'Alice'})-[r:KNOWS]->(b:Person {name:'Bob'}) RETURN startNode(r)", + ); + assert_eq!(r.stats.rows_produced, 1, "Alice KNOWS Bob is one edge"); + let batch = r + .batches + .iter() + .find(|b| b.num_rows() > 0) + .expect("a non-empty result batch"); + // The column name is the expression display string (no friendly alias for a + // function call), so address it positionally. + let col = batch.column(0); + let node = col + .as_any() + .downcast_ref::() + .unwrap_or_else(|| { + panic!( + "startNode(r) should be a node-value Struct, got {:?}", + col.data_type() + ) + }); + // The start node is Alice — label + property both readable off the value. + let labels = node.column_by_name("labels").expect("labels field"); + let list = labels + .as_any() + .downcast_ref::() + .expect("labels is a List"); + let label_vals = list.value(0); + let label_strs = label_vals + .as_any() + .downcast_ref::() + .expect("labels are Utf8"); + assert_eq!(label_strs.value(0), "Person"); + let name = node.column_by_name("name").expect("name property"); + let name_str = name + .as_any() + .downcast_ref::() + .expect("name is Utf8"); + assert_eq!(name_str.value(0), "Alice", "startNode(r) is the start node"); +} + +#[test] +fn start_end_node_property_access() { + // #753: property access on startNode(r) / endNode(r) resolves against the + // src / dst node's columns — `startNode(r).name` == the start node's name. + let gf = forge(); + let r = rows( + &gf, + "MATCH (a:Person {name:'Alice'})-[r:KNOWS]->(b:Person {name:'Bob'}) \ + RETURN startNode(r).name AS s, endNode(r).name AS e", + ); + assert_eq!(r.stats.rows_produced, 1); + let batch = r + .batches + .iter() + .find(|b| b.num_rows() > 0) + .expect("a non-empty result batch"); + let s = batch + .column_by_name("s") + .expect("s") + .as_any() + .downcast_ref::() + .expect("Utf8"); + let e = batch + .column_by_name("e") + .expect("e") + .as_any() + .downcast_ref::() + .expect("Utf8"); + assert_eq!(s.value(0), "Alice", "startNode(r).name"); + assert_eq!(e.value(0), "Bob", "endNode(r).name"); +} + #[test] fn count_aggregate() { let gf = forge(); diff --git a/crates/gf-ir/src/binder.rs b/crates/gf-ir/src/binder.rs index bb22a713..2c7aee82 100644 --- a/crates/gf-ir/src/binder.rs +++ b/crates/gf-ir/src/binder.rs @@ -145,6 +145,7 @@ impl Binder { vars: HashMap::new(), path_vars: HashMap::new(), node_vars: HashMap::new(), + edge_vars: HashMap::new(), next_var: 0, builder, errors: Vec::new(), @@ -216,6 +217,7 @@ impl Binder { vars: s.vars.clone(), path_vars: s.path_vars.clone(), node_vars: s.node_vars.clone(), + edge_vars: s.edge_vars.clone(), next_var: s.next_var, builder: GraphPlan::builder("openCypher").ontology_mode(self.mode), errors: Vec::new(), @@ -239,6 +241,9 @@ impl Binder { for (v, label) in sub_state.node_vars { s.node_vars.entry(v).or_insert(label); } + for (edge, endpoints) in sub_state.edge_vars { + s.edge_vars.entry(edge).or_insert(endpoints); + } s.errors.extend(sub_state.errors); s.warnings.extend(sub_state.warnings); let child = sub_state.builder.build(); @@ -325,6 +330,15 @@ impl Binder { max_hops, }); + // Record the relationship's endpoints so `startNode(r)` / + // `endNode(r)` can return the src/dst node value (#753). + // Only fixed single hops: a variable-length edge var binds + // to a list, not a single relationship, so the endpoint + // functions have no single node to resolve to. + if !is_var_hop { + s.edge_vars.insert(edge_var, (src_var, dst_var)); + } + if path_nodes.is_empty() { path_nodes.push(src_var); } @@ -840,24 +854,22 @@ impl Binder { s: &mut BinderState, materialize_nodes: bool, ) -> ExprId { - if materialize_nodes - && let Expr::Var(VarRef { name, .. }) = expr - && let Some(&v) = s.vars.get(name) - && let Some(label_opt) = s.node_vars.get(&v).cloned() - { - let var_ref = s.builder.push_expr(IrExpr::VarRef(v)); - let mut args = vec![var_ref]; - // Pass the pattern's label name through: the lowerer's ontology map - // is empty in exploratory mode, so the label can only come from the - // bind-time pattern. Absent (one arg) for an unlabelled match. - if let Some(label) = label_opt { - let lit = s.builder.push_expr(IrExpr::Literal(IrLiteral::Str(label))); - args.push(lit); + if materialize_nodes { + // A bare node var (`RETURN n`, #785) or a relationship endpoint + // (`RETURN startNode(r)` / `endNode(r)`, #753) materializes a whole + // node value. Both resolve to a node var bound to a `NodeScan`. + let node_var = match expr { + Expr::Var(VarRef { name, .. }) => s + .vars + .get(name) + .copied() + .filter(|v| s.node_vars.contains_key(v)), + Expr::FunctionCall(call) => Self::resolve_endpoint_node(call, s), + _ => None, + }; + if let Some(v) = node_var { + return self.node_struct_expr(v, s); } - return s.builder.push_expr(IrExpr::FunctionCall { - name: "_node_struct".to_string(), - args, - }); } self.lower_expr(expr, span, s) } @@ -1067,6 +1079,13 @@ impl Binder { Expr::FunctionCall(call) => { if let Some(id) = Self::lower_path_function(call, s) { id + } else if let Some(node_var) = Self::resolve_endpoint_node(call, s) { + // `startNode(r)` / `endNode(r)` over a matched relationship + // is the src / dst node var (#753). As a bare value it is a + // node reference; property access (`startNode(r).name`) + // resolves against that var's columns. A terminal RETURN + // upgrades it to a whole node value in `lower_return_item_expr`. + s.builder.push_expr(IrExpr::VarRef(node_var)) } else { let fn_name = call.name.join("."); let ir_args: Vec = call @@ -1174,6 +1193,51 @@ impl Binder { Some(id) } + /// Resolve `startNode(r)` / `endNode(r)` over a fixed-hop matched + /// relationship to the relationship's src / dst node variable (#753). + /// + /// Returns `None` for anything else — a non-matching name, a non-variable + /// argument, an unbound name, or an edge var that is not a recorded + /// fixed-hop endpoint (variable-length edges bind to a list, not one + /// relationship) — so the caller falls through to generic lowering, where + /// the function name surfaces as the usual `UnknownFunction` error. + fn resolve_endpoint_node(call: &FunctionCall, s: &BinderState) -> Option { + let [fn_name] = call.name.as_slice() else { + return None; + }; + let start = match fn_name.to_ascii_lowercase().as_str() { + "startnode" => true, + "endnode" => false, + _ => return None, + }; + let [Expr::Var(VarRef { name, .. })] = call.args.as_slice() else { + return None; + }; + let edge_var = s.vars.get(name)?; + let (src, dst) = s.edge_vars.get(edge_var)?; + Some(if start { *src } else { *dst }) + } + + /// Build a `_node_struct` call that materializes the given node var as a + /// whole node value — identity + labels + properties (#785). Shared by a + /// bare `RETURN n` and by `RETURN startNode(r)` / `endNode(r)` (#753). + fn node_struct_expr(&self, var: VarId, s: &mut BinderState) -> ExprId { + // The pattern's label name, passed through because the lowerer's + // ontology map is empty in exploratory mode; absent for an unlabelled + // match (one arg). + let label_opt = s.node_vars.get(&var).cloned().flatten(); + let var_ref = s.builder.push_expr(IrExpr::VarRef(var)); + let mut args = vec![var_ref]; + if let Some(label) = label_opt { + let lit = s.builder.push_expr(IrExpr::Literal(IrLiteral::Str(label))); + args.push(lit); + } + s.builder.push_expr(IrExpr::FunctionCall { + name: "_node_struct".to_string(), + args, + }) + } + /// IR for `nodes(p)`: the traversal node sequence as a /// `List` value. /// @@ -1419,6 +1483,12 @@ struct BinderState { /// of these materializes a whole node value (#785); the label is passed /// through to lowering since the ontology map is empty in exploratory mode. node_vars: HashMap>, + /// Fixed-hop relationship variables mapped to their `(src, dst)` node vars, + /// recorded at `Expand` build. `startNode(r)` / `endNode(r)` over a matched + /// relationship rewrite onto these endpoints (#753) — so they return the + /// node value (reusing #785), not the raw UUID. Variable-length edges bind + /// to a list, not one relationship, so they are deliberately not recorded. + edge_vars: HashMap, next_var: u32, builder: GraphPlanBuilder, errors: Vec, @@ -1825,6 +1895,7 @@ mod tests { vars: HashMap::new(), path_vars: HashMap::new(), node_vars: HashMap::new(), + edge_vars: HashMap::new(), next_var: 0, builder: GraphPlan::builder("openCypher").ontology_mode(OntologyMode::Advisory), errors: Vec::new(), From 5e8a695ac43bbed44901d1be86333f9ded0db050 Mon Sep 17 00:00:00 2001 From: David Spencer <1526975+DecisionNerd@users.noreply.github.com> Date: Tue, 23 Jun 2026 10:46:40 -0600 Subject: [PATCH 2/4] fix(ir): node_struct_expr is an associated fn (clippy::unused_self) (#753) The shared helper never reads `self` (it works through the `BinderState` builder), so `cargo clippy --workspace -- -D warnings` rejected the `&self` receiver. Make it an associated function and call it as `Self::node_struct_expr`. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/gf-ir/src/binder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/gf-ir/src/binder.rs b/crates/gf-ir/src/binder.rs index 2c7aee82..d3949528 100644 --- a/crates/gf-ir/src/binder.rs +++ b/crates/gf-ir/src/binder.rs @@ -868,7 +868,7 @@ impl Binder { _ => None, }; if let Some(v) = node_var { - return self.node_struct_expr(v, s); + return Self::node_struct_expr(v, s); } } self.lower_expr(expr, span, s) @@ -1221,7 +1221,7 @@ impl Binder { /// Build a `_node_struct` call that materializes the given node var as a /// whole node value — identity + labels + properties (#785). Shared by a /// bare `RETURN n` and by `RETURN startNode(r)` / `endNode(r)` (#753). - fn node_struct_expr(&self, var: VarId, s: &mut BinderState) -> ExprId { + fn node_struct_expr(var: VarId, s: &mut BinderState) -> ExprId { // The pattern's label name, passed through because the lowerer's // ontology map is empty in exploratory mode; absent for an unlabelled // match (one arg). From 8c048472d4d8a6d9d3a068cd93f2d110412a9ce4 Mon Sep 17 00:00:00 2001 From: David Spencer <1526975+DecisionNerd@users.noreply.github.com> Date: Tue, 23 Jun 2026 11:05:13 -0600 Subject: [PATCH 3/4] fix(ir): direction-aware + optional-safe startNode/endNode (#753, CodeRabbit) Two correctness fixes from review: - Direction: `src_var`/`dst_var` are the pattern's left/right vars, but the relationship's true start/end follow its direction. Record `(src, dst)` for outgoing, swap to `(dst, src)` for incoming, and skip undirected edges (their orientation is per matched row, so a static rewrite could pick the wrong endpoint). Validated: `(Bob)<-[r:KNOWS]-(b)` -> startNode = Alice (source). - OPTIONAL MATCH: do not propagate `edge_vars` out of an optional sub-state. After an unmatched optional, `r` is null and `startNode(r)` must be null, but the rewrite would resolve to the outer non-null endpoint var. Without an edge-uuid null gate (#889) that yields a wrong value, so optional edges stay unresolved (startNode/endNode -> UnknownFunction). Edges still resolve inside the optional's own WHERE, where matched rows have a non-null `r`. New e2e: `start_end_node_respects_incoming_direction`, `start_node_over_optional_edge_is_deferred`. clippy --workspace -D warnings clean; gf-ir goldens (92) and e2e (75) green. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 13 ++++--- crates/gf-api/tests/e2e_baseline.rs | 55 +++++++++++++++++++++++++++++ crates/gf-ir/src/binder.rs | 37 ++++++++++++++----- 3 files changed, 92 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fa36378..e0e7ab2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,11 +14,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 materialization), not a bare UUID. The binder records each fixed hop's `(src, dst)` node vars at `Expand` and rewrites the calls onto them: as a node reference for property access (`startNode(r).name`) and, in a terminal RETURN, - as a whole node value (`RETURN endNode(r)`). Variable-length edge vars bind to - a relationship *list*, not one relationship, so they are deliberately not - resolved here; an unrecognized argument falls through to the usual - `UnknownFunction` error rather than silently returning a UUID (which would fail - TCK). Closes the #743 deferral. + as a whole node value (`RETURN endNode(r)`). Endpoints follow the edge's + **direction** — an incoming `(a)<-[r]-(b)` resolves `startNode(r)` to `b` — not + the pattern's left/right order. Deliberately deferred (fall through to the + usual `UnknownFunction` error rather than risk a wrong answer): variable-length + edges (the var binds to a relationship *list*), undirected edges (orientation + is per matched row), and `OPTIONAL MATCH` edges (an unmatched `r` is null; + honoring `startNode(null) = null` needs an edge-uuid null gate — #889). Closes + the #743 deferral. - Neighborhood-proportional traversal latency (#838) — variable-length and single-hop expansion now read only the **reached destination** node records (`read_nodes_filtered`, an `node_id`-filtered read mirroring the #830 edge diff --git a/crates/gf-api/tests/e2e_baseline.rs b/crates/gf-api/tests/e2e_baseline.rs index 7530606c..bddfc057 100644 --- a/crates/gf-api/tests/e2e_baseline.rs +++ b/crates/gf-api/tests/e2e_baseline.rs @@ -222,6 +222,61 @@ fn start_end_node_property_access() { assert_eq!(e.value(0), "Bob", "endNode(r).name"); } +#[test] +fn start_end_node_respects_incoming_direction() { + // #753: the relationship's start/end follow the edge *direction*, not the + // pattern's left/right order. For `(Bob)<-[r:KNOWS]-(b)`, the only match is + // Alice KNOWS Bob, so startNode(r) is Alice (the edge source) and endNode(r) + // is Bob — even though Bob is the pattern's left-hand node. + let gf = forge(); + let r = rows( + &gf, + "MATCH (a:Person {name:'Bob'})<-[r:KNOWS]-(b:Person) \ + RETURN startNode(r).name AS s, endNode(r).name AS e", + ); + assert_eq!(r.stats.rows_produced, 1, "only Alice KNOWS Bob"); + let batch = r + .batches + .iter() + .find(|b| b.num_rows() > 0) + .expect("a non-empty result batch"); + let s = batch + .column_by_name("s") + .expect("s") + .as_any() + .downcast_ref::() + .expect("Utf8"); + let e = batch + .column_by_name("e") + .expect("e") + .as_any() + .downcast_ref::() + .expect("Utf8"); + assert_eq!( + s.value(0), + "Alice", + "startNode = edge source, not pattern-left" + ); + assert_eq!(e.value(0), "Bob", "endNode = edge target"); +} + +#[test] +fn start_node_over_optional_edge_is_deferred() { + // #753 deferral: over an OPTIONAL-matched relationship, `r` is null on an + // unmatched row and Cypher requires `startNode(r)` to be null. Honoring that + // needs an edge-uuid null gate on endpoint materialization (#889), so the + // rewrite is suppressed for optional edges — the call errors rather than + // mis-resolving to the outer, non-null endpoint var. Pins that intent. + let gf = forge(); + let res = gf.execute( + "MATCH (a:Person {name:'Alice'}) OPTIONAL MATCH (a)-[r:KNOWS]->(b) RETURN startNode(r)", + ); + assert!( + res.is_err(), + "optional-edge startNode is deferred; expected an error, got {res:?}" + ); +} + #[test] fn count_aggregate() { let gf = forge(); diff --git a/crates/gf-ir/src/binder.rs b/crates/gf-ir/src/binder.rs index d3949528..f81665ff 100644 --- a/crates/gf-ir/src/binder.rs +++ b/crates/gf-ir/src/binder.rs @@ -241,9 +241,17 @@ impl Binder { for (v, label) in sub_state.node_vars { s.node_vars.entry(v).or_insert(label); } - for (edge, endpoints) in sub_state.edge_vars { - s.edge_vars.entry(edge).or_insert(endpoints); - } + // Deliberately do NOT propagate `edge_vars` out of an OPTIONAL MATCH. + // After an unmatched optional row `r` is null, and Cypher requires + // `startNode(r)`/`endNode(r)` to be null — but the rewrite resolves + // to the endpoint node var, which on such a row is the *outer*, + // non-null var (e.g. `a` in `MATCH (a) OPTIONAL MATCH (a)-[r]->(b)`). + // Without an edge-uuid null gate on endpoint materialization that + // would return a wrong non-null value, so optional edges stay + // unresolved here and `startNode`/`endNode` fall through to + // UnknownFunction. (Edges still resolve inside the optional's own + // WHERE, where matched rows have a non-null `r`.) Null-gated endpoint + // values are a node-value-completeness follow-up (#889). s.errors.extend(sub_state.errors); s.warnings.extend(sub_state.warnings); let child = sub_state.builder.build(); @@ -331,12 +339,25 @@ impl Binder { }); // Record the relationship's endpoints so `startNode(r)` / - // `endNode(r)` can return the src/dst node value (#753). - // Only fixed single hops: a variable-length edge var binds - // to a list, not a single relationship, so the endpoint - // functions have no single node to resolve to. + // `endNode(r)` can return the start/end node value (#753). + // `src_var`/`dst_var` are the pattern's traversal-left/right + // vars; the relationship's true start/end follow its + // *direction*: an outgoing `(a)-[r]->(b)` starts at `a`, an + // incoming `(a)<-[r]-(b)` starts at `b`. An undirected edge's + // orientation is per matched row, so a static left/right + // rewrite could pick the wrong endpoint — skip it (startNode/ + // endNode then fall through to UnknownFunction). Only fixed + // single hops qualify: a variable-length edge var binds to a + // *list*, not one relationship. if !is_var_hop { - s.edge_vars.insert(edge_var, (src_var, dst_var)); + let endpoints = match dir { + Direction::Out => Some((src_var, dst_var)), + Direction::In => Some((dst_var, src_var)), + Direction::Undirected => None, + }; + if let Some(endpoints) = endpoints { + s.edge_vars.insert(edge_var, endpoints); + } } if path_nodes.is_empty() { From 48d1e95996250707fb044d7155f7f7bd2fea8187 Mon Sep 17 00:00:00 2001 From: David Spencer <1526975+DecisionNerd@users.noreply.github.com> Date: Tue, 23 Jun 2026 11:20:07 -0600 Subject: [PATCH 4/4] test(api): optional-edge deferral test uses an unmatched node (#753, CodeRabbit) Switch the OPTIONAL-MATCH deferral test from Alice (who has KNOWS edges) to Eve (no outgoing edges), so the represented scenario is a genuinely unmatched optional. The deferral itself surfaces as a bind-time UnknownFunction (data-independent), but the data now matches the intent. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/gf-api/tests/e2e_baseline.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crates/gf-api/tests/e2e_baseline.rs b/crates/gf-api/tests/e2e_baseline.rs index bddfc057..6464dd8b 100644 --- a/crates/gf-api/tests/e2e_baseline.rs +++ b/crates/gf-api/tests/e2e_baseline.rs @@ -265,11 +265,13 @@ fn start_node_over_optional_edge_is_deferred() { // #753 deferral: over an OPTIONAL-matched relationship, `r` is null on an // unmatched row and Cypher requires `startNode(r)` to be null. Honoring that // needs an edge-uuid null gate on endpoint materialization (#889), so the - // rewrite is suppressed for optional edges — the call errors rather than - // mis-resolving to the outer, non-null endpoint var. Pins that intent. + // rewrite is suppressed for optional edges. The suppression surfaces as a + // bind-time `UnknownFunction` (data-independent), but the query uses Eve — + // who has no outgoing edges — so the represented scenario is a genuinely + // unmatched optional, not one papered over by a row that happens to match. let gf = forge(); let res = gf.execute( - "MATCH (a:Person {name:'Alice'}) OPTIONAL MATCH (a)-[r:KNOWS]->(b) RETURN startNode(r)", + "MATCH (a:Person {name:'Eve'}) OPTIONAL MATCH (a)-[r:KNOWS]->(b) RETURN startNode(r)", ); assert!( res.is_err(),