From 5f86443d0eb84f7c21c72047da7a13e5b0097d40 Mon Sep 17 00:00:00 2001 From: John Hopper Date: Wed, 8 Apr 2026 19:16:33 -0700 Subject: [PATCH] feat (pg): optimize graph schema for larger datasets --- drivers/pg/query/definitions.go | 13 +++++----- drivers/pg/query/format.go | 2 +- drivers/pg/query/query.go | 4 +-- drivers/pg/query/sql/schema_up.sql | 39 +++++++++++++++--------------- drivers/pg/statements.go | 2 +- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/pg/query/definitions.go b/drivers/pg/query/definitions.go index 27bfc1a..19c8291 100644 --- a/drivers/pg/query/definitions.go +++ b/drivers/pg/query/definitions.go @@ -4,15 +4,16 @@ import "regexp" var ( pgPropertyIndexRegex = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(+properties\s+->>\s+'([^:]+)::.+$`) - pgColumnIndexRegex = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(([^)]+)\)$`) + pgColumnIndexRegex = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(([^)]+)\)(?:\s+include\s+\(([^)]+)\))?$`) ) const ( - pgIndexRegexGroupUnique = 1 - pgIndexRegexGroupName = 2 - pgIndexRegexGroupIndexType = 3 - pgIndexRegexGroupFields = 4 - pgIndexRegexNumExpectedGroups = 5 + pgIndexRegexGroupUnique = 1 + pgIndexRegexGroupName = 2 + pgIndexRegexGroupIndexType = 3 + pgIndexRegexGroupUsingFields = 4 + pgIndexRegexGroupIncludeFields = 5 + pgIndexRegexNumExpectedGroups = 6 pgIndexTypeBTree = "btree" pgIndexTypeGIN = "gin" diff --git a/drivers/pg/query/format.go b/drivers/pg/query/format.go index bc181e9..bcf9d87 100644 --- a/drivers/pg/query/format.go +++ b/drivers/pg/query/format.go @@ -161,7 +161,7 @@ func FormatRelationshipPartitionUpsert(graphTarget model.Graph, identityProperti return join("insert into ", graphTarget.Partitions.Edge.Name, " as e ", "(graph_id, start_id, end_id, kind_id, properties) ", "select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) ", - formatConflictMatcher(identityProperties, "graph_id, start_id, end_id, kind_id"), + formatConflictMatcher(identityProperties, "start_id, end_id, kind_id, graph_id"), "do update set properties = e.properties || excluded.properties;", ) } diff --git a/drivers/pg/query/query.go b/drivers/pg/query/query.go index 85d3ac5..c04dc9a 100644 --- a/drivers/pg/query/query.go +++ b/drivers/pg/query/query.go @@ -45,13 +45,13 @@ func (s Query) describeGraphPartition(name string) (model.GraphPartition, error) if captureGroups[pgIndexRegexGroupUnique] == pgIndexUniqueStr { graphPartition.Constraints[indexName] = graph.Constraint{ Name: indexName, - Field: captureGroups[pgIndexRegexGroupFields], + Field: captureGroups[pgIndexRegexGroupUsingFields], Type: parsePostgresIndexType(captureGroups[pgIndexRegexGroupIndexType]), } } else { graphPartition.Indexes[indexName] = graph.Index{ Name: indexName, - Field: captureGroups[pgIndexRegexGroupFields], + Field: captureGroups[pgIndexRegexGroupUsingFields], Type: parsePostgresIndexType(captureGroups[pgIndexRegexGroupIndexType]), } } diff --git a/drivers/pg/query/sql/schema_up.sql b/drivers/pg/query/sql/schema_up.sql index 4e0196f..1cf8a38 100644 --- a/drivers/pg/query/sql/schema_up.sql +++ b/drivers/pg/query/sql/schema_up.sql @@ -143,7 +143,7 @@ create table if not exists edge primary key (id, graph_id), foreign key (graph_id) references graph (id) on delete cascade, - unique (graph_id, start_id, end_id, kind_id) + unique (start_id, end_id, kind_id, graph_id) ) partition by list (graph_id); -- delete_node_edges is a trigger and associated plpgsql function to cascade delete edges when attached nodes are @@ -176,22 +176,20 @@ execute procedure delete_node_edges(); alter table edge alter column properties set storage main; --- Remove the old graph ID index. +-- Remove old indexes that are now redundant or superseded. drop index if exists edge_graph_id_index; - --- Index on the start vertex of each edge. -create index if not exists edge_start_id_index on edge using btree (start_id); - --- Index on the start vertex of each edge. -create index if not exists edge_end_id_index on edge using btree (end_id); - --- Index on the kind of each edge. -create index if not exists edge_kind_index on edge using btree (kind_id); - --- Index lookups that include the edge's start or end id along with a filter for the edge type. This is the most --- common join filter during traversal. -create index if not exists edge_start_kind_index on edge using btree (start_id, kind_id); -create index if not exists edge_end_kind_index on edge using btree (end_id, kind_id); +drop index if exists edge_start_id_index; +drop index if exists edge_end_id_index; +drop index if exists edge_kind_index; +drop index if exists edge_start_kind_index; +drop index if exists edge_end_kind_index; + +-- Covering indexes for traversal joins. The INCLUDE columns allow index-only scans for the common case where +-- the join needs (id, start_id, end_id, kind_id) without fetching from the heap. The standalone start_id, +-- end_id, and kind_id indexes are intentionally omitted: the composite indexes satisfy left-prefix lookups +-- on start_id or end_id alone, and kind_id is never queried in isolation during traversal. +create index if not exists edge_start_id_kind_id_id_end_id_index on edge using btree (start_id, kind_id) include (id, end_id); +create index if not exists edge_end_id_kind_id_id_start_id_index on edge using btree (end_id, kind_id) include (id, start_id); -- Path composite type do @@ -365,6 +363,9 @@ create or replace function public.create_unidirectional_pathspace_tables() returns void as $$ begin + -- The path column is not used as a primary key. Deduplication is handled by DISTINCT ON clauses in the + -- harness functions. Removing the PK on the variable-length int8[] array eliminates O(n)-key B-tree + -- maintenance that grows with traversal depth. create temporary table forward_front ( root_id int8 not null, @@ -372,8 +373,7 @@ begin depth int4 not null, satisfied bool, is_cycle bool not null, - path int8[] not null, - primary key (path) + path int8[] not null ) on commit drop; create temporary table next_front @@ -383,8 +383,7 @@ begin depth int4 not null, satisfied bool, is_cycle bool not null, - path int8[] not null, - primary key (path) + path int8[] not null ) on commit drop; create index forward_front_next_id_index on forward_front using btree (next_id); diff --git a/drivers/pg/statements.go b/drivers/pg/statements.go index a0fdcee..c1b39f6 100644 --- a/drivers/pg/statements.go +++ b/drivers/pg/statements.go @@ -12,7 +12,7 @@ const ( // Azure post-processing. This was done because Azure post will submit the same creation request hundreds of // times for the same edge. In PostgreSQL this results in a constraint violation. For now this is best-effort // until Azure post-processing can be refactored. - createEdgeBatchStatement = `insert into edge as e (graph_id, start_id, end_id, kind_id, properties) select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) on conflict (graph_id, start_id, end_id, kind_id) do update set properties = e.properties || excluded.properties;` + createEdgeBatchStatement = `insert into edge as e (graph_id, start_id, end_id, kind_id, properties) select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) on conflict (start_id, end_id, kind_id, graph_id) do update set properties = e.properties || excluded.properties;` deleteEdgeWithIDStatement = `delete from edge as e where e.id = any($1)` edgePropertySetOnlyStatement = `update edge set properties = properties || $1::jsonb where edge.id = $2`