feat (pg): optimize graph schema for larger datasets

zinic · zinic · commit 5f86443d0eb8 · 2026-04-09T08:41:09.000-07:00
diff --git a/drivers/pg/query/definitions.go b/drivers/pg/query/definitions.go
@@ -4,15 +4,16 @@ import "regexp"
 
 var (
 	pgPropertyIndexRegex = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(+properties\s+->>\s+'([^:]+)::.+$`)
-	pgColumnIndexRegex   = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(([^)]+)\)$`)
+	pgColumnIndexRegex   = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(([^)]+)\)(?:\s+include\s+\(([^)]+)\))?$`)
 )
 
 const (
-	pgIndexRegexGroupUnique       = 1
-	pgIndexRegexGroupName         = 2
-	pgIndexRegexGroupIndexType    = 3
-	pgIndexRegexGroupFields       = 4
-	pgIndexRegexNumExpectedGroups = 5
+	pgIndexRegexGroupUnique        = 1
+	pgIndexRegexGroupName          = 2
+	pgIndexRegexGroupIndexType     = 3
+	pgIndexRegexGroupUsingFields   = 4
+	pgIndexRegexGroupIncludeFields = 5
+	pgIndexRegexNumExpectedGroups  = 6
 
 	pgIndexTypeBTree   = "btree"
 	pgIndexTypeGIN     = "gin"
diff --git a/drivers/pg/query/format.go b/drivers/pg/query/format.go
@@ -161,7 +161,7 @@ func FormatRelationshipPartitionUpsert(graphTarget model.Graph, identityProperti
 	return join("insert into ", graphTarget.Partitions.Edge.Name, " as e ",
 		"(graph_id, start_id, end_id, kind_id, properties) ",
 		"select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) ",
-		formatConflictMatcher(identityProperties, "graph_id, start_id, end_id, kind_id"),
+		formatConflictMatcher(identityProperties, "start_id, end_id, kind_id, graph_id"),
 		"do update set properties = e.properties || excluded.properties;",
 	)
 }
diff --git a/drivers/pg/query/query.go b/drivers/pg/query/query.go
@@ -45,13 +45,13 @@ func (s Query) describeGraphPartition(name string) (model.GraphPartition, error)
 				if captureGroups[pgIndexRegexGroupUnique] == pgIndexUniqueStr {
 					graphPartition.Constraints[indexName] = graph.Constraint{
 						Name:  indexName,
-						Field: captureGroups[pgIndexRegexGroupFields],
+						Field: captureGroups[pgIndexRegexGroupUsingFields],
 						Type:  parsePostgresIndexType(captureGroups[pgIndexRegexGroupIndexType]),
 					}
 				} else {
 					graphPartition.Indexes[indexName] = graph.Index{
 						Name:  indexName,
-						Field: captureGroups[pgIndexRegexGroupFields],
+						Field: captureGroups[pgIndexRegexGroupUsingFields],
 						Type:  parsePostgresIndexType(captureGroups[pgIndexRegexGroupIndexType]),
 					}
 				}
diff --git a/drivers/pg/query/sql/schema_up.sql b/drivers/pg/query/sql/schema_up.sql
@@ -143,7 +143,7 @@ create table if not exists edge
   primary key (id, graph_id),
   foreign key (graph_id) references graph (id) on delete cascade,
 
-  unique (graph_id, start_id, end_id, kind_id)
+  unique (start_id, end_id, kind_id, graph_id)
 ) partition by list (graph_id);
 
 -- delete_node_edges is a trigger and associated plpgsql function to cascade delete edges when attached nodes are
@@ -176,22 +176,20 @@ execute procedure delete_node_edges();
 alter table edge
   alter column properties set storage main;
 
--- Remove the old graph ID index.
+-- Remove old indexes that are now redundant or superseded.
 drop index if exists edge_graph_id_index;
-
--- Index on the start vertex of each edge.
-create index if not exists edge_start_id_index on edge using btree (start_id);
-
--- Index on the start vertex of each edge.
-create index if not exists edge_end_id_index on edge using btree (end_id);
-
--- Index on the kind of each edge.
-create index if not exists edge_kind_index on edge using btree (kind_id);
-
--- Index lookups that include the edge's start or end id along with a filter for the edge type. This is the most
--- common join filter during traversal.
-create index if not exists edge_start_kind_index on edge using btree (start_id, kind_id);
-create index if not exists edge_end_kind_index on edge using btree (end_id, kind_id);
+drop index if exists edge_start_id_index;
+drop index if exists edge_end_id_index;
+drop index if exists edge_kind_index;
+drop index if exists edge_start_kind_index;
+drop index if exists edge_end_kind_index;
+
+-- Covering indexes for traversal joins. The INCLUDE columns allow index-only scans for the common case where
+-- the join needs (id, start_id, end_id, kind_id) without fetching from the heap. The standalone start_id,
+-- end_id, and kind_id indexes are intentionally omitted: the composite indexes satisfy left-prefix lookups
+-- on start_id or end_id alone, and kind_id is never queried in isolation during traversal.
+create index if not exists edge_start_id_kind_id_id_end_id_index on edge using btree (start_id, kind_id) include (id, end_id);
+create index if not exists edge_end_id_kind_id_id_start_id_index on edge using btree (end_id, kind_id) include (id, start_id);
 
 -- Path composite type
 do
@@ -365,15 +363,17 @@ create or replace function public.create_unidirectional_pathspace_tables()
   returns void as
 $$
 begin
+  -- The path column is not used as a primary key. Deduplication is handled by DISTINCT ON clauses in the
+  -- harness functions. Removing the PK on the variable-length int8[] array eliminates O(n)-key B-tree
+  -- maintenance that grows with traversal depth.
   create temporary table forward_front
   (
     root_id   int8   not null,
     next_id   int8   not null,
     depth     int4   not null,
     satisfied bool,
     is_cycle  bool   not null,
-    path      int8[] not null,
-    primary key (path)
+    path      int8[] not null
   ) on commit drop;
 
   create temporary table next_front
@@ -383,8 +383,7 @@ begin
     depth     int4   not null,
     satisfied bool,
     is_cycle  bool   not null,
-    path      int8[] not null,
-    primary key (path)
+    path      int8[] not null
   ) on commit drop;
 
   create index forward_front_next_id_index on forward_front using btree (next_id);
diff --git a/drivers/pg/statements.go b/drivers/pg/statements.go
@@ -12,7 +12,7 @@ const (
 	//	     Azure post-processing. This was done because Azure post will submit the same creation request hundreds of
 	// 		 times for the same edge. In PostgreSQL this results in a constraint violation. For now this is best-effort
 	//		 until Azure post-processing can be refactored.
-	createEdgeBatchStatement  = `insert into edge as e (graph_id, start_id, end_id, kind_id, properties) select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) on conflict (graph_id, start_id, end_id, kind_id) do update set properties = e.properties || excluded.properties;`
+	createEdgeBatchStatement  = `insert into edge as e (graph_id, start_id, end_id, kind_id, properties) select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) on conflict (start_id, end_id, kind_id, graph_id) do update set properties = e.properties || excluded.properties;`
 	deleteEdgeWithIDStatement = `delete from edge as e where e.id = any($1)`
 
 	edgePropertySetOnlyStatement      = `update edge set properties = properties || $1::jsonb where edge.id = $2`

Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ func FormatRelationshipPartitionUpsert(graphTarget model.Graph, identityProperti`
`161`	`161`	`return join("insert into ", graphTarget.Partitions.Edge.Name, " as e ",`
`162`	`162`	`"(graph_id, start_id, end_id, kind_id, properties) ",`
`163`	`163`	`"select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) ",`
`164`		`- formatConflictMatcher(identityProperties, "graph_id, start_id, end_id, kind_id"),`
	`164`	`+ formatConflictMatcher(identityProperties, "start_id, end_id, kind_id, graph_id"),`
`165`	`165`	`"do update set properties = e.properties \|\| excluded.properties;",`
`166`	`166`	`)`
`167`	`167`	`}`
Original file line number	Diff line number	Diff line change
`@@ -45,13 +45,13 @@ func (s Query) describeGraphPartition(name string) (model.GraphPartition, error)`
`45`	`45`	`if captureGroups[pgIndexRegexGroupUnique] == pgIndexUniqueStr {`
`46`	`46`	`graphPartition.Constraints[indexName] = graph.Constraint{`
`47`	`47`	`Name: indexName,`
`48`		`- Field: captureGroups[pgIndexRegexGroupFields],`
	`48`	`+ Field: captureGroups[pgIndexRegexGroupUsingFields],`
`49`	`49`	`Type: parsePostgresIndexType(captureGroups[pgIndexRegexGroupIndexType]),`
`50`	`50`	`}`
`51`	`51`	`} else {`
`52`	`52`	`graphPartition.Indexes[indexName] = graph.Index{`
`53`	`53`	`Name: indexName,`
`54`		`- Field: captureGroups[pgIndexRegexGroupFields],`
	`54`	`+ Field: captureGroups[pgIndexRegexGroupUsingFields],`
`55`	`55`	`Type: parsePostgresIndexType(captureGroups[pgIndexRegexGroupIndexType]),`
`56`	`56`	`}`
`57`	`57`	`}`