From 5f86443d0eb84f7c21c72047da7a13e5b0097d40 Mon Sep 17 00:00:00 2001
From: John Hopper <jhopper@specterops.io>
Date: Wed, 8 Apr 2026 19:16:33 -0700
Subject: [PATCH] feat (pg): optimize graph schema for larger datasets

---
 drivers/pg/query/definitions.go    | 13 +++++-----
 drivers/pg/query/format.go         |  2 +-
 drivers/pg/query/query.go          |  4 +--
 drivers/pg/query/sql/schema_up.sql | 39 +++++++++++++++---------------
 drivers/pg/statements.go           |  2 +-
 5 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/pg/query/definitions.go b/drivers/pg/query/definitions.go
index 27bfc1a..19c8291 100644
--- a/drivers/pg/query/definitions.go
+++ b/drivers/pg/query/definitions.go
@@ -4,15 +4,16 @@ import "regexp"
 
 var (
 	pgPropertyIndexRegex = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(+properties\s+->>\s+'([^:]+)::.+$`)
-	pgColumnIndexRegex   = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(([^)]+)\)$`)
+	pgColumnIndexRegex   = regexp.MustCompile(`(?i)^create\s+(unique)?(?:\s+)?index\s+([^ ]+)\s+on\s+\S+\s+using\s+([^ ]+)\s+\(([^)]+)\)(?:\s+include\s+\(([^)]+)\))?$`)
 )
 
 const (
-	pgIndexRegexGroupUnique       = 1
-	pgIndexRegexGroupName         = 2
-	pgIndexRegexGroupIndexType    = 3
-	pgIndexRegexGroupFields       = 4
-	pgIndexRegexNumExpectedGroups = 5
+	pgIndexRegexGroupUnique        = 1
+	pgIndexRegexGroupName          = 2
+	pgIndexRegexGroupIndexType     = 3
+	pgIndexRegexGroupUsingFields   = 4
+	pgIndexRegexGroupIncludeFields = 5
+	pgIndexRegexNumExpectedGroups  = 6
 
 	pgIndexTypeBTree   = "btree"
 	pgIndexTypeGIN     = "gin"
diff --git a/drivers/pg/query/format.go b/drivers/pg/query/format.go
index bc181e9..bcf9d87 100644
--- a/drivers/pg/query/format.go
+++ b/drivers/pg/query/format.go
@@ -161,7 +161,7 @@ func FormatRelationshipPartitionUpsert(graphTarget model.Graph, identityProperti
 	return join("insert into ", graphTarget.Partitions.Edge.Name, " as e ",
 		"(graph_id, start_id, end_id, kind_id, properties) ",
 		"select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) ",
-		formatConflictMatcher(identityProperties, "graph_id, start_id, end_id, kind_id"),
+		formatConflictMatcher(identityProperties, "start_id, end_id, kind_id, graph_id"),
 		"do update set properties = e.properties || excluded.properties;",
 	)
 }
diff --git a/drivers/pg/query/query.go b/drivers/pg/query/query.go
index 85d3ac5..c04dc9a 100644
--- a/drivers/pg/query/query.go
+++ b/drivers/pg/query/query.go
@@ -45,13 +45,13 @@ func (s Query) describeGraphPartition(name string) (model.GraphPartition, error)
 				if captureGroups[pgIndexRegexGroupUnique] == pgIndexUniqueStr {
 					graphPartition.Constraints[indexName] = graph.Constraint{
 						Name:  indexName,
-						Field: captureGroups[pgIndexRegexGroupFields],
+						Field: captureGroups[pgIndexRegexGroupUsingFields],
 						Type:  parsePostgresIndexType(captureGroups[pgIndexRegexGroupIndexType]),
 					}
 				} else {
 					graphPartition.Indexes[indexName] = graph.Index{
 						Name:  indexName,
-						Field: captureGroups[pgIndexRegexGroupFields],
+						Field: captureGroups[pgIndexRegexGroupUsingFields],
 						Type:  parsePostgresIndexType(captureGroups[pgIndexRegexGroupIndexType]),
 					}
 				}
diff --git a/drivers/pg/query/sql/schema_up.sql b/drivers/pg/query/sql/schema_up.sql
index 4e0196f..1cf8a38 100644
--- a/drivers/pg/query/sql/schema_up.sql
+++ b/drivers/pg/query/sql/schema_up.sql
@@ -143,7 +143,7 @@ create table if not exists edge
   primary key (id, graph_id),
   foreign key (graph_id) references graph (id) on delete cascade,
 
-  unique (graph_id, start_id, end_id, kind_id)
+  unique (start_id, end_id, kind_id, graph_id)
 ) partition by list (graph_id);
 
 -- delete_node_edges is a trigger and associated plpgsql function to cascade delete edges when attached nodes are
@@ -176,22 +176,20 @@ execute procedure delete_node_edges();
 alter table edge
   alter column properties set storage main;
 
--- Remove the old graph ID index.
+-- Remove old indexes that are now redundant or superseded.
 drop index if exists edge_graph_id_index;
-
--- Index on the start vertex of each edge.
-create index if not exists edge_start_id_index on edge using btree (start_id);
-
--- Index on the start vertex of each edge.
-create index if not exists edge_end_id_index on edge using btree (end_id);
-
--- Index on the kind of each edge.
-create index if not exists edge_kind_index on edge using btree (kind_id);
-
--- Index lookups that include the edge's start or end id along with a filter for the edge type. This is the most
--- common join filter during traversal.
-create index if not exists edge_start_kind_index on edge using btree (start_id, kind_id);
-create index if not exists edge_end_kind_index on edge using btree (end_id, kind_id);
+drop index if exists edge_start_id_index;
+drop index if exists edge_end_id_index;
+drop index if exists edge_kind_index;
+drop index if exists edge_start_kind_index;
+drop index if exists edge_end_kind_index;
+
+-- Covering indexes for traversal joins. The INCLUDE columns allow index-only scans for the common case where
+-- the join needs (id, start_id, end_id, kind_id) without fetching from the heap. The standalone start_id,
+-- end_id, and kind_id indexes are intentionally omitted: the composite indexes satisfy left-prefix lookups
+-- on start_id or end_id alone, and kind_id is never queried in isolation during traversal.
+create index if not exists edge_start_id_kind_id_id_end_id_index on edge using btree (start_id, kind_id) include (id, end_id);
+create index if not exists edge_end_id_kind_id_id_start_id_index on edge using btree (end_id, kind_id) include (id, start_id);
 
 -- Path composite type
 do
@@ -365,6 +363,9 @@ create or replace function public.create_unidirectional_pathspace_tables()
   returns void as
 $$
 begin
+  -- The path column is not used as a primary key. Deduplication is handled by DISTINCT ON clauses in the
+  -- harness functions. Removing the PK on the variable-length int8[] array eliminates O(n)-key B-tree
+  -- maintenance that grows with traversal depth.
   create temporary table forward_front
   (
     root_id   int8   not null,
@@ -372,8 +373,7 @@ begin
     depth     int4   not null,
     satisfied bool,
     is_cycle  bool   not null,
-    path      int8[] not null,
-    primary key (path)
+    path      int8[] not null
   ) on commit drop;
 
   create temporary table next_front
@@ -383,8 +383,7 @@ begin
     depth     int4   not null,
     satisfied bool,
     is_cycle  bool   not null,
-    path      int8[] not null,
-    primary key (path)
+    path      int8[] not null
   ) on commit drop;
 
   create index forward_front_next_id_index on forward_front using btree (next_id);
diff --git a/drivers/pg/statements.go b/drivers/pg/statements.go
index a0fdcee..c1b39f6 100644
--- a/drivers/pg/statements.go
+++ b/drivers/pg/statements.go
@@ -12,7 +12,7 @@ const (
 	//	     Azure post-processing. This was done because Azure post will submit the same creation request hundreds of
 	// 		 times for the same edge. In PostgreSQL this results in a constraint violation. For now this is best-effort
 	//		 until Azure post-processing can be refactored.
-	createEdgeBatchStatement  = `insert into edge as e (graph_id, start_id, end_id, kind_id, properties) select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) on conflict (graph_id, start_id, end_id, kind_id) do update set properties = e.properties || excluded.properties;`
+	createEdgeBatchStatement  = `insert into edge as e (graph_id, start_id, end_id, kind_id, properties) select $1, unnest($2::int8[]), unnest($3::int8[]), unnest($4::int2[]), unnest($5::jsonb[]) on conflict (start_id, end_id, kind_id, graph_id) do update set properties = e.properties || excluded.properties;`
 	deleteEdgeWithIDStatement = `delete from edge as e where e.id = any($1)`
 
 	edgePropertySetOnlyStatement      = `update edge set properties = properties || $1::jsonb where edge.id = $2`