[DCP] Adds Terraform deployment of ingestion workflow. (#27)

gmechali · web-flow · commit d90d19df154e · 2026-04-14T18:56:00.000Z
* Adds support for Workflow deployment, Terraform Trigger and the GCS bucket with permissions for reading and writing for dataflow.

* Also deploy the GCS bucket

* Addresses gemini comments, mostly deletion protection + permissions fixes

* Round 2 review cleanup

* Pass in the ingestion service account in data flow trigger.

* Fix iam issue for dataflow worker.
diff --git a/infra/dcp/main.tf b/infra/dcp/main.tf
@@ -18,6 +18,13 @@ provider "google" {
   billing_project       = var.billing_project_id != null ? var.billing_project_id : var.project_id
 }
 
+provider "google-beta" {
+  project               = var.project_id
+  region                = var.region
+  user_project_override = var.user_project_override
+  billing_project       = var.billing_project_id != null ? var.billing_project_id : var.project_id
+}
+
 # Enable required APIs for both stacks
 resource "google_project_service" "apis" {
   for_each = toset(concat([
@@ -30,7 +37,10 @@ resource "google_project_service" "apis" {
     "vpcaccess.googleapis.com",
     "artifactregistry.googleapis.com",
     "compute.googleapis.com"
-  ], var.enable_dcp ? ["spanner.googleapis.com"] : []))
+    ], var.enable_dcp ? ["spanner.googleapis.com"] : [], var.dcp_deploy_data_ingestion_workflow ? [
+    "workflows.googleapis.com",
+    "workflowexecutions.googleapis.com"
+  ] : []))
 
   service            = each.key
   disable_on_destroy = false
@@ -71,6 +81,9 @@ module "dcp" {
   make_service_public      = var.make_services_public
   deletion_protection      = var.deletion_protection
 
+  deploy_data_ingestion_workflow = var.dcp_deploy_data_ingestion_workflow
+
+
   depends_on = [google_project_service.apis]
 }
 
diff --git a/infra/dcp/modules/dcp/dataflow.tf b/infra/dcp/modules/dcp/dataflow.tf
@@ -0,0 +1,47 @@
+resource "google_workflows_workflow" "ingestion_orchestrator" {
+  count               = var.deploy_data_ingestion_workflow ? 1 : 0
+  name                = "${var.namespace}-ingestion-orchestrator"
+  region              = var.region
+  description         = "Triggers the Dataflow Flex Template Graph Ingestion Pipeline with runtime parameters"
+  service_account     = google_service_account.dcp_ingestion_runner[0].id
+  deletion_protection = var.deletion_protection
+
+  source_contents = <<-EOF
+  main:
+    params: [input]
+    steps:
+      - init:
+          assign:
+            - project_id: '${var.project_id}'
+            - launch_params:
+                projectId: '$${project_id}'
+                spannerInstanceId: '$${input.spannerInstanceId}'
+                spannerDatabaseId: '$${input.spannerDatabaseId}'
+                importList: '$${input.importList}'
+                tempLocation: '$${input.tempLocation}'
+      - run_flex_template:
+          call: googleapis.dataflow.v1b3.projects.locations.flexTemplates.launch
+          args:
+            projectId: '$${project_id}'
+            location: '$${input.region}'
+            body:
+              launchParameter:
+                jobName: '$${"ingestion-job-" + string(int(sys.now()))}'
+                containerSpecGcsPath: 'gs://datcom-templates/templates/flex/ingestion.json'
+                parameters: '$${launch_params}'
+                environment:
+                  serviceAccountEmail: '${google_service_account.dcp_ingestion_runner[0].email}'
+          result: launch_result
+      - return_result:
+          return: '$${launch_result}'
+  EOF
+}
+
+# Automatically provision a GCS bucket for the customer's custom graph ingestion files, if enabled
+resource "google_storage_bucket" "data_ingestion_bucket" {
+  count                       = var.deploy_data_ingestion_workflow && var.create_ingestion_bucket ? 1 : 0
+  name                        = "${var.namespace}-ingestion-bucket-${var.project_id}"
+  location                    = var.region
+  uniform_bucket_level_access = true
+  force_destroy               = !var.deletion_protection
+}
diff --git a/infra/dcp/modules/dcp/iam.tf b/infra/dcp/modules/dcp/iam.tf
@@ -21,3 +21,61 @@ resource "google_cloud_run_service_iam_binding" "public_invoker" {
     "allUsers"
   ]
 }
+
+# Dedicated Service Account for running the Dataflow Ingestion pipeline
+resource "google_service_account" "dcp_ingestion_runner" {
+  count        = var.deploy_data_ingestion_workflow ? 1 : 0
+  account_id   = "${local.name_prefix}dcp-ingestion-sa"
+  display_name = "Data Commons Platform Ingestion Runner"
+}
+
+# Grant Spanner Database User access to the Ingestion runner
+resource "google_project_iam_member" "ingestion_spanner_user" {
+  count   = var.deploy_data_ingestion_workflow ? 1 : 0
+  project = var.project_id
+  role    = "roles/spanner.databaseUser"
+  member  = "serviceAccount:${google_service_account.dcp_ingestion_runner[0].email}"
+}
+
+# Grant Dataflow orchestration and Storage permissions exclusively to the new Ingestion runner
+resource "google_project_iam_member" "dataflow_admin" {
+  count   = var.deploy_data_ingestion_workflow ? 1 : 0
+  project = var.project_id
+  role    = "roles/dataflow.admin"
+  member  = "serviceAccount:${google_service_account.dcp_ingestion_runner[0].email}"
+}
+
+resource "google_project_iam_member" "dataflow_worker" {
+  count   = var.deploy_data_ingestion_workflow ? 1 : 0
+  project = var.project_id
+  role    = "roles/dataflow.worker"
+  member  = "serviceAccount:${google_service_account.dcp_ingestion_runner[0].email}"
+}
+
+
+resource "google_service_account_iam_member" "service_account_user" {
+  count              = var.deploy_data_ingestion_workflow ? 1 : 0
+  service_account_id = google_service_account.dcp_ingestion_runner[0].name
+  role               = "roles/iam.serviceAccountUser"
+  member             = "serviceAccount:${google_service_account.dcp_ingestion_runner[0].email}"
+}
+
+# Fetch project number to reference the Workflows background Service Agent
+data "google_project" "project" {
+  project_id = var.project_id
+}
+
+resource "google_service_account_iam_member" "workflows_token_creator" {
+  count              = var.deploy_data_ingestion_workflow ? 1 : 0
+  service_account_id = google_service_account.dcp_ingestion_runner[0].name
+  role               = "roles/iam.serviceAccountTokenCreator"
+  member             = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-workflows.iam.gserviceaccount.com"
+}
+
+# Bind Object Admin access to either the newly created bucket or an explicitly reused external one
+resource "google_storage_bucket_iam_member" "dynamic_ingestion_bucket_access" {
+  count  = var.deploy_data_ingestion_workflow ? 1 : 0
+  bucket = var.create_ingestion_bucket ? google_storage_bucket.data_ingestion_bucket[0].name : var.external_ingestion_bucket_name
+  role   = "roles/storage.objectAdmin"
+  member = "serviceAccount:${google_service_account.dcp_ingestion_runner[0].email}"
+}
diff --git a/infra/dcp/modules/dcp/outputs.tf b/infra/dcp/modules/dcp/outputs.tf
@@ -13,3 +13,12 @@ output "spanner_instance_id" {
 output "spanner_database_id" {
   value = var.create_spanner_db ? (var.spanner_database_id != "" ? "${local.name_prefix}${var.spanner_database_id}" : "${local.name_prefix}dcp-db") : var.spanner_database_id
 }
+
+output "ingestion_orchestrator_id" {
+  description = "Fully qualified ID of the Cloud Workflows ingestion orchestrator"
+  value       = var.deploy_data_ingestion_workflow ? google_workflows_workflow.ingestion_orchestrator[0].id : null
+}
+output "data_ingestion_bucket_url" {
+  description = "GCS path to the dynamically provisioned bucket for customer custom MCF datasets"
+  value       = var.deploy_data_ingestion_workflow && var.create_ingestion_bucket ? google_storage_bucket.data_ingestion_bucket[0].url : null
+}
diff --git a/infra/dcp/modules/dcp/variables.tf b/infra/dcp/modules/dcp/variables.tf
@@ -92,3 +92,23 @@ variable "make_service_public" {
   description = "Whether to allow unauthenticated invocations to the service"
   type        = bool
 }
+
+# --- Ingestion Pipeline Config ---
+variable "deploy_data_ingestion_workflow" {
+  description = "Deploy the complete end-to-end Data Commons Ingestion workflow stack"
+  type        = bool
+}
+
+
+
+variable "create_ingestion_bucket" {
+  description = "Controls whether Terraform automatically provisions a dedicated staging GCS bucket for uploading graph dataset (.mcf) files"
+  type        = bool
+  default     = true
+}
+
+variable "external_ingestion_bucket_name" {
+  description = "Specifies an existing, external GCS bucket name to use for datasets if automatic provisioning is disabled"
+  type        = string
+  default     = ""
+}
diff --git a/infra/dcp/outputs.tf b/infra/dcp/outputs.tf
@@ -15,3 +15,12 @@ output "cdc_service_url" {
 output "cdc_mysql_instance_connection_name" {
   value = var.enable_cdc ? module.cdc[0].mysql_instance_connection_name : null
 }
+
+output "dcp_ingestion_orchestrator_id" {
+  description = "ID of the ingestion Cloud Workflows orchestrator"
+  value       = var.enable_dcp && var.dcp_deploy_data_ingestion_workflow ? module.dcp[0].ingestion_orchestrator_id : null
+}
+output "dcp_data_ingestion_bucket_url" {
+  description = "GCS URL pointing directly to the dynamically provisioned bucket for your input graph MCF files"
+  value       = var.enable_dcp && var.dcp_deploy_data_ingestion_workflow ? module.dcp[0].data_ingestion_bucket_url : null
+}
diff --git a/infra/dcp/variables.tf b/infra/dcp/variables.tf
@@ -355,3 +355,12 @@ variable "cdc_redis_replica_count" {
   type        = number
   default     = 1
 }
+
+# --- Ingestion Pipeline Config ---
+variable "dcp_deploy_data_ingestion_workflow" {
+  description = "Deploy the complete end-to-end Data Commons Ingestion workflow stack"
+  type        = bool
+  default     = false
+}
+
+