Add batch mode processing for entities

vish-cs · vish-cs · commit 57d525a45d4c · 2026-04-07T15:11:11.000Z
diff --git a/import-automation/executor/app/configs.py b/import-automation/executor/app/configs.py
@@ -169,6 +169,8 @@ class ExecutorConfig:
     disable_email_notifications: bool = True
     # Skip uploading the data to GCS (for local testing).
     skip_gcs_upload: bool = False
+    # Skip uploading input files to GCS.
+    skip_input_upload: bool = False
     # Maximum time a blocking call to the importer to
     # perform an import can take in seconds.
     importer_import_timeout: float = 20 * 60
diff --git a/import-automation/executor/app/executor/import_executor.py b/import-automation/executor/app/executor/import_executor.py
diff --git a/import-automation/workflow/import-automation-workflow.yaml b/import-automation/workflow/import-automation-workflow.yaml
@@ -21,6 +21,7 @@ main:
               memory: 32768
               disk: 100
           - resources: ${default(map.get(args, "resources"), defaultResources)}
+          - runIngestion: ${default(map.get(args, "runIngestion"), false)}
     - runImportJob:
         try:
           call: googleapis.batch.v1.projects.locations.jobs.create
@@ -99,6 +100,16 @@ main:
             override: false
             comment: '${"import-workflow:" + sys.get_env("GOOGLE_CLOUD_WORKFLOW_EXECUTION_ID")}'
         result: functionResponse
+    - runIngestion:
+        switch:
+          - condition: ${runIngestion}
+            steps:
+              - runSpannerIngestion:
+                  call: googleapis.workflowexecutions.v1.projects.locations.workflows.executions.create
+                  args:
+                    parent: ${"projects/" + projectId + "/locations/" + region + "/workflows/spanner-ingestion-workflow"}
+                    body:
+                      argument: ${json.encode_to_string({"importList": [text.split(importName, ":")[1]]})}
     - returnResult:
         return:
           jobId: ${jobId}
diff --git a/import-automation/workflow/import-helper/import_helper.py b/import-automation/workflow/import-helper/import_helper.py
@@ -31,6 +31,8 @@
 GCS_BUCKET_ID = os.environ.get('GCS_BUCKET_ID')
 INGESTION_HELPER_URL = f"https://{LOCATION}-{PROJECT_ID}.cloudfunctions.net/spanner-ingestion-helper"
 WORKFLOW_ID = 'spanner-ingestion-workflow'
+IMPORT_AUTOMATION_WORKFLOW_ID = 'import-automation-workflow'
+
 
 def invoke_ingestion_workflow(import_name: str):
     """Triggers the graph ingestion workflows.
@@ -51,6 +53,34 @@ def invoke_ingestion_workflow(import_name: str):
     )
 
 
+def invoke_import_workflow(import_name: str,
+                           latest_version: str,
+                           run_ingestion: bool = False):
+    """Triggers the import automation workflow.
+
+    Args:
+        import_name: The name of the import.
+        latest_version: The version of the import.
+        run_ingestion: Whether to run the ingestion workflow after the import.
+    """
+    import_config = {"user_script_args": [f"--version={latest_version}"]}
+    workflow_args = {
+        "importName": import_name,
+        "importConfig": json.dumps(import_config),
+        "runIngestion": run_ingestion
+    }
+
+    logging.info(f"Invoking {IMPORT_AUTOMATION_WORKFLOW_ID} for {import_name}")
+    execution_client = executions_v1.ExecutionsClient()
+    parent = f"projects/{PROJECT_ID}/locations/{LOCATION}/workflows/{IMPORT_AUTOMATION_WORKFLOW_ID}"
+    execution_req = executions_v1.Execution(argument=json.dumps(workflow_args))
+    response = execution_client.create_execution(parent=parent,
+                                                 execution=execution_req)
+    logging.info(
+        f"Triggered workflow {IMPORT_AUTOMATION_WORKFLOW_ID} for {import_name}. Execution ID: {response.name}"
+    )
+
+
 def update_import_status(import_name,
                          import_status,
                          import_version,
diff --git a/import-automation/workflow/import-helper/main.py b/import-automation/workflow/import-helper/main.py
@@ -47,12 +47,12 @@ def handle_feed_event(request):
     graph_path = attributes.get('graph_path', "/**/*.mcf*")
     job_id = attributes.get('feed_name', 'cda_feed')
     cron_schedule = attributes.get('cron_schedule', '')
-    post_process = attributes.get('post_process', '')
+    run_ingestion = 'Schema' in import_name or 'Place' in import_name
+
     # Update import status in spanner
     helper.update_import_status(import_name, import_status, latest_version,
-                                  graph_path, job_id, cron_schedule)
+                                graph_path, job_id, cron_schedule)
 
-    # Invoke ingestion workflow to trigger dataflow job
-    if post_process == 'spanner_ingestion_workflow':
-        helper.invoke_ingestion_workflow(import_name)
+    # Invoke import job and ingestion workflow to trigger dataflow job
+    helper.invoke_import_workflow(import_name, latest_version, run_ingestion)
     return 'OK', 200
diff --git a/scripts/entities/download.sh b/scripts/entities/download.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# Parse arguments
+for i in "$@"; do
+  case $i in
+    --entity=*)
+      ENTITY="${i#*=}"
+      shift
+      ;;
+    --version=*)
+      VERSION="${i#*=}"
+      shift
+      ;;
+    *)
+      # Skip unknown options
+      ;;
+  esac
+done
+
+BUCKET_NAME="datcom-prod-imports"
+DIR_NAME=$(basename "$(pwd)")
+GCS_FOLDER_PREFIX="scripts/${DIR_NAME}/${ENTITY}/${VERSION}"
+GCS_PATH="gs://${BUCKET_NAME}/${GCS_FOLDER_PREFIX}"
+
+echo "Downloading import ${ENTITY} for version ${VERSION} from ${GCS_PATH} to $(pwd)"
+mkdir -p "${ENTITY}"
+gcloud storage cp -r "${GCS_PATH}" "${ENTITY}/"
+echo "Successfully downloaded ${ENTITY} version ${VERSION}"
diff --git a/scripts/entities/manifest.json b/scripts/entities/manifest.json
@@ -8,15 +8,21 @@
             "provenance_url": "https://datacommons.org",
             "provenance_description": "Schema nodes for Data Commons",
             "scripts": [
-                "process.py --entity=Schema"
+                "./download.sh --entity=Schema",
+                "./process.py --entity=Schema"
+            ],
+            "import_inputs": [
+                {
+                    "node_mcf": "**/*.mcf"
+                }
             ],
-            "import_inputs": [],
             "source_files": [],
             "cron_schedule": "15 3 * * *",
             "config_override": {
-                "invoke_import_validation": false,
-                "invoke_import_tool": false,
-                "invoke_differ_tool": false
+                "invoke_import_validation": true,
+                "invoke_import_tool": true,
+                "invoke_differ_tool": true,
+                "skip_input_upload": true 
             }
         },
         {
@@ -27,11 +33,39 @@
             "provenance_url": "https://datacommons.org",
             "provenance_description": "Place nodes for Data Commons",
             "scripts": [
-                "process.py --entity=Place"
+                "./download.sh --entity=Place"
             ],
             "import_inputs": [],
             "source_files": [],
             "cron_schedule": "15 3 * * 1",
+            "resource_limits": {
+                "cpu": 8,
+                "memory": 128,
+                "disk": 100
+            },
+            "config_override": {
+                "invoke_import_validation": false,
+                "invoke_import_tool": true,
+                "invoke_differ_tool": false
+            }
+        },
+        {
+            "import_name": "Provenance",
+            "curator_emails": [
+                "support@datacommons.org"
+            ],
+            "provenance_url": "https://datacommons.org",
+            "provenance_description": "Provenance nodes for Data Commons",
+            "scripts": [
+                "./download.sh --entity=Provenance"
+            ],
+            "import_inputs": [
+                {
+                    "node_mcf": "**/*.mcf"
+                }
+            ],
+            "source_files": [],
+            "cron_schedule": "15 3 * * 1",
             "config_override": {
                 "invoke_import_validation": false,
                 "invoke_import_tool": false,
diff --git a/scripts/entities/process.py b/scripts/entities/process.py
@@ -25,35 +25,17 @@
 
 FLAGS = flags.FLAGS
 flags.DEFINE_string("entity", "Schema", "Entity type (Schema/Place).")
+flags.DEFINE_string("version", "", "Import version.")
 
-BUCKET_NAME = 'datcom-prod-imports'
-FILE_NAME = 'staging_version.txt'
-
-
-def process(entity_type: str):
-    # Ensure the import data is available in GCS.
-    current_date = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d")
-    logging.info(f'Checking import {entity_type} for date {current_date}')
-    file_path = os.path.join('scripts', os.path.basename(os.getcwd()),
-                             entity_type, FILE_NAME)
-    storage_client = storage.Client()
-    bucket = storage_client.bucket(BUCKET_NAME)
-    blob = bucket.blob(file_path)
-    version = blob.download_as_text()
-    if version == current_date:
-        logging.info(
-            f'Successfully validated import {entity_type} for date {current_date}'
-        )
-        return 0
-    else:
-        raise RuntimeError(
-            f'{entity_type} data not present in GCS bucket {BUCKET_NAME} for date {current_date}'
-        )
+
+def process(entity_type: str, version: str):
+    logging.info(f'Processing import {entity_type} for version {version}')
+    # TODO: add processing logic
 
 
 def main(_):
     """Runs the code."""
-    process(FLAGS.entity)
+    process(FLAGS.entity, FLAGS.version)
 
 
 if __name__ == "__main__":
diff --git a/tools/import_differ/import_differ.py b/tools/import_differ/import_differ.py
@@ -347,7 +347,7 @@ def run_dataflow_job(self, project: str, job: str, current_data: str,
         )
         return status
 
-    def run_differ(self):
+    def run_differ(self) -> dict:
         os.makedirs(self.output_path, exist_ok=True)
         tmp_path = os.path.join(self.output_path, self.job_name)
         os.makedirs(tmp_path, exist_ok=True)
@@ -424,6 +424,7 @@ def run_differ(self):
         differ_utils.write_csv_data(obs_diff_samples, self.output_path,
                                     'obs_diff_samples.csv', tmp_path)
         logging.info(f'Differ output written to {self.output_path}')
+        return differ_summary
 
 
 def main(_):
diff --git a/util/file_util.py b/util/file_util.py
@@ -348,7 +348,7 @@ def file_get_matching(filepat: Union[str, list]) -> list:
         for file in input_files:
             if file_is_local(file):
                 # Expand local file pattern.
-                for f in glob.glob(file):
+                for f in glob.glob(file, recursive=True):
                     files.add(f)
             elif file_is_gcs(file):
                 bucket = file_get_gcs_bucket(file)