Fixes for review comments

vish-cs · vish-cs · commit 40f1d63308af · 2026-04-15T07:30:10.000Z
diff --git a/import-automation/executor/app/executor/import_executor.py b/import-automation/executor/app/executor/import_executor.py
@@ -555,7 +555,8 @@ def _invoke_import_differ(self, genmcf_output_path: str,
                               absolute_import_dir: str) -> Tuple[str, str]:
         """Invokes the differ tool to compare current data with previous data."""
         current_data_path = os.path.join(genmcf_output_path, '*.mcf')
-        previous_data_path = latest_version + f'/{import_prefix}/genmcf/*.mcf'
+        previous_data_path = os.path.join(latest_version, import_prefix,
+                                          'genmcf', '*.mcf')
         diff_found = True
         # TODO: remove fallback logic once all imports move to new path.
         if not file_util.file_get_matching(previous_data_path):
@@ -715,17 +716,7 @@ def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
             import_summary.import_stats.get('validation_execution_time', 0),
             import_summary.import_stats.get('validation_data_size',
                                             0), validation_message)
-        if not self.config.ignore_validation_status and not validation_status:
-            logging.error(
-                "Marking import as VALIDATION due to validation failure.")
-            import_summary.status = ImportStatus.VALIDATION
-        elif not differ_status:
-            logging.info("Marking import as SKIP due to no data diff.")
-            import_summary.status = ImportStatus.SKIP
-        else:
-            import_summary.status = ImportStatus.STAGING
-
-        return validation_status
+        return validation_status, differ_status
 
     def _get_validation_message(
             self, validation_results: List[ValidationResult]) -> str:
@@ -916,9 +907,10 @@ def _import_one_helper(
                     import_summary=import_summary)
 
             validation_status = True
+            differ_status = True
             if self.config.invoke_import_validation:
                 logging.info("Invoking import validations")
-                validation_status = self._invoke_import_validation(
+                validation_status, differ_status = self._invoke_import_validation(
                     repo_dir=repo_dir,
                     relative_import_dir=relative_import_dir,
                     absolute_import_dir=absolute_import_dir,
@@ -932,6 +924,16 @@ def _import_one_helper(
                 logging.info(
                     'Skipping import validations as per import config.')
 
+            if not self.config.ignore_validation_status and not validation_status:
+                logging.error(
+                    "Marking import as VALIDATION due to validation failure.")
+                import_summary.status = ImportStatus.VALIDATION
+            elif not differ_status:
+                logging.info("Marking import as SKIP due to no data diff.")
+                import_summary.status = ImportStatus.SKIP
+            else:
+                import_summary.status = ImportStatus.STAGING
+
             import_summary.execution_time = int(time.time() - start_time)
             import_summary.data_volume = int(
                 import_summary.import_stats.get('source_data_size', 0) +
@@ -987,6 +989,13 @@ def _upload_import_inputs(self, import_dir: str, output_dir: str,
         import_inputs: Specification of the import as a dict.
 
     """
+        # Copy manifest file
+        manifest_file = os.path.join(import_dir, 'manifest.json')
+        dest = f'{output_dir}/{version}/{os.path.basename(manifest_file)}'
+        self._upload_file_helper(
+            src=manifest_file,
+            dest=dest,
+        )
         import_inputs = import_spec.get('import_inputs', [])
         errors = []
         data_size = 0
@@ -996,7 +1005,7 @@ def _upload_import_inputs(self, import_dir: str, output_dir: str,
             import_files = self._get_import_input_files(import_input,
                                                         import_dir)
             for file in import_files:
-                dest = f'{output_dir}/{version}/input{input_index}/{os.path.basename(file)}'
+                dest = f'{output_dir}/{version}/{os.path.basename(file)}'
                 data_size += os.path.getsize(file)
                 if not self.config.skip_input_upload:
                     self._upload_file_helper(
@@ -1012,11 +1021,10 @@ def _upload_import_inputs(self, import_dir: str, output_dir: str,
         for file in source_files:
             dest = f'{output_dir}/{version}/source_files/{os.path.relpath(file, import_dir)}'
             data_size += os.path.getsize(file)
-            if not self.config.skip_input_upload:
-                self._upload_file_helper(
-                    src=file,
-                    dest=dest,
-                )
+            self._upload_file_helper(
+                src=file,
+                dest=dest,
+            )
 
         import_summary.import_stats['source_data_size'] = data_size
         if errors:
diff --git a/import-automation/executor/cloudbuild.yaml b/import-automation/executor/cloudbuild.yaml
@@ -46,7 +46,6 @@ steps:
         python import_test.py
     env:
       - 'PROJECT_ID=$PROJECT_ID'
-      - 'LOCATION=$LOCATION'
       - 'GCS_BUCKET=${_GCS_BUCKET}'
       - 'IMAGE_URI=${_DOCKER_IMAGE}:${COMMIT_SHA}'
     dir: 'import-automation/executor'
diff --git a/import-automation/workflow/import-automation-workflow.yaml b/import-automation/workflow/import-automation-workflow.yaml
@@ -22,6 +22,9 @@ main:
               disk: 100
           - resources: ${default(map.get(args, "resources"), defaultResources)}
           - runIngestion: ${default(map.get(args, "runIngestion"), false)}
+          - ingestionArgs:
+              importList:
+                - ${text.split(importName, ":")[1]}
     - runImportJob:
         try:
           call: googleapis.batch.v1.projects.locations.jobs.create
@@ -109,7 +112,7 @@ main:
                   args:
                     parent: ${"projects/" + projectId + "/locations/" + region + "/workflows/spanner-ingestion-workflow"}
                     body:
-                      argument: ${json.encode_to_string({"importList": [text.split(importName, ":")[1]]})}
+                      argument: ${json.encode_to_string(ingestionArgs)}
     - returnResult:
         return:
           jobId: ${jobId}
diff --git a/import-automation/workflow/import-helper/main.py b/import-automation/workflow/import-helper/main.py
@@ -40,18 +40,10 @@ def handle_feed_event(request):
         return 'OK', 200
 
     import_name = attributes.get('import_name')
-    import_status = 'STAGING'
     latest_version = attributes.get(
         'import_version',
         datetime.now(timezone.utc).strftime("%Y-%m-%d"))
-    graph_path = attributes.get('graph_path', "/**/*.mcf*")
-    job_id = attributes.get('feed_name', 'cda_feed')
-    cron_schedule = attributes.get('cron_schedule', '')
-    run_ingestion = 'Schema' in import_name or 'Place' in import_name
-
-    # Update import status in spanner
-    helper.update_import_status(import_name, import_status, latest_version,
-                                graph_path, job_id, cron_schedule)
+    run_ingestion = True 
 
     # Invoke import job and ingestion workflow to trigger dataflow job
     helper.invoke_import_workflow(import_name, latest_version, run_ingestion)
diff --git a/scripts/entities/download.sh b/scripts/entities/download.sh
@@ -20,11 +20,9 @@ for i in "$@"; do
   case $i in
     --entity=*)
       ENTITY="${i#*=}"
-      shift
       ;;
     --version=*)
       VERSION="${i#*=}"
-      shift
       ;;
     *)
       # Skip unknown options
@@ -34,10 +32,21 @@ done
 
 BUCKET_NAME="datcom-prod-imports"
 DIR_NAME=$(basename "$(pwd)")
-GCS_FOLDER_PREFIX="scripts/${DIR_NAME}/${ENTITY}/${VERSION}"
-GCS_PATH="gs://${BUCKET_NAME}/${GCS_FOLDER_PREFIX}"
+GCS_FOLDER_PREFIX="scripts/${DIR_NAME}/${ENTITY}"
+GCS_PATH="gs://${BUCKET_NAME}/${GCS_FOLDER_PREFIX}/${VERSION}"
 
 echo "Downloading import ${ENTITY} for version ${VERSION} from ${GCS_PATH} to $(pwd)"
 mkdir -p "${ENTITY}"
 gcloud storage cp -r "${GCS_PATH}" "${ENTITY}/"
 echo "Successfully downloaded ${ENTITY} version ${VERSION}"
+
+# TODO: remove after scrpts are checked in
+# Download scripts from GCS
+SCRIPTS_GCS_PATH="gs://${BUCKET_NAME}/scripts/${DIR_NAME}/process/*"
+SCRIPTS_LOCAL_PATH="../../import-automation/executor/scripts"
+echo "Downloading scripts from ${SCRIPTS_GCS_PATH} to ${SCRIPTS_LOCAL_PATH}"
+mkdir -p "${SCRIPTS_LOCAL_PATH}"
+gcloud storage cp -r "${SCRIPTS_GCS_PATH}" "${SCRIPTS_LOCAL_PATH}/"
+
+
+
diff --git a/scripts/entities/manifest.json b/scripts/entities/manifest.json
@@ -22,7 +22,7 @@
                 "invoke_import_validation": true,
                 "invoke_import_tool": true,
                 "invoke_differ_tool": true,
-                "skip_input_upload": true 
+                "skip_input_upload": true
             }
         },
         {
@@ -33,9 +33,14 @@
             "provenance_url": "https://datacommons.org",
             "provenance_description": "Place nodes for Data Commons",
             "scripts": [
-                "./download.sh --entity=Place"
+                "./download.sh --entity=Place",
+                "./process.py --entity=Place"
+            ],
+            "import_inputs": [
+                {
+                    "node_mcf": "**/*.mcf"
+                }
             ],
-            "import_inputs": [],
             "source_files": [],
             "cron_schedule": "15 3 * * 1",
             "resource_limits": {
@@ -45,8 +50,9 @@
             },
             "config_override": {
                 "invoke_import_validation": false,
-                "invoke_import_tool": true,
-                "invoke_differ_tool": false
+                "invoke_import_tool": false,
+                "invoke_differ_tool": false,
+                "skip_input_upload": true
             }
         },
         {
@@ -57,7 +63,8 @@
             "provenance_url": "https://datacommons.org",
             "provenance_description": "Provenance nodes for Data Commons",
             "scripts": [
-                "./download.sh --entity=Provenance"
+                "./download.sh --entity=Provenance",
+                "./process.py --entity=Provenance"
             ],
             "import_inputs": [
                 {
@@ -69,7 +76,38 @@
             "config_override": {
                 "invoke_import_validation": false,
                 "invoke_import_tool": false,
-                "invoke_differ_tool": false
+                "invoke_differ_tool": false,
+                "skip_input_upload": true
+            }
+        },
+        {
+            "import_name": "Event",
+            "curator_emails": [
+                "support@datacommons.org"
+            ],
+            "provenance_url": "https://datacommons.org",
+            "provenance_description": "Event nodes for Data Commons",
+            "scripts": [
+                "./download.sh --entity=Event",
+                "./process.py --entity=Event"
+            ],
+            "import_inputs": [
+                {
+                    "node_mcf": "**/*.mcf"
+                }
+            ],
+            "source_files": [],
+            "cron_schedule": "15 3 * * 1",
+            "resource_limits": {
+                "cpu": 8,
+                "memory": 128,
+                "disk": 100
+            },
+            "config_override": {
+                "invoke_import_validation": false,
+                "invoke_import_tool": false,
+                "invoke_differ_tool": false,
+                "skip_input_upload": true
             }
         }
     ]
diff --git a/scripts/entities/process.py b/scripts/entities/process.py
@@ -19,18 +19,37 @@
 from absl import app
 from absl import flags
 from absl import logging
-import datetime
-from google.cloud import storage
 import os
+import sys
+
+# Add the scripts directory to sys.path
+script_dir = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), '..', '..', 'import-automation',
+                 'executor', 'scripts'))
+sys.path.append(script_dir)
+import generate_provisional_nodes
+import convert_dc_manifest
 
 FLAGS = flags.FLAGS
-flags.DEFINE_string("entity", "Schema", "Entity type (Schema/Place).")
+flags.DEFINE_string("entity", "", "Entity type (Schema/Place).")
 flags.DEFINE_string("version", "", "Import version.")
 
 
 def process(entity_type: str, version: str):
     logging.info(f'Processing import {entity_type} for version {version}')
-    # TODO: add processing logic
+    local_path = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), entity_type, version))
+
+    if entity_type == 'Provenance':
+        # Local path to Provenance data
+        logging.info(f'Processing DC manifest files in {local_path}')
+        convert_dc_manifest.process_directory(local_path)
+
+    # Local path to data
+    logging.info(
+        f'Generating provisional nodes for {entity_type} in {local_path}')
+    generate_provisional_nodes.generate_provisional_nodes(local_path)
+    return 0
 
 
 def main(_):
diff --git a/tools/import_differ/import_differ.py b/tools/import_differ/import_differ.py
@@ -368,6 +368,8 @@ def run_differ(self) -> dict:
             diff_path = os.path.join(self.output_path, 'schema-diff*')
             logging.info("Loading schema diff data from: %s", diff_path)
             schema_diff = differ_utils.load_csv_data(diff_path, tmp_path)
+            # TODO: populate summary for cloud mode
+            differ_summary = {}
         else:
             # Runs local Python differ.
             current_dir = os.path.join(tmp_path, 'current')