datacommonsorg
diff --git a/‎import-automation/executor/app/executor/import_executor.py‎
Lines changed: 42 additions & 41 deletions b/‎import-automation/executor/app/executor/import_executor.py‎
Lines changed: 42 additions & 41 deletions
diff --git a/‎import-automation/executor/cloudbuild.yaml‎
Lines changed: 0 additions & 1 deletion b/‎import-automation/executor/cloudbuild.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎import-automation/workflow/import-automation-workflow.yaml‎
Lines changed: 11 additions & 0 deletions b/‎import-automation/workflow/import-automation-workflow.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎import-automation/workflow/import-helper/import_helper.py‎
Lines changed: 30 additions & 0 deletions b/‎import-automation/workflow/import-helper/import_helper.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎import-automation/workflow/import-helper/main.py‎
Lines changed: 5 additions & 5 deletions b/‎import-automation/workflow/import-helper/main.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎scripts/entities/manifest.json‎
Lines changed: 37 additions & 5 deletions b/‎scripts/entities/manifest.json‎
Lines changed: 37 additions & 5 deletions
@@ -422,22 +422,19 @@ def _get_import_input_files(self, import_input, absolute_import_dir):
             for pattern in patterns:
                 if pattern:
                     files = glob.glob(os.path.join(absolute_import_dir,
-                                                   pattern))
+                                                   pattern),
+                                      recursive=True)
                     if not files and not glob.has_magic(pattern):
                         errors.append(
                             f'No matching files for {file_type}:{pattern}')
                     else:
                         input_files.extend(sorted(files))
-        import_prefix = ''
-        if input_files:
-            import_prefix = os.path.splitext(os.path.basename(
-                input_files[0]))[0]
         if errors:
             logging.fatal(
                 f'Missing import files in {absolute_import_dir}: {errors}')
             raise RuntimeError(
                 'Import job failed due to missing user script output files.')
-        return input_files, import_prefix
+        return input_files
 
     @log_function_call
     def _invoke_import_tool(self, absolute_import_dir: str,
@@ -453,17 +450,12 @@ def _invoke_import_tool(self, absolute_import_dir: str,
         import_stage = ImportStage.GENMCF
         import_name = import_spec['import_name']
         import_inputs = import_spec.get('import_inputs', [])
-        import_prefix_list = []
         input_index = -1
         for import_input in import_inputs:
             input_index += 1
-            input_files, import_prefix = self._get_import_input_files(
-                import_input, absolute_import_dir)
-            import_prefix_list.append(import_prefix)
-            if not import_prefix:
-                logging.error(
-                    'Skipping genmcf due to missing import input spec.')
-                continue
+            input_files = self._get_import_input_files(import_input,
+                                                       absolute_import_dir)
+            import_prefix = f'input{input_index}'
             output_path = os.path.join(absolute_import_dir, import_name,
                                        version, import_prefix, 'genmcf')
 
@@ -521,7 +513,6 @@ def _invoke_import_tool(self, absolute_import_dir: str,
             import_name, import_stage, ImportStatus.SUCCESS,
             import_summary.import_stats.get('genmcf_execution_time', 0),
             import_summary.import_stats.get('mcf_data_size', 0))
-        return import_prefix_list
 
     def _get_validation_config_file(self, repo_dir: str,
                                     absolute_import_dir: str, import_spec: dict,
@@ -559,7 +550,7 @@ def _get_validation_config_file(self, repo_dir: str,
     @log_function_call
     def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
                                   absolute_import_dir: str, import_spec: dict,
-                                  version: str, import_prefix_list: list,
+                                  version: str,
                                   import_summary: ImportStatusSummary) -> bool:
         """ 
         Performs validations on import data.
@@ -577,12 +568,11 @@ def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
         differ_job_name = 'differ'
 
         # Trigger validations for each tmcf/csv under import_inputs.
+        import_inputs = import_spec.get('import_inputs', [])
         input_index = -1
-        for import_prefix in import_prefix_list:
+        for import_input in import_inputs:
             input_index += 1
-            if not import_prefix:
-                logging.error('Skipping validation due to missing import spec.')
-                continue
+            import_prefix = f'input{input_index}'
 
             genmcf_output_path = os.path.join(absolute_import_dir, import_name,
                                               version, import_prefix, 'genmcf')
@@ -593,17 +583,20 @@ def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
             current_data_path = os.path.join(genmcf_output_path, '*.mcf')
             previous_data_path = latest_version + f'/{import_prefix}/genmcf/*.mcf'
             # TODO: remove fallback logic once all imports move to new path.
-            if latest_version and not file_util.file_get_matching(
-                    previous_data_path):
+            if not file_util.file_get_matching(previous_data_path):
+                input_files = self._get_import_input_files(
+                    import_input, absolute_import_dir)
+                import_prefix = os.path.splitext(
+                    os.path.basename(input_files[0]))[0]
+                previous_data_path = latest_version + f'/{import_prefix}/genmcf/*.mcf'
+            if not file_util.file_get_matching(previous_data_path):
                 previous_data_path = latest_version + f'/{import_prefix}/validation/*.mcf'
             # END
             summary_stats = os.path.join(genmcf_output_path,
                                          'summary_report.csv')
             report_json = os.path.join(genmcf_output_path, 'report.json')
             validation_output_file = os.path.join(validation_output_path,
                                                   'validation_output.csv')
-            differ_output = os.path.join(validation_output_path,
-                                         'obs_diff_summary.csv')
 
             # Invoke differ and validation scripts.
             differ_output_file = ''
@@ -620,7 +613,7 @@ def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
                                       job_name=differ_job_name,
                                       file_format='mcf',
                                       runner_mode='local')
-                differ.run_differ()
+                differ_summary = differ.run_differ()
                 log_metric(
                     AUTO_IMPORT_JOB_STAGE, "INFO",
                     f"Import: {import_name}, differ for {import_prefix} {latest_version} vs {version}",
@@ -633,6 +626,11 @@ def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
                         "current_version": version
                     })
                 differ_output_file = validation_output_path
+                if differ_summary.get('obs_diff_size',
+                                      '0') == 0 and differ_summary.get(
+                                          'schema_diff_size', '0') == 0:
+                    import_summary.status = ImportStatus.SKIP
+                    logging.info("Marking import as SKIP due to empty diff.")
             else:
                 differ_output_file = ''
                 logging.error(
@@ -693,6 +691,13 @@ def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
             import_summary.import_stats.get('validation_execution_time', 0),
             import_summary.import_stats.get('validation_data_size',
                                             0), validation_message)
+        if self.config.ignore_validation_status or validation_status:
+            import_summary.status = ImportStatus.STAGING
+        else:
+            logging.error(
+                "Marking import as VALIDATION due to validation failure.")
+            import_summary.status = ImportStatus.VALIDATION
+
         return validation_status
 
     def _get_validation_message(
@@ -853,11 +858,13 @@ def _import_one_helper(
                 repo_dir, 'import-automation', 'executor',
                 self.config.requirements_filename)
             timer = Timer()
-            interpreter_path, process = _create_venv(
-                (central_requirements_path, requirements_path),
-                tmpdir,
-                timeout=self.config.venv_create_timeout,
-            )
+            interpreter_path = sys.executable
+            process = subprocess.CompletedProcess(args=[], returncode=0)
+            # interpreter_path, process = _create_venv(
+            #     [requirements_path],
+            #     tmpdir,
+            #     timeout=self.config.venv_create_timeout,
+            # )
 
             _log_process(process=process,
                          import_name=import_name,
@@ -878,7 +885,7 @@ def _import_one_helper(
 
             if self.config.invoke_import_tool:
                 logging.info("Invoking import tool genmcf")
-                import_prefix_list = self._invoke_import_tool(
+                self._invoke_import_tool(
                     absolute_import_dir=absolute_import_dir,
                     relative_import_dir=relative_import_dir,
                     version=version,
@@ -894,7 +901,6 @@ def _import_one_helper(
                     absolute_import_dir=absolute_import_dir,
                     import_spec=import_spec,
                     version=version,
-                    import_prefix_list=import_prefix_list,
                     import_summary=import_summary)
                 logging.info(
                     f'Validations for version {version} completed with status: {validation_status}'
@@ -910,13 +916,6 @@ def _import_one_helper(
                 import_summary.import_stats.get('validation_data_size', 0))
             logging.info(import_summary)
 
-            if self.config.ignore_validation_status or validation_status:
-                import_summary.status = ImportStatus.STAGING
-            else:
-                logging.error(
-                    "Staging latest version update due to validation failure.")
-                import_summary.status = ImportStatus.VALIDATION
-
             self._update_latest_version(version, output_dir, import_spec,
                                         import_summary)
 
@@ -970,7 +969,9 @@ def _upload_import_inputs(
         import_inputs = import_spec.get('import_inputs', [])
         errors = []
         data_size = 0
+        input_index = -1
         for import_input in import_inputs:
+            input_index += 1
             for input_type in self.config.import_input_types:
                 path = import_input.get(input_type)
                 if not path:
@@ -984,13 +985,13 @@ def _upload_import_inputs(
                     if import_files:
                         for file in import_files:
                             if file:
-                                dest = f'{output_dir}/{version}/{os.path.basename(file)}'
+                                dest = f'{output_dir}/{version}/input{input_index}/{os.path.basename(file)}'
                                 data_size += os.path.getsize(file)
                                 self._upload_file_helper(
                                     src=file,
                                     dest=dest,
                                 )
-                        uploaded_dest = f'{output_dir}/{version}/{os.path.basename(path)}'
+                        uploaded_dest = f'{output_dir}/{version}/input{input_index}/{os.path.basename(path)}'
                         setattr(uploaded, input_type, uploaded_dest)
                     elif not glob.has_magic(path):
                         errors.append(
 
@@ -46,7 +46,6 @@ steps:
         python import_test.py
     env:
       - 'PROJECT_ID=$PROJECT_ID'
-      - 'LOCATION=$LOCATION'
       - 'GCS_BUCKET=${_GCS_BUCKET}'
       - 'IMAGE_URI=${_DOCKER_IMAGE}:${COMMIT_SHA}'
     dir: 'import-automation/executor'
 
@@ -21,6 +21,7 @@ main:
               memory: 32768
               disk: 100
           - resources: ${default(map.get(args, "resources"), defaultResources)}
+          - runIngestion: ${default(map.get(args, "runIngestion"), false)}
     - runImportJob:
         try:
           call: googleapis.batch.v1.projects.locations.jobs.create
@@ -99,6 +100,16 @@ main:
             override: false
             comment: '${"import-workflow:" + sys.get_env("GOOGLE_CLOUD_WORKFLOW_EXECUTION_ID")}'
         result: functionResponse
+    - runIngestion:
+        switch:
+          - condition: ${runIngestion}
+            steps:
+              - runSpannerIngestion:
+                  call: googleapis.workflowexecutions.v1.projects.locations.workflows.executions.create
+                  args:
+                    parent: ${"projects/" + projectId + "/locations/" + region + "/workflows/spanner-ingestion-workflow"}
+                    body:
+                      argument: ${json.encode_to_string({"importList": [text.split(importName, ":")[1]]})}
     - returnResult:
         return:
           jobId: ${jobId}
 
@@ -31,6 +31,8 @@
 GCS_BUCKET_ID = os.environ.get('GCS_BUCKET_ID')
 INGESTION_HELPER_URL = f"https://{LOCATION}-{PROJECT_ID}.cloudfunctions.net/spanner-ingestion-helper"
 WORKFLOW_ID = 'spanner-ingestion-workflow'
+IMPORT_AUTOMATION_WORKFLOW_ID = 'import-automation-workflow'
+
 
 def invoke_ingestion_workflow(import_name: str):
     """Triggers the graph ingestion workflows.
@@ -51,6 +53,34 @@ def invoke_ingestion_workflow(import_name: str):
     )
 
 
+def invoke_import_workflow(import_name: str,
+                           latest_version: str,
+                           run_ingestion: bool = False):
+    """Triggers the import automation workflow.
+
+    Args:
+        import_name: The name of the import.
+        latest_version: The version of the import.
+        run_ingestion: Whether to run the ingestion workflow after the import.
+    """
+    import_config = {"user_script_args": [f"--version={latest_version}"]}
+    workflow_args = {
+        "importName": import_name,
+        "importConfig": json.dumps(import_config),
+        "runIngestion": run_ingestion
+    }
+
+    logging.info(f"Invoking {IMPORT_AUTOMATION_WORKFLOW_ID} for {import_name}")
+    execution_client = executions_v1.ExecutionsClient()
+    parent = f"projects/{PROJECT_ID}/locations/{LOCATION}/workflows/{IMPORT_AUTOMATION_WORKFLOW_ID}"
+    execution_req = executions_v1.Execution(argument=json.dumps(workflow_args))
+    response = execution_client.create_execution(parent=parent,
+                                                 execution=execution_req)
+    logging.info(
+        f"Triggered workflow {IMPORT_AUTOMATION_WORKFLOW_ID} for {import_name}. Execution ID: {response.name}"
+    )
+
+
 def update_import_status(import_name,
                          import_status,
                          import_version,
 
@@ -47,12 +47,12 @@ def handle_feed_event(request):
     graph_path = attributes.get('graph_path', "/**/*.mcf*")
     job_id = attributes.get('feed_name', 'cda_feed')
     cron_schedule = attributes.get('cron_schedule', '')
-    post_process = attributes.get('post_process', '')
+    run_ingestion = 'Schema' in import_name or 'Place' in import_name
+
     # Update import status in spanner
     helper.update_import_status(import_name, import_status, latest_version,
-                                  graph_path, job_id, cron_schedule)
+                                graph_path, job_id, cron_schedule)
 
-    # Invoke ingestion workflow to trigger dataflow job
-    if post_process == 'spanner_ingestion_workflow':
-        helper.invoke_ingestion_workflow(import_name)
+    # Invoke import job and ingestion workflow to trigger dataflow job
+    helper.invoke_import_workflow(import_name, latest_version, run_ingestion)
     return 'OK', 200
@@ -8,14 +8,18 @@
             "provenance_url": "https://datacommons.org",
             "provenance_description": "Schema nodes for Data Commons",
             "scripts": [
-                "process.py --entity=Schema"
+                "./process.sh --entity=Schema"
+            ],
+            "import_inputs": [
+                {
+                    "node_mcf": "**/*.mcf"
+                }
             ],
-            "import_inputs": [],
             "source_files": [],
             "cron_schedule": "15 3 * * *",
             "config_override": {
-                "invoke_import_validation": false,
-                "invoke_import_tool": false,
+                "invoke_import_validation": true,
+                "invoke_import_tool": true,
                 "invoke_differ_tool": false
             }
         },
@@ -27,11 +31,39 @@
             "provenance_url": "https://datacommons.org",
             "provenance_description": "Place nodes for Data Commons",
             "scripts": [
-                "process.py --entity=Place"
+                "./process.sh --entity=Place"
             ],
             "import_inputs": [],
             "source_files": [],
             "cron_schedule": "15 3 * * 1",
+            "resource_limits": {
+                "cpu": 8,
+                "memory": 128,
+                "disk": 100
+            },
+            "config_override": {
+                "invoke_import_validation": false,
+                "invoke_import_tool": true,
+                "invoke_differ_tool": false
+            }
+        },
+        {
+            "import_name": "Provenance",
+            "curator_emails": [
+                "support@datacommons.org"
+            ],
+            "provenance_url": "https://datacommons.org",
+            "provenance_description": "Provenance nodes for Data Commons",
+            "scripts": [
+                "./process.sh --entity=Provenance"
+            ],
+            "import_inputs": [
+                {
+                    "node_mcf": "**/*.mcf"
+                }
+            ],
+            "source_files": [],
+            "cron_schedule": "15 3 * * 1",
             "config_override": {
                 "invoke_import_validation": false,
                 "invoke_import_tool": false,