Added testing to Streamflow/RO-Crate

henricasanova · henricasanova · commit 4316c38ce778 · 2026-02-28T14:24:08.000-10:00
diff --git a/tests/translators_loggers/test_translators_loggers.py b/tests/translators_loggers/test_translators_loggers.py
@@ -268,18 +268,18 @@ class TestTranslators:
     @pytest.mark.parametrize(
         "backend",
         [
-           # "swiftt",
-           # "dask",
-           # "parsl",
-           # "nextflow",
-           # "nextflow_subworkflow",
-           # "airflow",
-           # "bash",
-           # "taskvine",
-           # "makeflow",
-           # "cwl",
+           "swiftt",
+           "dask",
+           "parsl",
+           "nextflow",
+           "nextflow_subworkflow",
+           "airflow",
+           "bash",
+           "taskvine",
+           "makeflow",
+           "cwl",
            "streamflow",
-           # "pegasus",
+           "pegasus",
         ])
     @pytest.mark.unit
     # @pytest.mark.skip(reason="tmp")
@@ -346,18 +346,9 @@ def test_translator(self, backend) -> None:
 
             original_workflow : Workflow = benchmark.workflow
 
-            # print(original_workflow.tasks)
-            # print("======")
-            # print(reconstructed_workflow.tasks)
             for task_name in original_workflow.tasks.keys():
                 original_task = original_workflow.tasks[task_name]
                 reconstructed_task = reconstructed_workflow.tasks["main.cwl#" + task_name]
-                print("ORIGINAL:", original_task.task_id, "RECONSTRUCTED:", reconstructed_task.task_id)
-                print("   NUM_INPUT_FILES: ", len(original_task.input_files), len(reconstructed_task.input_files))
-                print("   NUM_OUTPUT_FILES: ", len(original_task.output_files), len(reconstructed_task.output_files))
-                print("   INPUT FILES: ", [f.file_id for f in original_task.input_files], [f.file_id for f in reconstructed_task.input_files])
-                print("   OUTPUT FILES: ", [f.file_id for f in original_task.output_files], [f.file_id for f in reconstructed_task.output_files])
-
             _compare_workflows(original_workflow, reconstructed_workflow)
 
         # Shutdown the container (weirdly, container is already shutdown by now... not sure how)
diff --git a/wfcommons/wfinstances/logs/ro_crate.py b/wfcommons/wfinstances/logs/ro_crate.py
@@ -122,9 +122,6 @@ def _construct_data_file_id_name_map(self):
                 continue
             alternate_name = item["alternateName"]
             self.data_file_id_name_map[id] = alternate_name
-        print("=== FILE MAP ===")
-        print(self.data_file_id_name_map)
-        print("==== END FILE MAP ===")
 
 
     def _create_tasks(self, create_actions, main_workflow_id):
@@ -141,8 +138,8 @@ def _create_tasks(self, create_actions, main_workflow_id):
                 continue
 
             create_action['name'] = create_action['name'].removeprefix("Run of workflow/")
-            print("***************************************")
-            print("DEALING WITH TASK:", create_action['name'])
+            # print("***************************************")
+            # print("DEALING WITH TASK:", create_action['name'])
 
             # Below would remove the "file.cwl#" tag, which runs the risk
             # of non-uniqueness of action names perhaps
@@ -155,19 +152,11 @@ def _create_tasks(self, create_actions, main_workflow_id):
             # Get all input & output for the create_action
             input = [obj['@id'] for obj in create_action['object']]
             output = [obj['@id'] for obj in create_action['result']]
-            # print("RAW INPUT FILES: ", input)
-            # print("RAW OUTPUT FILES: ", output)
 
             # Filter for actual files
             input_files = self._filter_file_ids(input)
-            print("GOT THESE IDS FOR INPUT FILES: ", input_files)
-            print("TRANSLATED TO REAL FILE NAMES: ", [self.data_file_id_name_map[f] for f in input_files])
             output_files = self._filter_file_ids(output)
-            print("GOT THESE IDS FOR OUTPUT FILES: ", output_files)
-            print("TRANSLATED TO REAL FILE NAMES: ", [self.data_file_id_name_map[f] for f in output_files])
 
-            print("FILTERED INPUT FILES: ", input_files)
-            print("FILTERED OUTPUT FILES: ", output_files)
 
             task = Task(name=create_action['name'],
                         task_id=create_action['name'],
@@ -208,31 +197,31 @@ def _create_tasks(self, create_actions, main_workflow_id):
         self._add_dependencies(files, instruments)
 
     def _add_dependencies(self, files, instruments):
+
+        # File dependencies
         for file in files.values():
             for parent in file.get('out', []):
                 for child in file.get('in', []):
-                    # self.workflow.add_dependency(parent, child)
                     self.workflow.add_dependency(self.task_id_name_map[parent], self.task_id_name_map[child])
 
-        # Assumes
-        parameter_connections = list(filter((lambda x: x.get('@type') == "ParameterConnection"), self.graph_data))
-        for parameter_connection in parameter_connections:
-            # parameter_connection["sourceParameter"] is either a single dict or a list of dicts,
-            # which is bad design but whatever
-            source_parameters = parameter_connection["sourceParameter"]
-            if not isinstance(source_parameters, list):
-                source_parameters = [source_parameters]
-
-            for item in source_parameters:
-                source = item["@id"]
-                source = source.rsplit("#", 1)[0]   # Trim to get instrument
-
-                target = parameter_connection["targetParameter"]["@id"]
-                target = target.rsplit("#", 1)[0]   # Trim to get instrument
-
-                for parent in instruments.get(source, []):
-                    for child in instruments.get(target, []):
-                        self.workflow.add_dependency(self.task_id_name_map[parent], self.task_id_name_map[child])
+        # THIS IS COMMENTED OUT AT IT SEEMS TO ADD TONS OF NON-EXISTING DEPENDENCIES ON WORKFLOW BENCHMARKS
+        # (FOR INSTANCE, IT TOTALLY BREAKS THE BENCHMARK WORKFLOW DUE TO ALL OF THEM USING shell.cwl#output_files
+        # parameter_connections = list(filter((lambda x: x.get('@type') == "ParameterConnection"), self.graph_data))
+        # for parameter_connection in parameter_connections:
+        #     # parameter_connection["sourceParameter"] is either a single dict or a list of dicts,
+        #     # which is bad design but whatever
+        #     source_parameters = parameter_connection["sourceParameter"]
+        #     if not isinstance(source_parameters, list):
+        #         source_parameters = [source_parameters]
+        #         source = item["@id"]
+        #         source = source.rsplit("#", 1)[0]   # Trim to get instrument
+        #
+        #         target = parameter_connection["targetParameter"]["@id"]
+        #         target = target.rsplit("#", 1)[0]   # Trim to get instrument
+        #
+        #         for parent in instruments.get(source, []):
+        #             for child in instruments.get(target, []):
+        #                 self.workflow.add_dependency(self.task_id_name_map[parent], self.task_id_name_map[child])
 
 
     def _time_diff(self, start_time, end_time):
@@ -254,19 +243,14 @@ def _filter_file_ids(self, ids):
 
         file_ids = list(filter(lambda x: self.lookup.get(x)['@type'] == 'File', ids))
         property_value_ids = list(filter(lambda x: self.lookup.get(x)['@type'] == 'PropertyValue', ids))
-        # print("FILE_IDS =", file_ids)
-        # print("PROPERTY_VALUE_IDS =", property_value_ids)
         for property_value_id in property_value_ids:
             property_values = self.lookup.get(property_value_id)['value']
-            # print("PROPERTY_VALUES =", property_values)
             if isinstance(property_values, dict):
                 property_values = [property_values]
 
             # Filter out values without "@id"s (i.e. int values, etc.)
             pv_contained_ids = list(filter(lambda x: isinstance(x, dict) and "@id" in x, property_values))
-            # print("PV_CONTAINED_IDS.1 = ", pv_contained_ids)
             pv_contained_ids = [obj["@id"] for obj in pv_contained_ids]
-            # print("PV_CONTAINED_IDS.2 = ", pv_contained_ids)
 
             # Recurse to verify everything's a file
             pv_filtered_ids = self._filter_file_ids(pv_contained_ids)