@@ -44,13 +44,16 @@ class ROCrateLogsParser(LogsParser):
4444 def __init__ (self ,
4545 crate_dir : pathlib .Path ,
4646 description : Optional [str ] = None ,
47- logger : Optional [Logger ] = None ) -> None :
47+ logger : Optional [Logger ] = None ,
48+ steps_to_ignore : Optional [list [str ]]= None ) -> None :
4849 """Create an object of the RO crate parser."""
4950
5051 # TODO: Decide if these should be RO crate or Streamflow or whatev
5152 super ().__init__ ('Streamflow-ROCrate' , 'https://w3id.org/workflowhub/workflow-ro-crate/1.0' , description , logger )
5253
5354 # Sanity check
55+ if steps_to_ignore is None :
56+ steps_to_ignore = []
5457 if not crate_dir .is_dir ():
5558 raise OSError (f'The provided path does not exist or is not a folder: { crate_dir } ' )
5659
@@ -63,6 +66,10 @@ def __init__(self,
6366
6467 self .file_objects = {}
6568
69+ self .task_id_name_map : dict [str , str ] = {}
70+
71+ self .steps_to_ignore = steps_to_ignore
72+
6673
6774 def build_workflow (self , workflow_name : Optional [str ] = None ) -> Workflow :
6875 """
@@ -113,6 +120,16 @@ def _create_tasks(self, create_actions, main_workflow_id):
113120 self ._process_main_workflow (create_action )
114121 continue
115122
123+ create_action ['name' ] = create_action ['name' ].removeprefix ("Run of workflow/" )
124+
125+ # Below would remove the "file.cwl#" tag, which runs the risk
126+ # of non-uniqueness of action names perhaps
127+ # create_action['name'] = create_action['name'].split('#', 1)[-1]
128+
129+ # Check if we should ignore this step
130+ if create_action ["name" ] in self .steps_to_ignore :
131+ continue
132+
116133 # Get all input & output for the create_action
117134 input = [obj ['@id' ] for obj in create_action ['object' ]]
118135 output = [obj ['@id' ] for obj in create_action ['result' ]]
@@ -121,17 +138,19 @@ def _create_tasks(self, create_actions, main_workflow_id):
121138 input_files = self ._filter_file_ids (input )
122139 output_files = self ._filter_file_ids (output )
123140
124- create_action ['name' ] = create_action ['name' ].removeprefix ("Run of workflow/" )
125141
126142 task = Task (name = create_action ['name' ],
127- task_id = create_action ['@id' ],
143+ task_id = create_action ['name' ],
144+ # task_id=create_action['name'] + "_" + create_action['@id'],
128145 task_type = TaskType .COMPUTE ,
129146 runtime = self ._time_diff (create_action ['startTime' ], create_action ['endTime' ]),
130147 executed_at = create_action ['startTime' ],
131148 input_files = self ._get_file_objects (input_files ),
132149 output_files = self ._get_file_objects (output_files ),
133150 logger = self .logger )
134151 self .workflow .add_task (task )
152+ self .task_id_name_map [create_action ['@id' ]] = create_action ['name' ]
153+ # self.task_id_name_map[create_action['@id']] = create_action['name'] + "_" + create_action['@id']
135154
136155 # For each file, track which task(s) it is in/output for
137156 for infile in input_files :
@@ -162,20 +181,29 @@ def _add_dependencies(self, files, instruments):
162181 for file in files .values ():
163182 for parent in file .get ('out' , []):
164183 for child in file .get ('in' , []):
165- self .workflow .add_dependency (parent , child )
184+ # self.workflow.add_dependency(parent, child)
185+ self .workflow .add_dependency (self .task_id_name_map [parent ], self .task_id_name_map [child ])
166186
167187 # Assumes
168188 parameter_connections = list (filter ((lambda x : x .get ('@type' ) == "ParameterConnection" ), self .graph_data ))
169189 for parameter_connection in parameter_connections :
170- source = parameter_connection ["sourceParameter" ]["@id" ]
171- source = source .rsplit ("#" , 1 )[0 ] # Trim to get instrument
190+ # parameter_connection["sourceParameter"] is either a single dict or a list of dicts,
191+ # which is bad design but whatever
192+ source_parameters = parameter_connection ["sourceParameter" ]
193+ if not isinstance (source_parameters , list ):
194+ source_parameters = [source_parameters ]
195+
196+ for item in source_parameters :
197+ source = item ["@id" ]
198+ source = source .rsplit ("#" , 1 )[0 ] # Trim to get instrument
199+
200+ target = parameter_connection ["targetParameter" ]["@id" ]
201+ target = target .rsplit ("#" , 1 )[0 ] # Trim to get instrument
172202
173- target = parameter_connection ["targetParameter" ]["@id" ]
174- target = target .rsplit ("#" , 1 )[0 ] # Trim to get instrument
203+ for parent in instruments .get (source , []):
204+ for child in instruments .get (target , []):
205+ self .workflow .add_dependency (self .task_id_name_map [parent ], self .task_id_name_map [child ])
175206
176- for parent in instruments .get (source , []):
177- for child in instruments .get (target , []):
178- self .workflow .add_dependency (parent , child )
179207
180208 def _time_diff (self , start_time , end_time ):
181209 diff = datetime .fromisoformat (end_time ) - datetime .fromisoformat (start_time )
0 commit comments