Merge pull request #83 from wfcommons/nextflow_improvements

rafaelfsilva · web-flow · commit bb2e267a8b2f · 2025-07-03T20:36:39.000-04:00
Nextflow with Flowcept
diff --git a/wfcommons/wfbench/translator/__init__.py b/wfcommons/wfbench/translator/__init__.py
@@ -1,20 +1,20 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2021-2024 The WfCommons Team.
+# Copyright (c) 2021-2025 The WfCommons Team.
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 
 from .airflow import AirflowTranslator
+from .bash import BashTranslator
+from .cwl import CWLTranslator
 from .dask import DaskTranslator
 from .nextflow import NextflowTranslator
 from .parsl import ParslTranslator
 from .pegasus import PegasusTranslator
+from .pycompss import PyCompssTranslator
 from .swift_t import SwiftTTranslator
 from .taskvine import TaskVineTranslator
-from .cwl import CWLTranslator
-from .bash import BashTranslator
-from .pycompss import PyCompssTranslator
diff --git a/wfcommons/wfbench/translator/cwl.py b/wfcommons/wfbench/translator/cwl.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2024 The WfCommons Team.
+# Copyright (c) 2024-2025 The WfCommons Team.
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -21,6 +21,7 @@
 
 this_dir = pathlib.Path(__file__).resolve().parent
 
+
 class CWLTranslator(Translator):
     """
     A WfFormat parser for creating CWL workflow benchmarks.
@@ -222,9 +223,9 @@ def _write_cwl_files(self, output_folder: pathlib.Path) -> None:
 
         clt_folder = cwl_folder.joinpath("clt")
         clt_folder.mkdir(exist_ok=True)
-        shutil.copy(this_dir.joinpath("templates/cwl_templates/wfbench.cwl"), clt_folder)
-        shutil.copy(this_dir.joinpath("templates/cwl_templates/folder.cwl"), clt_folder)
-        shutil.copy(this_dir.joinpath("templates/cwl_templates/shell.cwl"), clt_folder)
+        shutil.copy(this_dir.joinpath("templates/cwl/wfbench.cwl"), clt_folder)
+        shutil.copy(this_dir.joinpath("templates/cwl/folder.cwl"), clt_folder)
+        shutil.copy(this_dir.joinpath("templates/cwl/shell.cwl"), clt_folder)
 
         with open(cwl_folder.joinpath("main.cwl"), "w", encoding="utf-8") as f:
             f.write("\n".join(self.cwl_script))
diff --git a/wfcommons/wfbench/translator/nextflow.py b/wfcommons/wfbench/translator/nextflow.py
@@ -9,6 +9,7 @@
 # (at your option) any later version.
 
 import pathlib
+import shutil
 
 from logging import Logger
 from typing import List, Optional, Union
@@ -17,6 +18,8 @@
 from ...common import Workflow
 from ...common.task import Task
 
+this_dir = pathlib.Path(__file__).resolve().parent
+
 
 class NextflowTranslator(Translator):
     """
@@ -27,26 +30,14 @@ class NextflowTranslator(Translator):
     :param logger: The logger where to log information/warning or errors (optional).
     :type logger: Logger
     """
-
     def __init__(self,
                  workflow: Union[Workflow, pathlib.Path],
                  logger: Optional[Logger] = None) -> None:
         """Create an object of the translator."""
         super().__init__(workflow, logger)
 
         self.script = ""
-
-        self._usage_string = """
-Usage: nextflow run workflow.nf --pwd /path/to/directory [--simulate] [--help]
-
-    Required parameters:
-      --pwd         Working directory (where the workflow.nf file is located)
-
-    Optional parameters:
-      --help        Show this message and exit.
-      --simulate    Use a "sleep 1" for all tasks instead of the WfBench benchmark.
-"""
-
+        self.out_files = set()
 
     def translate(self, output_folder: pathlib.Path) -> None:
         """
@@ -55,25 +46,29 @@ def translate(self, output_folder: pathlib.Path) -> None:
         :param output_folder: The path to the folder in which the workflow benchmark will be generated.
         :type output_folder: pathlib.Path
         """
-
         # Create the output folder
-        output_folder.mkdir(parents=True)
+        self.output_folder = output_folder
+        self.output_folder.mkdir(parents=True)
 
         # Create benchmark files
         self._copy_binary_files(output_folder)
         self._generate_input_files(output_folder)
+        
+        if self.workflow.workflow_id:
+            shutil.copy(this_dir.joinpath("templates/flowcept_agent.py"), output_folder.joinpath("bin"))
 
         # Create a topological order of the tasks
         sorted_tasks = self._get_tasks_in_topological_order()
         # print([t.task_id for t in sorted_tasks])
 
         # Create the bash script for each task
         for task in sorted_tasks:
-            self._create_task_script(output_folder, task)
+            self._create_task_script(task)
 
         # Create the Nextflow workflow script and file
         self._create_workflow_script(sorted_tasks)
-        self._write_output_file(self.script, output_folder.joinpath("workflow.nf"))
+        run_workflow_code = self._merge_codelines("templates/nextflow/workflow.nf", self.script)
+        self._write_output_file(run_workflow_code, output_folder.joinpath("workflow.nf"))
 
         # Create the README file
         self._write_readme_file(output_folder)
@@ -86,10 +81,10 @@ def _create_workflow_script(self, tasks: list[Task]):
 
         :param tasks: The (sorted) list of tasks.
         :type tasks: list[Task]
-        """
-
-        # Output the code for command-line argument processing
-        self.script += self._generate_arg_parsing_code()
+        """       
+        # Add Flowcept code if enabled
+        if self.workflow.workflow_id:
+            self.script += self._generate_flowcept_code()
 
         # Output the code for each task
         for task in tasks:
@@ -100,60 +95,29 @@ def _create_workflow_script(self, tasks: list[Task]):
 
         return
 
-    def _generate_arg_parsing_code(self):
+    def _generate_flowcept_code(self) -> str:
         """
-        Generate the code to parse command-line argument.
 
         :return: The code.
         :rtype: str
         """
-
-        code = r'''
-params.simulate = false
-params.pwd = null
-params.help = null
-pwd = null
-
-def printUsage(error_msg, exit_code) {
-
-    def usage_string = """
-'''
-        code += self._usage_string
-
-        code += r'''
-"""
-    if (error_msg) {
-        def RED = '\u001B[31m'
-        def RESET = '\u001B[0m'
-        System.err.println "${RED}Error: ${RESET}" + error_msg
-    }
-    System.err.println usage_string
-    exit exit_code
-}
-
-def validateParams() {
-    if (params.help) {
-        printUsage(msg = "", exit_code=0)
-    }
-    if (params.pwd == null) {
-        printUsage(msg = "Missing required parameter: --pwd", exit_code=1)
-    }
-    pwd = file(params.pwd).toAbsolutePath().toString()
-    if (!file(pwd).exists()) {
-        printUsage(msg = "Directory not found: ${pwd}", exit_code=1)
-    } 
-}
-
-// Call validation at the start
-validateParams()
-
-'''
-        return code
+        out_files = ", ".join(f"\"{item}\"" for item in self.out_files)
+        return "process flowcept(){\n" \
+               "    input:\n" \
+               "    output:\n" \
+               "    script:\n" \
+               "        \"\"\"\n" \
+		       "        ${pwd}/bin/flowcept_agent.py " \
+               f"{self.workflow.name} {self.workflow.workflow_id} '[{out_files}]' \n" \
+		       "        \"\"\"\n" \
+               "}\n\n"                     
 
     def _get_tasks_in_topological_order(self) -> List[Task]:
         """
         Sort the workflow tasks in topological order.
 
+        :param output_folder: The path to the output folder.
+        :type output_folder: pathlib.Path
         :return: A sorted list of tasks.
         :rtype: List[Task]
         """
@@ -168,21 +132,20 @@ def _get_tasks_in_topological_order(self) -> List[Task]:
             if not all_children:
                 break
             for potential_task in all_children:
+                num_children = len(self.task_children[potential_task.task_id])
+                if not num_children:
+                    self.out_files.add(f"{self.output_folder.absolute()}/{potential_task.output_files[0]}")
                 if all(parent in sorted_tasks for parent in self._find_parents(potential_task.task_id)):
                     tasks_in_current_level.append(potential_task)
             levels[current_level] = tasks_in_current_level
             sorted_tasks += tasks_in_current_level
             current_level += 1
         return sorted_tasks
 
-
-    @staticmethod
-    def _create_task_script(output_folder: pathlib.Path, task: Task):
+    def _create_task_script(self, task: Task):
         """
         Generate the bash script for invoking a task.
 
-        :param output_folder: The path to the output folder.
-        :type output_folder: pathlib.Path
         :param task: The task.
         :type task: Task
         :return: The code.
@@ -194,16 +157,16 @@ def _create_task_script(output_folder: pathlib.Path, task: Task):
         # Generate input spec
         input_spec = "'\\["
         for f in task.input_files:
-            input_spec += f"\"{output_folder.resolve()}/data/{f.file_id}\","
+            input_spec += f"\"{self.output_folder.resolve()}/data/{f.file_id}\","
         input_spec = input_spec[:-1] + "\\]'"
 
         # Generate output spec
         output_spec = "'\\{"
         for f in task.output_files:
-            output_spec += f"\"{output_folder.resolve()}/data/{f.file_id}\":{str(f.size)},"
+            output_spec += f"\"{self.output_folder.resolve()}/data/{f.file_id}\":{str(f.size)},"
         output_spec = output_spec[:-1] + "\\}'"
 
-        code += f"{output_folder.resolve()}/bin/{task.program} "
+        code += f"{self.output_folder.resolve()}/bin/{task.program} "
 
         for a in task.args:
             if "--output-files" in a:
@@ -214,7 +177,7 @@ def _create_task_script(output_folder: pathlib.Path, task: Task):
                 code += f"{a} "
         code += "\n"
 
-        script_file_path = output_folder.joinpath(f"bin/script_{task.task_id}.sh")
+        script_file_path = self.output_folder.joinpath(f"bin/script_{task.task_id}.sh")
         with open(script_file_path, "w") as out:
             out.write(code)
 
@@ -296,6 +259,8 @@ def _generate_workflow_code(self, sorted_tasks: List[Task]) -> str:
 
         # Generate workflow function
         code += "workflow {\n"
+        if self.workflow.workflow_id:
+            code += "\tflowcept()\n"
         code += "\tresults = bootstrap()\n"
         for task in sorted_tasks:
             function_name = task.task_id.replace(".", "_")
@@ -353,6 +318,3 @@ def _write_readme_file(self, output_folder: pathlib.Path) -> None:
             out.write(f"Run the workflow in directory {str(output_folder)} using the following command:\n")
 
             out.write(f"\tnextflow run ./workflow.nf --pwd `pwd`\n")
-            out.write("\n")
-            out.write(self._usage_string)
-
diff --git a/wfcommons/wfbench/translator/swift_t.py b/wfcommons/wfbench/translator/swift_t.py
@@ -105,7 +105,7 @@ def translate(self, output_folder: pathlib.Path) -> None:
             self.script += f"string fc = sprintf(flowcept, \"{self.workflow.workflow_id}\", \"{self.workflow.name}\", \"{out_files}\");\n" \
                             "python_persist(fc);\n"
 
-        run_workflow_code = self._merge_codelines("templates/swift_t_templates/workflow.swift", self.script)
+        run_workflow_code = self._merge_codelines("templates/swift_t/workflow.swift", self.script)
 
         # write benchmark files
         output_folder.mkdir(parents=True)
diff --git a/wfcommons/wfbench/translator/templates/flowcept_agent.py b/wfcommons/wfbench/translator/templates/flowcept_agent.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2025 The WfCommons Team.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+import ast
+import logging
+import pathlib
+import sys
+import time
+from flowcept.flowcept_api.flowcept_controller import Flowcept
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="[WfBench][%%(asctime)s][%%(levelname)s] %%(message)s",
+    datefmt="%%H:%%M:%%S",
+    handlers=[logging.StreamHandler()]
+)
+
+workflow_name = sys.argv[1]
+workflow_id = sys.argv[2]
+out_files = ast.literal_eval(sys.argv[3])
+
+logging.info("Flowcept Starting")
+flowcept_agent = Flowcept(workflow_id=workflow_id, workflow_name=workflow_name, bundle_exec_id=workflow_id, start_persistence=False, save_workflow=True)
+
+try:
+    flowcept_agent.start()
+except Exception:
+    import traceback
+    traceback.print_exc()
+
+remaining_files = set(out_files)
+
+while remaining_files:
+    found_files = set()
+    for f in remaining_files:
+        if pathlib.Path(f).exists():
+            found_files.add(f)
+    remaining_files -= found_files
+    if not remaining_files:
+        break
+    time.sleep(1)
+    
+try:
+    flowcept_agent.stop()
+except Exception:
+    import traceback
+    traceback.print_exc()
+
+logging.info("Flowcept Completed")
diff --git a/wfcommons/wfbench/translator/templates/nextflow/workflow.nf b/wfcommons/wfbench/translator/templates/nextflow/workflow.nf
diff --git a/wfcommons/wfbench/translator/templates/swift_t/workflow.swift b/wfcommons/wfbench/translator/templates/swift_t/workflow.swift