improving Dask translator

rafaelfsilva · rafaelfsilva · commit 1d9843a8c07c · 2023-07-21T09:01:04.000-07:00
diff --git a/wfcommons/common/workflow.py b/wfcommons/common/workflow.py
@@ -60,7 +60,7 @@ def __init__(self,
         self.wms_name: Optional[str] = "WfCommons" if not wms_name else wms_name
         self.wms_version: Optional[str] = str(__version__) if not wms_version else wms_version
         self.wms_url: Optional[str] = f"https://docs.wfcommons.org/en/v{__version__}/" if not wms_url else wms_url
-        self.executed_at: Optional[str] = datetime.now().astimezone().isoformat()) if not executed_at else executed_at
+        self.executed_at: Optional[str] = datetime.now().astimezone().isoformat() if not executed_at else executed_at
         self.makespan: Optional[int] = makespan
         self.tasks = {}
         self.tasks_parents = {}
diff --git a/wfcommons/wfbench/bench.py b/wfcommons/wfbench/bench.py
@@ -152,7 +152,7 @@ def create_benchmark(self,
 
             task.runtime = 0
             task.files = []
-            task.program = f"{this_dir.joinpath('wfbench.py')}"
+            task.program = "wfbench.py"
             task.args = [task.name]
             task.args.extend(params)
 
diff --git a/wfcommons/wfbench/translator/abstract_translator.py b/wfcommons/wfbench/translator/abstract_translator.py
@@ -1,23 +1,27 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2021-2022 The WfCommons Team.
+# Copyright (c) 2021-2023 The WfCommons Team.
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 
 import logging
+import os
 import pathlib
+import shutil
 
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 
-from ...common import Task, Workflow
+from ...common import FileLink, Task, Workflow
 from ...wfinstances.instance import Instance
 
 
+this_dir = pathlib.Path(__file__).resolve().parent
+
 class Translator(ABC):
     """
     An abstract class of WfFormat parser for creating workflow benchmark applications.
@@ -69,13 +73,43 @@ def __init__(self,
                     self.task_children[task['name']].append(child)
 
     @abstractmethod
-    def translate(self, output_file_path: pathlib.Path) -> None:
+    def translate(self, output_folder: pathlib.Path) -> None:
         """
         Translate a workflow benchmark description (WfFormat) into an actual workflow application.
 
-        :param output_file_path: The path of the output file.
-        :type output_file_path: pathlib.Path
+        :param output_folder: The path to the folder in which the workflow benchmark will be generated.
+        :type output_folder: pathlib.Path
+        """
+
+    def _copy_binary_files(self, output_folder: pathlib.Path) -> None:
+        """
+        Copy binary files to workflow benchmark's bin folder.
+
+        :param output_folder: The path to the folder in which the workflow benchmark will be generated.
+        :type output_folder: pathlib.Path
+        """
+        bin_folder = output_folder.joinpath("bin")
+        bin_folder.mkdir()
+        shutil.copy(this_dir.joinpath("../wfbench.py"), bin_folder)
+        shutil.copy(shutil.which("cpu-benchmark"), bin_folder)
+
+    def _generate_input_files(self, output_folder: pathlib.Path) -> None:
+        """
+        Generate workflow input files into workflow benchmark's data folder.
+
+        :param output_folder: The path to the folder in which the workflow benchmark will be generated.
+        :type output_folder: pathlib.Path
         """
+        generated_files = []
+        data_folder = output_folder.joinpath("data")
+        data_folder.mkdir()
+        for task_name in self.root_task_names:
+            task = self.tasks[task_name]
+            for file in task.files:
+                if file.name not in generated_files and file.link == FileLink.INPUT:
+                    generated_files.append(file.name)
+                    with open(data_folder.joinpath(file.name), "wb") as fp:
+                        fp.write(os.urandom(int(file.size)))
 
     def _write_output_file(self, contents: str, output_file_path: pathlib.Path) -> None:
         """
@@ -91,7 +125,7 @@ def _write_output_file(self, contents: str, output_file_path: pathlib.Path) -> N
             out.write(contents)
         self.logger.info(f"Translated content written to '{output_file_path}'")
 
-    def _find_children(self, task_name: str) -> List[Task]:
+    def _find_children(self, task_name: str) -> list[Task]:
         """
         Find the children for a specific task.
 
@@ -108,7 +142,7 @@ def _find_children(self, task_name: str) -> List[Task]:
 
         return children
 
-    def _find_parents(self, task_name: str) -> List[Task]:
+    def _find_parents(self, task_name: str) -> list[Task]:
         """
         Find the parents for a specific task.
 
diff --git a/wfcommons/wfbench/translator/dask.py b/wfcommons/wfbench/translator/dask.py
@@ -38,14 +38,14 @@ def __init__(self,
         self.tasks_futures = {}
         self.task_id = 0
 
-    def translate(self, output_file_name: pathlib.Path) -> None:
+    def translate(self, output_folder: pathlib.Path) -> None:
         """
-        Translate a workflow benchmark description (WfFormat) into a Dask workflow application.
+        Translate a workflow benchmark description (WfFormat) into an actual workflow application.
 
-        :param output_file_name: The name of the output file (e.g., workflow.py).
-        :type output_file_name: pathlib.Path
+        :param output_folder: The path to the folder in which the workflow benchmark will be generated.
+        :type output_folder: pathlib.Path
         """
-        noindent_python_codelines = self._dask_wftasks_codelines("randomizer")
+        noindent_python_codelines = self._dask_wftasks_codelines("randomizer", output_folder)
         
         for task_name in self.root_task_names:
             noindent_python_codelines.extend(self._parse_tasks(task_name))
@@ -61,30 +61,53 @@ def translate(self, output_file_name: pathlib.Path) -> None:
         with open(this_dir.joinpath("templates/dask_template.py")) as fp:
             run_workflow_code = fp.read()
         run_workflow_code = run_workflow_code.replace("# Generated code goes here", wf_codelines)
-        with open("dask_workflow.py", "w") as fp:
+
+        # write benchmark files
+        output_folder.mkdir(parents=True)
+        with open(output_folder.joinpath("dask_workflow.py"), "w") as fp:
             fp.write(run_workflow_code)
+
+        # additional files
+        self._copy_binary_files(output_folder)
+        self._generate_input_files(output_folder)
         
     def _dask_wftasks_codelines(self, 
                                 randomizer_varname: str, 
+                                output_folder: pathlib.Path,
                                 simulate_minimum_execution_time: float = 0.1,
                                 simulate_maximum_execution_time: float = 1.1) -> list[str]:
         """
         Build the code definining all tasks in the workflow, i.e. WorkflowTask instances.
         
         :param randomizer_varname: The name of the randomizer.
         :type randomizer_varname: str
+        :param output_folder: The path to the folder in which the workflow benchmark will be generated.
+        :type output_folder: pathlib.Path
 
         :return: The non-indented Python lines of code used to instantiate the WorkflowTask instances.
         :rtype: list[str]
         """
         codelines = ["randomizer = random.Random(seed)",
                      "TASKS = {}"]
         for task in self.tasks.values():
-            input_files = [f.name for f in task.files if f.link == FileLink.INPUT]
-            output_files = [f.name for f in task.files if f.link == FileLink.OUTPUT]
+            input_files = [str(output_folder.joinpath(f"data/{f.name}")) for f in task.files if f.link == FileLink.INPUT]
+            output_files = [str(output_folder.joinpath(f"data/{f.name}")) for f in task.files if f.link == FileLink.OUTPUT]
+            program = output_folder.joinpath(f'bin/{task.program}')
+            args = []
+            print(task.args)
+            for a in task.args:
+                if "--out" in a:
+                    a = a.replace("{", "\"{").replace("}", "}\"").replace(".txt'", ".txt\\\\\"").replace("'", "\\\\\"" + str(output_folder.joinpath("data")) + "/").replace(": ", ":")
+                elif "--" not in a: 
+                    a = str(output_folder.joinpath("data", a))
+                else: 
+                    a = a.replace("'", "\"") 
+                args.append(a)
+            print(args)
+            print("")
             code = [f"WorkflowTask(dag_id = '{task.name}',",
                     f"             name = '{task.name}',",
-                    f"             command_arguments = {[task.program] + task.args},",
+                    f"             command_arguments = {[str(program)] + args},",
                     f"             inputs = {input_files},",
                     f"             outputs = {output_files},",
                     "             simulate = simulate,",
@@ -94,6 +117,7 @@ def _dask_wftasks_codelines(self,
                     "             )"]
             codelines.append(f"TASKS['{task.name}'] = {code[0]}")
             codelines.extend([codeline for codeline in code[1:]])
+        # exit(1)
         return codelines
 
     def _parse_tasks(self, task_name: str) -> list[str]:
@@ -115,7 +139,7 @@ def _parse_tasks(self, task_name: str) -> list[str]:
             self.parsed_tasks.append(task_name)
             self.tasks_futures[task_name] = f"fut_dv_{self.task_id}"
             self.task_id += 1
-            noindent_python_codelines = [f"{self.tasks_futures[task_name]} = client.submit(execute_task, TASKS['{task_name}'], [])"]
+            noindent_python_codelines = [f"{self.tasks_futures[task_name]} = client.submit(execute_task, TASKS['{task_name}'], {self.task_parents[task_name]})"]
             
             # parse children
             for child in self.task_children[task_name]:
diff --git a/wfcommons/wfbench/wfbench.py b/wfcommons/wfbench/wfbench.py
@@ -118,9 +118,9 @@ def cpu_mem_benchmark(cpu_threads: Optional[int] = 5,
             os.sched_setaffinity(cpu_proc.pid, {core})
         cpu_procs.append(cpu_proc)
 
-    mem_proc = subprocess.Popen(mem_prog)
-    if core:
-        os.sched_setaffinity(mem_proc.pid, {core})
+    # mem_proc = subprocess.Popen(mem_prog)
+    # if core:
+    #     os.sched_setaffinity(mem_proc.pid, {core})
 
     return cpu_procs
 
@@ -148,19 +148,19 @@ def get_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def io_read_benchmark_user_input_data_size(other):
+def io_read_benchmark_user_input_data_size(inputs):
     print("[WfBench] Starting IO Read Benchmark...")
-    for file in other:
-        with open(this_dir.joinpath(file), "rb") as fp:
+    for file in inputs:
+        with open(file, "rb") as fp:
             print(f"[WfBench]   Reading '{file}'")
             fp.readlines()
     print("[WfBench] Completed IO Read Benchmark!\n")
 
 
 def io_write_benchmark_user_input_data_size(outputs):
-    for task_name, file_size in outputs.items():
-        print(f"[WfBench] Writing output file '{task_name}'\n")
-        with open(this_dir.joinpath(task_name), "wb") as fp:
+    for file_name, file_size in outputs.items():
+        print(f"[WfBench] Writing output file '{file_name}'\n")
+        with open(file_name, "wb") as fp:
             fp.write(os.urandom(int(file_size)))