improving Dask translator

rafaelfsilva · rafaelfsilva · commit 1ec84b4bc83c · 2023-07-04T18:01:21.000-04:00
diff --git a/wfcommons/wfbench/__init__.py b/wfcommons/wfbench/__init__.py
@@ -9,4 +9,4 @@
 # (at your option) any later version.
 
 from .bench import WorkflowBenchmark
-from .translator import PegasusTranslator, SwiftTTranslator
+from .translator import DaskTranslator, PegasusTranslator, SwiftTTranslator
diff --git a/wfcommons/wfbench/translator/__init__.py b/wfcommons/wfbench/translator/__init__.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2021-2022 The WfCommons Team.
+# Copyright (c) 2021-2023 The WfCommons Team.
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 
+from .dask import DaskTranslator
 from .pegasus import PegasusTranslator
-from.swift_t import SwiftTTranslator
+from .swift_t import SwiftTTranslator
diff --git a/wfcommons/wfbench/translator/dask.py b/wfcommons/wfbench/translator/dask.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 The WfCommons Team.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+import pathlib
+
+from logging import Logger
+from typing import Optional, Union
+
+from .abstract_translator import Translator
+from ...common import FileLink, Workflow
+
+this_dir = pathlib.Path(__file__).resolve().parent
+
+
+class DaskTranslator(Translator):
+    """
+    A WfFormat parser for creating Dask workflow applications.
+
+    :param workflow: Workflow benchmark object or path to the workflow benchmark JSON instance.
+    :type workflow: Union[Workflow, pathlib.Path],
+    :param logger: The logger where to log information/warning or errors (optional).
+    :type logger: Logger
+    """
+
+    def __init__(self,
+                 workflow: Union[Workflow, pathlib.Path],
+                 logger: Optional[Logger] = None) -> None:
+        """Create an object of the translator."""
+        super().__init__(workflow, logger)
+        self.parsed_tasks = []
+        self.tasks_futures = {}
+        self.task_id = 0
+
+    def translate(self, output_file_name: pathlib.Path) -> None:
+        """
+        Translate a workflow benchmark description (WfFormat) into a Dask workflow application.
+
+        :param output_file_name: The name of the output file (e.g., workflow.py).
+        :type output_file_name: pathlib.Path
+        """
+        noindent_python_codelines = self._dask_wftasks_codelines("randomizer")
+        
+        for task_name in self.root_task_names:
+            noindent_python_codelines.extend(self._parse_tasks(task_name))
+        
+        # generate results
+        while self.task_id > 0:
+            self.task_id -= 1
+            noindent_python_codelines.append(f"TASKS['{self.parsed_tasks[self.task_id]}'] = fut_dv_{self.task_id}.result()")
+
+        # generate code
+        INDENT = "    "
+        wf_codelines = "\n".join(["%s%s" % (INDENT, codeline) for codeline in noindent_python_codelines])
+        with open(this_dir.joinpath("templates/dask_template.py")) as fp:
+            run_workflow_code = fp.read()
+        run_workflow_code = run_workflow_code.replace("# Generated code goes here", wf_codelines)
+        with open("dask_workflow.py", "w") as fp:
+            fp.write(run_workflow_code)
+        
+    def _dask_wftasks_codelines(self, 
+                                randomizer_varname: str, 
+                                simulate_minimum_execution_time: float = 0.1,
+                                simulate_maximum_execution_time: float = 1.1) -> list[str]:
+        """
+        Build the code definining all tasks in the workflow, i.e. WorkflowTask instances.
+        
+        :param randomizer_varname: The name of the randomizer.
+        :type randomizer_varname: str
+
+        :return: The non-indented Python lines of code used to instantiate the WorkflowTask instances.
+        :rtype: list[str]
+        """
+        codelines = ["randomizer = random.Random(seed)",
+                     "TASKS = {}"]
+        for task in self.tasks.values():
+            input_files = [f.name for f in task.files if f.link == FileLink.INPUT]
+            output_files = [f.name for f in task.files if f.link == FileLink.OUTPUT]
+            code = [f"WorkflowTask(dag_id = '{task.name}',",
+                    f"             name = '{task.name}',",
+                    f"             command_arguments = {[task.program] + task.args},",
+                    f"             inputs = {input_files},",
+                    f"             outputs = {output_files},",
+                    "             simulate = simulate,",
+                    f"             randomizer = {randomizer_varname},",
+                    f"             simulate_minimum_execution_time = {simulate_minimum_execution_time},",
+                    f"             simulate_maximum_execution_time = {simulate_maximum_execution_time},",
+                    "             )"]
+            codelines.append(f"TASKS['{task.name}'] = {code[0]}")
+            codelines.extend([codeline for codeline in code[1:]])
+        return codelines
+
+    def _parse_tasks(self, task_name: str) -> list[str]:
+        """
+        Recursively iterates over workflow tasks to generate submit command.
+        
+        :param task_name: The name of a task.
+        :type task_name: str
+
+        :return: The 
+        :rtype: list[str]
+        """
+        if task_name not in self.parsed_tasks:
+            # check for dependencies
+            for parent in self.task_parents[task_name]:
+                if parent not in self.parsed_tasks:
+                    return []
+            
+            self.parsed_tasks.append(task_name)
+            self.tasks_futures[task_name] = f"fut_dv_{self.task_id}"
+            self.task_id += 1
+            noindent_python_codelines = [f"{self.tasks_futures[task_name]} = client.submit(execute_task, TASKS['{task_name}'], [])"]
+            
+            # parse children
+            for child in self.task_children[task_name]:
+                noindent_python_codelines.extend(self._parse_tasks(child))
+        
+        return noindent_python_codelines
diff --git a/wfcommons/wfbench/translator/templates/dask_template.py b/wfcommons/wfbench/translator/templates/dask_template.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 The WfCommons Team.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+import argparse
+import json
+import logging
+import os
+import pathlib
+import random
+import sys
+import time
+
+from dask.distributed import Client
+
+
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def build_dask_client():
+    """
+    Feel free to modify this to target your local dask configuration
+
+    Lots of info there:
+    https://docs.dask.org/en/stable/configuration.html
+    https://dask.pydata.org/en/latest/scheduling.html
+    """
+    cpu_count = 2  
+    threads_per_cpu = 2
+    return Client(n_workers=cpu_count, threads_per_worker=threads_per_cpu)
+
+
+class WorkflowTask:
+    def __init__(self,
+                 dag_id: str = None,
+                 name: str = None,
+                 command_arguments: list[str] = None,
+                 inputs: list[str] = None,
+                 outputs: list[str] = None,
+                 simulate: bool = False,
+                 randomizer: random.Random = random.Random(),
+                 simulate_minimum_execution_time: float = 0.1,
+                 simulate_maximum_execution_time: float = 1.1,
+                 execution_time: float = None,  # This is an execution output
+                 ):
+        self.dag_id = dag_id
+        self.name = name
+        self.command_arguments = command_arguments
+        self.inputs = inputs
+        self.outputs = outputs
+        self.simulate = simulate
+        self.randomizer = randomizer
+        self.simulate_minimum_execution_time = simulate_minimum_execution_time
+        self.simulate_maximum_execution_time = simulate_maximum_execution_time
+        self.execution_time = execution_time
+
+    def simulate_execution(self):
+        time.sleep(self.randomizer.uniform(self.simulate_minimum_execution_time,
+                                           self.simulate_maximum_execution_time))
+
+
+def execute_task(task: WorkflowTask, fut_inputs_list) -> WorkflowTask:
+    """
+    :param task: The task to be executed (it holds all relevant information)
+    :param fut_inputs_list: Unused here but necessary for dask to build its own DAG
+    :return:
+    """
+    logger.info("Executing task %s/%s: %s / in=%s / out=%s" % (task.name, task.dag_id, task.command_arguments, task.inputs, task.outputs))
+    start = time.time()
+    if task.simulate or task.command_arguments is None or len(task.command_arguments) == 0:
+        logger.info("Simulating execution of task %s" % task.name)
+        # Pretend we do something/Wait some time
+        task.simulate_execution()
+        for output in task.outputs:
+            logger.debug("Simulating %s => %s" % (task.command_arguments, output))
+            pathlib.Path(output).touch()
+    else:
+        command = " ".join(task.command_arguments)
+        logger.info("Running command for task %s/%s: %s" % (task.name, task.dag_id, command))
+        os.system(command)  # TODO Use subprocess?
+    task.execution_time = time.time()-start
+    logger.info("End of task %s/%s (%f)" % (task.name, task.dag_id, task.execution_time))
+    return task
+
+
+def run_workflow(client, simulate: bool, seed: int=42) -> list[WorkflowTask]:
+# Generated code goes here
+    return TASKS
+
+
+def process_arguments():
+    parser = argparse.ArgumentParser(prog=sys.argv[0],
+                                     description='Runs a workflow through dask')  # TODO
+    parser.add_argument("-nosim", "--do-not-simulate",
+                        help="Do not simulate all tasks (default: do simulate all tasks)", action="store_false")
+    parser.add_argument("-s", "--seed", help="Randomizer seed (used when simulating)")
+    return parser.parse_args()
+
+
+def to_json(obj):
+    return json.dumps(obj, indent=2, default=lambda o: o.__dict__)
+
+
+if __name__ == '__main__':
+    args = process_arguments()
+    with build_dask_client() as client:
+        tasks = run_workflow(client, args.do_not_simulate, seed=int(args.seed))
+    with open("run.json", "w") as fp:
+        fp.write(to_json(tasks))
diff --git a/wfcommons/wfbench/wfbench.py b/wfcommons/wfbench/wfbench.py