|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +# |
| 4 | +# Copyright (c) 2021 The WorkflowHub Team. |
| 5 | +# |
| 6 | +# This program is free software: you can redistribute it and/or modify |
| 7 | +# it under the terms of the GNU General Public License as published by |
| 8 | +# the Free Software Foundation, either version 3 of the License, or |
| 9 | +# (at your option) any later version. |
| 10 | + |
| 11 | +from typing import Dict, Optional |
| 12 | + |
| 13 | +from .abstract_recipe import WorkflowRecipe |
| 14 | +from ...common.file import FileLink |
| 15 | +from ...common.workflow import Workflow |
| 16 | + |
| 17 | + |
| 18 | +class BLASTRecipe(WorkflowRecipe): |
| 19 | + """A BLAST workflow recipe class for creating synthetic workflow traces. |
| 20 | +
|
| 21 | + :param num_subsample: The number of subsample the reference file will be split. |
| 22 | + :type num_subsample: int |
| 23 | + :param data_footprint: The upper bound for the workflow total data footprint (in bytes). |
| 24 | + :type data_footprint: int |
| 25 | + :param num_tasks: The upper bound for the total number of tasks in the workflow. |
| 26 | + :type num_tasks: int |
| 27 | + :param runtime_factor: The factor of which tasks runtime will be increased/decreased. |
| 28 | + :type runtime_factor: float |
| 29 | + :param input_file_size_factor: The factor of which tasks input files size will be increased/decreased. |
| 30 | + :type input_file_size_factor: float |
| 31 | + :param output_file_size_factor: The factor of which tasks output files size will be increased/decreased. |
| 32 | + :type output_file_size_factor: float |
| 33 | + """ |
| 34 | + |
| 35 | + def __init__(self, |
| 36 | + num_subsample: Optional[int] = 2, |
| 37 | + data_footprint: Optional[int] = 0, |
| 38 | + num_tasks: Optional[int] = 5, |
| 39 | + runtime_factor: Optional[float] = 1.0, |
| 40 | + input_file_size_factor: Optional[float] = 1.0, |
| 41 | + output_file_size_factor: Optional[float] = 1.0 |
| 42 | + ) -> None: |
| 43 | + """Create an object of the BLAST workflow recipe.""" |
| 44 | + super().__init__("BLAST", |
| 45 | + data_footprint, |
| 46 | + num_tasks, |
| 47 | + runtime_factor, |
| 48 | + input_file_size_factor, |
| 49 | + output_file_size_factor) |
| 50 | + |
| 51 | + self.num_subsample: int = num_subsample |
| 52 | + |
| 53 | + @classmethod |
| 54 | + def from_num_tasks(cls, |
| 55 | + num_tasks: int, |
| 56 | + runtime_factor: Optional[float] = 1.0, |
| 57 | + input_file_size_factor: Optional[float] = 1.0, |
| 58 | + output_file_size_factor: Optional[float] = 1.0 |
| 59 | + ) -> 'BLASTRecipe': |
| 60 | + """ |
| 61 | + Instantiate a BLAST workflow recipe that will generate synthetic workflows up to |
| 62 | + the total number of tasks provided. |
| 63 | +
|
| 64 | + :param num_tasks: The upper bound for the total number of tasks in the workflow (at least 5). |
| 65 | + :type num_tasks: int |
| 66 | + :param runtime_factor: The factor of which tasks runtime will be increased/decreased. |
| 67 | + :type runtime_factor: float |
| 68 | + :param input_file_size_factor: The factor of which tasks input files size will be increased/decreased. |
| 69 | + :type input_file_size_factor: float |
| 70 | + :param output_file_size_factor: The factor of which tasks output files size will be increased/decreased. |
| 71 | + :type output_file_size_factor: float |
| 72 | +
|
| 73 | + :return: A BLAST workflow recipe object that will generate synthetic workflows up |
| 74 | + to the total number of tasks provided. |
| 75 | + :rtype: BLASTRecipe |
| 76 | + """ |
| 77 | + if num_tasks < 6: |
| 78 | + raise ValueError("The upper bound for the number of tasks should be at least 6.") |
| 79 | + |
| 80 | + return cls(num_subsample=int(num_tasks - 3), |
| 81 | + data_footprint=None, |
| 82 | + num_tasks=num_tasks, |
| 83 | + runtime_factor=runtime_factor, |
| 84 | + input_file_size_factor=input_file_size_factor, |
| 85 | + output_file_size_factor=output_file_size_factor) |
| 86 | + |
| 87 | + @classmethod |
| 88 | + def from_num_subsample(cls, |
| 89 | + num_subsample: int, |
| 90 | + runtime_factor: Optional[float] = 1.0, |
| 91 | + input_file_size_factor: Optional[float] = 1.0, |
| 92 | + output_file_size_factor: Optional[float] = 1.0 |
| 93 | + ) -> 'BLASTRecipe': |
| 94 | + """ |
| 95 | + Instantiate a BLAST workflow recipe that will generate synthetic workflows using |
| 96 | + the defined number of subsample. |
| 97 | +
|
| 98 | + :param num_subsample: The number of subsample the reference file will be split. |
| 99 | + :type num_subsample: int |
| 100 | + :param runtime_factor: The factor of which tasks runtime will be increased/decreased. |
| 101 | + :type runtime_factor: float |
| 102 | + :param input_file_size_factor: The factor of which tasks input files size will be increased/decreased. |
| 103 | + :type input_file_size_factor: float |
| 104 | + :param output_file_size_factor: The factor of which tasks output files size will be increased/decreased. |
| 105 | + :type output_file_size_factor: float |
| 106 | +
|
| 107 | + :return: A BLAST workflow recipe object that will generate synthetic workflows |
| 108 | + using the defined number of subsample. |
| 109 | + :rtype: BLASTRecipe |
| 110 | + """ |
| 111 | + if num_subsample < 2: |
| 112 | + raise ValueError("The number of subsample should be at least 2.") |
| 113 | + |
| 114 | + return cls(num_subsample=num_subsample, |
| 115 | + data_footprint=None, |
| 116 | + num_tasks=None, |
| 117 | + runtime_factor=runtime_factor, |
| 118 | + input_file_size_factor=input_file_size_factor, |
| 119 | + output_file_size_factor=output_file_size_factor) |
| 120 | + |
| 121 | + def build_workflow(self, workflow_name: Optional[str] = None) -> Workflow: |
| 122 | + """ |
| 123 | + Generate a synthetic workflow trace of a BLAST workflow. |
| 124 | +
|
| 125 | + :param workflow_name: The workflow name |
| 126 | + :type workflow_name: int |
| 127 | +
|
| 128 | + :return: A synthetic workflow trace object. |
| 129 | + :rtype: Workflow |
| 130 | + """ |
| 131 | + workflow = Workflow(name=self.name + '-synthetic-trace' if not workflow_name else workflow_name, makespan=None) |
| 132 | + self.task_id_counter: int = 1 |
| 133 | + |
| 134 | + # split_fasta task |
| 135 | + task_name = self._generate_task_name('split_fasta') |
| 136 | + split_fasta_task = self._generate_task('split_fasta', task_name, |
| 137 | + files_recipe={FileLink.OUTPUT: {'.fasta': self.num_subsample}}) |
| 138 | + workflow.add_node(task_name, task=split_fasta_task) |
| 139 | + |
| 140 | + out_files = [] |
| 141 | + err_files = [] |
| 142 | + blastall_tasks = [] |
| 143 | + |
| 144 | + for f in split_fasta_task.files: |
| 145 | + if f.link == FileLink.OUTPUT: |
| 146 | + task_name = self._generate_task_name('blastall') |
| 147 | + blastall_task = self._generate_task('blastall', task_name, [f]) |
| 148 | + workflow.add_node(task_name, task=blastall_task) |
| 149 | + workflow.add_edge(split_fasta_task.name, task_name) |
| 150 | + blastall_tasks.append(task_name) |
| 151 | + for file in blastall_task.files: |
| 152 | + if file.link == FileLink.OUTPUT and '.out' in file.name: |
| 153 | + out_files.append(file) |
| 154 | + if file.link == FileLink.OUTPUT and '.err' in file.name: |
| 155 | + err_files.append(file) |
| 156 | + |
| 157 | + # cat_blast task |
| 158 | + task_name = self._generate_task_name('cat_blast') |
| 159 | + cat_blast_task = self._generate_task('cat_blast', task_name, |
| 160 | + input_files=out_files, |
| 161 | + files_recipe={FileLink.OUTPUT: {'.out': self.num_subsample}}) |
| 162 | + workflow.add_node(task_name, task=cat_blast_task) |
| 163 | + |
| 164 | + # cat task |
| 165 | + task_name = self._generate_task_name('cat') |
| 166 | + cat_task = self._generate_task('cat', task_name, |
| 167 | + input_files=err_files, |
| 168 | + files_recipe={FileLink.OUTPUT: {'.err': self.num_subsample}}) |
| 169 | + workflow.add_node(task_name, task=cat_task) |
| 170 | + |
| 171 | + for t in blastall_tasks: |
| 172 | + workflow.add_edge(t, cat_blast_task.name) |
| 173 | + workflow.add_edge(t, cat_task.name) |
| 174 | + |
| 175 | + return workflow |
| 176 | + |
| 177 | + def _workflow_recipe(self) -> Dict: |
| 178 | + """ |
| 179 | + Recipe for generating synthetic traces of the BLAST workflow. Recipes can be |
| 180 | + generated by using the :class:`~workflowhub.trace.trace_analyzer.TraceAnalyzer`. |
| 181 | +
|
| 182 | + :return: A recipe in the form of a dictionary in which keys are task prefixes. |
| 183 | + :rtype: Dict[str, Any] |
| 184 | + """ |
| 185 | + return { |
| 186 | + "split_fasta": { |
| 187 | + "runtime": { |
| 188 | + "min": 0.051992, |
| 189 | + "max": 3.160018, |
| 190 | + "distribution": { |
| 191 | + "name": "arcsine", |
| 192 | + "params": [ |
| 193 | + -0.2258070520586602, |
| 194 | + 1.2258070520586604 |
| 195 | + ] |
| 196 | + } |
| 197 | + }, |
| 198 | + "input": { |
| 199 | + ".fasta": { |
| 200 | + "distribution": { |
| 201 | + "name": "arcsine", |
| 202 | + "params": [ |
| 203 | + -0.2258070520586602, |
| 204 | + 1.2258070520586604 |
| 205 | + ] |
| 206 | + }, |
| 207 | + "min": 203, |
| 208 | + "max": 201389 |
| 209 | + }, |
| 210 | + "split_fasta": { |
| 211 | + "distribution": "None", |
| 212 | + "min": 1, |
| 213 | + "max": 1 |
| 214 | + } |
| 215 | + }, |
| 216 | + "output": { |
| 217 | + ".fasta": { |
| 218 | + "distribution": { |
| 219 | + "name": "trapz", |
| 220 | + "params": [ |
| 221 | + 0.9999999999999999, |
| 222 | + 1.0, |
| 223 | + -0.10500000000000001, |
| 224 | + 1.1999999999999997 |
| 225 | + ] |
| 226 | + }, |
| 227 | + "min": 6, |
| 228 | + "max": 2015 |
| 229 | + } |
| 230 | + } |
| 231 | + }, |
| 232 | + "blastall": { |
| 233 | + "runtime": { |
| 234 | + "min": 8.116334, |
| 235 | + "max": 1799.556624, |
| 236 | + "distribution": { |
| 237 | + "name": "trapz", |
| 238 | + "params": [ |
| 239 | + 1.0, |
| 240 | + 1.0, |
| 241 | + -0.10500000000000001, |
| 242 | + 1.2 |
| 243 | + ] |
| 244 | + } |
| 245 | + }, |
| 246 | + "input": { |
| 247 | + "blastall": { |
| 248 | + "distribution": "None", |
| 249 | + "min": 7688, |
| 250 | + "max": 7688 |
| 251 | + }, |
| 252 | + "nt": { |
| 253 | + "distribution": { |
| 254 | + "name": "trapz", |
| 255 | + "params": [ |
| 256 | + 0.9999999999999999, |
| 257 | + 1.0, |
| 258 | + -0.10500000000000001, |
| 259 | + 1.1999999999999997 |
| 260 | + ] |
| 261 | + }, |
| 262 | + "min": 6927, |
| 263 | + "max": 2152118512 |
| 264 | + } |
| 265 | + }, |
| 266 | + "output": { |
| 267 | + ".out": { |
| 268 | + "distribution": { |
| 269 | + "name": "argus", |
| 270 | + "params": [ |
| 271 | + 2.465535551931572e-05, |
| 272 | + -0.7452662890705088, |
| 273 | + 1.7477663092998088 |
| 274 | + ] |
| 275 | + }, |
| 276 | + "min": 5, |
| 277 | + "max": 17952 |
| 278 | + }, |
| 279 | + ".err": { |
| 280 | + "distribution": "None", |
| 281 | + "min": 0, |
| 282 | + "max": 0 |
| 283 | + }, |
| 284 | + ".fasta": { |
| 285 | + "distribution": { |
| 286 | + "name": "trapz", |
| 287 | + "params": [ |
| 288 | + 0.9999999999999999, |
| 289 | + 1.0, |
| 290 | + -0.10500000000000001, |
| 291 | + 1.1999999999999997 |
| 292 | + ] |
| 293 | + }, |
| 294 | + "min": 6, |
| 295 | + "max": 2015 |
| 296 | + } |
| 297 | + } |
| 298 | + }, |
| 299 | + "cat_blast": { |
| 300 | + "runtime": { |
| 301 | + "min": 0.034811, |
| 302 | + "max": 16.689957, |
| 303 | + "distribution": { |
| 304 | + "name": "arcsine", |
| 305 | + "params": [ |
| 306 | + -0.2258070520586602, |
| 307 | + 1.2258070520586604 |
| 308 | + ] |
| 309 | + } |
| 310 | + }, |
| 311 | + "input": { |
| 312 | + "cat_blast": { |
| 313 | + "distribution": "None", |
| 314 | + "min": 1, |
| 315 | + "max": 1 |
| 316 | + } |
| 317 | + }, |
| 318 | + "output": { |
| 319 | + "None": { |
| 320 | + "distribution": { |
| 321 | + "name": "arcsine", |
| 322 | + "params": [ |
| 323 | + -0.2258070520586602, |
| 324 | + 1.2258070520586604 |
| 325 | + ] |
| 326 | + }, |
| 327 | + "min": 454, |
| 328 | + "max": 565948 |
| 329 | + }, |
| 330 | + ".out": { |
| 331 | + "distribution": { |
| 332 | + "name": "argus", |
| 333 | + "params": [ |
| 334 | + 2.465535551931572e-05, |
| 335 | + -0.7452662890705088, |
| 336 | + 1.7477663092998088 |
| 337 | + ] |
| 338 | + }, |
| 339 | + "min": 5, |
| 340 | + "max": 17952 |
| 341 | + } |
| 342 | + } |
| 343 | + }, |
| 344 | + "cat": { |
| 345 | + "runtime": { |
| 346 | + "min": 0.009596, |
| 347 | + "max": 0.021895, |
| 348 | + "distribution": { |
| 349 | + "name": "arcsine", |
| 350 | + "params": [ |
| 351 | + -0.2258070520586602, |
| 352 | + 1.2258070520586604 |
| 353 | + ] |
| 354 | + } |
| 355 | + }, |
| 356 | + "input": { |
| 357 | + |
| 358 | + }, |
| 359 | + "output": { |
| 360 | + ".err": { |
| 361 | + "distribution": "None", |
| 362 | + "min": 0, |
| 363 | + "max": 0 |
| 364 | + } |
| 365 | + } |
| 366 | + } |
| 367 | + } |
0 commit comments