Skip to content

Commit 4db8015

Browse files
committed
(#12) adding generator for BLAST workflow
1 parent fd73cbf commit 4db8015

3 files changed

Lines changed: 370 additions & 2 deletions

File tree

workflowhub/generator/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@
99
# (at your option) any later version.
1010

1111
from .generator import WorkflowGenerator
12-
from .workflow import CyclesRecipe, EpigenomicsRecipe, GenomeRecipe, MontageRecipe, MontageDataset, SeismologyRecipe, \
13-
SoyKBRecipe, SRASearchRecipe
12+
from .workflow import BLASTRecipe, CyclesRecipe, EpigenomicsRecipe, GenomeRecipe, MontageRecipe, MontageDataset, \
13+
SeismologyRecipe, SoyKBRecipe, SRASearchRecipe

workflowhub/generator/workflow/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
# the Free Software Foundation, either version 3 of the License, or
99
# (at your option) any later version.
1010

11+
from .blast_recipe import BLASTRecipe
1112
from .cycles_recipe import CyclesRecipe
1213
from .epigenomics_recipe import EpigenomicsRecipe
1314
from .genome_recipe import GenomeRecipe
Lines changed: 367 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,367 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright (c) 2021 The WorkflowHub Team.
5+
#
6+
# This program is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU General Public License as published by
8+
# the Free Software Foundation, either version 3 of the License, or
9+
# (at your option) any later version.
10+
11+
from typing import Dict, Optional
12+
13+
from .abstract_recipe import WorkflowRecipe
14+
from ...common.file import FileLink
15+
from ...common.workflow import Workflow
16+
17+
18+
class BLASTRecipe(WorkflowRecipe):
19+
"""A BLAST workflow recipe class for creating synthetic workflow traces.
20+
21+
:param num_subsample: The number of subsample the reference file will be split.
22+
:type num_subsample: int
23+
:param data_footprint: The upper bound for the workflow total data footprint (in bytes).
24+
:type data_footprint: int
25+
:param num_tasks: The upper bound for the total number of tasks in the workflow.
26+
:type num_tasks: int
27+
:param runtime_factor: The factor of which tasks runtime will be increased/decreased.
28+
:type runtime_factor: float
29+
:param input_file_size_factor: The factor of which tasks input files size will be increased/decreased.
30+
:type input_file_size_factor: float
31+
:param output_file_size_factor: The factor of which tasks output files size will be increased/decreased.
32+
:type output_file_size_factor: float
33+
"""
34+
35+
def __init__(self,
36+
num_subsample: Optional[int] = 2,
37+
data_footprint: Optional[int] = 0,
38+
num_tasks: Optional[int] = 5,
39+
runtime_factor: Optional[float] = 1.0,
40+
input_file_size_factor: Optional[float] = 1.0,
41+
output_file_size_factor: Optional[float] = 1.0
42+
) -> None:
43+
"""Create an object of the BLAST workflow recipe."""
44+
super().__init__("BLAST",
45+
data_footprint,
46+
num_tasks,
47+
runtime_factor,
48+
input_file_size_factor,
49+
output_file_size_factor)
50+
51+
self.num_subsample: int = num_subsample
52+
53+
@classmethod
54+
def from_num_tasks(cls,
55+
num_tasks: int,
56+
runtime_factor: Optional[float] = 1.0,
57+
input_file_size_factor: Optional[float] = 1.0,
58+
output_file_size_factor: Optional[float] = 1.0
59+
) -> 'BLASTRecipe':
60+
"""
61+
Instantiate a BLAST workflow recipe that will generate synthetic workflows up to
62+
the total number of tasks provided.
63+
64+
:param num_tasks: The upper bound for the total number of tasks in the workflow (at least 5).
65+
:type num_tasks: int
66+
:param runtime_factor: The factor of which tasks runtime will be increased/decreased.
67+
:type runtime_factor: float
68+
:param input_file_size_factor: The factor of which tasks input files size will be increased/decreased.
69+
:type input_file_size_factor: float
70+
:param output_file_size_factor: The factor of which tasks output files size will be increased/decreased.
71+
:type output_file_size_factor: float
72+
73+
:return: A BLAST workflow recipe object that will generate synthetic workflows up
74+
to the total number of tasks provided.
75+
:rtype: BLASTRecipe
76+
"""
77+
if num_tasks < 6:
78+
raise ValueError("The upper bound for the number of tasks should be at least 6.")
79+
80+
return cls(num_subsample=int(num_tasks - 3),
81+
data_footprint=None,
82+
num_tasks=num_tasks,
83+
runtime_factor=runtime_factor,
84+
input_file_size_factor=input_file_size_factor,
85+
output_file_size_factor=output_file_size_factor)
86+
87+
@classmethod
88+
def from_num_subsample(cls,
89+
num_subsample: int,
90+
runtime_factor: Optional[float] = 1.0,
91+
input_file_size_factor: Optional[float] = 1.0,
92+
output_file_size_factor: Optional[float] = 1.0
93+
) -> 'BLASTRecipe':
94+
"""
95+
Instantiate a BLAST workflow recipe that will generate synthetic workflows using
96+
the defined number of subsample.
97+
98+
:param num_subsample: The number of subsample the reference file will be split.
99+
:type num_subsample: int
100+
:param runtime_factor: The factor of which tasks runtime will be increased/decreased.
101+
:type runtime_factor: float
102+
:param input_file_size_factor: The factor of which tasks input files size will be increased/decreased.
103+
:type input_file_size_factor: float
104+
:param output_file_size_factor: The factor of which tasks output files size will be increased/decreased.
105+
:type output_file_size_factor: float
106+
107+
:return: A BLAST workflow recipe object that will generate synthetic workflows
108+
using the defined number of subsample.
109+
:rtype: BLASTRecipe
110+
"""
111+
if num_subsample < 2:
112+
raise ValueError("The number of subsample should be at least 2.")
113+
114+
return cls(num_subsample=num_subsample,
115+
data_footprint=None,
116+
num_tasks=None,
117+
runtime_factor=runtime_factor,
118+
input_file_size_factor=input_file_size_factor,
119+
output_file_size_factor=output_file_size_factor)
120+
121+
def build_workflow(self, workflow_name: Optional[str] = None) -> Workflow:
122+
"""
123+
Generate a synthetic workflow trace of a BLAST workflow.
124+
125+
:param workflow_name: The workflow name
126+
:type workflow_name: int
127+
128+
:return: A synthetic workflow trace object.
129+
:rtype: Workflow
130+
"""
131+
workflow = Workflow(name=self.name + '-synthetic-trace' if not workflow_name else workflow_name, makespan=None)
132+
self.task_id_counter: int = 1
133+
134+
# split_fasta task
135+
task_name = self._generate_task_name('split_fasta')
136+
split_fasta_task = self._generate_task('split_fasta', task_name,
137+
files_recipe={FileLink.OUTPUT: {'.fasta': self.num_subsample}})
138+
workflow.add_node(task_name, task=split_fasta_task)
139+
140+
out_files = []
141+
err_files = []
142+
blastall_tasks = []
143+
144+
for f in split_fasta_task.files:
145+
if f.link == FileLink.OUTPUT:
146+
task_name = self._generate_task_name('blastall')
147+
blastall_task = self._generate_task('blastall', task_name, [f])
148+
workflow.add_node(task_name, task=blastall_task)
149+
workflow.add_edge(split_fasta_task.name, task_name)
150+
blastall_tasks.append(task_name)
151+
for file in blastall_task.files:
152+
if file.link == FileLink.OUTPUT and '.out' in file.name:
153+
out_files.append(file)
154+
if file.link == FileLink.OUTPUT and '.err' in file.name:
155+
err_files.append(file)
156+
157+
# cat_blast task
158+
task_name = self._generate_task_name('cat_blast')
159+
cat_blast_task = self._generate_task('cat_blast', task_name,
160+
input_files=out_files,
161+
files_recipe={FileLink.OUTPUT: {'.out': self.num_subsample}})
162+
workflow.add_node(task_name, task=cat_blast_task)
163+
164+
# cat task
165+
task_name = self._generate_task_name('cat')
166+
cat_task = self._generate_task('cat', task_name,
167+
input_files=err_files,
168+
files_recipe={FileLink.OUTPUT: {'.err': self.num_subsample}})
169+
workflow.add_node(task_name, task=cat_task)
170+
171+
for t in blastall_tasks:
172+
workflow.add_edge(t, cat_blast_task.name)
173+
workflow.add_edge(t, cat_task.name)
174+
175+
return workflow
176+
177+
def _workflow_recipe(self) -> Dict:
178+
"""
179+
Recipe for generating synthetic traces of the BLAST workflow. Recipes can be
180+
generated by using the :class:`~workflowhub.trace.trace_analyzer.TraceAnalyzer`.
181+
182+
:return: A recipe in the form of a dictionary in which keys are task prefixes.
183+
:rtype: Dict[str, Any]
184+
"""
185+
return {
186+
"split_fasta": {
187+
"runtime": {
188+
"min": 0.051992,
189+
"max": 3.160018,
190+
"distribution": {
191+
"name": "arcsine",
192+
"params": [
193+
-0.2258070520586602,
194+
1.2258070520586604
195+
]
196+
}
197+
},
198+
"input": {
199+
".fasta": {
200+
"distribution": {
201+
"name": "arcsine",
202+
"params": [
203+
-0.2258070520586602,
204+
1.2258070520586604
205+
]
206+
},
207+
"min": 203,
208+
"max": 201389
209+
},
210+
"split_fasta": {
211+
"distribution": "None",
212+
"min": 1,
213+
"max": 1
214+
}
215+
},
216+
"output": {
217+
".fasta": {
218+
"distribution": {
219+
"name": "trapz",
220+
"params": [
221+
0.9999999999999999,
222+
1.0,
223+
-0.10500000000000001,
224+
1.1999999999999997
225+
]
226+
},
227+
"min": 6,
228+
"max": 2015
229+
}
230+
}
231+
},
232+
"blastall": {
233+
"runtime": {
234+
"min": 8.116334,
235+
"max": 1799.556624,
236+
"distribution": {
237+
"name": "trapz",
238+
"params": [
239+
1.0,
240+
1.0,
241+
-0.10500000000000001,
242+
1.2
243+
]
244+
}
245+
},
246+
"input": {
247+
"blastall": {
248+
"distribution": "None",
249+
"min": 7688,
250+
"max": 7688
251+
},
252+
"nt": {
253+
"distribution": {
254+
"name": "trapz",
255+
"params": [
256+
0.9999999999999999,
257+
1.0,
258+
-0.10500000000000001,
259+
1.1999999999999997
260+
]
261+
},
262+
"min": 6927,
263+
"max": 2152118512
264+
}
265+
},
266+
"output": {
267+
".out": {
268+
"distribution": {
269+
"name": "argus",
270+
"params": [
271+
2.465535551931572e-05,
272+
-0.7452662890705088,
273+
1.7477663092998088
274+
]
275+
},
276+
"min": 5,
277+
"max": 17952
278+
},
279+
".err": {
280+
"distribution": "None",
281+
"min": 0,
282+
"max": 0
283+
},
284+
".fasta": {
285+
"distribution": {
286+
"name": "trapz",
287+
"params": [
288+
0.9999999999999999,
289+
1.0,
290+
-0.10500000000000001,
291+
1.1999999999999997
292+
]
293+
},
294+
"min": 6,
295+
"max": 2015
296+
}
297+
}
298+
},
299+
"cat_blast": {
300+
"runtime": {
301+
"min": 0.034811,
302+
"max": 16.689957,
303+
"distribution": {
304+
"name": "arcsine",
305+
"params": [
306+
-0.2258070520586602,
307+
1.2258070520586604
308+
]
309+
}
310+
},
311+
"input": {
312+
"cat_blast": {
313+
"distribution": "None",
314+
"min": 1,
315+
"max": 1
316+
}
317+
},
318+
"output": {
319+
"None": {
320+
"distribution": {
321+
"name": "arcsine",
322+
"params": [
323+
-0.2258070520586602,
324+
1.2258070520586604
325+
]
326+
},
327+
"min": 454,
328+
"max": 565948
329+
},
330+
".out": {
331+
"distribution": {
332+
"name": "argus",
333+
"params": [
334+
2.465535551931572e-05,
335+
-0.7452662890705088,
336+
1.7477663092998088
337+
]
338+
},
339+
"min": 5,
340+
"max": 17952
341+
}
342+
}
343+
},
344+
"cat": {
345+
"runtime": {
346+
"min": 0.009596,
347+
"max": 0.021895,
348+
"distribution": {
349+
"name": "arcsine",
350+
"params": [
351+
-0.2258070520586602,
352+
1.2258070520586604
353+
]
354+
}
355+
},
356+
"input": {
357+
358+
},
359+
"output": {
360+
".err": {
361+
"distribution": "None",
362+
"min": 0,
363+
"max": 0
364+
}
365+
}
366+
}
367+
}

0 commit comments

Comments
 (0)