Added code for montage validation

tainagdcoleman · tainagdcoleman · commit a1e5b5c607f1 · 2021-08-20T08:56:18.000-10:00
diff --git a/wfcommons/wfperf/bench_plot.py b/wfcommons/wfperf/bench_plot.py
@@ -0,0 +1,101 @@
+from argparse import ArgumentParser
+from datetime import timedelta
+import plotly.graph_objects as go
+import pathlib
+import pandas as pd
+from typing import Dict, Tuple
+import argparse
+import re
+import plotly.express as px 
+import plotly.graph_objects as go
+
+
+this_dir = pathlib.Path(__file__).resolve().parent
+
+def convert_time(text: str) -> float:
+  return sum([float(p) * 60**i for i, p in enumerate(str(text).split(":")[::-1])])
+
+def get_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("path", help="Path to the csv")
+  parser.add_argument("-m", "--machine", help="Machine used")
+  return parser
+
+colors = {
+    "1_9": "#e69f00",
+    "2_8": "#56b4e9",
+    "3_7": "#009e73",
+    "4_6": "#f0e442",
+    "5_5": "#0072b2",
+    "6_4": "#d55e00",
+    "7_3": "#cc79a7",
+    "8_2": "#000000",
+    "9_1": "#aa3377",
+    "real":"#332288"
+}
+
+symbols = ["circle", "square", "diamond", "cross", "x", "triangle-up", "triangle-down", "star", "hexagon", "pentagon"]
+def main():
+  parser = get_parser()
+  args = parser.parse_args()
+  path = pathlib.Path(args.path)
+  machine = args.machine
+  files = []
+
+  for file in path.glob("*.csv"):
+    task, _ = re.match(r"^(.+)_(^.+)?.*?_.+?.*?.*?$", str(file)).groups()
+    task = pathlib.Path(task).stem
+    df = pd.read_csv(str(file), index_col=0)
+    df["task"] = task
+    files.append(df)
+  
+  df_all = pd.concat(files) 
+  df_all["tool"] = df_all["type"]
+  df_all.loc[df_all["tool"] != "real", "tool"] = "sysbench"
+
+  df_all["time"] = df_all["time"].apply(convert_time)
+  df_all["label"] = df_all["tool"] #+ "_" + df_all["server"] 
+
+  fig = px.strip(
+    df_all,
+    x="task", y="time",
+    color="type",
+    width=1500, 
+    height=750,
+    color_discrete_map=colors,
+    title=machine,
+    category_orders={
+      "task": sorted(df_all["task"].unique())
+    }
+    # symbol="tool",
+    # symbol_sequence=symbols
+  ).update_traces(
+    marker={
+      "size": 15,
+      "line": {
+        "width": 2,
+        "color": "DarkSlateGrey"
+      },
+      # "symbol": symbols
+    },
+    jitter=1,
+  )
+  
+
+  fig.update_layout(
+    legend=dict(
+      font_size=20
+    ),
+    font_size = 30,
+    yaxis_title="Time (s)",
+    xaxis_title="Task"
+  )
+
+  fig.update_xaxes(
+    tickangle = 45
+  )
+  savedir = this_dir.joinpath(f"new_test/{machine}")
+  fig.write_image(savedir.joinpath(f"{machine}_time_plot.png"))
+
+if __name__ == '__main__':
+  main()
diff --git a/wfcommons/wfperf/mean.py b/wfcommons/wfperf/mean.py
@@ -0,0 +1,61 @@
+import pandas as pd
+import pathlib
+import argparse
+import re
+
+this_dir = pathlib.Path(__file__).resolve().parent
+
+def find_mean(df: pd.Series) -> pd.DataFrame:
+    return df.mean()
+
+def convert_time(text: str) -> float:
+  return sum([float(p) * 60**i for i, p in enumerate(str(text).split(":")[::-1])])
+    
+def percent_error(real_mean: float, bm_mean: float) -> float:
+    return ((bm_mean - real_mean)/real_mean)*100
+
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("path", help="Path to the csv")
+    parser.add_argument("-r", "--ratio", help="Correct CPU percentage for task")
+    parser.add_argument("-t", "--task", help="Task")
+
+    return parser
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    path = pathlib.Path(args.path)
+    ratio = float(args.ratio)
+    cpu_thread = int(ratio*10) 
+    mem_thread = 10 - cpu_thread
+    label = f'{cpu_thread}_{mem_thread}'  
+
+    lines = []
+    for file in path.glob("*.csv"):
+        df = pd.read_csv(file, index_col=0)
+        df["time"] = df["time"].apply(convert_time)
+        real = df[df["type"] == "real"]
+        real = real.set_index("type")
+        bm = df[~(df["type"] == "real")]
+        bm = bm.set_index("type")
+
+
+        real_mean = round(float(find_mean(real["time"])), 3)
+        bm_thread = bm.loc[label]
+        bm_mean = round(float(find_mean(bm_thread)), 3)
+
+        error = round(percent_error(real_mean, bm_mean), 3)
+
+        savedir = file.parent.joinpath(f"error")
+        savedir.mkdir(exist_ok=True, parents=True)
+
+        _, machine = re.match(r"^.+_(^.+)?.*?_(.+).*?.*?$", str(file.stem), re.DOTALL).groups()
+        lines.append(f'{machine} {cpu_thread} {mem_thread} {real_mean} {bm_mean} {error} \n')
+    
+    with savedir.joinpath(f"{args.task}_error.txt").open("w+") as fp:
+        fp.writelines(lines)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/wfcommons/wfperf/montage_validation/montage.py b/wfcommons/wfperf/montage_validation/montage.py
@@ -0,0 +1,47 @@
+from wfcommons.wfperf.montage_validation.montage_perf import WorkflowBenchmark
+from wfcommons.wfchef.recipes import MontageRecipe
+import pathlib
+
+this_dir = pathlib.Path(__file__).resolve().parent
+
+def total_tasks():
+    num_tasks = {'mProject': 64, 
+                 'mDiffFit': 2016, 
+                 'mConcatFit': 1, 
+                 'mBgModel': 1, 
+                 'mBackground': 64, 
+                 'mImgtbl': 1, 
+                 'mAdd': 1, 
+                 'mViewer': 1}
+    total = 0
+    for (k, v) in num_tasks.items():
+        if k == 'mViewer':
+            num_tasks[k] = v*3 + 1 
+        else: 
+            num_tasks[k] = v*3
+
+    for values in num_tasks.values():
+        total += values
+
+    return total
+
+def main():
+     
+    num_tasks = total_tasks()
+    tasks = {'mProject': (12800000, 7, 120), 
+             'mDiffFit': (24900000 , 7, 1), 
+             'mConcatFit': (24900000 , 7, 5), 
+             'mBgModel': (1910000, 7, 120), 
+             'mBackground': (24900000 , 7, 1), 
+             'mImgtbl': (24900000 , 7, 2),
+             'mAdd': (1050000, 6, 120),
+             'mViewer': (7400000, 6, 120)}
+
+    
+    bench = WorkflowBenchmark(MontageRecipe, num_tasks)
+    bench.create(this_dir.joinpath("Montage"), tasks, verbose=True)
+
+    
+    
+if __name__ == "__main__":
+    main()
diff --git a/wfcommons/wfperf/montage_validation/montage_perf.py b/wfcommons/wfperf/montage_validation/montage_perf.py
@@ -0,0 +1,69 @@
+from fractions import Fraction
+from wfcommons.wfgen.abstract_recipe import WorkflowRecipe
+from wfcommons import WorkflowGenerator
+from typing import Dict, Union, List, Type, Tuple
+from numpy.random import choice
+from wfcommons.wfperf.data_gen import generate_sys_data, cleanup_sys_files
+import pathlib
+import json
+import subprocess
+
+class WorkflowBenchmark():
+    def __init__(self, Recipe: Type[WorkflowRecipe], num_tasks: int) -> None:
+        self.Recipe = Recipe
+        self.num_tasks = num_tasks
+
+    def create(self,
+               save_dir: pathlib.Path,
+               tasks: Dict[str, Tuple[int, int]],
+               mem_total_size: str = "1000T",
+               block_size: str = "4096",
+               verbose: bool = False) -> Dict:
+
+
+        if verbose:
+            print("Checking if the sysbench is installed.")
+        self._check_sysbench()
+        if verbose:
+            print("Creating directory.")
+        save_dir = pathlib.Path(save_dir).resolve()
+        save_dir.mkdir(exist_ok=True, parents=True)
+
+        if verbose:
+            print("Generating workflow")
+        generator = WorkflowGenerator(self.Recipe.from_num_tasks(self.num_tasks))
+        workflow = generator.build_workflow()
+        workflow.write_json(f'{save_dir.joinpath(workflow.name)}.json')
+
+        with open(f'{save_dir.joinpath(workflow.name)}.json') as json_file:
+            wf = json.load(json_file)
+
+        params = {
+            job: [
+                f"--memory-block-size={block_size}",
+                f"--memory-total-size={mem_total_size}",
+                f"--cpu-max-prime={cpu_max_prime}",
+                f"--percent_cpu={percent_cpu}",  
+                f"--forced-shutdown=0",  
+                f"--time={time}"
+            ]
+            for job, (cpu_max_prime, percent_cpu, time) in tasks.items()
+        }         
+        
+        for job in wf["workflow"]["jobs"]:
+            job["files"] = []
+            job.setdefault("command", {})
+            job["command"]["program"] = f"sys_test.py"
+            job_name = job["name"].rsplit("_", 1)[0]
+            job["command"]["arguments"] = params[job_name]
+
+
+        with open(f'{save_dir.joinpath(workflow.name)}.json', 'w') as fp:
+            json.dump(wf, fp, indent=4)
+
+
+    def _check_sysbench(self,):
+        proc = subprocess.Popen(["which", "sysbench"], stdout=subprocess.PIPE)
+        out, _ = proc.communicate()
+        if not out:
+            raise FileNotFoundError("Sysbench not found. Please install sysbench: https://github.com/aakopytov/sysbench")
diff --git a/wfcommons/wfperf/real_run.sh b/wfcommons/wfperf/real_run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+for i in {1..9}; do echo "======= $i"; time /home/tgcoleman/1000genome-sequential/bin/individuals.py ALL.chr1.250000.vcf 1 1 1001 3000 -s /home/tgcoleman/wfcommons/wfcommons/wfperf/new_test/real_individuals_$i; done
diff --git a/wfcommons/wfperf/run.sh b/wfcommons/wfperf/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+for i in {1..9}; do echo "======= $i"; python ten_samples.py frequency -m 14990000 -p 0.$i -s /home/tgcoleman/wfcommons/wfcommons/wfperf/new_test/frequency; done
diff --git a/wfcommons/wfperf/sys_test.py b/wfcommons/wfperf/sys_test.py
@@ -3,6 +3,7 @@
 import subprocess 
 import os
 import time
+from typing import List
 
 def get_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
@@ -11,6 +12,7 @@ def get_parser() -> argparse.ArgumentParser:
     parser.add_argument("--percent-cpu", type=float, help="percent of threads which will be cpu heavy")
     parser.add_argument("--save", type=pathlib.Path, help="directory to save to.")
 
+
     return parser
 
 def main():
@@ -27,49 +29,45 @@ def main():
     
     with save_dir.joinpath(f"{name}_cpu.txt").open("w+") as fp_cpu, save_dir.joinpath(f"{name}_memory.txt").open("w+") as fp_mem, save_dir.joinpath(f"{name}_ps.txt").open("w+") as fp_ps:
         num_cores = 1 #os.cpu_count()
-        cpu_threads = 1 #int(args.percent_cpu*10)
-        mem_threads = 1 #int(10 - cpu_threads)
-
+        cpu_threads = int(args.percent_cpu*10)
+        mem_threads = int(10 - cpu_threads)
+        print(f"cpu_threads={cpu_threads}, mem_threads={mem_threads}")
 
         print("Starting CPU benchmark...")
         sysbench_cpu_args = [arg for arg in other if arg.startswith("--cpu")] + [f"--threads={cpu_threads}"]
         
+        proc_cpus: List[subprocess.Popen] = []
+        proc_mems: List[subprocess.Popen] = []
         for i in range(num_cores):
-
-            proc_cpu = subprocess.Popen(
+            proc_cpus.append(subprocess.Popen(
                 [
                     "sysbench", "cpu",
                     *sysbench_cpu_args, "run"
                 ], 
                 stdout=fp_cpu, stderr=fp_cpu, 
-            )
-            
-            pid_1 = proc_cpu.pid
-            print(pid_1)
-            os.sched_setaffinity(pid_1, {i})   
-            
+            ))
+            os.sched_setaffinity(proc_cpus[-1].pid, {13})   
 
             print("Starting Memory benchmark...")
             sysbench_mem_args = [arg for arg in other if arg.startswith("--memory")] + [f"--time={args.time}", f"--threads={mem_threads}"]
-            proc_mem = subprocess.Popen(
+            proc_mems.append(subprocess.Popen(
                 [
                     "sysbench", "memory","run",
                     *sysbench_mem_args
                 ], 
                 stdout=fp_mem, stderr=fp_mem
-            )
+            ))
             
-            pid_2 = proc_mem.pid
-            print(pid_2)
-            os.sched_setaffinity(pid_2, {i})
+            os.sched_setaffinity(proc_mems[-1].pid, {13})
 
         proc = subprocess.Popen(["ps", "-o","pid,psr,comm,lstart"], stdout=fp_ps)
         proc.wait()
-        proc_cpu.wait()
-        print(time.time)
-        subprocess.Popen(["killall", "sysbench"])
+        for proc_cpu in proc_cpus:
+            proc_cpu.wait()
+        for proc_mem in proc_mems:
+            proc_mem.kill()
         
  
     
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/wfcommons/wfperf/ten_samples.py b/wfcommons/wfperf/ten_samples.py
diff --git a/wfcommons/wfperf/txt_parser.py b/wfcommons/wfperf/txt_parser.py
diff --git a/wfcommons/wfperf/wfperf_benchmark.py b/wfcommons/wfperf/wfperf_benchmark.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+#!/bin/bash`
	`2`	`+for i in {1..9}; do echo "======= $i"; time /home/tgcoleman/1000genome-sequential/bin/individuals.py ALL.chr1.250000.vcf 1 1 1001 3000 -s /home/tgcoleman/wfcommons/wfcommons/wfperf/new_test/real_individuals_$i; done`