converged-computing
diff --git a/‎analysis/magma/1-run-analysis.py‎
Lines changed: 272 additions & 0 deletions b/‎analysis/magma/1-run-analysis.py‎
Lines changed: 272 additions & 0 deletions
diff --git a/‎analysis/magma/README.md‎
Lines changed: 76 additions & 0 deletions b/‎analysis/magma/README.md‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎analysis/magma/data/img/magma-gflop-per-second-size-16.png‎
59.7 KB b/‎analysis/magma/data/img/magma-gflop-per-second-size-16.png‎
59.7 KB
diff --git a/‎analysis/magma/data/img/magma-gflop-per-second-size-32.png‎
57.1 KB b/‎analysis/magma/data/img/magma-gflop-per-second-size-32.png‎
57.1 KB
diff --git a/‎analysis/magma/data/img/magma-gflop-per-second-size-4.png‎
57.9 KB b/‎analysis/magma/data/img/magma-gflop-per-second-size-4.png‎
57.9 KB
diff --git a/‎analysis/magma/data/img/magma-gflop-per-second-size-8.png‎
58.1 KB b/‎analysis/magma/data/img/magma-gflop-per-second-size-8.png‎
58.1 KB
diff --git a/‎analysis/magma/data/img/magma-milliseconds-size-16.png‎
47.8 KB b/‎analysis/magma/data/img/magma-milliseconds-size-16.png‎
47.8 KB
diff --git a/‎analysis/magma/data/img/magma-milliseconds-size-32.png‎
46.6 KB b/‎analysis/magma/data/img/magma-milliseconds-size-32.png‎
46.6 KB
diff --git a/‎analysis/magma/data/img/magma-milliseconds-size-4.png‎
49.5 KB b/‎analysis/magma/data/img/magma-milliseconds-size-4.png‎
49.5 KB
diff --git a/‎analysis/magma/data/img/magma-milliseconds-size-8.png‎
51.9 KB b/‎analysis/magma/data/img/magma-milliseconds-size-8.png‎
51.9 KB
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+
+import argparse
+import collections
+import os
+import re
+import sys
+
+import seaborn as sns
+import matplotlib.pylab as plt
+import numpy as np
+
+here = os.path.dirname(os.path.abspath(__file__))
+analysis_root = os.path.dirname(here)
+root = os.path.dirname(analysis_root)
+sys.path.insert(0, analysis_root)
+
+import performance_study as ps
+
+sns.set_theme(style="whitegrid", palette="pastel")
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Run analysis",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--root",
+        help="root directory with experiments",
+        default=os.path.join(root, "experiments"),
+    )
+    parser.add_argument(
+        "--out",
+        help="directory to save parsed results",
+        default=os.path.join(here, "data"),
+    )
+    return parser
+
+
+def main():
+    """
+    Find application result files to parse.
+    """
+    parser = get_parser()
+    args, _ = parser.parse_known_args()
+
+    # Output images and data
+    outdir = os.path.abspath(args.out)
+    indir = os.path.abspath(args.root)
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+
+    # Find input directories
+    files = ps.find_inputs(indir, "magma")
+    if not files:
+        raise ValueError(f"There are no input files in {indir}")
+
+    # Saves raw data to files (json has parsed, and df csv has just duration / wrapper)
+    df, results = parse_data(indir, outdir, files)
+    plot_results(results, df, outdir)
+
+
+def parse_magma(item, filename, exp):
+    """
+    Parse rows of results from magma
+    """
+    results = []
+    for line in item.split("\n"):
+        # Use the empty cpu metadata as a marker of a result line
+        if "(  ---  )" not in line:
+            continue
+        # % BatchCount     M     N     K   MAGMA Gflop/s (ms)   CPU Gflop/s (ms)   MAGMA error
+        # 300    32    32    31      0.53 (   5.11)     ---   (  ---  )     ---
+        parts = [x for x in re.sub("([(]|[)])", "", line).split(" ") if x]
+        if parts[0] != "300":
+            raise ValueError(f"Found unexpected batch count {parts[0]}, should be 300.")
+        problem_size = f"{parts[1]}x{parts[2]}x{parts[3]}"
+        results.append(
+            {
+                "problem_size": problem_size,
+                "gflops_per_second": float(parts[4]),
+                "ms": float(parts[5]),
+                "exp": exp.prefix,
+                "size": exp.size,
+            }
+        )
+    print(f"File {filename} has {len(results)} results.")
+    return results
+
+
+def parse_data(indir, outdir, files):
+    """
+    Parse filepaths for environment, etc., and results files for data.
+    """
+    # metrics here will be wall time and wrapped time
+    p = ps.ProblemSizeParser("magma")
+
+    # For flux we can save jobspecs and other event data
+    data = {}
+
+    # This data is HUGE so we will organize by environment, size, then metric
+    results = {}
+
+    # It's important to just parse raw data once, and then use intermediate
+    for filename in files:
+        # Underscore means skip, also skip configs and runs without efa
+        # runs with google and shared memory were actually slower...
+        dirname = os.path.basename(filename)
+        if ps.skip_result(dirname, filename):
+            continue
+
+        # Note that aws eks has kripke-8gpu directories, that just
+        # distinguishes when we ran a first set of runs just with 8 and
+        # then had the larger cluster working. Both data are good.
+        # All of these are consistent across studies
+        exp = ps.ExperimentNameParser(filename, indir)
+        if exp.prefix not in data:
+            data[exp.prefix] = []
+
+        # Size 2 was typically testing
+        if exp.size == 2:
+            continue
+        if exp.size not in results:
+            results[exp.size] = {}
+
+        # Set the parsing context for the result data frame
+        p.set_context(exp.cloud, exp.env, exp.env_type, exp.size)
+        exp.show()
+
+        # Now we can read each result file to get metrics.
+        result_files = list(ps.get_outfiles(filename))
+        for result in result_files:
+            # Basename that start with underscore are test or otherwise should not be included
+            if os.path.basename(result).startswith("_"):
+                continue
+
+            # If we are running in an environment that had two jobs, check for result file name.
+            # the vbatched one is what we want!
+            if (
+                "eks/gpu" in result
+                or "aks/gpu" in result
+                or "gke/gpu" in result
+                or "compute-engine/gpu" in result
+            ):
+                if "vbatched" not in result:
+                    continue
+            item = ps.read_file(result)
+
+            # If this is a flux run, we have a jobspec and events here
+            if "JOBSPEC" in item:
+                item, duration, metadata = ps.parse_flux_metadata(item)
+                data[exp.prefix].append(metadata)
+
+            # Slurm has the item output, and then just the start/end of the job
+            else:
+                metadata = {}
+                duration = ps.parse_slurm_duration(item)
+                item = ps.remove_slurm_duration(item)
+
+            for r in parse_magma(item, result, exp):
+                if r["problem_size"] not in results[r["size"]]:
+                    results[r["size"]][r["problem_size"]] = {}
+                if r["exp"] not in results[r["size"]][r["problem_size"]]:
+                    results[r["size"]][r["problem_size"]][r["exp"]] = {
+                        "gflops_per_second": [],
+                        "ms": [],
+                    }
+                results[r["size"]][r["problem_size"]][r["exp"]][
+                    "gflops_per_second"
+                ].append(r["gflops_per_second"])
+                results[r["size"]][r["problem_size"]][r["exp"]]["ms"].append(r["ms"])
+            p.add_result("workload_manager_wrapper_seconds", duration, "all")
+
+    print("Done parsing magma results!")
+    # This just has the magma durations
+    p.df.to_csv(os.path.join(outdir, "magma-durations.csv"))
+    ps.write_json(data, os.path.join(outdir, "magma-parsed.json"))
+    ps.write_json(results, os.path.join(outdir, "magma-data-parsed.json"))
+    return p.df, results
+
+
+def plot_results(results, df, outdir):
+    """
+    Plot analysis results
+    """
+    # Make an image outdir
+    img_outdir = os.path.join(outdir, "img")
+    if not os.path.exists(img_outdir):
+        os.makedirs(img_outdir)
+
+    # Within a setup, compare between experiments for GPU and cpu
+    for nodes, problem_sizes in results.items():
+        size_list = list(problem_sizes.keys())
+
+        # Parse gflops/s and ms at the same time
+        vectors = {}
+        ms_vectors = {}
+
+        # For each size, we want a plot that has problem sizes on x, and the metric on y
+        # We are going to hue (color) by the full environment prefix.
+        # We need size -> experiment ->
+        for size in size_list:
+            for cloud_env, metrics in problem_sizes[size].items():
+                # Truncate the size - we just need <cloud>/<env>/<env_type>
+                experiment = os.path.dirname(cloud_env)
+                if experiment not in vectors:
+                    vectors[experiment] = []
+                if experiment not in ms_vectors:
+                    ms_vectors[experiment] = []
+                vectors[experiment].append(metrics["gflops_per_second"])
+                ms_vectors[experiment].append(metrics["ms"])
+
+        # Make a boxplot for each environment, so this is problem size across x
+        # and metric on y, colored by cloud environment
+        plt.figure(figsize=(10, 6))
+
+        # Make better colors!
+        palette = sns.color_palette("hls", 6)
+        colors = palette.as_hex()
+        offsets = [-0.75, -0.5, -0.25, 0.25, 0.5, 0.75]
+        for experiment, values in vectors.items():
+            positions = np.array(np.arange(len(values))) * 2.0 + offsets.pop(0)
+            plot = plt.boxplot(
+                values,
+                positions=positions,
+                widths=0.3,
+                patch_artist=True,
+                showfliers=False,
+            )
+            ps.set_group_color_properties(plot, colors.pop(0), experiment)
+
+        # set the x label values, the sizes
+        plt.xticks(
+            np.arange(0, len(size_list) * 2, 2), size_list, rotation=45, fontsize=6
+        )
+        plt.title(f"Magma Gflop/s Size {nodes}")
+        plt.tight_layout()
+        plt.savefig(
+            os.path.join(img_outdir, f"magma-gflop-per-second-size-{nodes}.png")
+        )
+        plt.close()
+
+        # Make a boxplot for each environment, so this is problem size across x
+        # and metric on y, colored by cloud environment
+        plt.figure(figsize=(10, 6))
+
+        offsets = [-0.75, -0.5, -0.25, 0.25, 0.5, 0.75]
+        colors = ["#003f5c", "#58508d", "#bc5090", "#de5a79", "#ff6361", "#ffa600"]
+        for experiment, values in ms_vectors.items():
+            positions = np.array(np.arange(len(values))) * 2.0 + offsets.pop(0)
+            plot = plt.boxplot(
+                values,
+                positions=positions,
+                widths=0.3,
+                patch_artist=True,
+                showfliers=False,
+            )
+            ps.set_group_color_properties(plot, colors.pop(0), experiment)
+
+        # set the x label values, the sizes
+        plt.xticks(
+            np.arange(0, len(size_list) * 2, 2), size_list, rotation=45, fontsize=6
+        )
+        plt.title(f"Magma Milliseconds Size {nodes}")
+        plt.tight_layout()
+        plt.savefig(os.path.join(img_outdir, f"magma-milliseconds-size-{nodes}.png"))
+        plt.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,76 @@
+# Magma Analysis
+
+For each run we are interested in parsing the vbatched result, which was run across environments with `--ngpus 1`. Although the runs were slightly different (on CycleCloud, etc the `CUDA_VISIBLE_DEVICES` was set to all of them, and on different Kubernetes environments it was set by the workload manager, meaning in practice we see device 1 for the latter and across devices for the first) we should still be able to compare the performance of individual GPUs. Given a result that looks like this:
+
+```console
+% transA = No transpose, transB = No transpose
+%              max   max   max
+% BatchCount     M     N     K   MAGMA Gflop/s (ms)   CPU Gflop/s (ms)   MAGMA error
+%===================================================================================
+         300    64    64    63     40.78 (   0.46)     ---   (  ---  )     ---
+         300    32    32    31      0.00 ( 627.04)     ---   (  ---  )     ---
+         300    64    64    63     61.24 (   0.31)     ---   (  ---  )     ---
+         300    32    32    31      0.00 ( 628.92)     ---   (  ---  )     ---
+         300    64    64    63     43.20 (   0.44)     ---   (  ---  )     ---
+         300    32    32    31      0.00 ( 587.99)     ---   (  ---  )     ---
+         300    64    64    63     57.02 (   0.33)     ---   (  ---  )     ---
+```
+
+We are going to plot the ms and Gflops/s separately. Each "chunk size" (the group of 3 including M, N, and K) will be assembled into a boxplot. While we shouldn't see variation across cluster sizes (the GPUs are not communicating) we will first plot them separately to see if there are any differences.
+
+```bash
+pip install -r requirements.txt
+```
+
+Then:
+
+```bash
+python 1-run-analysis.py
+```
+
+## Results
+
+These are currently split up by size, because that gives somewhat more granularity. They don't need to be.
+
+### Gflops/Second
+
+This seems to be the metric of interest. Azure (for both AKS and CycleCloud) has higher values, and greater variability. But also - CycleCloud was run differently (across devices) and the others on just one device, and it's not clear what kind of impact that might have. It seems to start separation at the 224x problem size.
+
+#### Size 4
+
+![data/img/magma-gflop-per-second-size-4.png](data/img/magma-gflop-per-second-size-4.png)
+
+#### Size 8
+
+![data/img/magma-gflop-per-second-size-8.png](data/img/magma-gflop-per-second-size-8.png)
+
+#### Size 16
+
+![data/img/magma-gflop-per-second-size-16.png](data/img/magma-gflop-per-second-size-16.png)
+
+#### Size 32
+
+Note that we could not do any runs on EKS size 32, as we couldn't get the nodes.
+
+![data/img/magma-gflop-per-second-size-32.png](data/img/magma-gflop-per-second-size-32.png)
+
+### Milliseconds
+
+I'm not actually sure what this is measuring - it seems to only spike up at the smallest "problem size" chunk, and only for a few environments. It's not clear if we removed these outliers if there would be meaningful differences down in the squashed data.
+
+#### Size 4
+
+![data/img/magma-milliseconds-size-4.png](data/img/magma-milliseconds-size-4.png)
+
+#### Size 8
+
+![data/img/magma-milliseconds-size-8.png](data/img/magma-milliseconds-size-8.png)
+
+#### Size 16
+
+![data/img/magma-milliseconds-size-16.png](data/img/magma-milliseconds-size-16.png)
+
+#### Size 32
+
+![data/img/magma-milliseconds-size-32.png](data/img/magma-milliseconds-size-32.png)
+