Skip to content

Commit 07f12ad

Browse files
committed
analysis: magma
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
1 parent efb5ea8 commit 07f12ad

17 files changed

Lines changed: 368342 additions & 21 deletions

analysis/magma/1-run-analysis.py

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import collections
5+
import os
6+
import re
7+
import sys
8+
9+
import seaborn as sns
10+
import matplotlib.pylab as plt
11+
import numpy as np
12+
13+
here = os.path.dirname(os.path.abspath(__file__))
14+
analysis_root = os.path.dirname(here)
15+
root = os.path.dirname(analysis_root)
16+
sys.path.insert(0, analysis_root)
17+
18+
import performance_study as ps
19+
20+
sns.set_theme(style="whitegrid", palette="pastel")
21+
22+
23+
def get_parser():
24+
parser = argparse.ArgumentParser(
25+
description="Run analysis",
26+
formatter_class=argparse.RawTextHelpFormatter,
27+
)
28+
parser.add_argument(
29+
"--root",
30+
help="root directory with experiments",
31+
default=os.path.join(root, "experiments"),
32+
)
33+
parser.add_argument(
34+
"--out",
35+
help="directory to save parsed results",
36+
default=os.path.join(here, "data"),
37+
)
38+
return parser
39+
40+
41+
def main():
42+
"""
43+
Find application result files to parse.
44+
"""
45+
parser = get_parser()
46+
args, _ = parser.parse_known_args()
47+
48+
# Output images and data
49+
outdir = os.path.abspath(args.out)
50+
indir = os.path.abspath(args.root)
51+
if not os.path.exists(outdir):
52+
os.makedirs(outdir)
53+
54+
# Find input directories
55+
files = ps.find_inputs(indir, "magma")
56+
if not files:
57+
raise ValueError(f"There are no input files in {indir}")
58+
59+
# Saves raw data to files (json has parsed, and df csv has just duration / wrapper)
60+
df, results = parse_data(indir, outdir, files)
61+
plot_results(results, df, outdir)
62+
63+
64+
def parse_magma(item, filename, exp):
65+
"""
66+
Parse rows of results from magma
67+
"""
68+
results = []
69+
for line in item.split("\n"):
70+
# Use the empty cpu metadata as a marker of a result line
71+
if "( --- )" not in line:
72+
continue
73+
# % BatchCount M N K MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error
74+
# 300 32 32 31 0.53 ( 5.11) --- ( --- ) ---
75+
parts = [x for x in re.sub("([(]|[)])", "", line).split(" ") if x]
76+
if parts[0] != "300":
77+
raise ValueError(f"Found unexpected batch count {parts[0]}, should be 300.")
78+
problem_size = f"{parts[1]}x{parts[2]}x{parts[3]}"
79+
results.append(
80+
{
81+
"problem_size": problem_size,
82+
"gflops_per_second": float(parts[4]),
83+
"ms": float(parts[5]),
84+
"exp": exp.prefix,
85+
"size": exp.size,
86+
}
87+
)
88+
print(f"File {filename} has {len(results)} results.")
89+
return results
90+
91+
92+
def parse_data(indir, outdir, files):
93+
"""
94+
Parse filepaths for environment, etc., and results files for data.
95+
"""
96+
# metrics here will be wall time and wrapped time
97+
p = ps.ProblemSizeParser("magma")
98+
99+
# For flux we can save jobspecs and other event data
100+
data = {}
101+
102+
# This data is HUGE so we will organize by environment, size, then metric
103+
results = {}
104+
105+
# It's important to just parse raw data once, and then use intermediate
106+
for filename in files:
107+
# Underscore means skip, also skip configs and runs without efa
108+
# runs with google and shared memory were actually slower...
109+
dirname = os.path.basename(filename)
110+
if ps.skip_result(dirname, filename):
111+
continue
112+
113+
# Note that aws eks has kripke-8gpu directories, that just
114+
# distinguishes when we ran a first set of runs just with 8 and
115+
# then had the larger cluster working. Both data are good.
116+
# All of these are consistent across studies
117+
exp = ps.ExperimentNameParser(filename, indir)
118+
if exp.prefix not in data:
119+
data[exp.prefix] = []
120+
121+
# Size 2 was typically testing
122+
if exp.size == 2:
123+
continue
124+
if exp.size not in results:
125+
results[exp.size] = {}
126+
127+
# Set the parsing context for the result data frame
128+
p.set_context(exp.cloud, exp.env, exp.env_type, exp.size)
129+
exp.show()
130+
131+
# Now we can read each result file to get metrics.
132+
result_files = list(ps.get_outfiles(filename))
133+
for result in result_files:
134+
# Basename that start with underscore are test or otherwise should not be included
135+
if os.path.basename(result).startswith("_"):
136+
continue
137+
138+
# If we are running in an environment that had two jobs, check for result file name.
139+
# the vbatched one is what we want!
140+
if (
141+
"eks/gpu" in result
142+
or "aks/gpu" in result
143+
or "gke/gpu" in result
144+
or "compute-engine/gpu" in result
145+
):
146+
if "vbatched" not in result:
147+
continue
148+
item = ps.read_file(result)
149+
150+
# If this is a flux run, we have a jobspec and events here
151+
if "JOBSPEC" in item:
152+
item, duration, metadata = ps.parse_flux_metadata(item)
153+
data[exp.prefix].append(metadata)
154+
155+
# Slurm has the item output, and then just the start/end of the job
156+
else:
157+
metadata = {}
158+
duration = ps.parse_slurm_duration(item)
159+
item = ps.remove_slurm_duration(item)
160+
161+
for r in parse_magma(item, result, exp):
162+
if r["problem_size"] not in results[r["size"]]:
163+
results[r["size"]][r["problem_size"]] = {}
164+
if r["exp"] not in results[r["size"]][r["problem_size"]]:
165+
results[r["size"]][r["problem_size"]][r["exp"]] = {
166+
"gflops_per_second": [],
167+
"ms": [],
168+
}
169+
results[r["size"]][r["problem_size"]][r["exp"]][
170+
"gflops_per_second"
171+
].append(r["gflops_per_second"])
172+
results[r["size"]][r["problem_size"]][r["exp"]]["ms"].append(r["ms"])
173+
p.add_result("workload_manager_wrapper_seconds", duration, "all")
174+
175+
print("Done parsing magma results!")
176+
# This just has the magma durations
177+
p.df.to_csv(os.path.join(outdir, "magma-durations.csv"))
178+
ps.write_json(data, os.path.join(outdir, "magma-parsed.json"))
179+
ps.write_json(results, os.path.join(outdir, "magma-data-parsed.json"))
180+
return p.df, results
181+
182+
183+
def plot_results(results, df, outdir):
184+
"""
185+
Plot analysis results
186+
"""
187+
# Make an image outdir
188+
img_outdir = os.path.join(outdir, "img")
189+
if not os.path.exists(img_outdir):
190+
os.makedirs(img_outdir)
191+
192+
# Within a setup, compare between experiments for GPU and cpu
193+
for nodes, problem_sizes in results.items():
194+
size_list = list(problem_sizes.keys())
195+
196+
# Parse gflops/s and ms at the same time
197+
vectors = {}
198+
ms_vectors = {}
199+
200+
# For each size, we want a plot that has problem sizes on x, and the metric on y
201+
# We are going to hue (color) by the full environment prefix.
202+
# We need size -> experiment ->
203+
for size in size_list:
204+
for cloud_env, metrics in problem_sizes[size].items():
205+
# Truncate the size - we just need <cloud>/<env>/<env_type>
206+
experiment = os.path.dirname(cloud_env)
207+
if experiment not in vectors:
208+
vectors[experiment] = []
209+
if experiment not in ms_vectors:
210+
ms_vectors[experiment] = []
211+
vectors[experiment].append(metrics["gflops_per_second"])
212+
ms_vectors[experiment].append(metrics["ms"])
213+
214+
# Make a boxplot for each environment, so this is problem size across x
215+
# and metric on y, colored by cloud environment
216+
plt.figure(figsize=(10, 6))
217+
218+
# Make better colors!
219+
palette = sns.color_palette("hls", 6)
220+
colors = palette.as_hex()
221+
offsets = [-0.75, -0.5, -0.25, 0.25, 0.5, 0.75]
222+
for experiment, values in vectors.items():
223+
positions = np.array(np.arange(len(values))) * 2.0 + offsets.pop(0)
224+
plot = plt.boxplot(
225+
values,
226+
positions=positions,
227+
widths=0.3,
228+
patch_artist=True,
229+
showfliers=False,
230+
)
231+
ps.set_group_color_properties(plot, colors.pop(0), experiment)
232+
233+
# set the x label values, the sizes
234+
plt.xticks(
235+
np.arange(0, len(size_list) * 2, 2), size_list, rotation=45, fontsize=6
236+
)
237+
plt.title(f"Magma Gflop/s Size {nodes}")
238+
plt.tight_layout()
239+
plt.savefig(
240+
os.path.join(img_outdir, f"magma-gflop-per-second-size-{nodes}.png")
241+
)
242+
plt.close()
243+
244+
# Make a boxplot for each environment, so this is problem size across x
245+
# and metric on y, colored by cloud environment
246+
plt.figure(figsize=(10, 6))
247+
248+
offsets = [-0.75, -0.5, -0.25, 0.25, 0.5, 0.75]
249+
colors = ["#003f5c", "#58508d", "#bc5090", "#de5a79", "#ff6361", "#ffa600"]
250+
for experiment, values in ms_vectors.items():
251+
positions = np.array(np.arange(len(values))) * 2.0 + offsets.pop(0)
252+
plot = plt.boxplot(
253+
values,
254+
positions=positions,
255+
widths=0.3,
256+
patch_artist=True,
257+
showfliers=False,
258+
)
259+
ps.set_group_color_properties(plot, colors.pop(0), experiment)
260+
261+
# set the x label values, the sizes
262+
plt.xticks(
263+
np.arange(0, len(size_list) * 2, 2), size_list, rotation=45, fontsize=6
264+
)
265+
plt.title(f"Magma Milliseconds Size {nodes}")
266+
plt.tight_layout()
267+
plt.savefig(os.path.join(img_outdir, f"magma-milliseconds-size-{nodes}.png"))
268+
plt.close()
269+
270+
271+
if __name__ == "__main__":
272+
main()

analysis/magma/README.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Magma Analysis
2+
3+
For each run we are interested in parsing the vbatched result, which was run across environments with `--ngpus 1`. Although the runs were slightly different (on CycleCloud, etc the `CUDA_VISIBLE_DEVICES` was set to all of them, and on different Kubernetes environments it was set by the workload manager, meaning in practice we see device 1 for the latter and across devices for the first) we should still be able to compare the performance of individual GPUs. Given a result that looks like this:
4+
5+
```console
6+
% transA = No transpose, transB = No transpose
7+
% max max max
8+
% BatchCount M N K MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error
9+
%===================================================================================
10+
300 64 64 63 40.78 ( 0.46) --- ( --- ) ---
11+
300 32 32 31 0.00 ( 627.04) --- ( --- ) ---
12+
300 64 64 63 61.24 ( 0.31) --- ( --- ) ---
13+
300 32 32 31 0.00 ( 628.92) --- ( --- ) ---
14+
300 64 64 63 43.20 ( 0.44) --- ( --- ) ---
15+
300 32 32 31 0.00 ( 587.99) --- ( --- ) ---
16+
300 64 64 63 57.02 ( 0.33) --- ( --- ) ---
17+
```
18+
19+
We are going to plot the ms and Gflops/s separately. Each "chunk size" (the group of 3 including M, N, and K) will be assembled into a boxplot. While we shouldn't see variation across cluster sizes (the GPUs are not communicating) we will first plot them separately to see if there are any differences.
20+
21+
```bash
22+
pip install -r requirements.txt
23+
```
24+
25+
Then:
26+
27+
```bash
28+
python 1-run-analysis.py
29+
```
30+
31+
## Results
32+
33+
These are currently split up by size, because that gives somewhat more granularity. They don't need to be.
34+
35+
### Gflops/Second
36+
37+
This seems to be the metric of interest. Azure (for both AKS and CycleCloud) has higher values, and greater variability. But also - CycleCloud was run differently (across devices) and the others on just one device, and it's not clear what kind of impact that might have. It seems to start separation at the 224x problem size.
38+
39+
#### Size 4
40+
41+
![data/img/magma-gflop-per-second-size-4.png](data/img/magma-gflop-per-second-size-4.png)
42+
43+
#### Size 8
44+
45+
![data/img/magma-gflop-per-second-size-8.png](data/img/magma-gflop-per-second-size-8.png)
46+
47+
#### Size 16
48+
49+
![data/img/magma-gflop-per-second-size-16.png](data/img/magma-gflop-per-second-size-16.png)
50+
51+
#### Size 32
52+
53+
Note that we could not do any runs on EKS size 32, as we couldn't get the nodes.
54+
55+
![data/img/magma-gflop-per-second-size-32.png](data/img/magma-gflop-per-second-size-32.png)
56+
57+
### Milliseconds
58+
59+
I'm not actually sure what this is measuring - it seems to only spike up at the smallest "problem size" chunk, and only for a few environments. It's not clear if we removed these outliers if there would be meaningful differences down in the squashed data.
60+
61+
#### Size 4
62+
63+
![data/img/magma-milliseconds-size-4.png](data/img/magma-milliseconds-size-4.png)
64+
65+
#### Size 8
66+
67+
![data/img/magma-milliseconds-size-8.png](data/img/magma-milliseconds-size-8.png)
68+
69+
#### Size 16
70+
71+
![data/img/magma-milliseconds-size-16.png](data/img/magma-milliseconds-size-16.png)
72+
73+
#### Size 32
74+
75+
![data/img/magma-milliseconds-size-32.png](data/img/magma-milliseconds-size-32.png)
76+
59.7 KB
Loading
57.1 KB
Loading
57.9 KB
Loading
58.1 KB
Loading
47.8 KB
Loading
46.6 KB
Loading
49.5 KB
Loading
51.9 KB
Loading

0 commit comments

Comments
 (0)