Skip to content

Commit e865425

Browse files
Dando18jhdavis8
andauthored
Update outputs with new prompts (#15)
* update the list of prompts * add checkpointing * add throughput * update generation * add stack analysis script * update throughput scripts * Update gitignore * add sum of prefix sum gpu driver * Fixes to scan 28 gpu driver, add test outputs, subprocess import * add implementations for drivers that have been tested * update analysis scripts * updated set of currently working drivers * Fix indent in scan-28 gpu.cu * Change scan 28 gpu to use copy macros instead of memcpy symbol call * fix kernel call in gpu.cu scan 28, update test output for same * Add scan 27 * Update scan benchmark names to match current ids * Some fixes for scan 31 * Add scan 30 drivers * Add MPI support for cpu.cc in scan 30, 31, 32 * Update gpu.cu for scan 30-32 * Small updates to scan 30-32 kokkos * Complete scan 33 * Add scan 34, small changes to template generator * Update test outputs with right numbers for scan prompts, add script to generate simple test outputs for driver * Various updates to scan drivers fixing minor bugs * Make create driver template and run all executable scripts * update template, add reduce 25 drivers * Add reduce 27 drivers * Add reduce 26 drivers * Add reduce 28 * Add reduce 29 * Add stencil 50 drivers * Add stencil 51 driver * Add stencil 53 driver * update formatting and problem sizes for scan and reduce * Add stencil 52 drivers * Add stencil 54 drivers * bug fixes for scan * reduce bug fixes * fix bugs in stencil drivers and set problem sizes * some minor updates after prompt changes * newest updates to prompts * update prompts json * update generate scripts * add run scripts * add updated model outputs * update runs scripts * add openai outputs * update generation script * update generate * update how scripts are run * outputs * update generation * add more outputs * update gpt-4 outputs * update search benchmarks * update analysis scripts * update model outputs * update computed results * update model collection * update gpt-4 results * update gpt 3 and 4 outputs * update metric defaults * update some driver utility scripts * update result data * update all.json files * update non-openai outputs * update hip results * update some scripts to handle hip better * update 59 * updated results * add helper script * run scripts changes --------- Co-authored-by: Josh Davis <jhdavis@umd.edu>
1 parent 05af9f8 commit e865425

176 files changed

Lines changed: 5985 additions & 474 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,7 @@ __pycache__
1010
*.out
1111
*.exe
1212
a.out
13-
*.o
13+
*.o
14+
15+
# editor buffers
16+
*~

analysis/create-dataframe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def main():
6060
"top_p": prompt["top_p"],
6161
"do_sample": prompt["do_sample"],
6262
"max_new_tokens": prompt["max_new_tokens"],
63-
"prompted": prompt["prompted"],
63+
"prompted": prompt.get("prompted", False),
6464
"generated_output": output["generated_output"],
6565
"did_build": output["did_build"],
6666
"is_source_valid": output["is_source_valid"],
@@ -81,7 +81,7 @@ def main():
8181
"top_p": prompt["top_p"],
8282
"do_sample": prompt["do_sample"],
8383
"max_new_tokens": prompt["max_new_tokens"],
84-
"prompted": prompt["prompted"],
84+
"prompted": prompt.get("prompted", False),
8585
"generated_output": output["generated_output"],
8686
"did_build": output["did_build"],
8787
"is_source_valid": output["is_source_valid"],

analysis/metrics.py

Lines changed: 145 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"""
33
# std imports
44
import argparse
5+
import json
56
from math import comb
67
from typing import Union
78

@@ -13,24 +14,29 @@
1314
def get_args():
1415
parser = argparse.ArgumentParser(description=__doc__)
1516
parser.add_argument("input_csv", type=str, help="Input CSV file containing the test cases.")
16-
parser.add_argument("-m", "--metric", default="pass", choices=["build", "pass", "speedup"], help="Metric to compute.")
17-
parser.add_argument("-k", "--k", type=int, default=1, help="K value for pass@k, build@k, and speedup@k.")
17+
parser.add_argument("-k", "--k", type=int, nargs='+', default=[1,5,10,20], help="K value for pass@k, build@k, and speedup@k.")
1818
parser.add_argument("-n", "--n", type=int, default=1, help="N value for speedup@k.")
1919
parser.add_argument("-o", "--output", type=str, help="Output csv file containing the results.")
20+
parser.add_argument("--problem-sizes", type=str, default='../drivers/problem-sizes.json', help="Json with problem sizes. Used for calculating GPU efficiency.")
21+
parser.add_argument("--model-name", type=str, help="Add model name column with this value")
2022
return parser.parse_args()
2123

2224
def get_correctness_df(df: pd.DataFrame) -> pd.DataFrame:
2325
""" Group by name, parallelism_model, and output_idx, and set is_valid to true only if all rows in the group have is_valid = true.
2426
Set it to false otherwise.
2527
"""
2628
# group all the runs for this LLM output
29+
df = df.copy()
2730
agg = df.groupby(["name", "parallelism_model", "output_idx"]).agg({"is_valid": ["count", "sum"]})
2831
agg.columns = ["count", "sum"]
2932

3033
# mark as valid only if all runs are valid
3134
agg["is_valid"] = agg["count"] == agg["sum"]
3235
agg = agg.reset_index()
3336
agg = agg.drop(columns=["count", "sum"])
37+
38+
# add problem_type column from df
39+
agg = agg.merge(df[["name", "problem_type"]].drop_duplicates(), on="name", how="left")
3440

3541
return agg
3642

@@ -39,18 +45,26 @@ def nCr(n: int, r: int) -> int:
3945
return 1
4046
return comb(n, r)
4147

48+
def buildk(df: pd.DataFrame, k: int) -> pd.DataFrame:
49+
""" Compute the build@k metric """
50+
agg = df.groupby(["name", "parallelism_model", "problem_type"]).agg({"did_build": ["count", "sum"]})
51+
agg.columns = ["total_build_attempts", "successful_builds"]
52+
agg = agg.reset_index()
53+
agg[f"build@{k}"] = agg.apply(lambda x: _passk(x["total_build_attempts"], x["successful_builds"], k), axis=1)
54+
return agg.groupby(["parallelism_model", "problem_type"]).agg({f"build@{k}": "mean"})
55+
4256
def _passk(num_samples: int, num_correct: int, k: int) -> float:
4357
if num_samples - num_correct < k:
4458
return 1.0
4559
return 1.0 - np.prod(1.0 - k / np.arange(num_samples - num_correct + 1, num_samples + 1))
4660

4761
def passk(df: pd.DataFrame, k: int) -> pd.DataFrame:
4862
""" Compute the pass@k metric """
49-
agg = df.groupby(["name", "parallelism_model"]).agg({"is_valid": ["count", "sum"]})
63+
agg = df.groupby(["name", "parallelism_model", "problem_type"]).agg({"is_valid": ["count", "sum"]})
5064
agg.columns = ["total_runs", "valid_count"]
5165
agg = agg.reset_index()
52-
agg["pass@k"] = agg.apply(lambda x: _passk(x["total_runs"], x["valid_count"], k), axis=1)
53-
return agg.groupby(["parallelism_model"]).agg({"pass@k": "mean"})
66+
agg[f"pass@{k}"] = agg.apply(lambda x: _passk(x["total_runs"], x["valid_count"], k), axis=1)
67+
return agg.groupby(["parallelism_model", "problem_type"]).agg({f"pass@{k}": "mean"})
5468

5569
def _speedupk(runtimes: Union[pd.Series, np.ndarray], baseline_runtime: float, k: int) -> float:
5670
""" Compute the speedup@k metric """
@@ -68,12 +82,14 @@ def _speedupk(runtimes: Union[pd.Series, np.ndarray], baseline_runtime: float, k
6882
num_samples = runtimes.shape[0]
6983
for j in range(1, num_samples+1):
7084
num = nCr(j-1, k-1) * baseline_runtime
71-
den = nCr(num_samples, k) * runtimes[j-1]
85+
den = nCr(num_samples, k) * max(runtimes[j-1], 1e-8)
7286
sum += num / den
7387
return pd.Series({f"speedup@{k}": sum})
7488

7589
def speedupk(df: pd.DataFrame, k: int, n: int) -> pd.DataFrame:
7690
""" Compute the speedup@k metric """
91+
df = df.copy()
92+
7793
# get all runs where is_valid is true
7894
df = df[df["is_valid"] == True]
7995

@@ -84,45 +100,155 @@ def speedupk(df: pd.DataFrame, k: int, n: int) -> pd.DataFrame:
84100
((df["parallelism_model"] == "kokkos") & (df["num_threads"] == 32)) |
85101
((df["parallelism_model"] == "omp") & (df["num_threads"] == 64)) |
86102
((df["parallelism_model"] == "mpi") & (df["num_procs"] == 512)) |
87-
((df["parallelism_model"] == "mpi+omp") & (df["num_procs"] == 4) & (df["num_threads"] == 8))]
103+
((df["parallelism_model"] == "mpi+omp") & (df["num_procs"] == 4) & (df["num_threads"] == 64))]
88104
df = df.copy()
89105

90106
# use min best_sequential_runtime
91107
df["best_sequential_runtime"] = df.groupby(["name", "parallelism_model", "output_idx"])["best_sequential_runtime"].transform("min")
92108

93109
# group by name, parallelism_model, and output_idx and call _speedupk
94-
df = df.groupby(["name", "parallelism_model", "output_idx"]).apply(
110+
df = df.groupby(["name", "parallelism_model", "problem_type"]).apply(
95111
lambda row: _speedupk(row["runtime"], np.min(row["best_sequential_runtime"]), k)
96112
).reset_index()
97113

98114
# compute the mean speedup@k
99-
df = df.groupby(["parallelism_model"]).agg({f"speedup@{k}": "mean"})
115+
df = df.groupby(["parallelism_model", "problem_type"]).agg({f"speedup@{k}": "mean"})
116+
117+
return df
118+
119+
def _efficiencyk(runtimes: Union[pd.Series, np.ndarray], baseline_runtime: float, k: int, n_resources: Union[pd.Series, np.ndarray]) -> float:
120+
""" Compute the efficiency@k metric """
121+
# create a copy of the runtimes
122+
if isinstance(runtimes, pd.Series):
123+
runtimes = runtimes.values.copy()
124+
else:
125+
runtimes = runtimes.copy()
126+
127+
if isinstance(n_resources, pd.Series):
128+
n_resources = n_resources.values.copy()
129+
else:
130+
n_resources = n_resources.copy()
131+
132+
# sort the runtimes
133+
runtimes.sort()
134+
135+
# compute expected value
136+
sum = 0.0
137+
num_samples = runtimes.shape[0]
138+
for j in range(1, num_samples+1):
139+
num = nCr(j-1, k-1) * baseline_runtime
140+
den = nCr(num_samples, k) * max(runtimes[j-1], 1e-8) * n_resources[j-1]
141+
sum += num / den
142+
return pd.Series({f"efficiency@{k}": sum})
143+
144+
def efficiencyk(df: pd.DataFrame, k: int, n: int) -> pd.DataFrame:
145+
""" Compute the efficiency@k metric """
146+
df = df.copy()
147+
148+
# get all runs where is_valid is true
149+
df = df[df["is_valid"] == True]
150+
151+
# choose processor count; hardcoded right now
152+
df = df[(df["parallelism_model"] == "serial") |
153+
(df["parallelism_model"] == "cuda") |
154+
(df["parallelism_model"] == "hip") |
155+
((df["parallelism_model"] == "kokkos") & (df["num_threads"] == 32)) |
156+
((df["parallelism_model"] == "omp") & (df["num_threads"] == 64)) |
157+
((df["parallelism_model"] == "mpi") & (df["num_procs"] == 512)) |
158+
((df["parallelism_model"] == "mpi+omp") & (df["num_procs"] == 4) & (df["num_threads"] == 64))]
159+
160+
# set n_resources column to 1 for serial; 32 for kokkos; 64 for omp; 512 for mpi; 4*64 for mpi+omp;
161+
# set it to problem_size for cuda and hip
162+
df["n_resources"] = 1
163+
df.loc[df["parallelism_model"] == "cuda", "n_resources"] = df["problem_size"]
164+
df.loc[df["parallelism_model"] == "hip", "n_resources"] = df["problem_size"]
165+
df.loc[df["parallelism_model"] == "kokkos", "n_resources"] = 32
166+
df.loc[df["parallelism_model"] == "omp", "n_resources"] = 64
167+
df.loc[df["parallelism_model"] == "mpi", "n_resources"] = 512
168+
df.loc[df["parallelism_model"] == "mpi+omp", "n_resources"] = 4*64
169+
170+
df = df.copy()
171+
172+
# use min best_sequential_runtime
173+
df["best_sequential_runtime"] = df.groupby(["name", "parallelism_model", "output_idx"])["best_sequential_runtime"].transform("min")
174+
175+
# group by name, parallelism_model, and output_idx and call _efficiencyk
176+
df = df.groupby(["name", "parallelism_model", "problem_type"]).apply(
177+
lambda row: _efficiencyk(row["runtime"], np.min(row["best_sequential_runtime"]), k, row["n_resources"])
178+
).reset_index()
179+
180+
# compute the mean efficiency@k
181+
df = df.groupby(["parallelism_model", "problem_type"]).agg({f"efficiency@{k}": "mean"})
100182

101183
return df
102184

185+
def parse_problem_size(problem_size: str) -> int:
186+
""" problem size is of format '(1<<n)' """
187+
num = problem_size.split("<<")[1][:-1]
188+
return 2 ** int(num)
189+
103190
def main():
104191
args = get_args()
105192

106193
# read in input
107194
df = pd.read_csv(args.input_csv)
108195

196+
# read in problem sizes
197+
with open(args.problem_sizes, "r") as f:
198+
problem_sizes = json.load(f)
199+
for problem in problem_sizes:
200+
for parallelism_model, problem_size in problem_sizes[problem].items():
201+
df.loc[(df["name"] == problem) & (df["parallelism_model"] == parallelism_model), "problem_size"] = parse_problem_size(problem_size)
202+
109203
# remove rows where parallelism_model is kokkos and num_threads is 64
110204
df = df[~((df["parallelism_model"] == "kokkos") & (df["num_threads"] == 64))]
111205

112206
# filter/aggregate
113207
df["did_run"] = df["did_run"].fillna(False) # if it didn't build, then this will be nan; overwrite
114208
df["is_valid"] = df["is_valid"].fillna(False) # if it didn't build, then this will be nan; overwrite
115209

116-
# compute metric
117-
if args.metric == "build":
118-
pass
119-
elif args.metric == "pass":
120-
df = get_correctness_df(df)
121-
result = passk(df, args.k)
122-
print(result)
123-
elif args.metric == "speedup":
124-
result = speedupk(df, args.k, args.n)
125-
print(result)
210+
# get only valid runs
211+
valid_runs = get_correctness_df(df)
212+
213+
# get values for each k
214+
all_results = []
215+
for k in args.k:
216+
build_values = buildk(df, k)
217+
pass_values = passk(valid_runs, k)
218+
speedup_values = speedupk(df, k, args.n)
219+
efficiency_values = efficiencyk(df, k, args.n)
220+
all_results.extend([build_values, pass_values, speedup_values, efficiency_values])
221+
222+
# merge all_results; each df has one column and the same index
223+
# build a new df with all the columns and the same index
224+
merged_df = pd.concat(all_results, axis=1).reset_index()
225+
226+
# if there were no successfull builds or runs, then speedup@k will be nan after merging
227+
# replace NaN speedup@k values with 0.0
228+
for k in args.k:
229+
merged_df[f"speedup@{k}"] = merged_df[f"speedup@{k}"].fillna(0.0)
230+
merged_df[f"efficiency@{k}"] = merged_df[f"efficiency@{k}"].fillna(0.0)
231+
232+
# add model name column
233+
if args.model_name:
234+
merged_df.insert(0, "model_name", args.model_name)
235+
236+
# clean up column names
237+
column_name_map = {
238+
"model_name": "model",
239+
"parallelism_model": "execution model",
240+
"problem_type": "problem type",
241+
}
242+
merged_df = merged_df.rename(columns=column_name_map)
243+
244+
# write to csv
245+
if args.output:
246+
merged_df.to_csv(args.output, index=False)
247+
else:
248+
pd.set_option('display.max_columns', merged_df.shape[1]+1)
249+
pd.set_option('display.max_rows', merged_df.shape[0]+1)
250+
print(merged_df)
251+
126252

127253

128254
if __name__ == "__main__":

drivers/cpp/Makefile

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,19 @@ MPICXX = mpicxx
33
CXX_FLAGS = -std=c++17 -O3
44
Kokkos_PATH ?= ../../tpl/kokkos/build
55

6+
HAS_NVCC := $(shell command -v nvcc 2> /dev/null)
7+
HAS_HIPCC := $(shell command -v hipcc 2> /dev/null)
8+
HAS_KOKKOS := $(shell test -d $(Kokkos_PATH)/lib64/ && echo "true")
9+
610
SERIAL_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*serial-driver.cc))
711
OMP_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*omp-driver.cc))
812
MPI_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*mpi-driver.cc))
913
MPI_OMP_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*mpi-omp-driver.cc))
1014
CUDA_DRIVERS = $(patsubst %.cu,%.o, $(wildcard */*cuda-driver.cu))
15+
HIP_DRIVERS = $(patsubst %.cu,%.o, $(wildcard */*hip-driver.cu))
1116
KOKKOS_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*kokkos-driver.cc))
1217

13-
ALL_DRIVERS = $(SERIAL_DRIVERS) $(OMP_DRIVERS) $(MPI_DRIVERS) $(MPI_OMP_DRIVERS) $(CUDA_DRIVERS) $(KOKKOS_DRIVERS)
18+
ALL_DRIVERS = $(SERIAL_DRIVERS) $(OMP_DRIVERS) $(MPI_DRIVERS) $(MPI_OMP_DRIVERS) $(CUDA_DRIVERS) $(HIP_DRIVERS) $(KOKKOS_DRIVERS)
1419

1520
all: $(ALL_DRIVERS)
1621

@@ -30,10 +35,26 @@ all: $(ALL_DRIVERS)
3035
$(MPICXX) $(CXX_FLAGS) -fopenmp -o $@ -c $<
3136

3237
%cuda-driver.o: %cuda-driver.cu
38+
ifdef HAS_NVCC
39+
@echo "nvcc found, compiling $(HAS_NVCC)"
3340
nvcc -std=c++17 -O3 -o $@ -c $<
41+
else
42+
@echo "nvcc not found, skipping $@"
43+
endif
44+
45+
%hip-driver.o: %hip-driver.cu
46+
ifdef HAS_HIPCC
47+
hipcc -std=c++17 -O3 -o $@ -c $< -Wno-unused-result
48+
else
49+
@echo "hipcc not found, skipping $@"
50+
endif
3451

3552
%kokkos-driver.o: %kokkos-driver.cc
53+
ifdef HAS_KOKKOS
3654
$(CXX) $(CXX_FLAGS) -I$(Kokkos_PATH)/include -L$(Kokkos_PATH)/lib64 -fopenmp -o $@ -c $<
55+
else
56+
@echo "Kokkos not found, skipping $@"
57+
endif
3758

3859
clean:
3960
rm -f $(ALL_DRIVERS)

drivers/cpp/benchmarks/fft/07_fft_fft_conjugate/cpu.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
// input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
66
// output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]
77
// */
8-
// void fft(std::vector<std::complex<double>> &x) {
8+
// void fftConjugate(std::vector<std::complex<double>> &x) {
99

1010
#include <algorithm>
1111
#include <cmath>
@@ -45,7 +45,7 @@ Context *init() {
4545
}
4646

4747
void NO_OPTIMIZE compute(Context *ctx) {
48-
fft(ctx->x);
48+
fftConjugate(ctx->x);
4949
}
5050

5151
void NO_OPTIMIZE best(Context *ctx) {
@@ -79,7 +79,7 @@ bool validate(Context *ctx) {
7979

8080
// compute test result
8181
std::vector<std::complex<double>> test = x;
82-
fft(test);
82+
fftConjugate(test);
8383
SYNC();
8484

8585
bool isCorrect = true;

drivers/cpp/benchmarks/fft/07_fft_fft_conjugate/gpu.cu

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
// input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
77
// output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]
88
// */
9-
// __global__ void fft(cuDoubleComplex *x, size_t N) {
9+
// __global__ void fftConjugate(cuDoubleComplex *x, size_t N) {
1010

1111
#include <algorithm>
1212
#include <numeric>
@@ -20,14 +20,6 @@
2020
#include "generated-code.cuh" // code generated by LLM
2121

2222

23-
#if defined(USE_CUDA)
24-
#include <thrust/device_vector.h>
25-
#include <thrust/copy.h>
26-
#include <thrust/sort.h>
27-
#include <thrust/iterator/counting_iterator.h>
28-
#include <thrust/iterator/permutation_iterator.h>
29-
#endif
30-
3123
struct Context {
3224
DOUBLE_COMPLEX_T *d_x;
3325
std::vector<DOUBLE_COMPLEX_T> tmp_x;
@@ -66,7 +58,7 @@ Context *init() {
6658
}
6759

6860
void NO_OPTIMIZE compute(Context *ctx) {
69-
fft<<<ctx->gridSize, ctx->blockSize>>>(ctx->d_x, ctx->N);
61+
fftConjugate<<<ctx->gridSize, ctx->blockSize>>>(ctx->d_x, ctx->N);
7062
}
7163

7264
void NO_OPTIMIZE best(Context *ctx) {
@@ -101,7 +93,7 @@ bool validate(Context *ctx) {
10193
correctFft(h_x);
10294

10395
// compute test result
104-
fft<<<gridSize, blockSize>>>(d_x, TEST_SIZE);
96+
fftConjugate<<<gridSize, blockSize>>>(d_x, TEST_SIZE);
10597
SYNC();
10698

10799
// copy back

0 commit comments

Comments
 (0)