parallelcodefoundry
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎analysis/create-dataframe.py‎
Lines changed: 2 additions & 2 deletions b/‎analysis/create-dataframe.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎analysis/metrics.py‎
Lines changed: 145 additions & 19 deletions b/‎analysis/metrics.py‎
Lines changed: 145 additions & 19 deletions
diff --git a/‎drivers/cpp/Makefile‎
Lines changed: 22 additions & 1 deletion b/‎drivers/cpp/Makefile‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎drivers/cpp/benchmarks/fft/07_fft_fft_conjugate/cpu.cc‎
Lines changed: 3 additions & 3 deletions b/‎drivers/cpp/benchmarks/fft/07_fft_fft_conjugate/cpu.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎drivers/cpp/benchmarks/fft/07_fft_fft_conjugate/gpu.cu‎
Lines changed: 3 additions & 11 deletions b/‎drivers/cpp/benchmarks/fft/07_fft_fft_conjugate/gpu.cu‎
Lines changed: 3 additions & 11 deletions
@@ -10,4 +10,7 @@ __pycache__
 *.out
 *.exe
 a.out
-*.o
+*.o
+
+# editor buffers
+*~
@@ -60,7 +60,7 @@ def main():
                     "top_p": prompt["top_p"],
                     "do_sample": prompt["do_sample"],
                     "max_new_tokens": prompt["max_new_tokens"],
-                    "prompted": prompt["prompted"],
+                    "prompted": prompt.get("prompted", False),
                     "generated_output": output["generated_output"],
                     "did_build": output["did_build"],
                     "is_source_valid": output["is_source_valid"],
@@ -81,7 +81,7 @@ def main():
                     "top_p": prompt["top_p"],
                     "do_sample": prompt["do_sample"],
                     "max_new_tokens": prompt["max_new_tokens"],
-                    "prompted": prompt["prompted"],
+                    "prompted": prompt.get("prompted", False),
                     "generated_output": output["generated_output"],
                     "did_build": output["did_build"],
                     "is_source_valid": output["is_source_valid"],
 
@@ -2,6 +2,7 @@
 """
 # std imports
 import argparse
+import json
 from math import comb
 from typing import Union
 
@@ -13,24 +14,29 @@
 def get_args():
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("input_csv", type=str, help="Input CSV file containing the test cases.")
-    parser.add_argument("-m", "--metric", default="pass", choices=["build", "pass", "speedup"], help="Metric to compute.")
-    parser.add_argument("-k", "--k", type=int, default=1, help="K value for pass@k, build@k, and speedup@k.")
+    parser.add_argument("-k", "--k", type=int, nargs='+', default=[1,5,10,20], help="K value for pass@k, build@k, and speedup@k.")
     parser.add_argument("-n", "--n", type=int, default=1, help="N value for speedup@k.")
     parser.add_argument("-o", "--output", type=str, help="Output csv file containing the results.")
+    parser.add_argument("--problem-sizes", type=str, default='../drivers/problem-sizes.json', help="Json with problem sizes. Used for calculating GPU efficiency.")
+    parser.add_argument("--model-name", type=str, help="Add model name column with this value")
     return parser.parse_args()
 
 def get_correctness_df(df: pd.DataFrame) -> pd.DataFrame:
     """ Group by name, parallelism_model, and output_idx, and set is_valid to true only if all rows in the group have is_valid = true.
         Set it to false otherwise.
     """
     # group all the runs for this LLM output
+    df = df.copy()
     agg = df.groupby(["name", "parallelism_model", "output_idx"]).agg({"is_valid": ["count", "sum"]})
     agg.columns = ["count", "sum"]
 
     # mark as valid only if all runs are valid
     agg["is_valid"] = agg["count"] == agg["sum"]
     agg = agg.reset_index()
     agg = agg.drop(columns=["count", "sum"])
+    
+    # add problem_type column from df
+    agg = agg.merge(df[["name", "problem_type"]].drop_duplicates(), on="name", how="left")
 
     return agg
 
@@ -39,18 +45,26 @@ def nCr(n: int, r: int) -> int:
         return 1
     return comb(n, r)
 
+def buildk(df: pd.DataFrame, k: int) -> pd.DataFrame:
+    """ Compute the build@k metric """
+    agg = df.groupby(["name", "parallelism_model", "problem_type"]).agg({"did_build": ["count", "sum"]})
+    agg.columns = ["total_build_attempts", "successful_builds"]
+    agg = agg.reset_index()
+    agg[f"build@{k}"] = agg.apply(lambda x: _passk(x["total_build_attempts"], x["successful_builds"], k), axis=1)
+    return agg.groupby(["parallelism_model", "problem_type"]).agg({f"build@{k}": "mean"})
+
 def _passk(num_samples: int, num_correct: int, k: int) -> float:
     if num_samples - num_correct < k:
         return 1.0
     return 1.0 - np.prod(1.0 - k / np.arange(num_samples - num_correct + 1, num_samples + 1))
 
 def passk(df: pd.DataFrame, k: int) -> pd.DataFrame:
     """ Compute the pass@k metric """
-    agg = df.groupby(["name", "parallelism_model"]).agg({"is_valid": ["count", "sum"]})
+    agg = df.groupby(["name", "parallelism_model", "problem_type"]).agg({"is_valid": ["count", "sum"]})
     agg.columns = ["total_runs", "valid_count"]
     agg = agg.reset_index()
-    agg["pass@k"] = agg.apply(lambda x: _passk(x["total_runs"], x["valid_count"], k), axis=1)
-    return agg.groupby(["parallelism_model"]).agg({"pass@k": "mean"})
+    agg[f"pass@{k}"] = agg.apply(lambda x: _passk(x["total_runs"], x["valid_count"], k), axis=1)
+    return agg.groupby(["parallelism_model", "problem_type"]).agg({f"pass@{k}": "mean"})
 
 def _speedupk(runtimes: Union[pd.Series, np.ndarray], baseline_runtime: float, k: int) -> float:
     """ Compute the speedup@k metric """
@@ -68,12 +82,14 @@ def _speedupk(runtimes: Union[pd.Series, np.ndarray], baseline_runtime: float, k
     num_samples = runtimes.shape[0]
     for j in range(1, num_samples+1):
         num = nCr(j-1, k-1) * baseline_runtime
-        den = nCr(num_samples, k) * runtimes[j-1]
+        den = nCr(num_samples, k) * max(runtimes[j-1], 1e-8)
         sum += num / den
     return pd.Series({f"speedup@{k}": sum})
 
 def speedupk(df: pd.DataFrame, k: int, n: int) -> pd.DataFrame:
     """ Compute the speedup@k metric """
+    df = df.copy()
+
     # get all runs where is_valid is true
     df = df[df["is_valid"] == True]
 
@@ -84,45 +100,155 @@ def speedupk(df: pd.DataFrame, k: int, n: int) -> pd.DataFrame:
             ((df["parallelism_model"] == "kokkos") & (df["num_threads"] == 32)) |
             ((df["parallelism_model"] == "omp") & (df["num_threads"] == 64)) |
             ((df["parallelism_model"] == "mpi") & (df["num_procs"] == 512)) |
-            ((df["parallelism_model"] == "mpi+omp") & (df["num_procs"] == 4) & (df["num_threads"] == 8))]
+            ((df["parallelism_model"] == "mpi+omp") & (df["num_procs"] == 4) & (df["num_threads"] == 64))]
     df = df.copy()
 
     # use min best_sequential_runtime
     df["best_sequential_runtime"] = df.groupby(["name", "parallelism_model", "output_idx"])["best_sequential_runtime"].transform("min")
 
     # group by name, parallelism_model, and output_idx and call _speedupk
-    df = df.groupby(["name", "parallelism_model", "output_idx"]).apply(
+    df = df.groupby(["name", "parallelism_model", "problem_type"]).apply(
             lambda row: _speedupk(row["runtime"], np.min(row["best_sequential_runtime"]), k)
         ).reset_index()
 
     # compute the mean speedup@k
-    df = df.groupby(["parallelism_model"]).agg({f"speedup@{k}": "mean"})
+    df = df.groupby(["parallelism_model", "problem_type"]).agg({f"speedup@{k}": "mean"})
+
+    return df
+
+def _efficiencyk(runtimes: Union[pd.Series, np.ndarray], baseline_runtime: float, k: int, n_resources: Union[pd.Series, np.ndarray]) -> float:
+    """ Compute the efficiency@k metric """
+    # create a copy of the runtimes
+    if isinstance(runtimes, pd.Series):
+        runtimes = runtimes.values.copy()
+    else:
+        runtimes = runtimes.copy()
+
+    if isinstance(n_resources, pd.Series):
+        n_resources = n_resources.values.copy()
+    else:
+        n_resources = n_resources.copy()
+
+    # sort the runtimes
+    runtimes.sort()
+
+    # compute expected value
+    sum = 0.0
+    num_samples = runtimes.shape[0]
+    for j in range(1, num_samples+1):
+        num = nCr(j-1, k-1) * baseline_runtime
+        den = nCr(num_samples, k) * max(runtimes[j-1], 1e-8) * n_resources[j-1]
+        sum += num / den
+    return pd.Series({f"efficiency@{k}": sum})
+
+def efficiencyk(df: pd.DataFrame, k: int, n: int) -> pd.DataFrame:
+    """ Compute the efficiency@k metric """
+    df = df.copy()
+
+    # get all runs where is_valid is true
+    df = df[df["is_valid"] == True]
+
+    # choose processor count; hardcoded right now
+    df = df[(df["parallelism_model"] == "serial") |
+            (df["parallelism_model"] == "cuda") |
+            (df["parallelism_model"] == "hip") |
+            ((df["parallelism_model"] == "kokkos") & (df["num_threads"] == 32)) |
+            ((df["parallelism_model"] == "omp") & (df["num_threads"] == 64)) |
+            ((df["parallelism_model"] == "mpi") & (df["num_procs"] == 512)) |
+            ((df["parallelism_model"] == "mpi+omp") & (df["num_procs"] == 4) & (df["num_threads"] == 64))]
+
+    # set n_resources column to 1 for serial; 32 for kokkos; 64 for omp; 512 for mpi; 4*64 for mpi+omp;
+    # set it to problem_size for cuda and hip
+    df["n_resources"] = 1
+    df.loc[df["parallelism_model"] == "cuda", "n_resources"] = df["problem_size"]
+    df.loc[df["parallelism_model"] == "hip", "n_resources"] = df["problem_size"]
+    df.loc[df["parallelism_model"] == "kokkos", "n_resources"] = 32
+    df.loc[df["parallelism_model"] == "omp", "n_resources"] = 64
+    df.loc[df["parallelism_model"] == "mpi", "n_resources"] = 512
+    df.loc[df["parallelism_model"] == "mpi+omp", "n_resources"] = 4*64
+
+    df = df.copy()
+
+    # use min best_sequential_runtime
+    df["best_sequential_runtime"] = df.groupby(["name", "parallelism_model", "output_idx"])["best_sequential_runtime"].transform("min")
+
+    # group by name, parallelism_model, and output_idx and call _efficiencyk
+    df = df.groupby(["name", "parallelism_model", "problem_type"]).apply(
+            lambda row: _efficiencyk(row["runtime"], np.min(row["best_sequential_runtime"]), k, row["n_resources"])
+        ).reset_index()
+    
+    # compute the mean efficiency@k
+    df = df.groupby(["parallelism_model", "problem_type"]).agg({f"efficiency@{k}": "mean"})
 
     return df
 
+def parse_problem_size(problem_size: str) -> int:
+    """ problem size is of format '(1<<n)' """
+    num = problem_size.split("<<")[1][:-1]
+    return 2 ** int(num)
+
 def main():
     args = get_args()
 
     # read in input
     df = pd.read_csv(args.input_csv)
 
+    # read in problem sizes
+    with open(args.problem_sizes, "r") as f:
+        problem_sizes = json.load(f)
+        for problem in problem_sizes:
+            for parallelism_model, problem_size in problem_sizes[problem].items():
+                df.loc[(df["name"] == problem) & (df["parallelism_model"] == parallelism_model), "problem_size"] = parse_problem_size(problem_size)
+
     # remove rows where parallelism_model is kokkos and num_threads is 64
     df = df[~((df["parallelism_model"] == "kokkos") & (df["num_threads"] == 64))]
 
     # filter/aggregate
     df["did_run"] = df["did_run"].fillna(False)     # if it didn't build, then this will be nan; overwrite
     df["is_valid"] = df["is_valid"].fillna(False)   # if it didn't build, then this will be nan; overwrite
 
-    # compute metric
-    if args.metric == "build":
-        pass
-    elif args.metric == "pass":
-        df = get_correctness_df(df)
-        result = passk(df, args.k)
-        print(result)
-    elif args.metric == "speedup":
-        result = speedupk(df, args.k, args.n)
-        print(result)
+    # get only valid runs
+    valid_runs = get_correctness_df(df)
+    
+    # get values for each k
+    all_results = []
+    for k in args.k:
+        build_values = buildk(df, k)
+        pass_values = passk(valid_runs, k)
+        speedup_values = speedupk(df, k, args.n)
+        efficiency_values = efficiencyk(df, k, args.n)
+        all_results.extend([build_values, pass_values, speedup_values, efficiency_values])
+    
+    # merge all_results; each df has one column and the same index
+    # build a new df with all the columns and the same index
+    merged_df = pd.concat(all_results, axis=1).reset_index()
+
+    # if there were no successfull builds or runs, then speedup@k will be nan after merging
+    # replace NaN speedup@k values with 0.0
+    for k in args.k:
+        merged_df[f"speedup@{k}"] = merged_df[f"speedup@{k}"].fillna(0.0)
+        merged_df[f"efficiency@{k}"] = merged_df[f"efficiency@{k}"].fillna(0.0)
+
+    # add model name column
+    if args.model_name:
+        merged_df.insert(0, "model_name", args.model_name)
+
+    # clean up column names
+    column_name_map = {
+        "model_name": "model",
+        "parallelism_model": "execution model",
+        "problem_type": "problem type",
+    }
+    merged_df = merged_df.rename(columns=column_name_map)
+
+    # write to csv
+    if args.output:
+        merged_df.to_csv(args.output, index=False)
+    else:
+        pd.set_option('display.max_columns', merged_df.shape[1]+1)
+        pd.set_option('display.max_rows', merged_df.shape[0]+1)
+        print(merged_df)
+        
 
 
 if __name__ == "__main__":
 
@@ -3,14 +3,19 @@ MPICXX = mpicxx
 CXX_FLAGS = -std=c++17 -O3
 Kokkos_PATH ?= ../../tpl/kokkos/build
 
+HAS_NVCC := $(shell command -v nvcc 2> /dev/null)
+HAS_HIPCC := $(shell command -v hipcc 2> /dev/null)
+HAS_KOKKOS := $(shell test -d $(Kokkos_PATH)/lib64/ && echo "true")
+
 SERIAL_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*serial-driver.cc))
 OMP_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*omp-driver.cc))
 MPI_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*mpi-driver.cc))
 MPI_OMP_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*mpi-omp-driver.cc))
 CUDA_DRIVERS = $(patsubst %.cu,%.o, $(wildcard */*cuda-driver.cu))
+HIP_DRIVERS = $(patsubst %.cu,%.o, $(wildcard */*hip-driver.cu))
 KOKKOS_DRIVERS = $(patsubst %.cc,%.o, $(wildcard */*kokkos-driver.cc))
 
-ALL_DRIVERS = $(SERIAL_DRIVERS) $(OMP_DRIVERS) $(MPI_DRIVERS) $(MPI_OMP_DRIVERS) $(CUDA_DRIVERS) $(KOKKOS_DRIVERS)
+ALL_DRIVERS = $(SERIAL_DRIVERS) $(OMP_DRIVERS) $(MPI_DRIVERS) $(MPI_OMP_DRIVERS) $(CUDA_DRIVERS) $(HIP_DRIVERS) $(KOKKOS_DRIVERS)
 
 all: $(ALL_DRIVERS)
 
@@ -30,10 +35,26 @@ all: $(ALL_DRIVERS)
 	$(MPICXX) $(CXX_FLAGS) -fopenmp -o $@ -c $<
 
 %cuda-driver.o: %cuda-driver.cu
+ifdef HAS_NVCC
+	@echo "nvcc found, compiling $(HAS_NVCC)"
 	nvcc -std=c++17 -O3 -o $@ -c $<
+else
+	@echo "nvcc not found, skipping $@"
+endif
+
+%hip-driver.o: %hip-driver.cu
+ifdef HAS_HIPCC
+	hipcc -std=c++17 -O3 -o $@ -c $< -Wno-unused-result
+else
+	@echo "hipcc not found, skipping $@"
+endif
 
 %kokkos-driver.o: %kokkos-driver.cc
+ifdef HAS_KOKKOS
 	$(CXX) $(CXX_FLAGS) -I$(Kokkos_PATH)/include -L$(Kokkos_PATH)/lib64 -fopenmp -o $@ -c $<
+else
+	@echo "Kokkos not found, skipping $@"
+endif
 
 clean:
 	rm -f $(ALL_DRIVERS)
@@ -5,7 +5,7 @@
 //    input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
 //    output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]
 // */
-// void fft(std::vector<std::complex<double>> &x) {
+// void fftConjugate(std::vector<std::complex<double>> &x) {
 
 #include <algorithm>
 #include <cmath>
@@ -45,7 +45,7 @@ Context *init() {
 }
 
 void NO_OPTIMIZE compute(Context *ctx) {
-    fft(ctx->x);
+    fftConjugate(ctx->x);
 }
 
 void NO_OPTIMIZE best(Context *ctx) {
@@ -79,7 +79,7 @@ bool validate(Context *ctx) {
 
         // compute test result
         std::vector<std::complex<double>> test = x;
-        fft(test);
+        fftConjugate(test);
         SYNC();
 
         bool isCorrect = true;
 
@@ -6,7 +6,7 @@
 //    input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
 //    output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]
 // */
-// __global__ void fft(cuDoubleComplex *x, size_t N) {
+// __global__ void fftConjugate(cuDoubleComplex *x, size_t N) {
 
 #include <algorithm>
 #include <numeric>
@@ -20,14 +20,6 @@
 #include "generated-code.cuh"   // code generated by LLM
 
 
-#if defined(USE_CUDA)
-#include <thrust/device_vector.h>
-#include <thrust/copy.h>
-#include <thrust/sort.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/permutation_iterator.h>
-#endif
-
 struct Context {
     DOUBLE_COMPLEX_T *d_x;
     std::vector<DOUBLE_COMPLEX_T> tmp_x;
@@ -66,7 +58,7 @@ Context *init() {
 }
 
 void NO_OPTIMIZE compute(Context *ctx) {
-    fft<<<ctx->gridSize, ctx->blockSize>>>(ctx->d_x, ctx->N);
+    fftConjugate<<<ctx->gridSize, ctx->blockSize>>>(ctx->d_x, ctx->N);
 }
 
 void NO_OPTIMIZE best(Context *ctx) {
@@ -101,7 +93,7 @@ bool validate(Context *ctx) {
         correctFft(h_x);
 
         // compute test result
-        fft<<<gridSize, blockSize>>>(d_x, TEST_SIZE);
+        fftConjugate<<<gridSize, blockSize>>>(d_x, TEST_SIZE);
         SYNC();
 
         // copy back
-Original file line number
+Diff line change
 *.out
 *.exe
 a.out
 -*.o
 +*.o
++
 +# editor buffers
 +*~