Modified graph cast and benchmark code to log performance benchmarks

szaman19 · szaman19 · commit cce5c0efa397 · 2025-08-25T22:41:36.000-07:00
diff --git a/experiments/Benchmarks/TestNCCL.py b/experiments/Benchmarks/TestNCCL.py
@@ -169,7 +169,7 @@ def run_scatter_benchmark(
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--message_size", type=int, default=128)
+    parser.add_argument("--message_size", type=int, default=2)
     parser.add_argument("--benchmark_cache", action="store_true")
     parser.add_argument("--num_iters", type=int, default=1000)
     parser.add_argument("--log_dir", type=str, default="logs")
@@ -196,92 +196,114 @@ def main():
     benchmark.print(f"Running NCCL Benchmark on {world_size} ranks")
 
     # Built in small message benchmarks, in future we can add more
-    gather_graph_data = get_nccl_gather_benchmark_data(message_size, world_size, device)
 
-    benchmark.print("*" * 50)
-    benchmark.print("Running Gather Benchmark")
-    times = run_gather_benchmark(benchmark, num_iters, gather_graph_data, cache=None)
-
-    benchmark.print("Saving Gather Benchmark Times")
-
-    for i in range(world_size):
-        benchmark.save_np(times, f"{log_dir}/NCCL_gather_times_{i}.npy", rank_to_save=i)
+    for i in range(1, 20):
+        message_size *= 2
+        benchmark.print("*" * 50)
+        benchmark.print(f"Running NCCL Benchmark for message size {message_size}")
+        gather_graph_data = get_nccl_gather_benchmark_data(
+            message_size, world_size, device
+        )
+        dist.barrier()
 
-    benchmark.print("Gather Benchmark Complete")
-    benchmark.print("*" * 50)
+        benchmark.print("Running Gather Benchmark")
+        times = run_gather_benchmark(
+            benchmark, num_iters, gather_graph_data, cache=None
+        )
 
-    if benchmark_cache:
-        edge_placement = gather_graph_data.edge_rank_placement
-        edge_src_rank = gather_graph_data.edge_src_rank
-        indices = gather_graph_data.edge_indices
+        benchmark.print("Saving Gather Benchmark Times")
 
-        gather_cache = NCCLGatherCacheGenerator(
-            indices,
-            edge_placement.view(-1),
-            edge_src_rank.view(-1),
-            1,
-            rank,
-            world_size,
+        benchmark.save_np(
+            times,
+            f"{log_dir}/NCCL_gather_times_message_size_{message_size}"
+            + f"_world_size_{world_size}.npy",
+            rank_to_save=0,
         )
+
+        benchmark.print("Gather Benchmark Complete")
         benchmark.print("*" * 50)
-        benchmark.print("Running Gather Benchmark with Cache")
-        times = run_gather_benchmark(
-            benchmark, num_iters, gather_graph_data, cache=gather_cache
-        )
 
-        benchmark.print("Saving Gather Benchmark with Cache Times")
-        for i in range(world_size):
+        if benchmark_cache:
+            edge_placement = gather_graph_data.edge_rank_placement
+            edge_src_rank = gather_graph_data.edge_src_rank
+            indices = gather_graph_data.edge_indices
+
+            gather_cache = NCCLGatherCacheGenerator(
+                indices,
+                edge_placement.view(-1),
+                edge_src_rank.view(-1),
+                1,
+                rank,
+                world_size,
+            )
+            benchmark.print("*" * 50)
+            benchmark.print("Running Gather Benchmark with Cache")
+            times = run_gather_benchmark(
+                benchmark, num_iters, gather_graph_data, cache=gather_cache
+            )
+
+            benchmark.print("Saving Gather Benchmark with Cache Times")
             benchmark.save_np(
-                times, f"{log_dir}/NCCL_gather_with_cache_times_{i}.npy", rank_to_save=i
+                times,
+                f"{log_dir}/NCCL_gather_with_cache_message_size_{message_size}"
+                + f"_world_size_{world_size}.npy",
+                rank_to_save=0,
             )
 
-        benchmark.print("Gather Benchmark with Cache Complete")
-        benchmark.print("*" * 50)
+            benchmark.print("Gather Benchmark with Cache Complete")
+            benchmark.print("*" * 50)
 
-    scatter_graph_data = get_nccl_scatter_benchmark_data(
-        message_size, world_size, device
-    )
-    benchmark.print("*" * 50)
-    benchmark.print("Running Scatter Benchmark")
-    times = run_scatter_benchmark(benchmark, num_iters, scatter_graph_data, cache=None)
+            scatter_graph_data = get_nccl_scatter_benchmark_data(
+                message_size, world_size, device
+            )
 
-    benchmark.print("Saving Scatter Benchmark Times")
-    for i in range(world_size):
-        benchmark.save_np(
-            times, f"{log_dir}/NCCL_scatter_times_{i}.npy", rank_to_save=i
-        )
+            benchmark.print("*" * 50)
+            benchmark.print("Running Scatter Benchmark")
+            times = run_scatter_benchmark(
+                benchmark, num_iters, scatter_graph_data, cache=None
+            )
 
-    benchmark.print("Scatter Benchmark Complete")
-    benchmark.print("*" * 50)
-    if benchmark_cache:
-        edge_placement = scatter_graph_data.edge_rank_placement
-        edge_dest_rank = scatter_graph_data.edge_dest_rank
-        indices = scatter_graph_data.edge_indices
-
-        scatter_cache = NCCLScatterCacheGenerator(
-            indices,
-            edge_placement.view(-1),
-            edge_dest_rank.view(-1),
-            1,
-            rank,
-            world_size,
-        )
-        benchmark.print("*" * 50)
-        benchmark.print("Running Scatter Benchmark with Cache")
-        times = run_scatter_benchmark(
-            benchmark, num_iters, scatter_graph_data, cache=scatter_cache
-        )
+            benchmark.print("Saving Scatter Benchmark Times")
 
-        benchmark.print("Saving Scatter Benchmark with Cache Times")
-        for i in range(world_size):
             benchmark.save_np(
                 times,
-                f"{log_dir}/NCCL_scatter_with_cache_times_{i}.npy",
-                rank_to_save=i,
+                f"{log_dir}/NCCL_scatter_times_message_size_{message_size}"
+                + f"_world_size_{world_size}.npy",
+                rank_to_save=0,
             )
 
-        benchmark.print("Scatter Benchmark with Cache Complete")
-        benchmark.print("*" * 50)
+            benchmark.print("Scatter Benchmark Complete")
+            benchmark.print("*" * 50)
+            if benchmark_cache:
+                edge_placement = scatter_graph_data.edge_rank_placement
+                edge_dest_rank = scatter_graph_data.edge_dest_rank
+                indices = scatter_graph_data.edge_indices
+
+                scatter_cache = NCCLScatterCacheGenerator(
+                    indices,
+                    edge_placement.view(-1),
+                    edge_dest_rank.view(-1),
+                    1,
+                    rank,
+                    world_size,
+                )
+                benchmark.print("*" * 50)
+                benchmark.print("Running Scatter Benchmark with Cache")
+                times = run_scatter_benchmark(
+                    benchmark, num_iters, scatter_graph_data, cache=scatter_cache
+                )
+
+                benchmark.print("Saving Scatter Benchmark with Cache Times")
+
+                benchmark.save_np(
+                    times,
+                    f"{log_dir}/NCCL_scatter_with_cache_message_size_{message_size}"
+                    + f"_world_size_{world_size}.npy",
+                    rank_to_save=0,
+                )
+
+                benchmark.print("Scatter Benchmark with Cache Complete")
+                benchmark.print("*" * 50)
 
     dist.destroy_process_group()
 
diff --git a/experiments/Benchmarks/generate_plots.py b/experiments/Benchmarks/generate_plots.py
@@ -76,5 +76,5 @@ def generate_cache_comparison_plot():
 
 if __name__ == "__main__":
     generate_plots("nccl")
-    generate_plots("nvshmem")
+    # generate_plots("nvshmem")
     generate_cache_comparison_plot()
diff --git a/experiments/GraphCast/README.md b/experiments/GraphCast/README.md
@@ -38,3 +38,8 @@ Run with benchmarking with the following command:
 python main.py --benchmark
 ```
 ***Note: *** The graph requires a large amount of memory so better to do run on the CPU and a machine with a large amount of memory.
+
+Run with multiple processes per GPU with the following command:
+```bash
+torchrun-hpc --xargs=--mpibind=off --xargs=--gpu-bind=none train_graphcast.py --is_distributed True --procs_per_gpu 4
+```
diff --git a/experiments/GraphCast/data_utils/graphcast_graph.py b/experiments/GraphCast/data_utils/graphcast_graph.py
@@ -219,7 +219,8 @@ def get_grid2mesh_graph(self, mesh_graph_dict: dict):
         contigous_edge_mapping, renumbered_edges = torch.sort(meshtogrid_edge_placement)
 
         src_grid_indices = src_grid_indices[renumbered_edges]
-        grid_vertex_rank_placement = torch.zeros_like(lat_lon_grid_flat)
+        grid_vertex_rank_placement = torch.zeros_like(lat_lon_grid_flat[:, 0])
+
         for i, rank in enumerate(meshtogrid_edge_placement):
             loc = src_grid_indices[i]
             grid_vertex_rank_placement[loc] = rank
@@ -254,6 +255,7 @@ def get_mesh2grid_graph(
         )
 
         edge_features, src_mesh_indices, dst_grid_indices = m2g_graph
+        breakpoint()
         src_mesh_indices = renumbered_vertices[src_mesh_indices]
         dst_grid_indices = renumbered_grid[dst_grid_indices]
 
diff --git a/experiments/GraphCast/dataset.py b/experiments/GraphCast/dataset.py
@@ -87,6 +87,7 @@ def __init__(
         self.lat_lon_grid = torch.stack(
             torch.meshgrid(self.latitudes, self.longitudes, indexing="ij"), dim=-1
         )
+
         self.graph_cast_graph = DistributedGraphCastGraphGenerator(
             self.lat_lon_grid,
             mesh_level=self.mesh_level,
diff --git a/experiments/GraphCast/layers.py b/experiments/GraphCast/layers.py
@@ -20,6 +20,8 @@
 from DGraph.Communicator import Communicator
 from dist_utils import SingleProcessDummyCommunicator
 
+# class MLPSiLuWithRecompute(nn.Module):
+
 
 class MeshGraphMLP(nn.Module):
     """MLP for graph processing"""
diff --git a/experiments/GraphCast/model.py b/experiments/GraphCast/model.py
@@ -330,7 +330,10 @@ def __init__(self, cfg: Config, comm, *args, **kwargs):
         )
 
     def forward(
-        self, input_grid_features: Tensor, static_graph: DistributedGraphCastGraph
+        self,
+        input_grid_features: Tensor,
+        static_graph: DistributedGraphCastGraph,
+        device: Optional[torch.device] = None,
     ) -> Tensor:
         """
         Args:
@@ -340,18 +343,19 @@ def forward(
         Returns:
             (Tensor): The predicted output grid
         """
-
-        input_grid_features = input_grid_features.squeeze(0)
-        input_mesh_features = static_graph.mesh_graph_node_features
-        mesh2mesh_edge_features = static_graph.mesh_graph_edge_features
-        grid2mesh_edge_features = static_graph.grid2mesh_graph_edge_features
-        mesh2grid_edge_features = static_graph.mesh2grid_graph_edge_features
-        mesh2mesh_edge_indices_src = static_graph.mesh_graph_src_indices
-        mesh2mesh_edge_indices_dst = static_graph.mesh_graph_dst_indices
-        mesh2grid_edge_indices_src = static_graph.mesh2grid_graph_src_indices
-        mesh2grid_edge_indices_dst = static_graph.mesh2grid_graph_dst_indices
-        grid2mesh_edge_indices_src = static_graph.grid2mesh_graph_src_indices
-        grid2mesh_edge_indices_dst = static_graph.grid2mesh_graph_dst_indices
+        if device is None:
+            device = input_grid_features.device
+        input_grid_features = input_grid_features.squeeze(0).to(device)
+        input_mesh_features = static_graph.mesh_graph_node_features.to(device)
+        mesh2mesh_edge_features = static_graph.mesh_graph_edge_features.to(device)
+        grid2mesh_edge_features = static_graph.grid2mesh_graph_edge_features.to(device)
+        mesh2grid_edge_features = static_graph.mesh2grid_graph_edge_features.to(device)
+        mesh2mesh_edge_indices_src = static_graph.mesh_graph_src_indices.to(device)
+        mesh2mesh_edge_indices_dst = static_graph.mesh_graph_dst_indices.to(device)
+        mesh2grid_edge_indices_src = static_graph.mesh2grid_graph_src_indices.to(device)
+        mesh2grid_edge_indices_dst = static_graph.mesh2grid_graph_dst_indices.to(device)
+        grid2mesh_edge_indices_src = static_graph.grid2mesh_graph_src_indices.to(device)
+        grid2mesh_edge_indices_dst = static_graph.grid2mesh_graph_dst_indices.to(device)
 
         out = self.embedder(
             input_grid_features,
diff --git a/experiments/GraphCast/train_graphcast.py b/experiments/GraphCast/train_graphcast.py
@@ -60,8 +60,10 @@ def main(
         comm = Communicator.init_process_group(
             _communicator, ranks_per_graph=procs_per_graph
         )
+        mesh_graph_placement = torch.load("mesh_vertex_rank_placement_4.pt")
     else:
         comm = SingleProcessDummyCommunicator()
+        mesh_graph_placement = torch.zeros(40962, dtype=torch.int64)
     if not use_synthetic_data:
         raise NotImplementedError("Real data is not yet supported yet.")
 
@@ -106,6 +108,7 @@ def main(
     dataset = SyntheticWeatherDataset(
         channels=[x for x in range(cfg.data.num_channels_climate)],
         num_samples_per_year=cfg.data.num_samples_per_year_train,
+        mesh_vertex_placement=mesh_graph_placement,
         num_steps=cfg.data.num_history,
         device=torch.device("cpu"),
     )
@@ -127,12 +130,12 @@ def main(
         break_training = False
 
         for data in dataloader:
-            in_data = data["invar"]
-            ground_truth = data["outvar"]
+            in_data = data["invar"].to(device)
+            ground_truth = data["outvar"].to(device)
 
             model.train()
             optimizer.zero_grad()
-            predicted_grid = model(in_data, static_graph)
+            predicted_grid = model(in_data, static_graph, device=device)
             loss = compute_loss(ground_truth, predicted_grid, comm)
             loss.backward()
             optimizer.step()
diff --git a/experiments/OGB/main.py b/experiments/OGB/main.py
@@ -163,6 +163,7 @@ def _run_experiment(
         gather_cache_file = f"{cache_prefix}_gather_cache_{world_size}_{rank}.pt"
 
         if os.path.exists(gather_cache_file):
+            print(f"Rank: {rank} Loading gather cache from {gather_cache_file}")
             gather_cache = torch.load(gather_cache_file, weights_only=False)
 
         if os.path.exists(scatter_cache_file):
@@ -379,6 +380,11 @@ def main(
     validation_trajectores = np.zeros((runs, epochs))
     validation_accuracies = np.zeros((runs, epochs))
     world_size = comm.get_world_size()
+
+    dist.barrier()
+    print(f"Running experiment with {world_size} processes on dataset {dataset}")
+    print(f"Using cache: {use_cache}")
+
     for i in range(runs):
         log_prefix = f"{log_dir}/{dataset}_{world_size}_cache={use_cache}_run_{i}"
         training_traj, val_traj, val_accuracy = _run_experiment(
diff --git a/experiments/OGB/preprocess.py b/experiments/OGB/preprocess.py
diff --git a/experiments/OGB/utils.py b/experiments/OGB/utils.py

Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,7 @@ def __init__(`
`87`	`87`	`self.lat_lon_grid = torch.stack(`
`88`	`88`	`torch.meshgrid(self.latitudes, self.longitudes, indexing="ij"), dim=-1`
`89`	`89`	`)`
	`90`	`+`
`90`	`91`	`self.graph_cast_graph = DistributedGraphCastGraphGenerator(`
`91`	`92`	`self.lat_lon_grid,`
`92`	`93`	`mesh_level=self.mesh_level,`