Optimized cache generation with scatter call and inverse_indices

szaman19 · szaman19 · commit a18faef4ec65 · 2025-05-29T00:57:18.000-07:00
Add standalone file to generation cache to asynchronously generate and save caches
- Update run code to load pre-saved cache files
diff --git a/DGraph/distributed/RankLocalOps.py b/DGraph/distributed/RankLocalOps.py
@@ -137,13 +137,13 @@ def RankLocalRenumberingWithMapping(_indices, rank_mapping):
     """
     This function removes duplicates from the indices tensor.
     """
-    unique_indices = torch.unique(_indices).to(_indices.device)
+    unique_indices, inverse_indices = torch.unique(_indices, return_inverse=True)
     rank_mapping = rank_mapping.to(_indices.device)
-    renumbered_indices = torch.zeros_like(_indices)
-    unique_rank_mapping = torch.zeros_like(unique_indices)
-    for i, idx in enumerate(unique_indices):
-        renumbered_indices[_indices == idx] = i
-        unique_rank_mapping[i] = rank_mapping[_indices == idx][0]
+    renumbered_indices = inverse_indices
+    unique_rank_mapping = torch.zeros_like(
+        unique_indices, dtype=rank_mapping.dtype, device=rank_mapping.device
+    )
+    unique_rank_mapping.scatter_(0, inverse_indices, rank_mapping)
 
     return renumbered_indices, unique_indices, unique_rank_mapping
 
diff --git a/experiments/OGB/GenerateCache.py b/experiments/OGB/GenerateCache.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+# the CONTRIBUTORS file. See the top-level LICENSE file for details.
+#
+# LLNL-CODE-697807.
+# All rights reserved.
+#
+# This file is part of LBANN: Livermore Big Artificial Neural Network
+# Toolkit. For details, see http://software.llnl.gov/LBANN or
+# https://github.com/LBANN and https://github.com/LLNL/LBANN.
+#
+# SPDX-License-Identifier: (Apache-2.0)
+
+from DGraph.data.ogbn_datasets import process_homogenous_data
+from ogb.nodeproppred import NodePropPredDataset
+from fire import Fire
+import os
+import torch
+from DGraph.distributed.nccl._nccl_cache import (
+    NCCLGatherCacheGenerator,
+    NCCLScatterCacheGenerator,
+)
+from time import perf_counter
+from tqdm import tqdm
+from multiprocessing import get_context
+
+
+cache_prefix = {
+    "ogbn-arxiv": "arxiv",
+    "ogbn-products": "products",
+    "ogbn-proteins": "proteins",
+}
+
+
+def generate_cache_file(
+    dist_graph,
+    src_indices,
+    dst_indices,
+    edge_placement,
+    edge_src_placement,
+    edge_dest_placement,
+    cache_prefix_str: str,
+    rank: int,
+    world_size: int,
+):
+    print(f"Generating cache for rank {rank}...")
+    local_node_features = dist_graph.get_local_node_features(rank).unsqueeze(0)
+    num_input_rows = local_node_features.size(1)
+
+    print(
+        f"Rank {rank} has {num_input_rows} input rows with shape {local_node_features.shape}"
+    )
+    gather_cache = NCCLGatherCacheGenerator(
+        dst_indices,
+        edge_placement,
+        edge_dest_placement,
+        num_input_rows,
+        rank,
+        world_size,
+    )
+
+    nodes_per_rank = dist_graph.get_nodes_per_rank()
+    nodes_per_rank = int(nodes_per_rank[rank].item())
+
+    scatter_cache = NCCLScatterCacheGenerator(
+        src_indices,
+        edge_placement,
+        edge_src_placement,
+        nodes_per_rank,
+        rank,
+        world_size,
+    )
+    print(f"Rank {rank}  completed cache generation")
+    with open(
+        f"{cache_prefix_str}_gather_cache_rank_{world_size}_{rank}.pt", "wb"
+    ) as f:
+        torch.save(gather_cache, f)
+
+    with open(
+        f"{cache_prefix_str}_scatter_cache_rank_{world_size}_{rank}.pt", "wb"
+    ) as f:
+        torch.save(scatter_cache, f)
+    return 0
+
+
+def main(dset: str, world_size: int, node_rank_placement_file: str):
+    assert dset in ["ogbn-arxiv", "ogbn-products", "ogbn-proteins"]
+
+    assert world_size > 0
+    assert os.path.exists(
+        node_rank_placement_file
+    ), "Node rank placement file does not exist."
+
+    node_rank_placement = torch.load(node_rank_placement_file)
+
+    dataset = NodePropPredDataset(
+        dset,
+    )
+
+    split_index = dataset.get_idx_split()
+    assert split_index is not None, "Split index is None."
+
+    graph, labels = dataset[0]
+
+    num_edges = graph["edge_index"].shape
+    print(num_edges)
+
+    dist_graph = process_homogenous_data(
+        graph_data=graph,
+        labels=labels,
+        world_Size=world_size,
+        split_idx=split_index,
+        node_rank_placement=node_rank_placement,
+        rank=0,
+    )
+
+    edge_indices = dist_graph.get_global_edge_indices()
+    rank_mappings = dist_graph.get_global_rank_mappings()
+
+    print("Edge indices shape:", edge_indices.shape)
+    print("Rank mappings shape:", rank_mappings.shape)
+
+    edge_indices = edge_indices.unsqueeze(0)
+    src_indices = edge_indices[:, 0, :]
+    dst_indices = edge_indices[:, 1, :]
+
+    edge_placement = rank_mappings[0]
+    edge_src_placement = rank_mappings[0]
+    edge_dest_placement = rank_mappings[1]
+
+    start_time = perf_counter()
+    cache_prefix_str = f"cache/{cache_prefix[dset]}"
+    with get_context("spawn").Pool(min(world_size, 8)) as pool:
+        args = [
+            (
+                dist_graph,
+                src_indices,
+                dst_indices,
+                edge_placement,
+                edge_src_placement,
+                edge_dest_placement,
+                cache_prefix_str,
+                rank,
+                world_size,
+            )
+            for rank in range(world_size)
+        ]
+
+        out = pool.starmap(generate_cache_file, args)
+
+    end_time = perf_counter()
+    print(f"Cache generation time: {end_time - start_time:.4f} seconds")
+    print("Cache files generated successfully.")
+    print(
+        f"Gather cache file: {cache_prefix_str}_gather_cache_rank_{world_size}_<rank>.pt"
+    )
+    print(
+        f"Scatter cache file: {cache_prefix_str}_scatter_cache_rank_{world_size}_<rank>.pt"
+    )
+
+
+if __name__ == "__main__":
+    Fire(main)
diff --git a/experiments/OGB/main.py b/experiments/OGB/main.py
@@ -91,6 +91,7 @@ def _run_experiment(
     hidden_dims: int = 128,
     num_classes: int = 40,
     use_cache: bool = False,
+    dset_name: str = "arxiv",
 ):
     local_rank = comm.get_rank() % torch.cuda.device_count()
     print(f"Rank: {local_rank} Local Rank: {local_rank}")
@@ -114,9 +115,9 @@ def _run_experiment(
     node_features, edge_indices, rank_mappings, labels = dataset[0]
 
     node_features = node_features.to(device).unsqueeze(0)
-    edge_indices = edge_indices.to(device)[:, :-1].unsqueeze(0)
+    edge_indices = edge_indices.to(device).unsqueeze(0)
     labels = labels.to(device).unsqueeze(0)
-    rank_mappings = rank_mappings[:, :-1]
+    rank_mappings = rank_mappings
 
     if rank == 0:
         print("*" * 80)
@@ -144,42 +145,55 @@ def _run_experiment(
 
     if use_cache:
         print(f"Rank: {rank} Using Cache. Generating Cache")
-        start_time = perf_counter()
-        src_indices = edge_indices[:, 0, :]
-        dst_indices = edge_indices[:, 1, :]
-
-        # This says where the edges are located
-        edge_placement = rank_mappings[0]
-
-        # These say where the source and destination nodes are located
-        edge_src_placement = rank_mappings[
-            0
-        ]  # Redundant but making explicit for clarity
-        edge_dest_placement = rank_mappings[1]
-
-        num_input_rows = node_features.size(1)
-        local_num_edges = (edge_placement == rank).sum().item()
-
-        if gather_cache is None:
-            gather_cache = NCCLGatherCacheGenerator(
-                dst_indices,
-                edge_placement,
-                edge_dest_placement,
-                num_input_rows,
-                rank,
-                world_size,
-            )
-        if scatter_cache is None:
-            nodes_per_rank = dataset.graph_obj.get_nodes_per_rank()
-
-            scatter_cache = NCCLScatterCacheGenerator(
-                src_indices,
-                edge_placement,
-                edge_src_placement,
-                nodes_per_rank[rank],
-                rank,
-                world_size,
-            )
+
+        # Check if the cache files already exist
+        cache_prefix = f"cache/{dset_name}"
+
+        scatter_cache_file = f"{cache_prefix}_scatter_cache_{world_size}_{rank}.pt"
+        gather_cache_file = f"{cache_prefix}_gather_cache_{world_size}_{rank}.pt"
+
+        if os.path.exists(scatter_cache_file):
+            scatter_cache = torch.load(scatter_cache_file, weights_only=False)
+        if os.path.exists(gather_cache_file):
+            gather_cache = torch.load(gather_cache_file, weights_only=False)
+
+        if gather_cache is None or scatter_cache is None:
+            start_time = perf_counter()
+            src_indices = edge_indices[:, 0, :]
+            dst_indices = edge_indices[:, 1, :]
+
+            # This says where the edges are located
+            edge_placement = rank_mappings[0]
+
+            # These say where the source and destination nodes are located
+            edge_src_placement = rank_mappings[
+                0
+            ]  # Redundant but making explicit for clarity
+            edge_dest_placement = rank_mappings[1]
+
+            num_input_rows = node_features.size(1)
+            local_num_edges = (edge_placement == rank).sum().item()
+
+            if gather_cache is None:
+                gather_cache = NCCLGatherCacheGenerator(
+                    dst_indices,
+                    edge_placement,
+                    edge_dest_placement,
+                    num_input_rows,
+                    rank,
+                    world_size,
+                )
+            if scatter_cache is None:
+                nodes_per_rank = dataset.graph_obj.get_nodes_per_rank()
+
+                scatter_cache = NCCLScatterCacheGenerator(
+                    src_indices,
+                    edge_placement,
+                    edge_src_placement,
+                    nodes_per_rank[rank],
+                    rank,
+                    world_size,
+                )
 
         # Sanity checks for the cache
         for key, value in gather_cache.gather_send_local_placement.items():
@@ -208,11 +222,10 @@ def _run_experiment(
         end_time = perf_counter()
         print(f"Rank: {rank} Cache Generation Time: {end_time - start_time:.4f} s")
 
-        if rank == 0:
-            with open(f"{log_prefix}_gather_cache_{world_size}.pt", "wb") as f:
-                torch.save(gather_cache, f)
-            with open(f"{log_prefix}_scatter_cache_{world_size}.pt", "wb") as f:
-                torch.save(scatter_cache, f)
+        with open(f"{log_prefix}_gather_cache_{world_size}_{rank}.pt", "wb") as f:
+            torch.save(gather_cache, f)
+        with open(f"{log_prefix}_scatter_cache_{world_size}_{rank}.pt", "wb") as f:
+            torch.save(scatter_cache, f)
         print(f"Rank: {rank} Cache Generated")
 
     training_times = []
@@ -366,6 +379,7 @@ def main(
             log_prefix,
             use_cache=use_cache,
             num_classes=num_classes,
+            dset_name=dataset,
         )
         training_trajectores[i] = training_traj
         validation_trajectores[i] = val_traj