LBANN
diff --git a/‎DGraph/distributed/RankLocalOps.py‎
Lines changed: 50 additions & 0 deletions b/‎DGraph/distributed/RankLocalOps.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎DGraph/distributed/csrc/local_data_kernels.cuh‎
Lines changed: 102 additions & 11 deletions b/‎DGraph/distributed/csrc/local_data_kernels.cuh‎
Lines changed: 102 additions & 11 deletions
diff --git a/‎DGraph/distributed/csrc/torch_local_bindings.cpp‎
Lines changed: 24 additions & 0 deletions b/‎DGraph/distributed/csrc/torch_local_bindings.cpp‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎DGraph/distributed/csrc/torch_local_kernels.cu‎
Lines changed: 117 additions & 0 deletions b/‎DGraph/distributed/csrc/torch_local_kernels.cu‎
Lines changed: 117 additions & 0 deletions
@@ -17,6 +17,14 @@
 
 import torch
 
+try:
+    from torch_local import local_masked_gather, local_masked_scatter
+
+    _LOCAL_OPT_KERNELS_AVAILABLE = True
+except ImportError:
+    _LOCAL_OPT_KERNELS_AVAILABLE = False
+import warnings
+
 
 def RankLocalMaskedGather(
     _src: torch.Tensor, indices: torch.Tensor, rank_mapping: torch.Tensor, rank: int
@@ -31,6 +39,48 @@ def RankLocalMaskedGather(
     return local_gathered_data
 
 
+def __Local_Gather_impl(_src_tensor, local_indices):
+    num_features = _src_tensor.shape[-1]
+    bs = _src_tensor.shape[0]
+    local_indices = local_indices.view(bs, -1, 1).expand(bs, -1, num_features)
+    local_gathered_data = torch.gather(_src_tensor, 1, local_indices)
+    return local_gathered_data
+
+
+def OptimizedRankLocalMaskedGather(
+    src: torch.Tensor,
+    indices: torch.Tensor,
+    rank_mapping: torch.Tensor,
+    output: torch.Tensor,
+    rank: int,
+) -> torch.Tensor:
+    """
+    This function gathers the indices from the source rank to the destination rank.
+    """
+    if not _LOCAL_OPT_KERNELS_AVAILABLE:
+        warnings.warn(
+            "Optimized local kernels are not available. Falling back to the default implementation."
+        )
+        return RankLocalMaskedGather(src, indices, rank_mapping, rank)
+    bs = src.shape[0]
+    indices = indices.view(bs, -1, 1)
+    num_output_rows = indices.shape[1]
+    num_src_rows = src.shape[1]
+    num_features = src.shape[-1]
+    local_masked_gather(
+        src,
+        indices,
+        rank_mapping,
+        output,
+        bs,
+        num_src_rows,
+        num_features,
+        num_output_rows,
+        rank,
+    )
+    return output
+
+
 def OutOfPlaceRankLocalMaskedGather(
     _src: torch.Tensor, indices: torch.Tensor, rank_mapping: torch.Tensor, rank: int
 ) -> torch.Tensor:
 
@@ -15,16 +15,13 @@
  */
 #pragma once
 #include <cuda.h>
-#include <thrust/pair.h>
-#include <cub/cub.cuh>
-
 
 /**
- * 
+ *
  * This file houses all the kernels that we use for local data communication.
  * Currently all the kernels are in the Local namespace and in the same file, but
  * we can split this up in the future if needed for better organization.
- * 
+ *
  */
 namespace Local
 {
@@ -36,7 +33,7 @@ namespace Local
 
   __global__ void Fused_ReLU_Scatter_Kernel(
       const float *__restrict__ values,
-      const float *__restrict__ indices,
+      const long *__restrict__ indices,
       float *__restrict__ output,
       const int mini_batch_size,
       const int num_values_rows,
@@ -60,7 +57,7 @@ namespace Local
 
       for (size_t row = gidy; row < num_values_rows; row += nthreadsy)
       {
-        const int ind = __float2int_rd(indices[ind_offset + row]);
+        const int ind = indices[ind_offset + row];
 
         for (size_t i = gidx; i < num_cols; i += nthreadsx)
         {
@@ -79,7 +76,7 @@ namespace Local
       const float *__restrict__ values_2,
       const float *__restrict__ means,
       const float *__restrict__ inv_var,
-      const float *__restrict__ indices,
+      const long *__restrict__ indices,
       float *__restrict__ output,
       const int mini_batch_size,
       const int num_values_rows,
@@ -103,7 +100,7 @@ namespace Local
 
       for (size_t row = gidy; row < num_values_rows; row += nthreadsy)
       {
-        const int ind = __float2int_rd(indices[ind_offset + row]);
+        const int ind = indices[ind_offset + row];
 
         for (size_t i = gidx; i < num_cols; i += nthreadsx)
         {
@@ -119,7 +116,7 @@ namespace Local
 
   __global__ void Sparse_Scatter_Kernel(
       const float *__restrict__ values,
-      const float *__restrict__ indices,
+      const long *__restrict__ indices,
       float *__restrict__ output,
       const int mini_batch_size,
       const int num_values_rows,
@@ -143,7 +140,7 @@ namespace Local
 
       for (size_t row = gidy; row < num_values_rows; row += nthreadsy)
       {
-        const int ind = __float2int_rd(indices[ind_offset + row]);
+        const int ind = indices[ind_offset + row];
 
         for (size_t i = gidx; i < num_cols; i += nthreadsx)
         {
@@ -160,4 +157,98 @@ namespace Local
     }
   }
 
+  __global__ void Rank_Local_Gather_Kernel(
+      const float *__restrict__ values,
+      const long *__restrict__ indices,
+      const long *__restrict__ rank_placement,
+      float *__restrict__ output,
+      const int mini_batch_size,
+      const int num_values_rows,
+      const int num_cols,
+      const int num_output_rows,
+      const int local_rank)
+  {
+
+    const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+    const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+
+    const size_t nthreadsx = gridDim.x * blockDim.x;
+    const size_t nthreadsy = gridDim.y * blockDim.y;
+    const size_t nthreadsz = gridDim.z * blockDim.z;
+
+    for (size_t mb_i = gidz; mb_i < mini_batch_size; mb_i += nthreadsz)
+    {
+      const auto values_offset = mb_i * num_cols * num_values_rows;
+      const auto output_offset = mb_i * num_cols * num_output_rows;
+      const auto ind_offset = mb_i * num_output_rows;
+      const auto rank_placement_offset = mb_i * num_output_rows;
+
+      for (size_t row = gidy; row < num_output_rows; row += nthreadsy)
+      {
+        const int ind = indices[ind_offset + row];
+        const int row_rank = rank_placement[rank_placement_offset + row];
+        // Only gather the values if the rank is the same as the local rank
+        if (row_rank == local_rank)
+        {
+          // Probably not needed, but just in case
+          if (ind > -1 && ind < num_values_rows)
+          {
+            for (size_t i = gidx; i < num_cols; i += nthreadsx)
+            {
+              const auto val = values[values_offset + ind * num_cols + i];
+              output[output_offset + row * num_cols + i] = val;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  __global__ void Rank_Local_Scatter_Kernel(
+      const float *__restrict__ values,
+      const long *__restrict__ indices,
+      const long *__restrict__ rank_placement,
+      float *__restrict__ output,
+      const int mini_batch_size,
+      const int num_values_rows,
+      const int num_cols,
+      const int num_output_rows,
+      const int local_rank)
+  {
+    const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+    const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+
+    const size_t nthreadsx = gridDim.x * blockDim.x;
+    const size_t nthreadsy = gridDim.y * blockDim.y;
+    const size_t nthreadsz = gridDim.z * blockDim.z;
+
+    for (size_t mb_i = gidz; mb_i < mini_batch_size; mb_i += nthreadsz)
+    {
+      const auto values_offset = mb_i * num_cols * num_values_rows;
+      const auto output_offset = mb_i * num_cols * num_output_rows;
+      const auto ind_offset = mb_i * num_values_rows;
+      const auto rank_placement_offset = mb_i * num_output_rows;
+
+      for (size_t row = gidy; row < num_values_rows; row += nthreadsy)
+      {
+        const int ind = indices[ind_offset + row];
+        const int row_rank = rank_placement[rank_placement_offset + row];
+        // Only gather the values if the rank is the same as the local rank
+        if (row_rank == local_rank)
+        {
+          // Probably not needed, but just in case
+          if (ind > -1 && ind < num_output_rows)
+          {
+            for (size_t i = gidx; i < num_cols; i += nthreadsx)
+            {
+              const auto val = values[values_offset + row * num_cols + i];
+              atomicAdd(&output[output_offset + ind * num_cols + i], Max(val, 0.0));
+            }
+          }
+        }
+      }
+    }
+  }
 } // namespace Local
@@ -0,0 +1,24 @@
+/**
+ * Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+ * the CONTRIBUTORS file. See the top-level LICENSE file for details.
+ *
+ * LLNL-CODE-697807.
+ * All rights reserved.
+ *
+ * This file is part of LBANN: Livermore Big Artificial Neural Network
+ * Toolkit. For details, see http://software.llnl.gov/LBANN or
+ * https://github.com/LBANN and https://github.com/LLNL/LBANN.
+ *
+ * SPDX-License-Identifier: (Apache-2.0)
+ */
+
+#include <torch/extension.h>
+#include "torch_local.hpp"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("local_masked_gather", &local_masked_gather, "Masked Gather");
+  m.def("local_masked_scatter", &local_masked_scatter, "Masked Scatter");
+}
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+ * the CONTRIBUTORS file. See the top-level LICENSE file for details.
+ *
+ * LLNL-CODE-697807.
+ * All rights reserved.
+ *
+ * This file is part of LBANN: Livermore Big Artificial Neural Network
+ * Toolkit. For details, see http://software.llnl.gov/LBANN or
+ * https://github.com/LBANN and https://github.com/LLNL/LBANN.
+ *
+ * SPDX-License-Identifier: (Apache-2.0)
+ */
+#include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
+#include "torch_local.hpp"
+#include "local_data_kernels.cuh"
+#include "macros.hpp"
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+torch::Tensor local_masked_gather(torch::Tensor input,
+                                  torch::Tensor indices,
+                                  torch::Tensor rank_local_placement,
+                                  torch::Tensor output,
+                                  const int num_batches,
+                                  const int num_values_rows,
+                                  const int num_cols,
+                                  const int num_output_rows,
+                                  const int local_rank)
+{
+  CHECK_INPUT(input);
+  CHECK_INPUT(indices);
+  CHECK_INPUT(rank_local_placement);
+  CHECK_INPUT(output);
+
+  const float *input_ptr = input.data_ptr<float>();
+  const long *indices_ptr = indices.data_ptr<long>();
+  const long *rank_local_placement_ptr = rank_local_placement.data_ptr<long>();
+  float *output_ptr = output.data_ptr<float>();
+
+  dim3 block_dims, grid_dims;
+  block_dims.x = 32;
+  block_dims.y = 32;
+  block_dims.z = 1;
+
+  const auto num_grids_needed = (num_output_rows + block_dims.y - 1) / block_dims.y;
+  const auto num_col_grids_needed = (num_cols + block_dims.x - 1) / block_dims.x;
+  grid_dims.x = num_col_grids_needed < 65535 ? num_col_grids_needed : 65535;
+  grid_dims.y = num_grids_needed < 65535 ? num_grids_needed : 65535;
+  grid_dims.z = 1;
+
+  // Get the default stream for the current device
+  at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream(input.device().index());
+  Local::Rank_Local_Gather_Kernel<<<grid_dims, block_dims>>>(input_ptr,
+                                                             indices_ptr,
+                                                             rank_local_placement_ptr,
+                                                             output_ptr,
+                                                             num_batches,
+                                                             num_values_rows,
+                                                             num_cols,
+                                                             num_output_rows,
+                                                             local_rank);
+  CUDACHECK(cudaGetLastError());
+  return output;
+}
+
+torch::Tensor local_masked_scatter(torch::Tensor input,
+                                   torch::Tensor indices,
+                                   torch::Tensor rank_local_placement,
+                                   torch::Tensor output,
+                                   const int num_batches,
+                                   const int num_values_rows,
+                                   const int num_cols,
+                                   const int num_output_rows,
+                                   const int rank)
+{
+  CHECK_INPUT(input);
+  CHECK_INPUT(indices);
+  CHECK_INPUT(rank_local_placement);
+  CHECK_INPUT(output);
+
+  const float *input_ptr = input.data_ptr<float>();
+  const long *indices_ptr = indices.data_ptr<long>();
+  const long *rank_local_placement_ptr = rank_local_placement.data_ptr<long>();
+  float *output_ptr = output.data_ptr<float>();
+
+  dim3 block_dims, grid_dims;
+  block_dims.x = 32;
+  block_dims.y = 32;
+  block_dims.z = 1;
+
+  const auto num_grids_needed = (num_output_rows + block_dims.y - 1) / block_dims.y;
+  const auto num_col_grids_needed = (num_cols + block_dims.x - 1) / block_dims.x;
+  grid_dims.x = num_col_grids_needed < 65535 ? num_col_grids_needed : 65535;
+  grid_dims.y = num_grids_needed < 65535 ? num_grids_needed : 65535;
+  grid_dims.z = 1;
+  // Get the default stream for the current device
+  at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream(input.device().index());
+  Local::Rank_Local_Scatter_Kernel<<<grid_dims, block_dims>>>(input_ptr,
+                                                              indices_ptr,
+                                                              rank_local_placement_ptr,
+                                                              output_ptr,
+                                                              num_batches,
+                                                              num_values_rows,
+                                                              num_cols,
+                                                              num_output_rows,
+                                                              rank);
+  CUDACHECK(cudaGetLastError());
+  return output;
+}