InfiniTensor
diff --git a/‎CMakeLists.txt‎
Lines changed: 46 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 46 additions & 5 deletions
diff --git a/‎example/gpt2/main.cc‎
Lines changed: 19 additions & 0 deletions b/‎example/gpt2/main.cc‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎example/llama3/main.cc‎
Lines changed: 16 additions & 0 deletions b/‎example/llama3/main.cc‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎infini_train/src/core/runtime/maca/maca_dispatch.h‎
Lines changed: 51 additions & 0 deletions b/‎infini_train/src/core/runtime/maca/maca_dispatch.h‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎infini_train/src/kernels/maca/accumulate_grad.maca‎
Lines changed: 3 additions & 2 deletions b/‎infini_train/src/kernels/maca/accumulate_grad.maca‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎infini_train/src/kernels/maca/cast.maca‎
Lines changed: 2 additions & 1 deletion b/‎infini_train/src/kernels/maca/cast.maca‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎infini_train/src/kernels/maca/comm.maca‎
Lines changed: 1 addition & 1 deletion b/‎infini_train/src/kernels/maca/comm.maca‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎infini_train/src/kernels/maca/concat.maca‎
Lines changed: 5 additions & 4 deletions b/‎infini_train/src/kernels/maca/concat.maca‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎infini_train/src/kernels/maca/cross_entropy.maca‎
Lines changed: 4 additions & 3 deletions b/‎infini_train/src/kernels/maca/cross_entropy.maca‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎infini_train/src/kernels/maca/elementwise.maca‎
Lines changed: 10 additions & 15 deletions b/‎infini_train/src/kernels/maca/elementwise.maca‎
Lines changed: 10 additions & 15 deletions
@@ -9,19 +9,23 @@ option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running on CUDA using NCCL" ON)
 option(USE_MCCL "Build project for distributed running on MACA using MCCL" ON)
 
-project(infini_train VERSION 0.5.0 LANGUAGES CXX)
-
-# Switch to mxcc after project() so that third-party libs (glog, gflags) are
-# configured with the host compiler and their feature-detection checks pass.
+# ------------------------------------------------------------------------------
+# MACA toolchain override (must happen before project())
+# ------------------------------------------------------------------------------
+# When targeting MetaX MACA, the C/C++ compiler must be mxcc so that .maca
+# sources and device code can be compiled by the MACA toolchain.
 if(USE_MACA)
   set(MACA_PATH $ENV{MACA_PATH})
   if(NOT MACA_PATH)
-    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set.")
+    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set. "
+                        "Please export MACA_PATH (e.g. /opt/maca) before configuring.")
   endif()
   set(CMAKE_C_COMPILER   "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
   set(CMAKE_CXX_COMPILER "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
 endif()
 
+project(infini_train VERSION 0.5.0 LANGUAGES CXX)
+
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -41,8 +45,45 @@ include_directories(${gflags_SOURCE_DIR}/include)
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
 set(BUILD_TESTING OFF CACHE BOOL "Disable glog unit tests" FORCE)
+# Build glog as a static lib so its symbols are always visible at link time.
+# Under mxcc the default symbol visibility is hidden, which causes the shared
+# libglog.so to export no symbols and produces "undefined reference" errors.
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build glog as static library" FORCE)
 
+# Under MACA/mxcc, cmake's feature-detection test compilations do not find
+# standard POSIX system headers (mxcc has a non-standard sysroot probe path).
+# Pre-set glog's HAVE_* cache variables so that glog skips its fallback type /
+# symbol definitions, which would otherwise conflict with the real system
+# headers during the actual build.
+if(USE_MACA)
+  set(HAVE_SYS_TYPES_H    1 CACHE INTERNAL "")
+  set(HAVE_UNISTD_H        1 CACHE INTERNAL "")
+  set(HAVE_DLFCN_H         1 CACHE INTERNAL "")
+  set(HAVE_GLOB_H          1 CACHE INTERNAL "")
+  set(HAVE_PWD_H           1 CACHE INTERNAL "")
+  set(HAVE_SYS_TIME_H      1 CACHE INTERNAL "")
+  set(HAVE_SYS_UTSNAME_H   1 CACHE INTERNAL "")
+  set(HAVE_SYS_WAIT_H      1 CACHE INTERNAL "")
+  set(HAVE_SYS_SYSCALL_H   1 CACHE INTERNAL "")
+  set(HAVE_SYSLOG_H        1 CACHE INTERNAL "")
+  set(HAVE_UCONTEXT_H      1 CACHE INTERNAL "")
+  # check_type_size() uses two internal variables: the size value and a sentinel
+  # "HAVE_HAVE_<VAR>" that marks the check as done. Pre-setting only the value
+  # is insufficient — the sentinel must also be set so the check skips entirely.
+  set(HAVE_MODE_T          4    CACHE INTERNAL "")  # 4 bytes on Linux
+  set(HAVE_HAVE_MODE_T     TRUE CACHE INTERNAL "")
+  set(HAVE_SSIZE_T         8    CACHE INTERNAL "")  # 8 bytes on 64-bit Linux
+  set(HAVE_HAVE_SSIZE_T    TRUE CACHE INTERNAL "")
+  set(HAVE_PREAD           1 CACHE INTERNAL "")
+  set(HAVE_PWRITE          1 CACHE INTERNAL "")
+  set(HAVE_POSIX_FADVISE   1 CACHE INTERNAL "")
+  set(HAVE_SIGACTION       1 CACHE INTERNAL "")
+  set(HAVE_SIGALTSTACK     1 CACHE INTERNAL "")
+  set(HAVE_FCNTL           1 CACHE INTERNAL "")
+  set(HAVE_DLADDR          1 CACHE INTERNAL "")
+  set(HAVE___CXA_DEMANGLE  1 CACHE INTERNAL "")
+endif()
+
 add_subdirectory(third_party/glog)
 include_directories(${glog_SOURCE_DIR}/src)
 
 
@@ -29,6 +29,9 @@
 #ifdef PROFILE_MODE
 #include "infini_train/include/profiler.h"
 #endif
+#ifdef USE_MACA
+#include "infini_train/src/core/runtime/maca/maca_guard_impl.h"
+#endif
 #include "infini_train/include/nn/parallel/utils.h"
 #include "infini_train/include/utils/global_module_hook_registry.h"
 #include "infini_train/include/utils/precision_check_config.h"
@@ -452,12 +455,28 @@ void Train(const nn::parallel::Rank &rank) {
     Profiler::Instance().Report("gpt2.report", Profiler::SortBy::DeviceTimePercentage);
     Profiler::Instance().PrintRecords("gpt2.records.log");
 #endif
+
+    // On MACA, flush all pending mcFreeAsync operations so that ATU entries for
+    // activation/gradient tensors from this step are released before the next
+    // forward pass begins.  Without this, the ATU (address-translation unit)
+    // accumulates deferred frees across steps and becomes full, causing
+    // xnack(0x8) ATU-fault crashes in CastKernel and other large-tensor kernels.
+    if (device.type() == Device::DeviceType::kMACA) {
+        impl->SynchronizeDevice(device);
+    }
 }
 
 int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
+    // On MACA, when TP > 1 disable P2P to prevent MCCL communication-ordering
+    // deadlocks and P2P teardown crashes.  Must be set before any mcclCommInitAll
+    // call (i.e. before threads that create ProcessGroups are spawned).
+    if (FLAGS_device == kDeviceMACA && FLAGS_tensor_parallel > 1) {
+            setenv("MACA_P2P_DISABLE", "1", 1);
+    }
+
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
                                      FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
 
@@ -427,12 +427,28 @@ void Train(const nn::parallel::Rank &rank) {
     Profiler::Instance().Report("llama3.report", Profiler::SortBy::DeviceTimePercentage);
     Profiler::Instance().PrintRecords("llama3.records.log");
 #endif
+
+    // On MACA, flush all pending mcFreeAsync operations so that ATU entries for
+    // activation/gradient tensors from this step are released before the next
+    // forward pass begins.  Without this, the ATU (address-translation unit)
+    // accumulates deferred frees across steps and becomes full, causing
+    // xnack(0x8) ATU-fault crashes in CastKernel and other large-tensor kernels.
+    if (device.type() == Device::DeviceType::kMACA) {
+        impl->SynchronizeDevice(device);
+    }
 }
 
 int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
+    // On MACA, when TP > 1 disable P2P to prevent MCCL communication-ordering
+    // deadlocks and P2P teardown crashes.  Must be set before any mcclCommInitAll
+    // call (i.e. before threads that create ProcessGroups are spawned).
+    if (FLAGS_device == kDeviceMACA && FLAGS_tensor_parallel > 1) {
+            setenv("MACA_P2P_DISABLE", "1", 1);
+    }
+
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
                                      FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
 
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include <common/maca_bfloat16.h>
+#include <common/maca_fp16.h>
+
+#include "infini_train/include/core/backend_type_map.h"
+#include "infini_train/include/dtype_dispatch.h"
+
+// -----------------------------------------------------------------------------
+// MACA low-precision BackendTypeMap specializations:
+//   FP16 -> __half, BF16 -> __maca_bfloat16
+// -----------------------------------------------------------------------------
+namespace infini_train::core {
+template <> struct BackendTypeMap<Device::DeviceType::kMACA, DataType::kFLOAT16> {
+    using type = __half;
+};
+
+template <> struct BackendTypeMap<Device::DeviceType::kMACA, DataType::kBFLOAT16> {
+    using type = __maca_bfloat16;
+};
+} // namespace infini_train::core
+
+// Register all standard (non-low-precision) dtypes for the MACA backend.
+// FP16/BF16 are registered explicitly above with their MACA-native scalar types.
+INFINI_REGISTER_STANDARD_BACKEND_TYPES(infini_train::Device::DeviceType::kMACA)
+
+namespace infini_train::core::maca {
+
+template <DataType DType> struct MacaTypeMap : BackendTypeMap<Device::DeviceType::kMACA, DType> {};
+
+// -----------------------------------------------------------------------------
+// MACA dispatch helpers
+// -----------------------------------------------------------------------------
+
+template <DataType... AllowedDTypes, typename Functor, typename... Args>
+auto DispatchMacaFunc(DataType dtype, Functor &&func, std::string_view context_identifier = "", Args &&...args) {
+    return infini_train::DispatchByTypeMap<MacaTypeMap, AllowedDTypes...>(
+        dtype, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+template <typename... AllowedTypeLists, typename Functor, typename... Args>
+auto DispatchMacaFunc(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier = "",
+                      Args &&...args) {
+    return infini_train::DispatchByTypeMap<MacaTypeMap, AllowedTypeLists...>(
+        dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+} // namespace infini_train::core::maca
@@ -6,6 +6,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -29,7 +30,7 @@ void AccumulateGrad(const std::shared_ptr<Tensor> &gradient, float rate, const s
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         gradient->Dtype(),
         [=]<typename T>() {
             AccumulateGradKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -73,7 +74,7 @@ void AdamAccumulateGrad(const std::shared_ptr<Tensor> &grad, const std::shared_p
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         grad->Dtype(),
         [=]<typename T>() {
             AdamAccumulateGradKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
 
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -33,7 +34,7 @@ std::shared_ptr<Tensor> Cast(std::shared_ptr<Tensor> input, DataType dtype) {
     dim3 grid_dims(CEIL_DIV(num_elements, block_dims.x));
     const size_t step = grid_dims.x * block_dims.x;
 
-    DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+    core::maca::DispatchMacaFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
         {dtype, input->Dtype()},
         [=]<typename Tdst, typename Tsrc>() {
             auto dst = static_cast<Tdst *>(dst_tensor->DataPtr());
 
@@ -29,7 +29,7 @@ std::vector<std::shared_ptr<Tensor>> ReduceAddCoalesced(const std::vector<std::v
     std::vector<std::vector<std::shared_ptr<Tensor>>> to_destination_grads;
     for (int i = 0; i < grads[0].size(); ++i) {
         outputs.emplace_back(std::make_shared<Tensor>(grads[0][i]->Dims(), grads[0][i]->Dtype(), destination));
-        outputs[i]->Fill<float>(0.0);
+        outputs[i]->Fill(0.0);
     }
     for (int i = 0; i < grads.size(); ++i) {
         to_destination_grads.push_back(std::vector<std::shared_ptr<Tensor>>());
 
@@ -11,6 +11,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -102,7 +103,7 @@ std::shared_ptr<Tensor> ConcatForward(const std::vector<std::shared_ptr<Tensor>>
     int threads_per_block = 256;
     int num_blocks = static_cast<int>((total + threads_per_block - 1) / threads_per_block);
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=, &inputs, &host_offsets]<typename T>() {
             std::vector<const T *> host_input_ptrs;
@@ -185,8 +186,8 @@ std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor
     grads.reserve(input_dims_list.size());
     for (const auto &dvec : input_dims_list) {
         auto t = std::make_shared<Tensor>(dvec, dtype, device);
-        DispatchFunc<INFINI_ALL_TYPES>(
-            dtype, [=]<typename T>() { t->Fill<T>(0); }, "MACA ConcatBackward");
+        core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
+            dtype, [=]<typename T>() { t->Fill(0); }, "MACA ConcatBackward");
         grads.push_back(t);
     }
 
@@ -208,7 +209,7 @@ std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor
     int threads_per_block = 256;
     int num_blocks = static_cast<int>((total + threads_per_block - 1) / threads_per_block);
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=, &grads, &host_offsets]<typename T>() {
             std::vector<T *> host_ptrs;
 
@@ -12,6 +12,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -91,7 +92,7 @@ std::shared_ptr<Tensor> CrossEntropyForward(const std::shared_ptr<Tensor> &input
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    return DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+    return core::maca::DispatchMacaFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
         {target->Dtype(), input->Dtype()},
         [=]<typename Ttarget, typename Tinput>() {
             const Ttarget *target_ptr = static_cast<const Ttarget *>(target->DataPtr());
@@ -198,10 +199,10 @@ std::shared_ptr<Tensor> CrossEntropyBackward(const std::shared_ptr<Tensor> &inpu
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+    core::maca::DispatchMacaFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
         {target->Dtype(), input_casted->Dtype()},
         [=]<typename Ttarget, typename Tinput>() {
-            grad_input->Fill<Tinput>(0);
+            grad_input->Fill(0);
             const Tinput *output_grad_ptr = static_cast<const Tinput *>(grad_output->DataPtr());
             const Ttarget *target_ptr = static_cast<const Ttarget *>(target->DataPtr());
             const Tinput *input_ptr = static_cast<const Tinput *>(input_casted->DataPtr());
 
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -766,9 +767,7 @@ std::shared_ptr<Tensor> UnaryBackward(const std::shared_ptr<Tensor> &grad_output
                                       Func unary_fn) {
     auto dtype = grad_output->Dtype();
     auto a_dtype = a ? a->Dtype() : dtype;
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {dtype, a_dtype}, [=]<typename Tgrad, typename Ta>() { return DataTypeMap_v<WidestType_t<Tgrad, Ta>>; },
-        "MACA UnaryBackward");
+    DataType promoted_type = PromoteDataTypes(dtype, a_dtype);
 
     auto grad_output_promoted
         = dtype == promoted_type ? grad_output : std::make_shared<Tensor>(grad_output->To(promoted_type));
@@ -795,9 +794,7 @@ std::shared_ptr<Tensor> BinaryForward(const std::shared_ptr<Tensor> &a, const st
     auto a_dtype = a->Dtype();
     auto b_dtype = b->Dtype();
 
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {a_dtype, b_dtype}, [=]<typename Ta, typename Tb>() { return DataTypeMap_v<WidestType_t<Ta, Tb>>; },
-        "MACA BinaryForward");
+    DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);
 
     auto a_promoted = a_dtype == promoted_type ? a : std::make_shared<Tensor>(a->To(promoted_type));
     auto b_promoted = b_dtype == promoted_type ? b : std::make_shared<Tensor>(b->To(promoted_type));
@@ -837,9 +834,7 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
     auto a_dtype = a_promoted ? a_promoted->Dtype() : dtype;
     auto b_dtype = b_promoted ? b_promoted->Dtype() : dtype;
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {a_dtype, b_dtype}, [=]<typename Ta, typename Tb>() { return DataTypeMap_v<WidestType_t<Ta, Tb>>; },
-        "MACA BinaryBackward");
+    DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);
 
     CHECK(a_num_elements >= b_num_elements && a_num_elements % b_num_elements == 0);
 
@@ -867,26 +862,26 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
     switch (promoted_type) {
         DISPATCH_CASE(WRAP({
                           if (needs_broadcast) {
-                              grad_a->Fill<float>(0.0f);
-                              grad_b->Fill<float>(0.0f);
+                              grad_a->Fill(0.0f);
+                              grad_b->Fill(0.0f);
                           }
                           LaunchBackward<256, float>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims, grad_output_promoted,
                                                      a_promoted, b_promoted);
                       }),
                       DataType::kFLOAT32)
         DISPATCH_CASE(WRAP({
                           if (needs_broadcast) {
-                              grad_a->Fill<__maca_bfloat16>(0);
-                              grad_b->Fill<__maca_bfloat16>(0);
+                              grad_a->Fill(0);
+                              grad_b->Fill(0);
                           }
                           LaunchBackward<256, __maca_bfloat16>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims,
                                                            grad_output_promoted, a_promoted, b_promoted);
                       }),
                       DataType::kBFLOAT16)
         // FIXME(zbl): AtomicAdd does not support int64_t
         // DISPATCH_CASE(WRAP({
-        //                   grad_a->Fill<int64_t>(0);
-        //                   grad_b->Fill<int64_t>(0);
+        //                   grad_a->Fill(0);
+        //                   grad_b->Fill(0);
         //                   LaunchBackward<256, int64_t>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims, grad_output, a,
         //                   b);
         //               }),
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ std::vector<std::shared_ptr<Tensor>> ReduceAddCoalesced(const std::vector<std::v`
`29`	`29`	`std::vector<std::vector<std::shared_ptr<Tensor>>> to_destination_grads;`
`30`	`30`	`for (int i = 0; i < grads[0].size(); ++i) {`
`31`	`31`	`outputs.emplace_back(std::make_shared<Tensor>(grads[0][i]->Dims(), grads[0][i]->Dtype(), destination));`
`32`		`- outputs[i]->Fill<float>(0.0);`
	`32`	`+ outputs[i]->Fill(0.0);`
`33`	`33`	`}`
`34`	`34`	`for (int i = 0; i < grads.size(); ++i) {`
`35`	`35`	`to_destination_grads.push_back(std::vector<std::shared_ptr<Tensor>>());`