sync basline

jimmy23246 · jimmy23246 · commit 4d26e2534d6f · 2026-03-20T02:24:08.000-04:00
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
@@ -1569,22 +1569,18 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
       } break;
       case TcuType::DTENSOR_START: { // Disaggregated tensor core command
         //rs1 holds descriptor address
-        if (core_->id() == 0) { // only core 0 can issue dtensor commands
-          auto cluster = core_->socket()->cluster();
-          if (cluster->dtensor()) {
+        auto cluster = core_->socket()->cluster();
+        if (cluster->dtensor()) {
             uint64_t desc_addr = rs1_data[0].u64;
             cluster->dtensor()->start(desc_addr);
-          }
         }
         rd_write = false;
       } break;
       case TcuType::DTENSOR_POLL: {
         uint32_t done = 0;
-        if (core_->id() == 0) { // only core 0 can poll dtensor
-          auto cluster = core_->socket()->cluster();
-          if (cluster->dtensor()) {
-            done = cluster->dtensor()->poll();
-          }
+        auto cluster = core_->socket()->cluster();
+        if (cluster->dtensor()) {
+        done = cluster->dtensor()->poll();
         }
         for (uint32_t t = 0; t < num_threads; ++t) {
           rd_data[t].u32 = done;
diff --git a/tests/regression/dtcu_basic/kernel.cpp b/tests/regression/dtcu_basic/kernel.cpp
@@ -6,18 +6,31 @@ namespace vt = vortex::tensor;
 using ctx = vt::wmma_context<NUM_THREADS, vt::ITYPE, vt::OTYPE>;
 
 void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
-  // DTCU only works on core 0
-  // Issue the start command from the first thread of the first warp, and wait until completion
-  if (vx_warp_id() == 0 && vx_thread_id() == 0) {
-    vt::dtensor_start(arg->desc_addr);
-    while (0 == vt::dtensor_poll()) {
-      // busy wait
+    // Virgo-style synchronization
+    // Still assumes that core 0 can only issue comand
+    auto num_cores = vx_num_cores();
+    bool is_leader = (vx_core_id() == 0) && (vx_warp_id() == 0) && (vx_thread_id() == 0);
+
+    // memory fence
+    vx_fence();
+
+	// global barrier
+	vx_barrier(0x80000000, num_cores);
+
+    if (is_leader) {
+        vt::dtensor_start(arg->desc_addr);
+        while (0 == vt::dtensor_poll()) {
+            // busy wait
+        }
     }
-  }
+
+    // Commit before moving on
+    vx_fence();
+    vx_barrier(0x80000000, num_cores);
 }
 
 int main() {
-  auto arg = (kernel_arg_t *)csr_read(VX_CSR_MSCRATCH);
-  // 1 warp w/ grid=1x1, block=NUM_THREADS x 1
-  return vx_spawn_threads(1, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg);
+    auto arg = (kernel_arg_t *)csr_read(VX_CSR_MSCRATCH);
+    // 1 warp w/ grid=1x1, block=NUM_THREADS x 1
+    return vx_spawn_threads(1, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg);
 }
diff --git a/tests/regression/dtcu_basic/main.cpp b/tests/regression/dtcu_basic/main.cpp
@@ -162,8 +162,11 @@ int main(int argc, char** argv) {
 
   // ---- alloc device buffers ----
   // Equivalent to set block size to warp size, but here we only use 1 tile
+  uint64_t num_cores = 0;
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
+
   kernel_arg_t karg{};
-  karg.grid_dim[0]  = 1;
+  karg.grid_dim[0]  = num_cores;
   karg.grid_dim[1]  = 1;
   karg.block_dim[0] = NUM_THREADS;
   karg.block_dim[1] = 1;