Skip to content

Commit 4d26e25

Browse files
committed
sync basline
1 parent 903d5c8 commit 4d26e25

3 files changed

Lines changed: 32 additions & 20 deletions

File tree

sim/simx/execute.cpp

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,22 +1569,18 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
15691569
} break;
15701570
case TcuType::DTENSOR_START: { // Disaggregated tensor core command
15711571
//rs1 holds descriptor address
1572-
if (core_->id() == 0) { // only core 0 can issue dtensor commands
1573-
auto cluster = core_->socket()->cluster();
1574-
if (cluster->dtensor()) {
1572+
auto cluster = core_->socket()->cluster();
1573+
if (cluster->dtensor()) {
15751574
uint64_t desc_addr = rs1_data[0].u64;
15761575
cluster->dtensor()->start(desc_addr);
1577-
}
15781576
}
15791577
rd_write = false;
15801578
} break;
15811579
case TcuType::DTENSOR_POLL: {
15821580
uint32_t done = 0;
1583-
if (core_->id() == 0) { // only core 0 can poll dtensor
1584-
auto cluster = core_->socket()->cluster();
1585-
if (cluster->dtensor()) {
1586-
done = cluster->dtensor()->poll();
1587-
}
1581+
auto cluster = core_->socket()->cluster();
1582+
if (cluster->dtensor()) {
1583+
done = cluster->dtensor()->poll();
15881584
}
15891585
for (uint32_t t = 0; t < num_threads; ++t) {
15901586
rd_data[t].u32 = done;

tests/regression/dtcu_basic/kernel.cpp

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,31 @@ namespace vt = vortex::tensor;
66
using ctx = vt::wmma_context<NUM_THREADS, vt::ITYPE, vt::OTYPE>;
77

88
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
9-
// DTCU only works on core 0
10-
// Issue the start command from the first thread of the first warp, and wait until completion
11-
if (vx_warp_id() == 0 && vx_thread_id() == 0) {
12-
vt::dtensor_start(arg->desc_addr);
13-
while (0 == vt::dtensor_poll()) {
14-
// busy wait
9+
// Virgo-style synchronization
10+
// Still assumes that core 0 can only issue comand
11+
auto num_cores = vx_num_cores();
12+
bool is_leader = (vx_core_id() == 0) && (vx_warp_id() == 0) && (vx_thread_id() == 0);
13+
14+
// memory fence
15+
vx_fence();
16+
17+
// global barrier
18+
vx_barrier(0x80000000, num_cores);
19+
20+
if (is_leader) {
21+
vt::dtensor_start(arg->desc_addr);
22+
while (0 == vt::dtensor_poll()) {
23+
// busy wait
24+
}
1525
}
16-
}
26+
27+
// Commit before moving on
28+
vx_fence();
29+
vx_barrier(0x80000000, num_cores);
1730
}
1831

1932
int main() {
20-
auto arg = (kernel_arg_t *)csr_read(VX_CSR_MSCRATCH);
21-
// 1 warp w/ grid=1x1, block=NUM_THREADS x 1
22-
return vx_spawn_threads(1, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg);
33+
auto arg = (kernel_arg_t *)csr_read(VX_CSR_MSCRATCH);
34+
// 1 warp w/ grid=1x1, block=NUM_THREADS x 1
35+
return vx_spawn_threads(1, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg);
2336
}

tests/regression/dtcu_basic/main.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,11 @@ int main(int argc, char** argv) {
162162

163163
// ---- alloc device buffers ----
164164
// Equivalent to set block size to warp size, but here we only use 1 tile
165+
uint64_t num_cores = 0;
166+
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
167+
165168
kernel_arg_t karg{};
166-
karg.grid_dim[0] = 1;
169+
karg.grid_dim[0] = num_cores;
167170
karg.grid_dim[1] = 1;
168171
karg.block_dim[0] = NUM_THREADS;
169172
karg.block_dim[1] = 1;

0 commit comments

Comments
 (0)