Skip to content

Commit 79083de

Browse files
author
Randy L
committed
Fix merge fallout: KMU DCR paths, CSR unit, runtime start_wg
- Remove bad merge residue in VX_csr_unit (sched_csr_if) and widen CTA TID math for Verilator - Restore vx_intrinsics.h from bug_fixes line to avoid spawn/intrinsics redeclaration - Declare vx_start_wg in vortex.h; fix DBGPRINT dim in callbacks.inc - stub: profiling via getenv + VX_DCR_BASE_MPM_VALUE - rtlsim/simx/opae/xrt: program VX_DCR_KMU_* DCRs in start_wg like vx_start_g - vecadd_v2: KERNEL_LIB=vortex2, kernel uses vx_spawn2/kernel_main like vecadd Made-with: Cursor
1 parent eed3157 commit 79083de

11 files changed

Lines changed: 152 additions & 272 deletions

File tree

hw/rtl/core/VX_csr_unit.sv

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
8484

8585
.sched_csr_if (sched_csr_if),
8686

87-
.cta_csr_valid (sched_csr_if.cta_csr_valid),
88-
.cta_csr_wid (sched_csr_if.cta_csr_wid),
89-
.cta_csr_data (sched_csr_if.cta_csr_data),
90-
9187
`ifdef EXT_F_ENABLE
9288
.fpu_csr_if (fpu_csr_if),
9389
`endif
@@ -131,8 +127,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
131127
wire [CTA_TID_WIDTH:0] cx = tx / sched_csr_if.cta_csrs.block_dim[0];
132128
wire [CTA_TID_WIDTH:0] ty = (CTA_TID_WIDTH+1)'(sched_csr_if.cta_csrs.thread_idx[1]) + cx;
133129
wire [CTA_TID_WIDTH:0] cy = ty / sched_csr_if.cta_csrs.block_dim[1];
134-
assign cta_tid_x[i] = `XLEN'(tx - cx * sched_csr_if.cta_csrs.block_dim[0]);
135-
assign cta_tid_y[i] = `XLEN'(ty - cy * sched_csr_if.cta_csrs.block_dim[1]);
130+
assign cta_tid_x[i] = `XLEN'(32'(tx) - 32'(cx) * 32'(sched_csr_if.cta_csrs.block_dim[0]));
131+
assign cta_tid_y[i] = `XLEN'(32'(ty) - 32'(cy) * 32'(sched_csr_if.cta_csrs.block_dim[1]));
136132
assign cta_tid_z[i] = `XLEN'(sched_csr_if.cta_csrs.thread_idx[2]) + `XLEN'(cy);
137133
end
138134

kernel/include/vx_intrinsics.h

Lines changed: 0 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -548,132 +548,6 @@ inline float vx_packlh_f(const void* base, uint32_t stride) {
548548

549549
#ifdef __cplusplus
550550
}
551-
552-
// CTA Block Index Proxy Structures
553-
// These allow blockIdx.x, blockIdx.y, blockIdx.z to be used directly
554-
// without function call syntax, reading from RISC-V CSRs automatically
555-
556-
#ifndef VX_CSR_CTA_X
557-
#define VX_CSR_CTA_X 0xCC6
558-
#endif
559-
560-
#ifndef VX_CSR_CTA_Y
561-
#define VX_CSR_CTA_Y 0xCC7
562-
#endif
563-
564-
#ifndef VX_CSR_CTA_Z
565-
#define VX_CSR_CTA_Z 0xCC8
566-
#endif
567-
568-
#ifndef VX_CSR_BLOCK_DIM_X
569-
#define VX_CSR_BLOCK_DIM_X 0xCCA
570551
#endif
571552

572-
#ifndef VX_CSR_BLOCK_DIM_Y
573-
#define VX_CSR_BLOCK_DIM_Y 0xCCB
574-
#endif
575-
576-
#ifndef VX_CSR_BLOCK_DIM_Z
577-
#define VX_CSR_BLOCK_DIM_Z 0xCCC
578-
#endif
579-
580-
#ifndef VX_CSR_CTA_WARP_ID
581-
#define VX_CSR_CTA_WARP_ID 0xCCD
582-
#endif
583-
584-
// Proxy structure for blockIdx with x, y, z members
585-
struct BlockIdx {
586-
struct X {
587-
// Implicit conversion to unsigned int triggers the CSR read
588-
inline operator unsigned int() const {
589-
unsigned int val;
590-
__asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_CTA_X));
591-
return val;
592-
}
593-
} x;
594-
595-
struct Y {
596-
inline operator unsigned int() const {
597-
unsigned int val;
598-
__asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_CTA_Y));
599-
return val;
600-
}
601-
} y;
602-
603-
struct Z {
604-
inline operator unsigned int() const {
605-
unsigned int val;
606-
__asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_CTA_Z));
607-
return val;
608-
}
609-
} z;
610-
};
611-
612-
// Create a global instance of blockIdx
613-
// Marking it static ensures no linker errors if included in multiple files.
614-
// The struct holds no actual data, so the compiler will optimize it away.
615-
static const BlockIdx blockIdx;
616-
617-
// Proxy structure for blockDim with x, y, z members
618-
struct BlockDim {
619-
struct X {
620-
// Implicit conversion to unsigned int triggers the CSR read
621-
inline operator unsigned int() const {
622-
unsigned int val;
623-
__asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_BLOCK_DIM_X));
624-
return val;
625-
}
626-
} x;
627-
628-
struct Y {
629-
inline operator unsigned int() const {
630-
unsigned int val;
631-
__asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_BLOCK_DIM_Y));
632-
return val;
633-
}
634-
} y;
635-
636-
struct Z {
637-
inline operator unsigned int() const {
638-
unsigned int val;
639-
__asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_BLOCK_DIM_Z));
640-
return val;
641-
}
642-
} z;
643-
};
644-
645-
// Create a global instance of blockDim
646-
// Marking it static ensures no linker errors if included in multiple files.
647-
// The struct holds no actual data, so the compiler will optimize it away.
648-
static const BlockDim blockDim;
649-
650-
// Proxy structure for threadIdx with x, y, z members
651-
// threadIdx.x gives the flat thread index within the CTA:
652-
// warp_local_id * NUM_THREADS + thread_id_within_warp
653-
struct ThreadIdx {
654-
struct X {
655-
inline operator unsigned int() const {
656-
unsigned int warp_local_id;
657-
__asm__ __volatile__ ("csrr %0, %1" : "=r"(warp_local_id) : "i"(VX_CSR_CTA_WARP_ID));
658-
return warp_local_id * vx_num_threads() + vx_thread_id();
659-
}
660-
} x;
661-
662-
struct Y {
663-
inline operator unsigned int() const {
664-
return 0;
665-
}
666-
} y;
667-
668-
struct Z {
669-
inline operator unsigned int() const {
670-
return 0;
671-
}
672-
} z;
673-
};
674-
675-
static const ThreadIdx threadIdx;
676-
677-
#endif // __cplusplus
678-
679553
#endif // __VX_INTRINSICS_H__

runtime/common/callbacks.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ extern int vx_dev_init(callbacks_t* callbacks) {
180180
uint32_t dim, const uint32_t* grid_dim, const uint32_t * block_dim, uint32_t lmem_size) {
181181
if (nullptr == hdevice || nullptr == hkernel || nullptr == harguments)
182182
return -1;
183-
DBGPRINT("START_WG: hdevice=%p, hkernel=%p, harguments=%p, dimension=%d", hdevice, hkernel, harguments, dimension);
183+
DBGPRINT("START_WG: hdevice=%p, hkernel=%p, harguments=%p, dimension=%d", hdevice, hkernel, harguments, dim);
184184
for (uint32_t i = 0; i < dim; ++i) {
185185
DBGPRINT(", grid_dim[%d]=%d", i, grid_dim[i]);
186186
}

runtime/include/vortex.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments);
117117
int vx_start_g(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments,
118118
uint32_t ndim, const uint32_t* grid_dim, const uint32_t* block_dim, uint32_t lmem_size);
119119

120+
// Start device execution with work-group dimensions (KMU path)
121+
int vx_start_wg(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments,
122+
uint32_t dim, const uint32_t* grid_dim, const uint32_t * block_dim, uint32_t lmem_size);
123+
120124
// Return optimal grid/block dimensions for maximum occupancy given global work size
121125
int vx_max_occupancy_grid(vx_device_h hdevice, uint32_t ndim, const uint32_t* global_dim,
122126
uint32_t* grid_dim, uint32_t* block_dim);

runtime/opae/vortex.cpp

Lines changed: 32 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -422,57 +422,47 @@ class vx_device {
422422
return 0;
423423
}
424424

425-
int start_wg(uint64_t krnl_addr, uint64_t args_addr, uint32_t dim, const uint32_t *grid_dim, const uint32_t *block_dim, uint32_t lmem_size) {
426-
// set kernel info
427-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff), {
428-
return err;
429-
});
430-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32), {
431-
return err;
432-
});
433-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff), {
434-
return err;
435-
});
436-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32), {
437-
return err;
438-
});
425+
int start_wg(uint64_t krnl_addr, uint64_t args_addr, uint32_t ndim, const uint32_t *grid_dim, const uint32_t *block_dim, uint32_t lmem_size) {
426+
uint32_t eff_block_dim[3], block_size, warp_step_x, warp_step_y, warp_step_z;
427+
prepare_kernel_launch_params(NUM_THREADS, NUM_WARPS, ndim, block_dim,
428+
eff_block_dim, &block_size, &warp_step_x, &warp_step_y, &warp_step_z);
429+
uint32_t _lmem_size = lmem_size;
439430

440-
if (dim > 0) {
441-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_GRID_DIM0, grid_dim[0]), {
442-
return err;
443-
});
444-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_BLOCK_DIM0, block_dim[0]), {
445-
return err;
446-
});
447-
if (dim > 1) {
448-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_GRID_DIM1, grid_dim[1]), {
449-
return err;
450-
});
451-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_BLOCK_DIM1, block_dim[1]), {
452-
return err;
453-
});
454-
if (dim > 2) {
455-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_GRID_DIM2, grid_dim[2]), {
456-
return err;
457-
});
458-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_BLOCK_DIM2, block_dim[2]), {
459-
return err; });
460-
}
431+
{
432+
uint32_t threads_per_core = NUM_WARPS * NUM_THREADS;
433+
if (block_size > threads_per_core) {
434+
std::cerr << "Error: cannot schedule kernel with block_size > threads_per_core ("
435+
<< block_size << "," << threads_per_core << ")\n";
436+
return -1;
437+
}
438+
int warps_per_block = (block_size + NUM_THREADS - 1) / NUM_THREADS;
439+
int blocks_per_core = NUM_WARPS / warps_per_block;
440+
if (_lmem_size == 0) {
441+
uint64_t local_mem_size = (1ull << LMEM_LOG_SIZE);
442+
_lmem_size = static_cast<uint32_t>(local_mem_size / blocks_per_core);
461443
}
462444
}
463445

464-
CHECK_ERR(this->dcr_write(VX_DCR_BASE_LMEM_SIZE, lmem_size), {
465-
return err;
466-
});
446+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_STARTUP_ADDR0, krnl_addr & 0xffffffff), { return err; });
447+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_STARTUP_ADDR1, static_cast<uint32_t>(krnl_addr >> 32)), { return err; });
448+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_STARTUP_ARG0, args_addr & 0xffffffff), { return err; });
449+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_STARTUP_ARG1, static_cast<uint32_t>(args_addr >> 32)), { return err; });
450+
static const uint32_t grid_regs[3] = {VX_DCR_KMU_GRID_DIM_X, VX_DCR_KMU_GRID_DIM_Y, VX_DCR_KMU_GRID_DIM_Z};
451+
static const uint32_t block_regs[3] = {VX_DCR_KMU_BLOCK_DIM_X, VX_DCR_KMU_BLOCK_DIM_Y, VX_DCR_KMU_BLOCK_DIM_Z};
452+
for (uint32_t i = 0; i < 3; ++i) {
453+
CHECK_ERR(this->dcr_write(grid_regs[i], (i < ndim) ? grid_dim[i] : 1), { return err; });
454+
CHECK_ERR(this->dcr_write(block_regs[i], eff_block_dim[i]), { return err; });
455+
}
456+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_LMEM_SIZE, _lmem_size), { return err; });
457+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_BLOCK_SIZE, block_size), { return err; });
458+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_WARP_STEP_X, warp_step_x), { return err; });
459+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_WARP_STEP_Y, warp_step_y), { return err; });
460+
CHECK_ERR(this->dcr_write(VX_DCR_KMU_WARP_STEP_Z, warp_step_z), { return err; });
467461

468-
// start execution
469462
CHECK_FPGA_ERR(api_.fpgaWriteMMIO64(fpga_, 0, MMIO_CMD_TYPE, CMD_RUN), {
470463
return -1;
471464
});
472465

473-
// clear mpm cache
474-
mpm_cache_.clear();
475-
476466
return 0;
477467
}
478468

runtime/rtlsim/vortex.cpp

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -215,40 +215,51 @@ class vx_device {
215215
return 0;
216216
}
217217

218-
int start_wg(uint64_t krnl_addr, uint64_t args_addr, uint32_t dim, const uint32_t* grid_dim, const uint32_t* block_dim, uint32_t lmem_size) {
219-
// ensure prior run completed
218+
int start_wg(uint64_t krnl_addr, uint64_t args_addr, uint32_t ndim, const uint32_t* grid_dim, const uint32_t* block_dim, uint32_t lmem_size) {
220219
if (future_.valid()) {
221220
future_.wait();
222221
}
223222

224-
// set kernel info
225-
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
226-
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
227-
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
228-
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
229-
230-
if (dim > 0) {
231-
this->dcr_write(VX_DCR_BASE_GRID_DIM0, grid_dim[0]);
232-
this->dcr_write(VX_DCR_BASE_BLOCK_DIM0, block_dim[0]);
233-
if (dim > 1) {
234-
this->dcr_write(VX_DCR_BASE_GRID_DIM1, grid_dim[1]);
235-
this->dcr_write(VX_DCR_BASE_BLOCK_DIM1, block_dim[1]);
236-
if (dim > 2) {
237-
this->dcr_write(VX_DCR_BASE_GRID_DIM2, grid_dim[2]);
238-
this->dcr_write(VX_DCR_BASE_BLOCK_DIM2, block_dim[2]);
239-
}
223+
uint32_t eff_block_dim[3], block_size, warp_step_x, warp_step_y, warp_step_z;
224+
prepare_kernel_launch_params(NUM_THREADS, NUM_WARPS, ndim, block_dim,
225+
eff_block_dim, &block_size, &warp_step_x, &warp_step_y, &warp_step_z);
226+
uint32_t _lmem_size = lmem_size;
227+
228+
{
229+
uint32_t threads_per_core = NUM_WARPS * NUM_THREADS;
230+
if (block_size > threads_per_core) {
231+
std::cerr << "Error: cannot schedule kernel with block_size > threads_per_core ("
232+
<< block_size << "," << threads_per_core << ")\n";
233+
return -1;
234+
}
235+
int warps_per_block = (block_size + NUM_THREADS - 1) / NUM_THREADS;
236+
int blocks_per_core = NUM_WARPS / warps_per_block;
237+
if (_lmem_size == 0) {
238+
uint64_t local_mem_size = (1ull << LMEM_LOG_SIZE);
239+
_lmem_size = static_cast<uint32_t>(local_mem_size / blocks_per_core);
240240
}
241241
}
242-
this->dcr_write(VX_DCR_BASE_LMEM_SIZE, lmem_size);
243242

244-
// start new run
243+
this->dcr_write(VX_DCR_KMU_STARTUP_ADDR0, krnl_addr & 0xffffffff);
244+
this->dcr_write(VX_DCR_KMU_STARTUP_ADDR1, static_cast<uint32_t>(krnl_addr >> 32));
245+
this->dcr_write(VX_DCR_KMU_STARTUP_ARG0, args_addr & 0xffffffff);
246+
this->dcr_write(VX_DCR_KMU_STARTUP_ARG1, static_cast<uint32_t>(args_addr >> 32));
247+
static const uint32_t grid_regs[3] = {VX_DCR_KMU_GRID_DIM_X, VX_DCR_KMU_GRID_DIM_Y, VX_DCR_KMU_GRID_DIM_Z};
248+
static const uint32_t block_regs[3] = {VX_DCR_KMU_BLOCK_DIM_X, VX_DCR_KMU_BLOCK_DIM_Y, VX_DCR_KMU_BLOCK_DIM_Z};
249+
for (uint32_t i = 0; i < 3; ++i) {
250+
this->dcr_write(grid_regs[i], (i < ndim) ? grid_dim[i] : 1);
251+
this->dcr_write(block_regs[i], eff_block_dim[i]);
252+
}
253+
this->dcr_write(VX_DCR_KMU_LMEM_SIZE, _lmem_size);
254+
this->dcr_write(VX_DCR_KMU_BLOCK_SIZE, block_size);
255+
this->dcr_write(VX_DCR_KMU_WARP_STEP_X, warp_step_x);
256+
this->dcr_write(VX_DCR_KMU_WARP_STEP_Y, warp_step_y);
257+
this->dcr_write(VX_DCR_KMU_WARP_STEP_Z, warp_step_z);
258+
245259
future_ = std::async(std::launch::async, [&]{
246260
processor_.run();
247261
});
248262

249-
// clear mpm cache
250-
mpm_cache_.clear();
251-
252263
return 0;
253264
}
254265

0 commit comments

Comments
 (0)