vortexgpgpu
diff --git a/‎hw/rtl/core/VX_csr_unit.sv‎
Lines changed: 2 additions & 6 deletions b/‎hw/rtl/core/VX_csr_unit.sv‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎kernel/include/vx_intrinsics.h‎
Lines changed: 0 additions & 126 deletions b/‎kernel/include/vx_intrinsics.h‎
Lines changed: 0 additions & 126 deletions
diff --git a/‎runtime/common/callbacks.inc‎
Lines changed: 1 addition & 1 deletion b/‎runtime/common/callbacks.inc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎runtime/include/vortex.h‎
Lines changed: 4 additions & 0 deletions b/‎runtime/include/vortex.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎runtime/opae/vortex.cpp‎
Lines changed: 32 additions & 42 deletions b/‎runtime/opae/vortex.cpp‎
Lines changed: 32 additions & 42 deletions
diff --git a/‎runtime/rtlsim/vortex.cpp‎
Lines changed: 34 additions & 23 deletions b/‎runtime/rtlsim/vortex.cpp‎
Lines changed: 34 additions & 23 deletions
@@ -84,10 +84,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
 
         .sched_csr_if   (sched_csr_if),
 
-        .cta_csr_valid   (sched_csr_if.cta_csr_valid),
-        .cta_csr_wid     (sched_csr_if.cta_csr_wid),
-        .cta_csr_data    (sched_csr_if.cta_csr_data),
-
     `ifdef EXT_F_ENABLE
         .fpu_csr_if     (fpu_csr_if),
     `endif
@@ -131,8 +127,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
         wire [CTA_TID_WIDTH:0] cx = tx / sched_csr_if.cta_csrs.block_dim[0];
         wire [CTA_TID_WIDTH:0] ty = (CTA_TID_WIDTH+1)'(sched_csr_if.cta_csrs.thread_idx[1]) + cx;
         wire [CTA_TID_WIDTH:0] cy = ty / sched_csr_if.cta_csrs.block_dim[1];
-        assign cta_tid_x[i] = `XLEN'(tx - cx * sched_csr_if.cta_csrs.block_dim[0]);
-        assign cta_tid_y[i] = `XLEN'(ty - cy * sched_csr_if.cta_csrs.block_dim[1]);
+        assign cta_tid_x[i] = `XLEN'(32'(tx) - 32'(cx) * 32'(sched_csr_if.cta_csrs.block_dim[0]));
+        assign cta_tid_y[i] = `XLEN'(32'(ty) - 32'(cy) * 32'(sched_csr_if.cta_csrs.block_dim[1]));
         assign cta_tid_z[i] = `XLEN'(sched_csr_if.cta_csrs.thread_idx[2]) + `XLEN'(cy);
     end
 
 
@@ -548,132 +548,6 @@ inline float vx_packlh_f(const void* base, uint32_t stride) {
 
 #ifdef __cplusplus
 }
-
-// CTA Block Index Proxy Structures
-// These allow blockIdx.x, blockIdx.y, blockIdx.z to be used directly
-// without function call syntax, reading from RISC-V CSRs automatically
-
-#ifndef VX_CSR_CTA_X
-#define VX_CSR_CTA_X                    0xCC6
-#endif
-
-#ifndef VX_CSR_CTA_Y
-#define VX_CSR_CTA_Y                    0xCC7
-#endif
-
-#ifndef VX_CSR_CTA_Z
-#define VX_CSR_CTA_Z                    0xCC8
-#endif
-
-#ifndef VX_CSR_BLOCK_DIM_X
-#define VX_CSR_BLOCK_DIM_X               0xCCA
 #endif
 
-#ifndef VX_CSR_BLOCK_DIM_Y
-#define VX_CSR_BLOCK_DIM_Y               0xCCB
-#endif
-
-#ifndef VX_CSR_BLOCK_DIM_Z
-#define VX_CSR_BLOCK_DIM_Z               0xCCC
-#endif
-
-#ifndef VX_CSR_CTA_WARP_ID
-#define VX_CSR_CTA_WARP_ID              0xCCD
-#endif
-
-// Proxy structure for blockIdx with x, y, z members
-struct BlockIdx {
-    struct X {
-        // Implicit conversion to unsigned int triggers the CSR read
-        inline operator unsigned int() const {
-            unsigned int val;
-            __asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_CTA_X));
-            return val;
-        }
-    } x;
-    
-    struct Y {
-        inline operator unsigned int() const {
-            unsigned int val;
-            __asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_CTA_Y));
-            return val;
-        }
-    } y;
-    
-    struct Z {
-        inline operator unsigned int() const {
-            unsigned int val;
-            __asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_CTA_Z));
-            return val;
-        }
-    } z;
-};
-
-// Create a global instance of blockIdx
-// Marking it static ensures no linker errors if included in multiple files.
-// The struct holds no actual data, so the compiler will optimize it away.
-static const BlockIdx blockIdx;
-
-// Proxy structure for blockDim with x, y, z members
-struct BlockDim {
-    struct X {
-        // Implicit conversion to unsigned int triggers the CSR read
-        inline operator unsigned int() const {
-            unsigned int val;
-            __asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_BLOCK_DIM_X));
-            return val;
-        }
-    } x;
-    
-    struct Y {
-        inline operator unsigned int() const {
-            unsigned int val;
-            __asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_BLOCK_DIM_Y));
-            return val;
-        }
-    } y;
-    
-    struct Z {
-        inline operator unsigned int() const {
-            unsigned int val;
-            __asm__ __volatile__ ("csrr %0, %1" : "=r"(val) : "i"(VX_CSR_BLOCK_DIM_Z));
-            return val;
-        }
-    } z;
-};
-
-// Create a global instance of blockDim
-// Marking it static ensures no linker errors if included in multiple files.
-// The struct holds no actual data, so the compiler will optimize it away.
-static const BlockDim blockDim;
-
-// Proxy structure for threadIdx with x, y, z members
-// threadIdx.x gives the flat thread index within the CTA:
-//   warp_local_id * NUM_THREADS + thread_id_within_warp
-struct ThreadIdx {
-    struct X {
-        inline operator unsigned int() const {
-            unsigned int warp_local_id;
-            __asm__ __volatile__ ("csrr %0, %1" : "=r"(warp_local_id) : "i"(VX_CSR_CTA_WARP_ID));
-            return warp_local_id * vx_num_threads() + vx_thread_id();
-        }
-    } x;
-
-    struct Y {
-        inline operator unsigned int() const {
-            return 0;
-        }
-    } y;
-
-    struct Z {
-        inline operator unsigned int() const {
-            return 0;
-        }
-    } z;
-};
-
-static const ThreadIdx threadIdx;
-
-#endif // __cplusplus
-
 #endif // __VX_INTRINSICS_H__
@@ -180,7 +180,7 @@ extern int vx_dev_init(callbacks_t* callbacks) {
     uint32_t dim, const uint32_t* grid_dim, const uint32_t * block_dim, uint32_t lmem_size) {
     if (nullptr == hdevice || nullptr == hkernel || nullptr == harguments)
       return -1;
-    DBGPRINT("START_WG: hdevice=%p, hkernel=%p, harguments=%p, dimension=%d", hdevice, hkernel, harguments, dimension);
+    DBGPRINT("START_WG: hdevice=%p, hkernel=%p, harguments=%p, dimension=%d", hdevice, hkernel, harguments, dim);
     for (uint32_t i = 0; i < dim; ++i) {
       DBGPRINT(", grid_dim[%d]=%d", i, grid_dim[i]);
     }
 
@@ -117,6 +117,10 @@ int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments);
 int vx_start_g(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments,
                uint32_t ndim, const uint32_t* grid_dim, const uint32_t* block_dim, uint32_t lmem_size);
 
+// Start device execution with work-group dimensions (KMU path)
+int vx_start_wg(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments,
+               uint32_t dim, const uint32_t* grid_dim, const uint32_t * block_dim, uint32_t lmem_size);
+
 // Return optimal grid/block dimensions for maximum occupancy given global work size
 int vx_max_occupancy_grid(vx_device_h hdevice, uint32_t ndim, const uint32_t* global_dim,
                            uint32_t* grid_dim, uint32_t* block_dim);
 
@@ -422,57 +422,47 @@ class vx_device {
     return 0;
   }
 
-  int start_wg(uint64_t krnl_addr, uint64_t args_addr, uint32_t dim, const uint32_t *grid_dim, const uint32_t *block_dim, uint32_t lmem_size) {
-    // set kernel info
-    CHECK_ERR(this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff), {
-      return err;
-    });
-    CHECK_ERR(this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32), {
-      return err;
-    });
-    CHECK_ERR(this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff), {
-      return err;
-    });
-    CHECK_ERR(this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32), {
-      return err;
-    });
+  int start_wg(uint64_t krnl_addr, uint64_t args_addr, uint32_t ndim, const uint32_t *grid_dim, const uint32_t *block_dim, uint32_t lmem_size) {
+    uint32_t eff_block_dim[3], block_size, warp_step_x, warp_step_y, warp_step_z;
+    prepare_kernel_launch_params(NUM_THREADS, NUM_WARPS, ndim, block_dim,
+        eff_block_dim, &block_size, &warp_step_x, &warp_step_y, &warp_step_z);
+    uint32_t _lmem_size = lmem_size;
 
-    if (dim > 0) {
-      CHECK_ERR(this->dcr_write(VX_DCR_BASE_GRID_DIM0, grid_dim[0]), {
-        return err;
-      });
-      CHECK_ERR(this->dcr_write(VX_DCR_BASE_BLOCK_DIM0, block_dim[0]), {
-        return err;
-      });
-      if (dim > 1) {
-        CHECK_ERR(this->dcr_write(VX_DCR_BASE_GRID_DIM1, grid_dim[1]), {
-          return err;
-        });
-        CHECK_ERR(this->dcr_write(VX_DCR_BASE_BLOCK_DIM1, block_dim[1]), {
-           return err;
-        });
-        if (dim > 2) {
-          CHECK_ERR(this->dcr_write(VX_DCR_BASE_GRID_DIM2, grid_dim[2]), {
-            return err;
-          });
-          CHECK_ERR(this->dcr_write(VX_DCR_BASE_BLOCK_DIM2, block_dim[2]), {
-             return err;           });
-        }
+    {
+      uint32_t threads_per_core = NUM_WARPS * NUM_THREADS;
+      if (block_size > threads_per_core) {
+        std::cerr << "Error: cannot schedule kernel with block_size > threads_per_core ("
+                  << block_size << "," << threads_per_core << ")\n";
+        return -1;
+      }
+      int warps_per_block = (block_size + NUM_THREADS - 1) / NUM_THREADS;
+      int blocks_per_core = NUM_WARPS / warps_per_block;
+      if (_lmem_size == 0) {
+        uint64_t local_mem_size = (1ull << LMEM_LOG_SIZE);
+        _lmem_size = static_cast<uint32_t>(local_mem_size / blocks_per_core);
       }
     }
 
-    CHECK_ERR(this->dcr_write(VX_DCR_BASE_LMEM_SIZE, lmem_size), {
-      return err;
-    });
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_STARTUP_ADDR0, krnl_addr & 0xffffffff), { return err; });
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_STARTUP_ADDR1, static_cast<uint32_t>(krnl_addr >> 32)), { return err; });
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_STARTUP_ARG0, args_addr & 0xffffffff), { return err; });
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_STARTUP_ARG1, static_cast<uint32_t>(args_addr >> 32)), { return err; });
+    static const uint32_t grid_regs[3] = {VX_DCR_KMU_GRID_DIM_X, VX_DCR_KMU_GRID_DIM_Y, VX_DCR_KMU_GRID_DIM_Z};
+    static const uint32_t block_regs[3] = {VX_DCR_KMU_BLOCK_DIM_X, VX_DCR_KMU_BLOCK_DIM_Y, VX_DCR_KMU_BLOCK_DIM_Z};
+    for (uint32_t i = 0; i < 3; ++i) {
+      CHECK_ERR(this->dcr_write(grid_regs[i], (i < ndim) ? grid_dim[i] : 1), { return err; });
+      CHECK_ERR(this->dcr_write(block_regs[i], eff_block_dim[i]), { return err; });
+    }
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_LMEM_SIZE, _lmem_size), { return err; });
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_BLOCK_SIZE, block_size), { return err; });
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_WARP_STEP_X, warp_step_x), { return err; });
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_WARP_STEP_Y, warp_step_y), { return err; });
+    CHECK_ERR(this->dcr_write(VX_DCR_KMU_WARP_STEP_Z, warp_step_z), { return err; });
 
-    // start execution
     CHECK_FPGA_ERR(api_.fpgaWriteMMIO64(fpga_, 0, MMIO_CMD_TYPE, CMD_RUN), {
       return -1;
     });
 
-    // clear mpm cache
-    mpm_cache_.clear();
-
     return 0;
   }
 
 
@@ -215,40 +215,51 @@ class vx_device {
     return 0;
   }
 
-  int start_wg(uint64_t krnl_addr, uint64_t args_addr, uint32_t dim, const uint32_t* grid_dim, const uint32_t* block_dim, uint32_t lmem_size) {
-     // ensure prior run completed
+  int start_wg(uint64_t krnl_addr, uint64_t args_addr, uint32_t ndim, const uint32_t* grid_dim, const uint32_t* block_dim, uint32_t lmem_size) {
     if (future_.valid()) {
       future_.wait();
     }
 
-    // set kernel info
-    this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
-    this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
-    this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
-    this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
-
-    if (dim > 0) {
-      this->dcr_write(VX_DCR_BASE_GRID_DIM0, grid_dim[0]);
-      this->dcr_write(VX_DCR_BASE_BLOCK_DIM0, block_dim[0]);
-      if (dim > 1) {
-        this->dcr_write(VX_DCR_BASE_GRID_DIM1, grid_dim[1]);
-        this->dcr_write(VX_DCR_BASE_BLOCK_DIM1, block_dim[1]);
-        if (dim > 2) {
-          this->dcr_write(VX_DCR_BASE_GRID_DIM2, grid_dim[2]);
-          this->dcr_write(VX_DCR_BASE_BLOCK_DIM2, block_dim[2]);
-        }
+    uint32_t eff_block_dim[3], block_size, warp_step_x, warp_step_y, warp_step_z;
+    prepare_kernel_launch_params(NUM_THREADS, NUM_WARPS, ndim, block_dim,
+        eff_block_dim, &block_size, &warp_step_x, &warp_step_y, &warp_step_z);
+    uint32_t _lmem_size = lmem_size;
+
+    {
+      uint32_t threads_per_core = NUM_WARPS * NUM_THREADS;
+      if (block_size > threads_per_core) {
+        std::cerr << "Error: cannot schedule kernel with block_size > threads_per_core ("
+                  << block_size << "," << threads_per_core << ")\n";
+        return -1;
+      }
+      int warps_per_block = (block_size + NUM_THREADS - 1) / NUM_THREADS;
+      int blocks_per_core = NUM_WARPS / warps_per_block;
+      if (_lmem_size == 0) {
+        uint64_t local_mem_size = (1ull << LMEM_LOG_SIZE);
+        _lmem_size = static_cast<uint32_t>(local_mem_size / blocks_per_core);
       }
     }
-    this->dcr_write(VX_DCR_BASE_LMEM_SIZE, lmem_size);
 
-    // start new run
+    this->dcr_write(VX_DCR_KMU_STARTUP_ADDR0, krnl_addr & 0xffffffff);
+    this->dcr_write(VX_DCR_KMU_STARTUP_ADDR1, static_cast<uint32_t>(krnl_addr >> 32));
+    this->dcr_write(VX_DCR_KMU_STARTUP_ARG0, args_addr & 0xffffffff);
+    this->dcr_write(VX_DCR_KMU_STARTUP_ARG1, static_cast<uint32_t>(args_addr >> 32));
+    static const uint32_t grid_regs[3] = {VX_DCR_KMU_GRID_DIM_X, VX_DCR_KMU_GRID_DIM_Y, VX_DCR_KMU_GRID_DIM_Z};
+    static const uint32_t block_regs[3] = {VX_DCR_KMU_BLOCK_DIM_X, VX_DCR_KMU_BLOCK_DIM_Y, VX_DCR_KMU_BLOCK_DIM_Z};
+    for (uint32_t i = 0; i < 3; ++i) {
+      this->dcr_write(grid_regs[i], (i < ndim) ? grid_dim[i] : 1);
+      this->dcr_write(block_regs[i], eff_block_dim[i]);
+    }
+    this->dcr_write(VX_DCR_KMU_LMEM_SIZE, _lmem_size);
+    this->dcr_write(VX_DCR_KMU_BLOCK_SIZE, block_size);
+    this->dcr_write(VX_DCR_KMU_WARP_STEP_X, warp_step_x);
+    this->dcr_write(VX_DCR_KMU_WARP_STEP_Y, warp_step_y);
+    this->dcr_write(VX_DCR_KMU_WARP_STEP_Z, warp_step_z);
+
     future_ = std::async(std::launch::async, [&]{
       processor_.run();
     });
 
-    // clear mpm cache
-    mpm_cache_.clear();
-
     return 0;
   }
Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ extern int vx_dev_init(callbacks_t* callbacks) {`
`180`	`180`	`uint32_t dim, const uint32_t* grid_dim, const uint32_t * block_dim, uint32_t lmem_size) {`
`181`	`181`	`if (nullptr == hdevice \|\| nullptr == hkernel \|\| nullptr == harguments)`
`182`	`182`	`return -1;`
`183`		`- DBGPRINT("START_WG: hdevice=%p, hkernel=%p, harguments=%p, dimension=%d", hdevice, hkernel, harguments, dimension);`
	`183`	`+ DBGPRINT("START_WG: hdevice=%p, hkernel=%p, harguments=%p, dimension=%d", hdevice, hkernel, harguments, dim);`
`184`	`184`	`for (uint32_t i = 0; i < dim; ++i) {`
`185`	`185`	`DBGPRINT(", grid_dim[%d]=%d", i, grid_dim[i]);`
`186`	`186`	`}`