3939// =============================================================================
4040
4141#ifndef PTO2_PROFILING
42- #define PTO2_PROFILING 1
42+ # define PTO2_PROFILING 1
4343#endif
4444
4545#ifndef PTO2_ORCH_PROFILING
46- #define PTO2_ORCH_PROFILING 0
46+ # define PTO2_ORCH_PROFILING 0
4747#endif
4848
4949#ifndef PTO2_SCHED_PROFILING
50- #define PTO2_SCHED_PROFILING 0
50+ # define PTO2_SCHED_PROFILING 0
5151#endif
5252
5353#if PTO2_ORCH_PROFILING && !PTO2_PROFILING
54- #error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
54+ # error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
5555#endif
5656
5757#if PTO2_SCHED_PROFILING && !PTO2_PROFILING
58- #error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
58+ # error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
5959#endif
6060
6161// =============================================================================
121121struct PTO2TaskId {
122122 uint64_t raw;
123123
124- constexpr PTO2TaskId () : raw(0 ) {}
125- constexpr explicit PTO2TaskId (uint64_t v) : raw(v) {}
124+ constexpr PTO2TaskId () :
125+ raw(0 ) {}
126+ constexpr explicit PTO2TaskId (uint64_t v) :
127+ raw(v) {}
126128
127129 constexpr uint8_t ring () const { return static_cast <uint8_t >(raw >> 32 ); }
128130 constexpr uint32_t local () const { return static_cast <uint32_t >(raw & 0xFFFFFFFFu ); }
129131
130- constexpr bool operator ==(const PTO2TaskId& other) const { return raw == other.raw ; }
131- constexpr bool operator !=(const PTO2TaskId& other) const { return raw != other.raw ; }
132+ constexpr bool operator ==(const PTO2TaskId & other) const { return raw == other.raw ; }
133+ constexpr bool operator !=(const PTO2TaskId & other) const { return raw != other.raw ; }
132134};
133135
134136static_assert (sizeof (PTO2TaskId) == 8 , " PTO2TaskId must stay 8 bytes (shared memory ABI)" );
135137
136- static inline PTO2TaskId pto2_make_task_id (uint8_t ring_id, uint32_t local_id) {
138+ static inline PTO2TaskId
139+ pto2_make_task_id (uint8_t ring_id, uint32_t local_id) { // NOLINT(bugprone-easily-swappable-parameters)
137140 return PTO2TaskId{(static_cast <uint64_t >(ring_id) << 32 ) | static_cast <uint64_t >(local_id)};
138141}
139142
@@ -203,8 +206,8 @@ typedef enum {
203206 */
204207struct PTO2TaskSlotState ; // Forward declaration
205208struct PTO2DepListEntry {
206- PTO2TaskSlotState* slot_state; // Consumer slot state (direct pointer)
207- PTO2DepListEntry* next; // next entry
209+ PTO2TaskSlotState * slot_state; // Consumer slot state (direct pointer)
210+ PTO2DepListEntry * next; // next entry
208211};
209212
210213// =============================================================================
@@ -228,8 +231,8 @@ struct PTO2TaskDescriptor {
228231 int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT];
229232
230233 // Packed output buffer (all outputs packed into single contiguous buffer)
231- void * packed_buffer_base; // Start of packed buffer in GM Heap
232- void * packed_buffer_end; // End of packed buffer (for heap reclamation)
234+ void * packed_buffer_base; // Start of packed buffer in GM Heap
235+ void * packed_buffer_end; // End of packed buffer (for heap reclamation)
233236};
234237
235238// =============================================================================
@@ -250,18 +253,18 @@ struct PTO2TaskPayload {
250253 int32_t scalar_count{0 };
251254 int32_t fanin_actual_count{0 }; // Actual fanin count (without the +1 redundance)
252255 int32_t _reserved{0 }; // Reserved (dep_pool_mark moved to SlotState for local access)
253- PTO2TaskSlotState* fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release)
256+ PTO2TaskSlotState * fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release)
254257 // === Cache lines 3-34 (2048B) — tensors (alignas(64) forces alignment) ===
255258 Tensor tensors[MAX_TENSOR_ARGS];
256259 // === Cache lines 35-50 (1024B) — scalars ===
257260 uint64_t scalars[MAX_SCALAR_ARGS];
258261
259- void init (const Arg& args, const TaskOutputTensors& materialized_outputs) {
262+ void init (const Arg & args, const TaskOutputTensors & materialized_outputs) {
260263 tensor_count = args.tensor_count ();
261264 scalar_count = args.scalar_count ();
262265 int32_t out_idx = 0 ;
263266 for (int32_t i = 0 ; i < args.tensor_count (); i++) {
264- const Tensor* src;
267+ const Tensor * src;
265268 if (args.tag (i) == TensorArgType::OUTPUT) {
266269 src = materialized_outputs.output_ptr (out_idx++);
267270 } else {
@@ -292,7 +295,7 @@ struct alignas(64) PTO2TaskSlotState {
292295 std::atomic<int32_t > fanout_lock; // Per-task spinlock (0=unlocked, 1=locked)
293296 int32_t fanout_count; // 1 (owning scope) + number of consumers
294297
295- PTO2DepListEntry* fanout_head; // Pointer to first fanout entry (nullptr = empty)
298+ PTO2DepListEntry * fanout_head; // Pointer to first fanout entry (nullptr = empty)
296299
297300 // Task state (completion, consumed check, ready check)
298301 std::atomic<PTO2TaskState> task_state; // PENDING/READY/RUNNING/COMPLETED/CONSUMED
@@ -304,9 +307,9 @@ struct alignas(64) PTO2TaskSlotState {
304307 // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
305308 std::atomic<int32_t > fanout_refcount; // Dynamic: counts released references
306309
307- PTO2TaskPayload* payload;
310+ PTO2TaskPayload * payload;
308311
309- PTO2TaskDescriptor* task;
312+ PTO2TaskDescriptor * task;
310313
311314 // Hot-path completion fields (moved from TaskDescriptor to avoid cross-struct access)
312315 uint8_t active_mask; // Bitmask of active subtask slots (set once)
@@ -325,7 +328,7 @@ static_assert(sizeof(PTO2TaskSlotState) == 64);
325328 * Cycle cost function pointer type
326329 * Returns estimated cycle count for the InCore function
327330 */
328- typedef int64_t (*PTO2CycleCostFunc)(void ** args, int32_t num_args);
331+ typedef int64_t (*PTO2CycleCostFunc)(void ** args, int32_t num_args);
329332
330333// =============================================================================
331334// InCore Function Type
@@ -335,7 +338,7 @@ typedef int64_t (*PTO2CycleCostFunc)(void** args, int32_t num_args);
335338 * InCore function signature
336339 * All InCore functions must match this signature
337340 */
338- typedef void (*PTO2InCoreFunc)(void ** args, int32_t num_args);
341+ typedef void (*PTO2InCoreFunc)(void ** args, int32_t num_args);
339342
340343// =============================================================================
341344// Utility Macros
@@ -345,11 +348,11 @@ typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
345348 * Memory barrier macros for different architectures
346349 */
347350#if defined(__aarch64__)
348- #define PTO2_MEMORY_BARRIER () __asm__ __volatile__ (" dmb sy" ::: " memory" )
351+ # define PTO2_MEMORY_BARRIER () __asm__ __volatile__ (" dmb sy" ::: " memory" )
349352#elif defined(__x86_64__)
350- #define PTO2_MEMORY_BARRIER () __asm__ __volatile__ (" mfence" ::: " memory" )
353+ # define PTO2_MEMORY_BARRIER () __asm__ __volatile__ (" mfence" ::: " memory" )
351354#else
352- #define PTO2_MEMORY_BARRIER () __sync_synchronize()
355+ # define PTO2_MEMORY_BARRIER () __sync_synchronize()
353356#endif
354357
355358// Spin-wait hint for AICPU threads. On real hardware the AICPU has dedicated
@@ -358,9 +361,9 @@ typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
358361// This header is also compiled into the Host .so (for struct definitions only),
359362// where the hint is never called — the fallback no-op keeps Host builds clean.
360363#if __has_include("spin_hint.h")
361- #include " spin_hint.h" // NOLINT(build/include_subdir)
364+ # include " spin_hint.h" // NOLINT(build/include_subdir)
362365#else
363- #define SPIN_WAIT_HINT () ((void )0 )
366+ # define SPIN_WAIT_HINT () ((void )0 )
364367#endif
365368
366369// =============================================================================
@@ -376,11 +379,11 @@ typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
376379// =============================================================================
377380
378381#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
379- #include " aicpu/device_time.h"
382+ # include " aicpu/device_time.h"
380383#endif
381384
382385#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
383- static inline void pto2_fanout_lock (PTO2TaskSlotState& slot_state, uint64_t & atomic_count, uint64_t & wait_cycle) {
386+ static inline void pto2_fanout_lock (PTO2TaskSlotState & slot_state, uint64_t & atomic_count, uint64_t & wait_cycle) {
384387 uint64_t t0 = get_sys_cnt_aicpu ();
385388 bool contended = false ;
386389 uint32_t atomic_ops = 0 ;
@@ -393,7 +396,8 @@ static inline void pto2_fanout_lock(PTO2TaskSlotState& slot_state, uint64_t& ato
393396 }
394397 int32_t expected = 0 ;
395398 if (slot_state.fanout_lock .compare_exchange_weak (
396- expected, 1 , std::memory_order_acquire, std::memory_order_relaxed)) {
399+ expected, 1 , std::memory_order_acquire, std::memory_order_relaxed
400+ )) {
397401 atomic_ops++; // successful CAS = 1 atomic
398402 atomic_count += atomic_ops;
399403 if (contended) {
@@ -407,20 +411,21 @@ static inline void pto2_fanout_lock(PTO2TaskSlotState& slot_state, uint64_t& ato
407411}
408412#endif
409413
410- static inline void pto2_fanout_lock (PTO2TaskSlotState& slot_state) {
414+ static inline void pto2_fanout_lock (PTO2TaskSlotState & slot_state) {
411415 for (;;) {
412416 while (slot_state.fanout_lock .load (std::memory_order_acquire) != 0 ) {
413417 SPIN_WAIT_HINT ();
414418 }
415419 int32_t expected = 0 ;
416420 if (slot_state.fanout_lock .compare_exchange_weak (
417- expected, 1 , std::memory_order_acquire, std::memory_order_relaxed)) {
421+ expected, 1 , std::memory_order_acquire, std::memory_order_relaxed
422+ )) {
418423 return ;
419424 }
420425 }
421426}
422427
423- static inline void pto2_fanout_unlock (PTO2TaskSlotState& slot_state) {
428+ static inline void pto2_fanout_unlock (PTO2TaskSlotState & slot_state) {
424429 slot_state.fanout_lock .store (0 , std::memory_order_release);
425430}
426431
0 commit comments