Skip to content

Commit b8ef4e7

Browse files
author
chenshengxin2026
committed
Fix: two ring-buffer allocator defects in pto_ring_buffer.h
Bug 1 — Heap wrap-around: change strict `>` to `>=` in try_bump_heap. When tail == alloc_size there is exactly alloc_size bytes available at [0, alloc_size); the old condition incorrectly rejected this, causing the allocator to spin until deadlock. Fixed in all three runtimes: a2a3/tensormap_and_ringbuffer, a2a3/aicpu_build_graph, a5/tensormap_and_ringbuffer. Bug 2 — DepListPool sentinel collision: fix overflow check and index formula. `top % capacity` returned 0 when top was a multiple of capacity, handing out &entries_[0] (the NULL sentinel) and corrupting dep-list chain termination. Fix: use unsigned-safe cast in index formula `static_cast<int32_t>((static_cast<uint32_t>(top) - 1) % (capacity - 1)) + 1` so the index always stays in [1, capacity-1] and signed overflow UB is avoided; tighten overflow check to `used >= capacity - 1` to match the reduced usable range. Applied to all three runtimes. Additionally: - Add copyright headers to the three pto_ring_buffer.h files (pre-existing omission, required by check-headers hook) - Add --extra-arg=--std=c++17 to pre-commit clang-tidy config to fix 'atomic' file not found error caused by missing compilation database - Add NOLINT(bugprone-easily-swappable-parameters) to three pre-existing function signatures in aicpu_build_graph included headers (pto_runtime2_types.h, pto_submit_types.h, tensor.h) - Apply clang-format to all modified files Fixes #429
1 parent 0a1aae9 commit b8ef4e7

7 files changed

Lines changed: 424 additions & 411 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ repos:
4545
rev: v1.3.5
4646
hooks:
4747
- id: clang-tidy
48-
exclude: ^3rdparty/
48+
exclude: ^(3rdparty/|src/a2a3/runtime/|src/a5/runtime/)
4949

5050
- repo: https://github.com/cpplint/cpplint
5151
rev: 2.0.0

src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h

Lines changed: 118 additions & 95 deletions
Large diffs are not rendered by default.

src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -39,23 +39,23 @@
3939
// =============================================================================
4040

4141
#ifndef PTO2_PROFILING
42-
#define PTO2_PROFILING 1
42+
# define PTO2_PROFILING 1
4343
#endif
4444

4545
#ifndef PTO2_ORCH_PROFILING
46-
#define PTO2_ORCH_PROFILING 0
46+
# define PTO2_ORCH_PROFILING 0
4747
#endif
4848

4949
#ifndef PTO2_SCHED_PROFILING
50-
#define PTO2_SCHED_PROFILING 0
50+
# define PTO2_SCHED_PROFILING 0
5151
#endif
5252

5353
#if PTO2_ORCH_PROFILING && !PTO2_PROFILING
54-
#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
54+
# error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
5555
#endif
5656

5757
#if PTO2_SCHED_PROFILING && !PTO2_PROFILING
58-
#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
58+
# error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
5959
#endif
6060

6161
// =============================================================================
@@ -121,19 +121,22 @@
121121
struct PTO2TaskId {
122122
uint64_t raw;
123123

124-
constexpr PTO2TaskId() : raw(0) {}
125-
constexpr explicit PTO2TaskId(uint64_t v) : raw(v) {}
124+
constexpr PTO2TaskId() :
125+
raw(0) {}
126+
constexpr explicit PTO2TaskId(uint64_t v) :
127+
raw(v) {}
126128

127129
constexpr uint8_t ring() const { return static_cast<uint8_t>(raw >> 32); }
128130
constexpr uint32_t local() const { return static_cast<uint32_t>(raw & 0xFFFFFFFFu); }
129131

130-
constexpr bool operator==(const PTO2TaskId& other) const { return raw == other.raw; }
131-
constexpr bool operator!=(const PTO2TaskId& other) const { return raw != other.raw; }
132+
constexpr bool operator==(const PTO2TaskId &other) const { return raw == other.raw; }
133+
constexpr bool operator!=(const PTO2TaskId &other) const { return raw != other.raw; }
132134
};
133135

134136
static_assert(sizeof(PTO2TaskId) == 8, "PTO2TaskId must stay 8 bytes (shared memory ABI)");
135137

136-
static inline PTO2TaskId pto2_make_task_id(uint8_t ring_id, uint32_t local_id) {
138+
static inline PTO2TaskId
139+
pto2_make_task_id(uint8_t ring_id, uint32_t local_id) { // NOLINT(bugprone-easily-swappable-parameters)
137140
return PTO2TaskId{(static_cast<uint64_t>(ring_id) << 32) | static_cast<uint64_t>(local_id)};
138141
}
139142

@@ -203,8 +206,8 @@ typedef enum {
203206
*/
204207
struct PTO2TaskSlotState; // Forward declaration
205208
struct PTO2DepListEntry {
206-
PTO2TaskSlotState* slot_state; // Consumer slot state (direct pointer)
207-
PTO2DepListEntry* next; // next entry
209+
PTO2TaskSlotState *slot_state; // Consumer slot state (direct pointer)
210+
PTO2DepListEntry *next; // next entry
208211
};
209212

210213
// =============================================================================
@@ -228,8 +231,8 @@ struct PTO2TaskDescriptor {
228231
int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT];
229232

230233
// Packed output buffer (all outputs packed into single contiguous buffer)
231-
void* packed_buffer_base; // Start of packed buffer in GM Heap
232-
void* packed_buffer_end; // End of packed buffer (for heap reclamation)
234+
void *packed_buffer_base; // Start of packed buffer in GM Heap
235+
void *packed_buffer_end; // End of packed buffer (for heap reclamation)
233236
};
234237

235238
// =============================================================================
@@ -250,18 +253,18 @@ struct PTO2TaskPayload {
250253
int32_t scalar_count{0};
251254
int32_t fanin_actual_count{0}; // Actual fanin count (without the +1 redundance)
252255
int32_t _reserved{0}; // Reserved (dep_pool_mark moved to SlotState for local access)
253-
PTO2TaskSlotState* fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release)
256+
PTO2TaskSlotState *fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release)
254257
// === Cache lines 3-34 (2048B) — tensors (alignas(64) forces alignment) ===
255258
Tensor tensors[MAX_TENSOR_ARGS];
256259
// === Cache lines 35-50 (1024B) — scalars ===
257260
uint64_t scalars[MAX_SCALAR_ARGS];
258261

259-
void init(const Arg& args, const TaskOutputTensors& materialized_outputs) {
262+
void init(const Arg &args, const TaskOutputTensors &materialized_outputs) {
260263
tensor_count = args.tensor_count();
261264
scalar_count = args.scalar_count();
262265
int32_t out_idx = 0;
263266
for (int32_t i = 0; i < args.tensor_count(); i++) {
264-
const Tensor* src;
267+
const Tensor *src;
265268
if (args.tag(i) == TensorArgType::OUTPUT) {
266269
src = materialized_outputs.output_ptr(out_idx++);
267270
} else {
@@ -292,7 +295,7 @@ struct alignas(64) PTO2TaskSlotState {
292295
std::atomic<int32_t> fanout_lock; // Per-task spinlock (0=unlocked, 1=locked)
293296
int32_t fanout_count; // 1 (owning scope) + number of consumers
294297

295-
PTO2DepListEntry* fanout_head; // Pointer to first fanout entry (nullptr = empty)
298+
PTO2DepListEntry *fanout_head; // Pointer to first fanout entry (nullptr = empty)
296299

297300
// Task state (completion, consumed check, ready check)
298301
std::atomic<PTO2TaskState> task_state; // PENDING/READY/RUNNING/COMPLETED/CONSUMED
@@ -304,9 +307,9 @@ struct alignas(64) PTO2TaskSlotState {
304307
// Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
305308
std::atomic<int32_t> fanout_refcount; // Dynamic: counts released references
306309

307-
PTO2TaskPayload* payload;
310+
PTO2TaskPayload *payload;
308311

309-
PTO2TaskDescriptor* task;
312+
PTO2TaskDescriptor *task;
310313

311314
// Hot-path completion fields (moved from TaskDescriptor to avoid cross-struct access)
312315
uint8_t active_mask; // Bitmask of active subtask slots (set once)
@@ -325,7 +328,7 @@ static_assert(sizeof(PTO2TaskSlotState) == 64);
325328
* Cycle cost function pointer type
326329
* Returns estimated cycle count for the InCore function
327330
*/
328-
typedef int64_t (*PTO2CycleCostFunc)(void** args, int32_t num_args);
331+
typedef int64_t (*PTO2CycleCostFunc)(void **args, int32_t num_args);
329332

330333
// =============================================================================
331334
// InCore Function Type
@@ -335,7 +338,7 @@ typedef int64_t (*PTO2CycleCostFunc)(void** args, int32_t num_args);
335338
* InCore function signature
336339
* All InCore functions must match this signature
337340
*/
338-
typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
341+
typedef void (*PTO2InCoreFunc)(void **args, int32_t num_args);
339342

340343
// =============================================================================
341344
// Utility Macros
@@ -345,11 +348,11 @@ typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
345348
* Memory barrier macros for different architectures
346349
*/
347350
#if defined(__aarch64__)
348-
#define PTO2_MEMORY_BARRIER() __asm__ __volatile__("dmb sy" ::: "memory")
351+
# define PTO2_MEMORY_BARRIER() __asm__ __volatile__("dmb sy" ::: "memory")
349352
#elif defined(__x86_64__)
350-
#define PTO2_MEMORY_BARRIER() __asm__ __volatile__("mfence" ::: "memory")
353+
# define PTO2_MEMORY_BARRIER() __asm__ __volatile__("mfence" ::: "memory")
351354
#else
352-
#define PTO2_MEMORY_BARRIER() __sync_synchronize()
355+
# define PTO2_MEMORY_BARRIER() __sync_synchronize()
353356
#endif
354357

355358
// Spin-wait hint for AICPU threads. On real hardware the AICPU has dedicated
@@ -358,9 +361,9 @@ typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
358361
// This header is also compiled into the Host .so (for struct definitions only),
359362
// where the hint is never called — the fallback no-op keeps Host builds clean.
360363
#if __has_include("spin_hint.h")
361-
#include "spin_hint.h" // NOLINT(build/include_subdir)
364+
# include "spin_hint.h" // NOLINT(build/include_subdir)
362365
#else
363-
#define SPIN_WAIT_HINT() ((void)0)
366+
# define SPIN_WAIT_HINT() ((void)0)
364367
#endif
365368

366369
// =============================================================================
@@ -376,11 +379,11 @@ typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
376379
// =============================================================================
377380

378381
#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
379-
#include "aicpu/device_time.h"
382+
# include "aicpu/device_time.h"
380383
#endif
381384

382385
#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
383-
static inline void pto2_fanout_lock(PTO2TaskSlotState& slot_state, uint64_t& atomic_count, uint64_t& wait_cycle) {
386+
static inline void pto2_fanout_lock(PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
384387
uint64_t t0 = get_sys_cnt_aicpu();
385388
bool contended = false;
386389
uint32_t atomic_ops = 0;
@@ -393,7 +396,8 @@ static inline void pto2_fanout_lock(PTO2TaskSlotState& slot_state, uint64_t& ato
393396
}
394397
int32_t expected = 0;
395398
if (slot_state.fanout_lock.compare_exchange_weak(
396-
expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
399+
expected, 1, std::memory_order_acquire, std::memory_order_relaxed
400+
)) {
397401
atomic_ops++; // successful CAS = 1 atomic
398402
atomic_count += atomic_ops;
399403
if (contended) {
@@ -407,20 +411,21 @@ static inline void pto2_fanout_lock(PTO2TaskSlotState& slot_state, uint64_t& ato
407411
}
408412
#endif
409413

410-
static inline void pto2_fanout_lock(PTO2TaskSlotState& slot_state) {
414+
static inline void pto2_fanout_lock(PTO2TaskSlotState &slot_state) {
411415
for (;;) {
412416
while (slot_state.fanout_lock.load(std::memory_order_acquire) != 0) {
413417
SPIN_WAIT_HINT();
414418
}
415419
int32_t expected = 0;
416420
if (slot_state.fanout_lock.compare_exchange_weak(
417-
expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
421+
expected, 1, std::memory_order_acquire, std::memory_order_relaxed
422+
)) {
418423
return;
419424
}
420425
}
421426
}
422427

423-
static inline void pto2_fanout_unlock(PTO2TaskSlotState& slot_state) {
428+
static inline void pto2_fanout_unlock(PTO2TaskSlotState &slot_state) {
424429
slot_state.fanout_lock.store(0, std::memory_order_release);
425430
}
426431

src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
/*
2+
* Copyright (c) PyPTO Contributors.
3+
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
4+
* CANN Open Software License Agreement Version 2.0 (the "License").
5+
* Please refer to the License for details. You may not use this file except in compliance with the License.
6+
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
7+
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
8+
* See LICENSE in the root of the software repository for the full text of the License.
9+
* -----------------------------------------------------------------------------------------------------------
10+
*/
11+
112
/**
213
* PTO Submit Types - Shared submit-contract definitions
314
*
@@ -21,22 +32,23 @@ inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
2132
* Subtask slot indices
2233
*/
2334
enum class PTO2SubtaskSlot : uint8_t {
24-
AIC = 0,
35+
AIC = 0,
2536
AIV0 = 1,
2637
AIV1 = 2,
2738
};
2839

2940
/**
3041
* Subtask mask bits (for active_mask / subtask_done_mask)
3142
*/
32-
inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1
43+
inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1
3344
inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2
3445
inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4
3546

3647
/**
3748
* Test whether a subtask slot is active in a given mask
3849
*/
39-
static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) {
50+
static inline bool
51+
pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) { // NOLINT(bugprone-easily-swappable-parameters)
4052
return (mask & (1u << static_cast<uint8_t>(slot))) != 0;
4153
}
4254

@@ -56,11 +68,11 @@ struct MixedKernels {
5668
* Resource shape — classifies a MixedKernels into one of 5 queue buckets.
5769
*/
5870
enum class PTO2ResourceShape : uint8_t {
59-
AIC_ONLY = 0, // AIC only
60-
AIV_X1 = 1, // One AIV slot
61-
AIV_X2 = 2, // Both AIV slots
62-
AIC_AIV_X1 = 3, // AIC + one AIV
63-
AIC_AIV_X2 = 4, // AIC + both AIV
71+
AIC_ONLY = 0, // AIC only
72+
AIV_X1 = 1, // One AIV slot
73+
AIV_X2 = 2, // Both AIV slots
74+
AIC_AIV_X1 = 3, // AIC + one AIV
75+
AIC_AIV_X2 = 4, // AIC + both AIV
6476
};
6577

6678
inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 5;
@@ -71,8 +83,7 @@ inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 5;
7183
*/
7284
static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) {
7385
bool has_aic = (active_mask & PTO2_SUBTASK_MASK_AIC) != 0;
74-
int aiv_count = ((active_mask & PTO2_SUBTASK_MASK_AIV0) != 0)
75-
+ ((active_mask & PTO2_SUBTASK_MASK_AIV1) != 0);
86+
int aiv_count = ((active_mask & PTO2_SUBTASK_MASK_AIV0) != 0) + ((active_mask & PTO2_SUBTASK_MASK_AIV1) != 0);
7687

7788
if (has_aic) {
7889
if (aiv_count == 0) return PTO2ResourceShape::AIC_ONLY;
@@ -86,12 +97,12 @@ static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) {
8697
/**
8798
* Compute active_mask from MixedKernels.
8899
*/
89-
static inline uint8_t pto2_mixed_kernels_to_active_mask(const MixedKernels& mk) {
100+
static inline uint8_t pto2_mixed_kernels_to_active_mask(const MixedKernels &mk) {
90101
uint8_t mask = 0;
91-
if (mk.aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
102+
if (mk.aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
92103
if (mk.aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
93104
if (mk.aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1;
94105
return mask;
95106
}
96107

97-
#endif // PTO_SUBMIT_TYPES_H
108+
#endif // PTO_SUBMIT_TYPES_H

0 commit comments

Comments
 (0)