Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/platform/a2a3/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,8 +427,9 @@ int DeviceRunner::run(Runtime& runtime,
return rc;
}

// Print collected performance data (after stream sync)
// Collect phase data and print performance data (after stream sync)
if (runtime.enable_profiling) {
perf_collector_.collect_phase_data();
export_swimlane_json();
}

Expand Down
2 changes: 2 additions & 0 deletions src/platform/a2a3sim/aicpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ target_compile_options(aicpu_kernel
PRIVATE
-Wall
-Wextra
-Werror
-Wno-error=class-memaccess
-fPIC
-O3
-g
Expand Down
3 changes: 2 additions & 1 deletion src/platform/a2a3sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,9 @@ int DeviceRunner::run(Runtime& runtime,

LOG_INFO("All threads completed");

// Print performance data after execution completes
// Collect AICPU phase data and print performance data after execution completes
if (runtime.enable_profiling) {
perf_collector_.collect_phase_data();
export_swimlane_json();
}

Expand Down
64 changes: 64 additions & 0 deletions src/platform/include/aicpu/performance_collector_aicpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,68 @@ void perf_aicpu_flush_buffers(Runtime* runtime,
*/
void perf_aicpu_update_total_tasks(Runtime* runtime, uint32_t total_tasks);

/**
* Initialize AICPU phase profiling
*
* Sets up AicpuPhaseHeader and clears per-thread phase record buffers.
* Must be called once from thread 0 after perf_aicpu_init_profiling().
*
* @param runtime Runtime instance pointer
* @param num_sched_threads Number of scheduler threads
*/
void perf_aicpu_init_phase_profiling(Runtime* runtime, int num_sched_threads);

/**
* Record a single scheduler phase
*
* Appends an AicpuPhaseRecord to the specified thread's buffer.
* Silently drops records when the buffer is full.
*
* @param thread_idx Scheduler thread index
* @param phase_id Phase identifier
* @param start_time Phase start timestamp
* @param end_time Phase end timestamp
* @param loop_iter Current loop iteration number
* @param tasks_processed Number of tasks processed in this phase
*/
void perf_aicpu_record_phase(int thread_idx,
AicpuPhaseId phase_id,
uint64_t start_time, uint64_t end_time,
uint32_t loop_iter, uint32_t tasks_processed);

/**
* Write orchestrator cumulative summary
*
* Writes the orchestrator's accumulated profiling data to shared memory
* for host-side collection.
*
* @param src Pointer to populated AicpuOrchSummary (magic field is set internally)
*/
void perf_aicpu_write_orch_summary(const AicpuOrchSummary* src);

/**
* Set orchestrator thread index for per-task phase recording
*
* Must be called once from the orchestrator thread before any
* perf_aicpu_record_orch_phase() calls.
*
* @param thread_idx Thread index for the orchestrator (typically num_sched_threads)
*/
void perf_aicpu_set_orch_thread_idx(int thread_idx);

/**
* Record a single orchestrator phase
*
* Appends an AicpuPhaseRecord for one sub-step of pto2_submit_task().
* Uses the orchestrator's dedicated buffer slot (set via set_orch_thread_idx).
*
* @param phase_id Orchestrator phase identifier (ORCH_SYNC..ORCH_SCOPE_END)
* @param start_time Phase start timestamp
* @param end_time Phase end timestamp
* @param submit_idx Task submission index (acts as loop_iter)
*/
void perf_aicpu_record_orch_phase(AicpuPhaseId phase_id,
uint64_t start_time, uint64_t end_time,
uint32_t submit_idx);

#endif // PLATFORM_AICPU_PERFORMANCE_COLLECTOR_AICPU_H_
137 changes: 135 additions & 2 deletions src/platform/include/common/perf_profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file perf_profiling.h
* @brief Performance profiling data structures
*
* Architecture: Fixed header + dynamic tail
* Architecture: Fixed header + dynamic tail + optional phase profiling region
*
* Memory layout:
* ┌─────────────────────────────────────────────────────────────┐
Expand All @@ -19,9 +19,21 @@
* │ ... │
* ├─────────────────────────────────────────────────────────────┤
* │ DoubleBuffer[num_cores-1] │
* ├─────────────────────────────────────────────────────────────┤
* │ AicpuPhaseHeader (optional, present when phase profiling) │
* │ - magic, num_sched_threads, records_per_thread │
* │ - buffer_counts[PLATFORM_MAX_AICPU_THREADS] │
* │ - orch_summary │
* ├─────────────────────────────────────────────────────────────┤
* │ AicpuPhaseRecord[thread0][0..records_per_thread-1] │
* ├─────────────────────────────────────────────────────────────┤
* │ AicpuPhaseRecord[thread1][0..records_per_thread-1] │
* ├─────────────────────────────────────────────────────────────┤
* │ ... │
* └─────────────────────────────────────────────────────────────┘
*
* Total size = sizeof(PerfDataHeader) + num_cores * sizeof(DoubleBuffer)
* Base size = sizeof(PerfDataHeader) + num_cores * sizeof(DoubleBuffer)
* With phases = Base + sizeof(AicpuPhaseHeader) + num_sched_threads * records_per_thread * sizeof(AicpuPhaseRecord)
*/

#ifndef PLATFORM_COMMON_PERF_PROFILING_H_
Expand Down Expand Up @@ -178,6 +190,90 @@ struct PerfDataHeader {
volatile uint32_t total_tasks; // Total tasks (AICPU writes after orchestration)
} __attribute__((aligned(64)));

// =============================================================================
// AICPU Phase Profiling - Scheduler and Orchestrator Records
// =============================================================================

/**
* AICPU phase identifier
*
* Scheduler phases (0-3): four phases in each scheduler loop iteration.
* Orchestrator phases (16-24): sub-steps within each pto2_submit_task() call.
*/
enum class AicpuPhaseId : uint32_t {
// Scheduler phases (0-3)
SCHED_COMPLETE = 0, // Process completed tasks (fanout traversal)
SCHED_DISPATCH = 1, // Dispatch ready tasks to idle cores
SCHED_SCAN = 2, // Incremental scan for root tasks
SCHED_EARLY_READY = 3, // Drain orchestrator's early-ready queue
// Orchestrator phases (16-24)
ORCH_SYNC = 16, // tensormap sync
ORCH_ALLOC = 17, // task_ring_alloc
ORCH_PARAMS = 18, // param copy
ORCH_LOOKUP = 19, // tensormap lookup + dep
ORCH_HEAP = 20, // heap alloc
ORCH_INSERT = 21, // tensormap insert
ORCH_FANIN = 22, // fanin + early-ready
ORCH_FINALIZE = 23, // scheduler init + SM
ORCH_SCOPE_END = 24 // scope_end
};

/**
* Single AICPU scheduler phase record (32 bytes)
*
* Records one phase within one loop iteration of a scheduler thread.
* No thread_id field: identity is derived from array index (position = identity).
*/
struct AicpuPhaseRecord {
uint64_t start_time; // Phase start timestamp
uint64_t end_time; // Phase end timestamp
uint32_t loop_iter; // Loop iteration number
AicpuPhaseId phase_id; // Phase type
uint32_t tasks_processed; // Tasks processed in this phase
uint32_t padding; // Alignment padding
};

/**
* AICPU orchestrator cumulative summary
*
* Contains accumulated cycle counts from the orchestrator thread.
* Written once after orchestration completes.
*/
struct AicpuOrchSummary {
uint64_t start_time; // Orchestrator start timestamp
uint64_t end_time; // Orchestrator end timestamp
uint64_t sync_cycle; // sync_tensormap phase
uint64_t alloc_cycle; // task_ring_alloc phase
uint64_t params_cycle; // param_copy phase
uint64_t lookup_cycle; // lookup+dep phase
uint64_t heap_cycle; // heap_alloc phase
uint64_t insert_cycle; // tensormap_insert phase
uint64_t fanin_cycle; // fanin+ready phase
uint64_t finalize_cycle; // finalize+SM phase
uint64_t scope_end_cycle; // scope_end phase
int64_t submit_count; // Total tasks submitted
uint32_t magic; // Validation magic (AICPU_PHASE_MAGIC)
uint32_t padding; // Alignment padding
} __attribute__((aligned(64)));

constexpr uint32_t AICPU_PHASE_MAGIC = 0x41435048; // "ACPH"
constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 16384; // ~512KB per thread

/**
* AICPU phase profiling header
*
* Located after the DoubleBuffer array in shared memory.
* Contains metadata and per-thread record counts.
*/
struct AicpuPhaseHeader {
uint32_t magic; // Validation magic (AICPU_PHASE_MAGIC)
uint32_t num_sched_threads; // Number of scheduler threads
uint32_t records_per_thread; // Max records per thread
uint32_t padding; // Alignment padding
volatile uint32_t buffer_counts[PLATFORM_MAX_AICPU_THREADS]; // Per-thread record counts
AicpuOrchSummary orch_summary; // Orchestrator cumulative data
} __attribute__((aligned(64)));

// =============================================================================
// Helper Functions - Memory Layout
// =============================================================================
Expand Down Expand Up @@ -250,6 +346,43 @@ inline void get_buffer_and_status(DoubleBuffer* db, uint32_t buffer_id,
}
}

/**
* Calculate total memory size including phase profiling region
*
* @param num_cores Number of AICore instances
* @param num_sched_threads Number of scheduler threads (typically 3)
* @return Total bytes needed
*/
inline size_t calc_perf_data_size_with_phases(int num_cores, int num_sched_threads) {
return calc_perf_data_size(num_cores)
+ sizeof(AicpuPhaseHeader)
+ num_sched_threads * PLATFORM_PHASE_RECORDS_PER_THREAD * sizeof(AicpuPhaseRecord);
}

/**
* Get AicpuPhaseHeader pointer (located after DoubleBuffer array)
*
* @param base_ptr Shared memory base address
* @param num_cores Number of AICore instances
* @return AicpuPhaseHeader pointer
*/
inline AicpuPhaseHeader* get_phase_header(void* base_ptr, int num_cores) {
return (AicpuPhaseHeader*)((char*)base_ptr + calc_perf_data_size(num_cores));
}

/**
* Get AicpuPhaseRecord array for specified thread
*
* @param base_ptr Shared memory base address
* @param num_cores Number of AICore instances
* @param thread_idx Scheduler thread index
* @return AicpuPhaseRecord array pointer
*/
inline AicpuPhaseRecord* get_phase_records(void* base_ptr, int num_cores, int thread_idx) {
char* phase_start = (char*)get_phase_header(base_ptr, num_cores) + sizeof(AicpuPhaseHeader);
return (AicpuPhaseRecord*)(phase_start + thread_idx * PLATFORM_PHASE_RECORDS_PER_THREAD * sizeof(AicpuPhaseRecord));
}

#ifdef __cplusplus
}
#endif
Expand Down
14 changes: 14 additions & 0 deletions src/platform/include/host/performance_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,14 @@ class PerformanceCollector {
*/
bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }

/**
* Collect AICPU phase profiling data from shared memory
*
* Reads scheduler phase records and orchestrator summary from the
* phase profiling region. Must be called after AICPU threads have joined.
*/
void collect_phase_data();

/**
* Get collected records (for testing)
*/
Expand All @@ -152,6 +160,12 @@ class PerformanceCollector {

// Collected data (per-core vectors, indexed by core_index)
std::vector<std::vector<PerfRecord>> collected_perf_records_;

// AICPU phase profiling data
std::vector<std::vector<AicpuPhaseRecord>> collected_phase_records_;
std::vector<AicpuPhaseRecord> collected_orch_phase_records_;
AicpuOrchSummary collected_orch_summary_{};
bool has_phase_data_{false};
};

#endif // PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_
Loading
Loading