hw-native-sys · ChaoWao · Mar 2, 2026 · Mar 2, 2026
diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp
@@ -427,8 +427,9 @@ int DeviceRunner::run(Runtime& runtime,
         return rc;
     }
 
-    // Print collected performance data (after stream sync)
+    // Collect phase data and print performance data (after stream sync)
     if (runtime.enable_profiling) {
+        perf_collector_.collect_phase_data();
         export_swimlane_json();
     }
 

diff --git a/src/platform/a2a3sim/aicpu/CMakeLists.txt b/src/platform/a2a3sim/aicpu/CMakeLists.txt
@@ -50,6 +50,8 @@ target_compile_options(aicpu_kernel
     PRIVATE
         -Wall
         -Wextra
+        -Werror
+        -Wno-error=class-memaccess
         -fPIC
         -O3
         -g

diff --git a/src/platform/a2a3sim/host/device_runner.cpp b/src/platform/a2a3sim/host/device_runner.cpp
@@ -297,8 +297,9 @@ int DeviceRunner::run(Runtime& runtime,
 
     LOG_INFO("All threads completed");
 
-    // Print performance data after execution completes
+    // Collect AICPU phase data and print performance data after execution completes
     if (runtime.enable_profiling) {
+        perf_collector_.collect_phase_data();
         export_swimlane_json();
     }
 

diff --git a/src/platform/include/aicpu/performance_collector_aicpu.h b/src/platform/include/aicpu/performance_collector_aicpu.h
@@ -78,4 +78,68 @@ void perf_aicpu_flush_buffers(Runtime* runtime,
  */
 void perf_aicpu_update_total_tasks(Runtime* runtime, uint32_t total_tasks);
 
+/**
+ * Initialize AICPU phase profiling
+ *
+ * Sets up AicpuPhaseHeader and clears per-thread phase record buffers.
+ * Must be called once from thread 0 after perf_aicpu_init_profiling().
+ *
+ * @param runtime Runtime instance pointer
+ * @param num_sched_threads Number of scheduler threads
+ */
+void perf_aicpu_init_phase_profiling(Runtime* runtime, int num_sched_threads);
+
+/**
+ * Record a single scheduler phase
+ *
+ * Appends an AicpuPhaseRecord to the specified thread's buffer.
+ * Silently drops records when the buffer is full.
+ *
+ * @param thread_idx Scheduler thread index
+ * @param phase_id Phase identifier
+ * @param start_time Phase start timestamp
+ * @param end_time Phase end timestamp
+ * @param loop_iter Current loop iteration number
+ * @param tasks_processed Number of tasks processed in this phase
+ */
+void perf_aicpu_record_phase(int thread_idx,
+                              AicpuPhaseId phase_id,
+                              uint64_t start_time, uint64_t end_time,
+                              uint32_t loop_iter, uint32_t tasks_processed);
+
+/**
+ * Write orchestrator cumulative summary
+ *
+ * Writes the orchestrator's accumulated profiling data to shared memory
+ * for host-side collection.
+ *
+ * @param src Pointer to populated AicpuOrchSummary (magic field is set internally)
+ */
+void perf_aicpu_write_orch_summary(const AicpuOrchSummary* src);
+
+/**
+ * Set orchestrator thread index for per-task phase recording
+ *
+ * Must be called once from the orchestrator thread before any
+ * perf_aicpu_record_orch_phase() calls.
+ *
+ * @param thread_idx Thread index for the orchestrator (typically num_sched_threads)
+ */
+void perf_aicpu_set_orch_thread_idx(int thread_idx);
+
+/**
+ * Record a single orchestrator phase
+ *
+ * Appends an AicpuPhaseRecord for one sub-step of pto2_submit_task().
+ * Uses the orchestrator's dedicated buffer slot (set via set_orch_thread_idx).
+ *
+ * @param phase_id Orchestrator phase identifier (ORCH_SYNC..ORCH_SCOPE_END)
+ * @param start_time Phase start timestamp
+ * @param end_time Phase end timestamp
+ * @param submit_idx Task submission index (acts as loop_iter)
+ */
+void perf_aicpu_record_orch_phase(AicpuPhaseId phase_id,
+                                   uint64_t start_time, uint64_t end_time,
+                                   uint32_t submit_idx);
+
 #endif  // PLATFORM_AICPU_PERFORMANCE_COLLECTOR_AICPU_H_
diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h
@@ -2,7 +2,7 @@
  * @file perf_profiling.h
  * @brief Performance profiling data structures
  *
- * Architecture: Fixed header + dynamic tail
+ * Architecture: Fixed header + dynamic tail + optional phase profiling region
  *
  * Memory layout:
  * ┌─────────────────────────────────────────────────────────────┐
@@ -19,9 +19,21 @@
  * │ ...                                                         │
  * ├─────────────────────────────────────────────────────────────┤
  * │ DoubleBuffer[num_cores-1]                                   │
+ * ├─────────────────────────────────────────────────────────────┤
+ * │ AicpuPhaseHeader (optional, present when phase profiling)   │
+ * │  - magic, num_sched_threads, records_per_thread             │
+ * │  - buffer_counts[PLATFORM_MAX_AICPU_THREADS]                │
+ * │  - orch_summary                                             │
+ * ├─────────────────────────────────────────────────────────────┤
+ * │ AicpuPhaseRecord[thread0][0..records_per_thread-1]          │
+ * ├─────────────────────────────────────────────────────────────┤
+ * │ AicpuPhaseRecord[thread1][0..records_per_thread-1]          │
+ * ├─────────────────────────────────────────────────────────────┤
+ * │ ...                                                         │
  * └─────────────────────────────────────────────────────────────┘
  *
- * Total size = sizeof(PerfDataHeader) + num_cores * sizeof(DoubleBuffer)
+ * Base size = sizeof(PerfDataHeader) + num_cores * sizeof(DoubleBuffer)
+ * With phases = Base + sizeof(AicpuPhaseHeader) + num_sched_threads * records_per_thread * sizeof(AicpuPhaseRecord)
  */
 
 #ifndef PLATFORM_COMMON_PERF_PROFILING_H_
@@ -178,6 +190,90 @@ struct PerfDataHeader {
     volatile uint32_t total_tasks;                   // Total tasks (AICPU writes after orchestration)
 } __attribute__((aligned(64)));
 
+// =============================================================================
+// AICPU Phase Profiling - Scheduler and Orchestrator Records
+// =============================================================================
+
+/**
+ * AICPU phase identifier
+ *
+ * Scheduler phases (0-3): four phases in each scheduler loop iteration.
+ * Orchestrator phases (16-24): sub-steps within each pto2_submit_task() call.
+ */
+enum class AicpuPhaseId : uint32_t {
+    // Scheduler phases (0-3)
+    SCHED_COMPLETE    = 0,  // Process completed tasks (fanout traversal)
+    SCHED_DISPATCH    = 1,  // Dispatch ready tasks to idle cores
+    SCHED_SCAN        = 2,  // Incremental scan for root tasks
+    SCHED_EARLY_READY = 3,  // Drain orchestrator's early-ready queue
+    // Orchestrator phases (16-24)
+    ORCH_SYNC      = 16,  // tensormap sync
+    ORCH_ALLOC     = 17,  // task_ring_alloc
+    ORCH_PARAMS    = 18,  // param copy
+    ORCH_LOOKUP    = 19,  // tensormap lookup + dep
+    ORCH_HEAP      = 20,  // heap alloc
+    ORCH_INSERT    = 21,  // tensormap insert
+    ORCH_FANIN     = 22,  // fanin + early-ready
+    ORCH_FINALIZE  = 23,  // scheduler init + SM
+    ORCH_SCOPE_END = 24   // scope_end
+};
+
+/**
+ * Single AICPU scheduler phase record (32 bytes)
+ *
+ * Records one phase within one loop iteration of a scheduler thread.
+ * No thread_id field: identity is derived from array index (position = identity).
+ */
+struct AicpuPhaseRecord {
+    uint64_t start_time;       // Phase start timestamp
+    uint64_t end_time;         // Phase end timestamp
+    uint32_t loop_iter;        // Loop iteration number
+    AicpuPhaseId phase_id;     // Phase type
+    uint32_t tasks_processed;  // Tasks processed in this phase
+    uint32_t padding;          // Alignment padding
+};
+
+/**
+ * AICPU orchestrator cumulative summary
+ *
+ * Contains accumulated cycle counts from the orchestrator thread.
+ * Written once after orchestration completes.
+ */
+struct AicpuOrchSummary {
+    uint64_t start_time;       // Orchestrator start timestamp
+    uint64_t end_time;         // Orchestrator end timestamp
+    uint64_t sync_cycle;       // sync_tensormap phase
+    uint64_t alloc_cycle;      // task_ring_alloc phase
+    uint64_t params_cycle;     // param_copy phase
+    uint64_t lookup_cycle;     // lookup+dep phase
+    uint64_t heap_cycle;       // heap_alloc phase
+    uint64_t insert_cycle;     // tensormap_insert phase
+    uint64_t fanin_cycle;      // fanin+ready phase
+    uint64_t finalize_cycle;   // finalize+SM phase
+    uint64_t scope_end_cycle;  // scope_end phase
+    int64_t  submit_count;     // Total tasks submitted
+    uint32_t magic;            // Validation magic (AICPU_PHASE_MAGIC)
+    uint32_t padding;          // Alignment padding
+} __attribute__((aligned(64)));
+
+constexpr uint32_t AICPU_PHASE_MAGIC = 0x41435048;  // "ACPH"
+constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 16384;  // ~512KB per thread
+
+/**
+ * AICPU phase profiling header
+ *
+ * Located after the DoubleBuffer array in shared memory.
+ * Contains metadata and per-thread record counts.
+ */
+struct AicpuPhaseHeader {
+    uint32_t magic;                  // Validation magic (AICPU_PHASE_MAGIC)
+    uint32_t num_sched_threads;      // Number of scheduler threads
+    uint32_t records_per_thread;     // Max records per thread
+    uint32_t padding;                // Alignment padding
+    volatile uint32_t buffer_counts[PLATFORM_MAX_AICPU_THREADS];  // Per-thread record counts
+    AicpuOrchSummary orch_summary;   // Orchestrator cumulative data
+} __attribute__((aligned(64)));
+
 // =============================================================================
 // Helper Functions - Memory Layout
 // =============================================================================
@@ -250,6 +346,43 @@ inline void get_buffer_and_status(DoubleBuffer* db, uint32_t buffer_id,
     }
 }
 
+/**
+ * Calculate total memory size including phase profiling region
+ *
+ * @param num_cores Number of AICore instances
+ * @param num_sched_threads Number of scheduler threads (typically 3)
+ * @return Total bytes needed
+ */
+inline size_t calc_perf_data_size_with_phases(int num_cores, int num_sched_threads) {
+    return calc_perf_data_size(num_cores)
+         + sizeof(AicpuPhaseHeader)
+         + num_sched_threads * PLATFORM_PHASE_RECORDS_PER_THREAD * sizeof(AicpuPhaseRecord);
+}
+
+/**
+ * Get AicpuPhaseHeader pointer (located after DoubleBuffer array)
+ *
+ * @param base_ptr Shared memory base address
+ * @param num_cores Number of AICore instances
+ * @return AicpuPhaseHeader pointer
+ */
+inline AicpuPhaseHeader* get_phase_header(void* base_ptr, int num_cores) {
+    return (AicpuPhaseHeader*)((char*)base_ptr + calc_perf_data_size(num_cores));
+}
+
+/**
+ * Get AicpuPhaseRecord array for specified thread
+ *
+ * @param base_ptr Shared memory base address
+ * @param num_cores Number of AICore instances
+ * @param thread_idx Scheduler thread index
+ * @return AicpuPhaseRecord array pointer
+ */
+inline AicpuPhaseRecord* get_phase_records(void* base_ptr, int num_cores, int thread_idx) {
+    char* phase_start = (char*)get_phase_header(base_ptr, num_cores) + sizeof(AicpuPhaseHeader);
+    return (AicpuPhaseRecord*)(phase_start + thread_idx * PLATFORM_PHASE_RECORDS_PER_THREAD * sizeof(AicpuPhaseRecord));
+}
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/src/platform/include/host/performance_collector.h b/src/platform/include/host/performance_collector.h
@@ -135,6 +135,14 @@ class PerformanceCollector {
      */
     bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }
 
+    /**
+     * Collect AICPU phase profiling data from shared memory
+     *
+     * Reads scheduler phase records and orchestrator summary from the
+     * phase profiling region. Must be called after AICPU threads have joined.
+     */
+    void collect_phase_data();
+
     /**
      * Get collected records (for testing)
      */
@@ -152,6 +160,12 @@ class PerformanceCollector {
 
     // Collected data (per-core vectors, indexed by core_index)
     std::vector<std::vector<PerfRecord>> collected_perf_records_;
+
+    // AICPU phase profiling data
+    std::vector<std::vector<AicpuPhaseRecord>> collected_phase_records_;
+    std::vector<AicpuPhaseRecord> collected_orch_phase_records_;
+    AicpuOrchSummary collected_orch_summary_{};
+    bool has_phase_data_{false};
 };
 
 #endif  // PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_
-Original file line number
+Diff line change
@@ Expand Up / @@ -50,6 +50,8 @@ target_compile_options(aicpu_kernel @@
         PRIVATE
             -Wall
             -Wextra
+            -Werror
+            -Wno-error=class-memaccess
             -fPIC
             -O3
             -g
@@ Expand Down @@