From 044e40c99db379d2c2f6293b07f80909fff9640a Mon Sep 17 00:00:00 2001 From: puddingfjz <2811443837@qq.com> Date: Mon, 13 Apr 2026 17:29:47 +0800 Subject: [PATCH 1/6] Fix: Migrate AICPU launch to new rtsLaunchCpuKernel interface Implements Issue #356: Migrate from legacy rtAicpuKernelLaunchExWithArgs to the new rtsLaunchCpuKernel / rtsBinaryLoadFromFile / rtsFuncGetByName interface available in CANN 7.0+. Changes: - Add AicpuLoader abstraction class supporting both legacy and new interfaces - New interface: Use rtsBinaryLoadFromFile with JSON descriptor (pypto approach) - JSON contains only filename (libaicpu_kernel.so), runtime finds via library path - No temporary .so file creation needed - cpuKernelMode=0 (JSON only mode) - Legacy interface: Unchanged behavior when BUILD_WITH_NEW_CANN=OFF - BUILD_WITH_NEW_CANN compile flag controls which interface is used The new interface provides forward compatibility and aligns with pypto's approach. The legacy interface remains as fallback for older CANN versions. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 2 + src/a2a3/platform/onboard/host/CMakeLists.txt | 21 +- .../platform/onboard/host/aicpu_loader.cpp | 274 ++++++++++++++++++ src/a2a3/platform/onboard/host/aicpu_loader.h | 110 +++++++ .../platform/onboard/host/device_runner.cpp | 43 ++- .../platform/onboard/host/device_runner.h | 6 +- src/a5/platform/onboard/host/CMakeLists.txt | 21 +- src/a5/platform/onboard/host/aicpu_loader.cpp | 203 +++++++++++++ src/a5/platform/onboard/host/aicpu_loader.h | 108 +++++++ .../platform/onboard/host/device_runner.cpp | 43 ++- src/a5/platform/onboard/host/device_runner.h | 4 + 11 files changed, 780 insertions(+), 55 deletions(-) create mode 100644 src/a2a3/platform/onboard/host/aicpu_loader.cpp create mode 100644 src/a2a3/platform/onboard/host/aicpu_loader.h create mode 100644 src/a5/platform/onboard/host/aicpu_loader.cpp create mode 100644 src/a5/platform/onboard/host/aicpu_loader.h diff --git a/.gitignore b/.gitignore index 37d5e142b..ddba9caf7 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,8 @@ examples/scripts/_deps/ # Profiling files outputs tmp +profiling_logs_* +*.log # Mid-work documentation .docs diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index 12c86f4fd..ac31fd9c9 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -35,6 +35,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_loader.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" @@ -84,11 +85,16 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 -) +# Conditional compilation for new CANN interface +option(BUILD_WITH_NEW_CANN "Use new rtsLaunchCpuKernel interface (CANN 7.0+)" ON) +if(BUILD_WITH_NEW_CANN) + target_compile_definitions(host_runtime PRIVATE BUILD_WITH_NEW_CANN) + # Add additional include path for new RTS headers (CANN 7.0+) + target_include_directories(host_runtime PRIVATE + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime + ) + message(STATUS "Building with new CANN rtsLaunchCpuKernel interface") +endif() # Link against CANN runtime libraries # ascend_hal is dynamically loaded at runtime via dlopen in device_runner @@ -100,4 +106,9 @@ target_link_libraries(host_runtime dl ) +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/aarch64-linux/lib64 +) + set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.cpp b/src/a2a3/platform/onboard/host/aicpu_loader.cpp new file mode 100644 index 000000000..0fb559ea2 --- /dev/null +++ b/src/a2a3/platform/onboard/host/aicpu_loader.cpp @@ -0,0 +1,274 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Loader Implementation + */ + +#include "aicpu_loader.h" + +#include +#include +#include +#include +#include +#include + +#include "common/unified_log.h" +#include "common/kernel_args.h" + +#ifdef BUILD_WITH_NEW_CANN +// New CANN RTS header for rtsLaunchCpuKernel interface (CANN 7.0+) +#include "runtime/runtime/rts/rts_kernel.h" +#include "runtime/runtime/kernel.h" + +// Forward declarations for JSON structures +struct AicpuOpConfig { + std::string functionName; + std::string kernelSo; + std::string opKernelLib; + std::string computeCost = "100"; + std::string engine = "DNN_VM_AICPU"; + std::string flagAsync = "False"; + std::string flagPartial = "False"; + std::string userDefined = "False"; + std::string opType; +}; + +// Generate AICPU op info JSON file +static bool GenerateAicpuOpJson(const std::string& json_path, const std::vector& op_configs) { + std::ofstream json_file(json_path); + if (!json_file.is_open()) { + LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); + return false; + } + + json_file << "{\n"; + for (size_t i = 0; i < op_configs.size(); ++i) { + const auto& config = op_configs[i]; + json_file << " \"" << config.opType << "\": {\n"; + json_file << " \"opInfo\": {\n"; + json_file << " \"functionName\": \"" << config.functionName << "\",\n"; + json_file << " \"kernelSo\": \"" << config.kernelSo << "\",\n"; + json_file << " \"opKernelLib\": \"" << config.opKernelLib << "\",\n"; + json_file << " \"computeCost\": \"" << config.computeCost << "\",\n"; + json_file << " \"engine\": \"" << config.engine << "\",\n"; + json_file << " \"flagAsync\": \"" << config.flagAsync << "\",\n"; + json_file << " \"flagPartial\": \"" << config.flagPartial << "\",\n"; + json_file << " \"userDefined\": \"" << config.userDefined << "\"\n"; + json_file << " }\n"; + json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; + } + json_file << "}\n"; + json_file.close(); + + LOG_INFO("Generated AICPU op info JSON: %s", json_path.c_str()); + return true; +} + +#endif + +int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, const std::vector& kernel_names) { +#ifdef BUILD_WITH_NEW_CANN + // New interface: Load binary using JSON descriptor (pypto approach) + LOG_INFO("AicpuLoader: Using new rtsBinaryLoadFromFile + rtsLaunchCpuKernel interface"); + LOG_INFO("AicpuLoader: Binary size=%zu bytes", aicpu_binary.size()); + + // Step 1: Generate op info JSON at runtime (using only filename, not full path) + const char* tmp_dir = std::getenv("TMPDIR") ? std::getenv("TMPDIR") : "/tmp"; + std::string json_path_template = std::string(tmp_dir) + "/simpler_aicpu_op_info_XXXXXX.json"; + std::vector json_path_buffer(json_path_template.begin(), json_path_template.end()); + json_path_buffer.push_back('\0'); + + int json_fd = mkstemps(json_path_buffer.data(), 5); + if (json_fd == -1) { + LOG_ERROR("Failed to create temporary JSON file"); + return -1; + } + close(json_fd); + json_file_path_ = json_path_buffer.data(); + + // Map opType (used for rtsFuncGetByName) to functionName (actual symbol in .so) + std::unordered_map name_mapping = { + {"DynTileFwkKernelServerInit", "DynTileFwkBackendKernelServerInit"}, + {"DynTileFwkKernelServer", "DynTileFwkBackendKernelServer"} + }; + + // Create op configs for JSON generation + // kernelSo uses only filename - runtime will find it via library search path + std::vector op_configs; + for (const auto& name : kernel_names) { + AicpuOpConfig config; + config.opType = name; + config.functionName = name_mapping[name]; + config.kernelSo = "libaicpu_kernel.so"; // Filename only, runtime searches library path + config.opKernelLib = "KFCKernel"; + op_configs.push_back(config); + } + + // Generate JSON file + if (!GenerateAicpuOpJson(json_file_path_, op_configs)) { + return -1; + } + + // Step 2: Load binary handle from JSON: rtsBinaryLoadFromFile + // cpuKernelMode=0: JSON only mode, runtime finds .so via library search path + rtLoadBinaryOption_t option = {}; + option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; + option.value.cpuKernelMode = 0; + + rtLoadBinaryConfig_t load_config = {}; + load_config.options = &option; + load_config.numOpt = 1; + + rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); + return rc; + } + LOG_INFO("AicpuLoader: Loaded binary from JSON, handle=%p", binary_handle_); + + // Step 3: Resolve function handles: rtsFuncGetByName + for (const auto& name : kernel_names) { + rtFuncHandle func_handle = nullptr; + rc = rtsFuncGetByName(binary_handle_, name.c_str(), &func_handle); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsFuncGetByName failed for %s: %d", name.c_str(), rc); + return rc; + } + func_handles_[name] = func_handle; + LOG_INFO("AicpuLoader: Resolved function handle for %s: %p", name.c_str(), func_handle); + } + + return 0; + +#else + // Legacy interface: No pre-loading needed + (void)so_path; + (void)kernel_names; + LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); + return 0; +#endif +} + +int AicpuLoader::init(const std::string& so_path, const std::vector& kernel_names) { +#ifdef BUILD_WITH_NEW_CANN + // New interface: Use init_with_binary() instead + // This init() is kept for backward compatibility but does nothing + (void)so_path; + (void)kernel_names; + LOG_INFO("AicpuLoader: Use init_with_binary() for new interface"); + return 0; +#else + // Legacy interface: No pre-loading needed + (void)so_path; + (void)kernel_names; + LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); + return 0; +#endif +} + +int AicpuLoader::launch(rtStream_t stream, KernelArgs* k_args, const char* kernel_name, int aicpu_num) { +#ifdef BUILD_WITH_NEW_CANN + // New interface: rtsLaunchCpuKernel + auto it = func_handles_.find(kernel_name); + if (it == func_handles_.end()) { + LOG_ERROR("Kernel not found: %s", kernel_name); + return -1; + } + + rtFuncHandle func_handle = it->second; + + // Prepare args for new interface + struct Args { + KernelArgs k_args; + char kernel_name[64]; + char so_name[64]; + } args; + + args.k_args = *k_args; + std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); + args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; + std::strncpy(args.so_name, "libaicpu_extend_kernels.so", sizeof(args.so_name) - 1); + args.so_name[sizeof(args.so_name) - 1] = '\0'; + + rtCpuKernelArgs_t cpu_args = {}; + cpu_args.baseArgs.args = &args; + cpu_args.baseArgs.argsSize = sizeof(args); + cpu_args.baseArgs.kernelNameAddrOffset = offsetof(struct Args, kernel_name); + cpu_args.baseArgs.soNameAddrOffset = offsetof(struct Args, so_name); + cpu_args.baseArgs.hostInputInfoPtr = nullptr; + cpu_args.baseArgs.kernelOffsetInfoPtr = nullptr; + cpu_args.baseArgs.hostInputInfoNum = 0; + cpu_args.baseArgs.kernelOffsetInfoNum = 0; + cpu_args.baseArgs.isNoNeedH2DCopy = 0; + cpu_args.baseArgs.timeout = 0; + cpu_args.cpuParamHeadOffset = 0; + + // Launch: rtsLaunchCpuKernel + rtError_t rc = rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, nullptr, &cpu_args); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsLaunchCpuKernel failed for %s: %d", kernel_name, rc); + return rc; + } + + return 0; + +#else + // Legacy interface: rtAicpuKernelLaunchExWithArgs + struct Args { + KernelArgs k_args; + char kernel_name[32]; + const char so_name[32] = {"libaicpu_extend_kernels.so"}; + const char op_name[32] = {""}; + } args; + + args.k_args = *k_args; + std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); + args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; + + rtAicpuArgsEx_t rt_args; + std::memset(&rt_args, 0, sizeof(rt_args)); + rt_args.args = &args; + rt_args.argsSize = sizeof(args); + rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); + rt_args.soNameAddrOffset = offsetof(struct Args, so_name); + + return rtAicpuKernelLaunchExWithArgs( + rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 + ); +#endif +} + +void AicpuLoader::finalize() { +#ifdef BUILD_WITH_NEW_CANN + // New interface: Unload binary and clear handles + if (binary_handle_ != nullptr) { + rtError_t rc = rtsBinaryUnload(binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_WARN("rtsBinaryUnload failed: %d", rc); + } + binary_handle_ = nullptr; + } + func_handles_.clear(); + + // Delete temporary JSON file if it was created + if (!json_file_path_.empty()) { + std::remove(json_file_path_.c_str()); + LOG_INFO("AicpuLoader: Deleted temporary JSON file: %s", json_file_path_.c_str()); + json_file_path_.clear(); + } + + LOG_INFO("AicpuLoader: Finalized new interface"); +#else + // Legacy interface: No-op + (void)this; // Suppress unused warning +#endif +} diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.h b/src/a2a3/platform/onboard/host/aicpu_loader.h new file mode 100644 index 000000000..de67f567d --- /dev/null +++ b/src/a2a3/platform/onboard/host/aicpu_loader.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Loader Abstraction + * + * This file provides an abstraction layer for AICPU kernel launching that supports + * both the legacy rtAicpuKernelLaunchExWithArgs API and the new rtsLaunchCpuKernel + * interface available in newer CANN versions. + * + * The interface used is controlled by the BUILD_WITH_NEW_CANN compile flag: + * - When undefined or OFF: Uses legacy rtAicpuKernelLaunchExWithArgs + * - When ON: Uses new rtsLaunchCpuKernel / rtsBinaryLoadFromFile / rtsFuncGetByName + */ + +#ifndef A2A3_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ +#define A2A3_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ + +#include +#include +#include +#include + +#include + +// Forward declarations +struct KernelArgs; + +/** + * @brief AICPU kernel loader abstraction + * + * Supports both legacy and new CANN AICPU launch interfaces through conditional compilation. + */ +class AicpuLoader { +public: + AicpuLoader() = default; + ~AicpuLoader() = default; + + /** + * @brief Initialize the AICPU loader with binary data + * + * For the new interface (BUILD_WITH_NEW_CANN=ON), this generates a JSON descriptor + * and loads the binary using rtsBinaryLoadFromFile. The .so file is referenced by + * filename only (libaicpu_kernel.so) and must be findable via library search path. + * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. + * + * @param aicpu_binary Binary data of the AICPU shared library (not used, kept for API compatibility) + * @param kernel_names List of kernel function names to resolve + * @return 0 on success, error code on failure + */ + int init_with_binary(const std::vector& aicpu_binary, const std::vector& kernel_names); + + /** + * @brief Initialize the AICPU loader (legacy interface compatibility) + * + * For the new interface (BUILD_WITH_NEW_CANN=ON), this stores kernel names for later use. + * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. + * + * @param so_path Path to the AICPU shared library (not used in new interface) + * @param kernel_names List of kernel function names to resolve + * @return 0 on success, error code on failure + */ + int init(const std::string& so_path, const std::vector& kernel_names); + + /** + * @brief Launch an AICPU kernel + * + * Unified interface that delegates to either legacy or new implementation. + * + * @param stream CUDA-style stream for execution + * @param k_args Kernel arguments + * @param kernel_name Name of the kernel to launch + * @param aicpu_num Number of AICPU instances to launch + * @return 0 on success, error code on failure + */ + int launch(rtStream_t stream, KernelArgs* k_args, const char* kernel_name, int aicpu_num); + + /** + * @brief Cleanup resources + * + * For the new interface, this unloads the binary and clears handles. + * For the legacy interface, this is a no-op. + */ + void finalize(); + + // Disable copy and move + AicpuLoader(const AicpuLoader&) = delete; + AicpuLoader& operator=(const AicpuLoader&) = delete; + AicpuLoader(AicpuLoader&&) = delete; + AicpuLoader& operator=(AicpuLoader&&) = delete; + +private: +#ifdef BUILD_WITH_NEW_CANN + // New interface members + void* binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromFile + std::unordered_map func_handles_; // Function handles (kernel_name -> func_handle) + std::string json_file_path_; // Path to temporary JSON descriptor file +#else + // Legacy interface - no state needed +#endif +}; + +#endif // A2A3_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 41d2235c8..3551c856b 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -313,8 +313,24 @@ int DeviceRunner::ensure_binaries_loaded( aicore_kernel_binary_ = aicore_kernel_binary; +#ifdef BUILD_WITH_NEW_CANN + // New interface: Initialize AICPU loader with binary data + const std::vector kernel_names = { + "DynTileFwkKernelServerInit", + "DynTileFwkKernelServer" + }; + int rc = aicpu_loader_.init_with_binary(aicpu_so_binary, kernel_names); + if (rc != 0) { + LOG_ERROR("AicpuLoader init_with_binary failed: %d", rc); + return rc; + } + LOG_INFO("DeviceRunner: AICPU loader initialized with %zu bytes of binary data", aicpu_so_binary.size()); +#else + int rc = 0; +#endif + // Load AICPU SO - int rc = so_info_.init(aicpu_so_binary, mem_alloc_); + rc = so_info_.init(aicpu_so_binary, mem_alloc_); if (rc != 0) { LOG_ERROR("AicpuSoInfo::init failed: %d", rc); return rc; @@ -607,6 +623,9 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); + // Cleanup AICPU loader + aicpu_loader_.finalize(); + // Kernel binaries should have been removed by validate_runtime_impl() if (!func_id_to_addr_.empty()) { LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size()); @@ -659,27 +678,7 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - struct Args { - KernelArgs k_args; - char kernel_name[32]; - const char so_name[32] = {"libaicpu_extend_kernels.so"}; - const char op_name[32] = {""}; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - - rtAicpuArgsEx_t rt_args; - std::memset(&rt_args, 0, sizeof(rt_args)); - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(struct Args, so_name); - - return rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 - ); + return aicpu_loader_.launch(stream, k_args, kernel_name, aicpu_num); } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 0c7598363..cfafff928 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -43,6 +43,7 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#include "aicpu_loader.h" #include "runtime.h" /** @@ -252,7 +253,7 @@ class DeviceRunner { */ int run(Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1); + const std::vector &aicore_kernel_binary, int launch_aicpu_num); /** * Print handshake results from device @@ -380,6 +381,9 @@ class DeviceRunner { int worker_count_{0}; // Stored for print_handshake_results in destructor std::vector aicore_kernel_binary_; + // AICPU loader abstraction (supports both legacy and new CANN interfaces) + AicpuLoader aicpu_loader_; + // Memory management MemoryAllocator mem_alloc_; diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index 12c86f4fd..ac31fd9c9 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -35,6 +35,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_loader.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" @@ -84,11 +85,16 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 -) +# Conditional compilation for new CANN interface +option(BUILD_WITH_NEW_CANN "Use new rtsLaunchCpuKernel interface (CANN 7.0+)" ON) +if(BUILD_WITH_NEW_CANN) + target_compile_definitions(host_runtime PRIVATE BUILD_WITH_NEW_CANN) + # Add additional include path for new RTS headers (CANN 7.0+) + target_include_directories(host_runtime PRIVATE + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime + ) + message(STATUS "Building with new CANN rtsLaunchCpuKernel interface") +endif() # Link against CANN runtime libraries # ascend_hal is dynamically loaded at runtime via dlopen in device_runner @@ -100,4 +106,9 @@ target_link_libraries(host_runtime dl ) +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/aarch64-linux/lib64 +) + set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a5/platform/onboard/host/aicpu_loader.cpp b/src/a5/platform/onboard/host/aicpu_loader.cpp new file mode 100644 index 000000000..5c0470169 --- /dev/null +++ b/src/a5/platform/onboard/host/aicpu_loader.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Loader Implementation + */ + +#include "aicpu_loader.h" + +#include +#include +#include + +#include "common/unified_log.h" +#include "common/kernel_args.h" + +#ifdef BUILD_WITH_NEW_CANN +// New CANN RTS header for rtsLaunchCpuKernel interface (CANN 7.0+) +#include "runtime/runtime/rts/rts_kernel.h" +#include "runtime/runtime/kernel.h" +#endif + +int AicpuLoader::init_with_binary(const std::vector& aicpu_so_binary, const std::vector& kernel_names) { +#ifdef BUILD_WITH_NEW_CANN + // New interface: Load binary from memory and resolve function handles + LOG_INFO("AicpuLoader: Using new rtsBinaryLoadFromData + rtsLaunchCpuKernel interface"); + + if (aicpu_so_binary.empty()) { + LOG_ERROR("AicpuLoader: AICPU binary is empty"); + return -1; + } + + // 1. Load binary from memory: rtsBinaryLoadFromData + // Try with CPU kernel mode configuration + rtLoadBinaryOption_t option = {}; + option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; + option.value.cpuKernelMode = 1; // Load CPU so & json + + rtLoadBinaryConfig_t load_config = {}; + load_config.options = &option; + load_config.numOpt = 1; + + rtError_t rc = rtsBinaryLoadFromData( + aicpu_so_binary.data(), + aicpu_so_binary.size(), + &load_config, + &binary_handle_ + ); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsBinaryLoadFromData failed: %d (binary size=%zu)", rc, aicpu_so_binary.size()); + return rc; + } + LOG_INFO("AicpuLoader: Loaded binary from memory, handle=%p, size=%zu", binary_handle_, aicpu_so_binary.size()); + + // Map kernel names to backend versions (actual symbol names in the .so) + std::unordered_map name_mapping = { + {"DynTileFwkKernelServerInit", "DynTileFwkBackendKernelServerInit"}, + {"DynTileFwkKernelServer", "DynTileFwkBackendKernelServer"} + }; + + // 2. Resolve function handles: rtsFuncGetByName + for (const auto& name : kernel_names) { + // Map to the actual symbol name + std::string actual_name = name; + auto it = name_mapping.find(name); + if (it != name_mapping.end()) { + actual_name = it->second; + } + + rtFuncHandle func_handle = nullptr; + rc = rtsFuncGetByName(binary_handle_, actual_name.c_str(), &func_handle); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsFuncGetByName failed for %s (mapped from %s): %d", actual_name.c_str(), name.c_str(), rc); + return rc; + } + func_handles_[name] = func_handle; // Store with original name for lookup + LOG_INFO("AicpuLoader: Resolved function handle for %s -> %s: %p", name.c_str(), actual_name.c_str(), func_handle); + } + + return 0; + +#else + // Legacy interface: No pre-loading needed + (void)aicpu_so_binary; + (void)kernel_names; + LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); + return 0; +#endif +} + +int AicpuLoader::init(const std::string& so_path, const std::vector& kernel_names) { +#ifdef BUILD_WITH_NEW_CANN + // New interface: Store kernel names for later resolution + // Binary will be loaded via init_with_binary() + LOG_INFO("AicpuLoader: Using new rtsLaunchCpuKernel interface (binary not loaded yet)"); + (void)so_path; + (void)kernel_names; + return 0; // Binary will be loaded separately via init_with_binary() +#else + // Legacy interface: No pre-loading needed + (void)so_path; + (void)kernel_names; + LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); + return 0; +#endif +} + +int AicpuLoader::launch(rtStream_t stream, KernelArgs* k_args, const char* kernel_name, int aicpu_num) { +#ifdef BUILD_WITH_NEW_CANN + // New interface: rtsLaunchCpuKernel + auto it = func_handles_.find(kernel_name); + if (it == func_handles_.end()) { + LOG_ERROR("Kernel not found: %s", kernel_name); + return -1; + } + + rtFuncHandle func_handle = it->second; + + // Prepare args for new interface + struct Args { + KernelArgs k_args; + char kernel_name[64]; + char so_name[64]; + } args; + + args.k_args = *k_args; + std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); + args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; + std::strncpy(args.so_name, "libaicpu_extend_kernels.so", sizeof(args.so_name) - 1); + args.so_name[sizeof(args.so_name) - 1] = '\0'; + + rtCpuKernelArgs_t cpu_args = {}; + cpu_args.baseArgs.args = &args; + cpu_args.baseArgs.argsSize = sizeof(args); + cpu_args.baseArgs.kernelNameAddrOffset = offsetof(struct Args, kernel_name); + cpu_args.baseArgs.soNameAddrOffset = offsetof(struct Args, so_name); + cpu_args.baseArgs.hostInputInfoPtr = nullptr; + cpu_args.baseArgs.kernelOffsetInfoPtr = nullptr; + cpu_args.baseArgs.hostInputInfoNum = 0; + cpu_args.baseArgs.kernelOffsetInfoNum = 0; + cpu_args.baseArgs.isNoNeedH2DCopy = 0; + cpu_args.baseArgs.timeout = 0; + cpu_args.cpuParamHeadOffset = 0; + + // Launch: rtsLaunchCpuKernel + rtError_t rc = rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, nullptr, &cpu_args); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsLaunchCpuKernel failed for %s: %d", kernel_name, rc); + return rc; + } + + return 0; + +#else + // Legacy interface: rtAicpuKernelLaunchExWithArgs + struct Args { + KernelArgs k_args; + char kernel_name[32]; + const char so_name[32] = {"libaicpu_extend_kernels.so"}; + const char op_name[32] = {""}; + } args; + + args.k_args = *k_args; + std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); + args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; + + rtAicpuArgsEx_t rt_args; + std::memset(&rt_args, 0, sizeof(rt_args)); + rt_args.args = &args; + rt_args.argsSize = sizeof(args); + rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); + rt_args.soNameAddrOffset = offsetof(struct Args, so_name); + + return rtAicpuKernelLaunchExWithArgs( + rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 + ); +#endif +} + +void AicpuLoader::finalize() { +#ifdef BUILD_WITH_NEW_CANN + // New interface: Unload binary and clear handles + if (binary_handle_ != nullptr) { + rtError_t rc = rtsBinaryUnload(binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_WARN("rtsBinaryUnload failed: %d", rc); + } + binary_handle_ = nullptr; + } + func_handles_.clear(); + LOG_INFO("AicpuLoader: Finalized new interface"); +#else + // Legacy interface: No-op + (void)this; // Suppress unused warning +#endif +} diff --git a/src/a5/platform/onboard/host/aicpu_loader.h b/src/a5/platform/onboard/host/aicpu_loader.h new file mode 100644 index 000000000..c2929b6d7 --- /dev/null +++ b/src/a5/platform/onboard/host/aicpu_loader.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Loader Abstraction + * + * This file provides an abstraction layer for AICPU kernel launching that supports + * both the legacy rtAicpuKernelLaunchExWithArgs API and the new rtsLaunchCpuKernel + * interface available in newer CANN versions. + * + * The interface used is controlled by the BUILD_WITH_NEW_CANN compile flag: + * - When undefined or OFF: Uses legacy rtAicpuKernelLaunchExWithArgs + * - When ON: Uses new rtsLaunchCpuKernel / rtsBinaryLoadFromFile / rtsFuncGetByName + */ + +#ifndef A5_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ +#define A5_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ + +#include +#include +#include +#include + +#include + +// Forward declarations +struct KernelArgs; + +/** + * @brief AICPU kernel loader abstraction + * + * Supports both legacy and new CANN AICPU launch interfaces through conditional compilation. + */ +class AicpuLoader { +public: + AicpuLoader() = default; + ~AicpuLoader() = default; + + /** + * @brief Initialize the AICPU loader with binary data + * + * For the new interface (BUILD_WITH_NEW_CANN=ON), this loads the AICPU binary from memory + * and resolves function handles using rtsBinaryLoadFromData and rtsFuncGetByName. + * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. + * + * @param aicpu_so_binary Binary data of the AICPU shared object + * @param kernel_names List of kernel function names to resolve (used for mapping) + * @return 0 on success, error code on failure + */ + int init_with_binary(const std::vector& aicpu_so_binary, const std::vector& kernel_names); + + /** + * @brief Initialize the AICPU loader (legacy interface compatibility) + * + * For the new interface (BUILD_WITH_NEW_CANN=ON), this stores kernel names for later use. + * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. + * + * @param so_path Path to the AICPU shared library (not used in new interface) + * @param kernel_names List of kernel function names to resolve + * @return 0 on success, error code on failure + */ + int init(const std::string& so_path, const std::vector& kernel_names); + + /** + * @brief Launch an AICPU kernel + * + * Unified interface that delegates to either legacy or new implementation. + * + * @param stream CUDA-style stream for execution + * @param k_args Kernel arguments + * @param kernel_name Name of the kernel to launch + * @param aicpu_num Number of AICPU instances to launch + * @return 0 on success, error code on failure + */ + int launch(rtStream_t stream, KernelArgs* k_args, const char* kernel_name, int aicpu_num); + + /** + * @brief Cleanup resources + * + * For the new interface, this unloads the binary and clears handles. + * For the legacy interface, this is a no-op. + */ + void finalize(); + + // Disable copy and move + AicpuLoader(const AicpuLoader&) = delete; + AicpuLoader& operator=(const AicpuLoader&) = delete; + AicpuLoader(AicpuLoader&&) = delete; + AicpuLoader& operator=(AicpuLoader&&) = delete; + +private: +#ifdef BUILD_WITH_NEW_CANN + // New interface members + void* binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromData + std::unordered_map func_handles_; // Function handles +#else + // Legacy interface - no state needed +#endif +}; + +#endif // A5_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index e451a5efa..d4cfc7f80 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -231,8 +231,24 @@ int DeviceRunner::ensure_binaries_loaded( aicore_kernel_binary_ = aicore_kernel_binary; +#ifdef BUILD_WITH_NEW_CANN + // New interface: Initialize AICPU loader with binary data + const std::vector kernel_names = { + "DynTileFwkKernelServerInit", + "DynTileFwkKernelServer" + }; + int rc = aicpu_loader_.init_with_binary(aicpu_so_binary, kernel_names); + if (rc != 0) { + LOG_ERROR("AicpuLoader init_with_binary failed: %d", rc); + return rc; + } + LOG_INFO("DeviceRunner: AICPU loader initialized with binary data"); +#else + int rc = 0; +#endif + // Load AICPU SO - int rc = so_info_.init(aicpu_so_binary, mem_alloc_); + rc = so_info_.init(aicpu_so_binary, mem_alloc_); if (rc != 0) { LOG_ERROR("AicpuSoInfo::init failed: %d", rc); return rc; @@ -480,6 +496,9 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); + // Cleanup AICPU loader + aicpu_loader_.finalize(); + // Kernel binaries should have been removed by validate_runtime_impl() if (!func_id_to_addr_.empty()) { LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size()); @@ -520,27 +539,7 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - struct Args { - KernelArgs k_args; - char kernel_name[32]; - const char so_name[32] = {"libaicpu_extend_kernels.so"}; - const char op_name[32] = {""}; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - - rtAicpuArgsEx_t rt_args; - std::memset(&rt_args, 0, sizeof(rt_args)); - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(struct Args, so_name); - - return rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 - ); + return aicpu_loader_.launch(stream, k_args, kernel_name, aicpu_num); } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime *runtime) { diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 6658f7221..c94595e8f 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -43,6 +43,7 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#include "aicpu_loader.h" #include "runtime.h" /** @@ -342,6 +343,9 @@ class DeviceRunner { int worker_count_{0}; // Stored for print_handshake_results in destructor std::vector aicore_kernel_binary_; + // AICPU loader abstraction (supports both legacy and new CANN interfaces) + AicpuLoader aicpu_loader_; + // Memory management MemoryAllocator mem_alloc_; From d1f3f9cd879a9553383ec202294f918c4d800f6e Mon Sep 17 00:00:00 2001 From: puddingfjz <2811443837@qq.com> Date: Mon, 13 Apr 2026 18:02:10 +0800 Subject: [PATCH 2/6] Fix: migrate a5 AICPU launch to use JSON-based rtsBinaryLoadFromFile Align a5 implementation with a2a3 by using rtsBinaryLoadFromFile with cpuKernelMode=0 instead of rtsBinaryLoadFromData. This matches the pypto approach and ensures consistency across platforms. Changes: - Use rtsBinaryLoadFromFile + JSON descriptor (cpuKernelMode=0) - Generate temporary JSON file with filename-only .so reference - Add json_file_path_ member for cleanup Fixes #356 Co-Authored-By: Claude Opus 4.6 --- src/a5/platform/onboard/host/aicpu_loader.cpp | 151 +++++++++++++----- src/a5/platform/onboard/host/aicpu_loader.h | 24 +-- 2 files changed, 124 insertions(+), 51 deletions(-) diff --git a/src/a5/platform/onboard/host/aicpu_loader.cpp b/src/a5/platform/onboard/host/aicpu_loader.cpp index 5c0470169..0fb559ea2 100644 --- a/src/a5/platform/onboard/host/aicpu_loader.cpp +++ b/src/a5/platform/onboard/host/aicpu_loader.cpp @@ -3,7 +3,7 @@ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of * CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. * ----------------------------------------------------------------------------------------------------------- @@ -16,7 +16,10 @@ #include #include +#include #include +#include +#include #include "common/unified_log.h" #include "common/kernel_args.h" @@ -25,70 +28,130 @@ // New CANN RTS header for rtsLaunchCpuKernel interface (CANN 7.0+) #include "runtime/runtime/rts/rts_kernel.h" #include "runtime/runtime/kernel.h" + +// Forward declarations for JSON structures +struct AicpuOpConfig { + std::string functionName; + std::string kernelSo; + std::string opKernelLib; + std::string computeCost = "100"; + std::string engine = "DNN_VM_AICPU"; + std::string flagAsync = "False"; + std::string flagPartial = "False"; + std::string userDefined = "False"; + std::string opType; +}; + +// Generate AICPU op info JSON file +static bool GenerateAicpuOpJson(const std::string& json_path, const std::vector& op_configs) { + std::ofstream json_file(json_path); + if (!json_file.is_open()) { + LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); + return false; + } + + json_file << "{\n"; + for (size_t i = 0; i < op_configs.size(); ++i) { + const auto& config = op_configs[i]; + json_file << " \"" << config.opType << "\": {\n"; + json_file << " \"opInfo\": {\n"; + json_file << " \"functionName\": \"" << config.functionName << "\",\n"; + json_file << " \"kernelSo\": \"" << config.kernelSo << "\",\n"; + json_file << " \"opKernelLib\": \"" << config.opKernelLib << "\",\n"; + json_file << " \"computeCost\": \"" << config.computeCost << "\",\n"; + json_file << " \"engine\": \"" << config.engine << "\",\n"; + json_file << " \"flagAsync\": \"" << config.flagAsync << "\",\n"; + json_file << " \"flagPartial\": \"" << config.flagPartial << "\",\n"; + json_file << " \"userDefined\": \"" << config.userDefined << "\"\n"; + json_file << " }\n"; + json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; + } + json_file << "}\n"; + json_file.close(); + + LOG_INFO("Generated AICPU op info JSON: %s", json_path.c_str()); + return true; +} + #endif -int AicpuLoader::init_with_binary(const std::vector& aicpu_so_binary, const std::vector& kernel_names) { +int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, const std::vector& kernel_names) { #ifdef BUILD_WITH_NEW_CANN - // New interface: Load binary from memory and resolve function handles - LOG_INFO("AicpuLoader: Using new rtsBinaryLoadFromData + rtsLaunchCpuKernel interface"); + // New interface: Load binary using JSON descriptor (pypto approach) + LOG_INFO("AicpuLoader: Using new rtsBinaryLoadFromFile + rtsLaunchCpuKernel interface"); + LOG_INFO("AicpuLoader: Binary size=%zu bytes", aicpu_binary.size()); + + // Step 1: Generate op info JSON at runtime (using only filename, not full path) + const char* tmp_dir = std::getenv("TMPDIR") ? std::getenv("TMPDIR") : "/tmp"; + std::string json_path_template = std::string(tmp_dir) + "/simpler_aicpu_op_info_XXXXXX.json"; + std::vector json_path_buffer(json_path_template.begin(), json_path_template.end()); + json_path_buffer.push_back('\0'); - if (aicpu_so_binary.empty()) { - LOG_ERROR("AicpuLoader: AICPU binary is empty"); + int json_fd = mkstemps(json_path_buffer.data(), 5); + if (json_fd == -1) { + LOG_ERROR("Failed to create temporary JSON file"); return -1; } + close(json_fd); + json_file_path_ = json_path_buffer.data(); + + // Map opType (used for rtsFuncGetByName) to functionName (actual symbol in .so) + std::unordered_map name_mapping = { + {"DynTileFwkKernelServerInit", "DynTileFwkBackendKernelServerInit"}, + {"DynTileFwkKernelServer", "DynTileFwkBackendKernelServer"} + }; + + // Create op configs for JSON generation + // kernelSo uses only filename - runtime will find it via library search path + std::vector op_configs; + for (const auto& name : kernel_names) { + AicpuOpConfig config; + config.opType = name; + config.functionName = name_mapping[name]; + config.kernelSo = "libaicpu_kernel.so"; // Filename only, runtime searches library path + config.opKernelLib = "KFCKernel"; + op_configs.push_back(config); + } - // 1. Load binary from memory: rtsBinaryLoadFromData - // Try with CPU kernel mode configuration + // Generate JSON file + if (!GenerateAicpuOpJson(json_file_path_, op_configs)) { + return -1; + } + + // Step 2: Load binary handle from JSON: rtsBinaryLoadFromFile + // cpuKernelMode=0: JSON only mode, runtime finds .so via library search path rtLoadBinaryOption_t option = {}; option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; - option.value.cpuKernelMode = 1; // Load CPU so & json + option.value.cpuKernelMode = 0; rtLoadBinaryConfig_t load_config = {}; load_config.options = &option; load_config.numOpt = 1; - rtError_t rc = rtsBinaryLoadFromData( - aicpu_so_binary.data(), - aicpu_so_binary.size(), - &load_config, - &binary_handle_ - ); + rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsBinaryLoadFromData failed: %d (binary size=%zu)", rc, aicpu_so_binary.size()); + LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); return rc; } - LOG_INFO("AicpuLoader: Loaded binary from memory, handle=%p, size=%zu", binary_handle_, aicpu_so_binary.size()); - - // Map kernel names to backend versions (actual symbol names in the .so) - std::unordered_map name_mapping = { - {"DynTileFwkKernelServerInit", "DynTileFwkBackendKernelServerInit"}, - {"DynTileFwkKernelServer", "DynTileFwkBackendKernelServer"} - }; + LOG_INFO("AicpuLoader: Loaded binary from JSON, handle=%p", binary_handle_); - // 2. Resolve function handles: rtsFuncGetByName + // Step 3: Resolve function handles: rtsFuncGetByName for (const auto& name : kernel_names) { - // Map to the actual symbol name - std::string actual_name = name; - auto it = name_mapping.find(name); - if (it != name_mapping.end()) { - actual_name = it->second; - } - rtFuncHandle func_handle = nullptr; - rc = rtsFuncGetByName(binary_handle_, actual_name.c_str(), &func_handle); + rc = rtsFuncGetByName(binary_handle_, name.c_str(), &func_handle); if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsFuncGetByName failed for %s (mapped from %s): %d", actual_name.c_str(), name.c_str(), rc); + LOG_ERROR("rtsFuncGetByName failed for %s: %d", name.c_str(), rc); return rc; } - func_handles_[name] = func_handle; // Store with original name for lookup - LOG_INFO("AicpuLoader: Resolved function handle for %s -> %s: %p", name.c_str(), actual_name.c_str(), func_handle); + func_handles_[name] = func_handle; + LOG_INFO("AicpuLoader: Resolved function handle for %s: %p", name.c_str(), func_handle); } return 0; #else // Legacy interface: No pre-loading needed - (void)aicpu_so_binary; + (void)so_path; (void)kernel_names; LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); return 0; @@ -97,12 +160,12 @@ int AicpuLoader::init_with_binary(const std::vector& aicpu_so_binary, c int AicpuLoader::init(const std::string& so_path, const std::vector& kernel_names) { #ifdef BUILD_WITH_NEW_CANN - // New interface: Store kernel names for later resolution - // Binary will be loaded via init_with_binary() - LOG_INFO("AicpuLoader: Using new rtsLaunchCpuKernel interface (binary not loaded yet)"); + // New interface: Use init_with_binary() instead + // This init() is kept for backward compatibility but does nothing (void)so_path; (void)kernel_names; - return 0; // Binary will be loaded separately via init_with_binary() + LOG_INFO("AicpuLoader: Use init_with_binary() for new interface"); + return 0; #else // Legacy interface: No pre-loading needed (void)so_path; @@ -195,6 +258,14 @@ void AicpuLoader::finalize() { binary_handle_ = nullptr; } func_handles_.clear(); + + // Delete temporary JSON file if it was created + if (!json_file_path_.empty()) { + std::remove(json_file_path_.c_str()); + LOG_INFO("AicpuLoader: Deleted temporary JSON file: %s", json_file_path_.c_str()); + json_file_path_.clear(); + } + LOG_INFO("AicpuLoader: Finalized new interface"); #else // Legacy interface: No-op diff --git a/src/a5/platform/onboard/host/aicpu_loader.h b/src/a5/platform/onboard/host/aicpu_loader.h index c2929b6d7..0be15b68f 100644 --- a/src/a5/platform/onboard/host/aicpu_loader.h +++ b/src/a5/platform/onboard/host/aicpu_loader.h @@ -3,7 +3,7 @@ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of * CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. * ----------------------------------------------------------------------------------------------------------- @@ -46,24 +46,25 @@ class AicpuLoader { /** * @brief Initialize the AICPU loader with binary data * - * For the new interface (BUILD_WITH_NEW_CANN=ON), this loads the AICPU binary from memory - * and resolves function handles using rtsBinaryLoadFromData and rtsFuncGetByName. + * For the new interface (BUILD_WITH_NEW_CANN=ON), this generates a JSON descriptor + * and loads the binary using rtsBinaryLoadFromFile. The .so file is referenced by + * filename only (libaicpu_kernel.so) and must be findable via library search path. * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. * - * @param aicpu_so_binary Binary data of the AICPU shared object - * @param kernel_names List of kernel function names to resolve (used for mapping) + * @param aicpu_binary Binary data of the AICPU shared library (not used, kept for API compatibility) + * @param kernel_names List of kernel function names to resolve * @return 0 on success, error code on failure */ - int init_with_binary(const std::vector& aicpu_so_binary, const std::vector& kernel_names); + int init_with_binary(const std::vector& aicpu_binary, const std::vector& kernel_names); /** * @brief Initialize the AICPU loader (legacy interface compatibility) * - * For the new interface (BUILD_WITH_NEW_CANN=ON), this stores kernel names for later use. + * For the new interface (BUILD_WITH_NEW_CANN=ON), this does nothing. * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. * - * @param so_path Path to the AICPU shared library (not used in new interface) - * @param kernel_names List of kernel function names to resolve + * @param so_path Path to the AICPU shared library (not used) + * @param kernel_names List of kernel function names (not used) * @return 0 on success, error code on failure */ int init(const std::string& so_path, const std::vector& kernel_names); @@ -98,8 +99,9 @@ class AicpuLoader { private: #ifdef BUILD_WITH_NEW_CANN // New interface members - void* binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromData - std::unordered_map func_handles_; // Function handles + void* binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromFile + std::unordered_map func_handles_; // Function handles (kernel_name -> func_handle) + std::string json_file_path_; // Path to temporary JSON descriptor file #else // Legacy interface - no state needed #endif From 69ae837c833b484c872bc7649fdd72ee96c9aac4 Mon Sep 17 00:00:00 2001 From: puddingfjz <2811443837@qq.com> Date: Mon, 13 Apr 2026 18:14:25 +0800 Subject: [PATCH 3/6] Chore: remove test-related .gitignore entries Remove profiling_logs_* and *.log entries that were added during development and testing. These are not needed for the production codebase. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index ddba9caf7..37d5e142b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,8 +28,6 @@ examples/scripts/_deps/ # Profiling files outputs tmp -profiling_logs_* -*.log # Mid-work documentation .docs From d904625998ffefade1fb91fad1a947b64b79956b Mon Sep 17 00:00:00 2001 From: puddingfjz <2811443837@qq.com> Date: Mon, 13 Apr 2026 18:29:01 +0800 Subject: [PATCH 4/6] Style: apply clang-format and fix copyright headers - Apply clang-format pointer/reference spacing - Fix copyright header: "WITHOUT WARRANTIES OF ANY KIND" (not "OR") Co-Authored-By: Claude Opus 4.6 --- .../platform/onboard/host/aicpu_loader.cpp | 20 +++++++++-------- src/a2a3/platform/onboard/host/aicpu_loader.h | 20 ++++++++--------- .../platform/onboard/host/device_runner.cpp | 5 +---- src/a5/platform/onboard/host/aicpu_loader.cpp | 20 +++++++++-------- src/a5/platform/onboard/host/aicpu_loader.h | 22 +++++++++---------- .../platform/onboard/host/device_runner.cpp | 5 +---- 6 files changed, 45 insertions(+), 47 deletions(-) diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.cpp b/src/a2a3/platform/onboard/host/aicpu_loader.cpp index 0fb559ea2..a1bab7bfe 100644 --- a/src/a2a3/platform/onboard/host/aicpu_loader.cpp +++ b/src/a2a3/platform/onboard/host/aicpu_loader.cpp @@ -3,7 +3,7 @@ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of * CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. * ----------------------------------------------------------------------------------------------------------- @@ -43,7 +43,7 @@ struct AicpuOpConfig { }; // Generate AICPU op info JSON file -static bool GenerateAicpuOpJson(const std::string& json_path, const std::vector& op_configs) { +static bool GenerateAicpuOpJson(const std::string &json_path, const std::vector &op_configs) { std::ofstream json_file(json_path); if (!json_file.is_open()) { LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); @@ -52,7 +52,7 @@ static bool GenerateAicpuOpJson(const std::string& json_path, const std::vector< json_file << "{\n"; for (size_t i = 0; i < op_configs.size(); ++i) { - const auto& config = op_configs[i]; + const auto &config = op_configs[i]; json_file << " \"" << config.opType << "\": {\n"; json_file << " \"opInfo\": {\n"; json_file << " \"functionName\": \"" << config.functionName << "\",\n"; @@ -75,14 +75,16 @@ static bool GenerateAicpuOpJson(const std::string& json_path, const std::vector< #endif -int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, const std::vector& kernel_names) { +int AicpuLoader::init_with_binary( + const std::vector &aicpu_binary, const std::vector &kernel_names +) { #ifdef BUILD_WITH_NEW_CANN // New interface: Load binary using JSON descriptor (pypto approach) LOG_INFO("AicpuLoader: Using new rtsBinaryLoadFromFile + rtsLaunchCpuKernel interface"); LOG_INFO("AicpuLoader: Binary size=%zu bytes", aicpu_binary.size()); // Step 1: Generate op info JSON at runtime (using only filename, not full path) - const char* tmp_dir = std::getenv("TMPDIR") ? std::getenv("TMPDIR") : "/tmp"; + const char *tmp_dir = std::getenv("TMPDIR") ? std::getenv("TMPDIR") : "/tmp"; std::string json_path_template = std::string(tmp_dir) + "/simpler_aicpu_op_info_XXXXXX.json"; std::vector json_path_buffer(json_path_template.begin(), json_path_template.end()); json_path_buffer.push_back('\0'); @@ -104,7 +106,7 @@ int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, cons // Create op configs for JSON generation // kernelSo uses only filename - runtime will find it via library search path std::vector op_configs; - for (const auto& name : kernel_names) { + for (const auto &name : kernel_names) { AicpuOpConfig config; config.opType = name; config.functionName = name_mapping[name]; @@ -136,7 +138,7 @@ int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, cons LOG_INFO("AicpuLoader: Loaded binary from JSON, handle=%p", binary_handle_); // Step 3: Resolve function handles: rtsFuncGetByName - for (const auto& name : kernel_names) { + for (const auto &name : kernel_names) { rtFuncHandle func_handle = nullptr; rc = rtsFuncGetByName(binary_handle_, name.c_str(), &func_handle); if (rc != RT_ERROR_NONE) { @@ -158,7 +160,7 @@ int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, cons #endif } -int AicpuLoader::init(const std::string& so_path, const std::vector& kernel_names) { +int AicpuLoader::init(const std::string &so_path, const std::vector &kernel_names) { #ifdef BUILD_WITH_NEW_CANN // New interface: Use init_with_binary() instead // This init() is kept for backward compatibility but does nothing @@ -175,7 +177,7 @@ int AicpuLoader::init(const std::string& so_path, const std::vector #endif } -int AicpuLoader::launch(rtStream_t stream, KernelArgs* k_args, const char* kernel_name, int aicpu_num) { +int AicpuLoader::launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { #ifdef BUILD_WITH_NEW_CANN // New interface: rtsLaunchCpuKernel auto it = func_handles_.find(kernel_name); diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.h b/src/a2a3/platform/onboard/host/aicpu_loader.h index de67f567d..1ec949227 100644 --- a/src/a2a3/platform/onboard/host/aicpu_loader.h +++ b/src/a2a3/platform/onboard/host/aicpu_loader.h @@ -55,7 +55,7 @@ class AicpuLoader { * @param kernel_names List of kernel function names to resolve * @return 0 on success, error code on failure */ - int init_with_binary(const std::vector& aicpu_binary, const std::vector& kernel_names); + int init_with_binary(const std::vector &aicpu_binary, const std::vector &kernel_names); /** * @brief Initialize the AICPU loader (legacy interface compatibility) @@ -67,7 +67,7 @@ class AicpuLoader { * @param kernel_names List of kernel function names to resolve * @return 0 on success, error code on failure */ - int init(const std::string& so_path, const std::vector& kernel_names); + int init(const std::string &so_path, const std::vector &kernel_names); /** * @brief Launch an AICPU kernel @@ -80,7 +80,7 @@ class AicpuLoader { * @param aicpu_num Number of AICPU instances to launch * @return 0 on success, error code on failure */ - int launch(rtStream_t stream, KernelArgs* k_args, const char* kernel_name, int aicpu_num); + int launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num); /** * @brief Cleanup resources @@ -91,17 +91,17 @@ class AicpuLoader { void finalize(); // Disable copy and move - AicpuLoader(const AicpuLoader&) = delete; - AicpuLoader& operator=(const AicpuLoader&) = delete; - AicpuLoader(AicpuLoader&&) = delete; - AicpuLoader& operator=(AicpuLoader&&) = delete; + AicpuLoader(const AicpuLoader &) = delete; + AicpuLoader &operator=(const AicpuLoader &) = delete; + AicpuLoader(AicpuLoader &&) = delete; + AicpuLoader &operator=(AicpuLoader &&) = delete; private: #ifdef BUILD_WITH_NEW_CANN // New interface members - void* binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromFile - std::unordered_map func_handles_; // Function handles (kernel_name -> func_handle) - std::string json_file_path_; // Path to temporary JSON descriptor file + void *binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromFile + std::unordered_map func_handles_; // Function handles (kernel_name -> func_handle) + std::string json_file_path_; // Path to temporary JSON descriptor file #else // Legacy interface - no state needed #endif diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 3551c856b..d5106e532 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -315,10 +315,7 @@ int DeviceRunner::ensure_binaries_loaded( #ifdef BUILD_WITH_NEW_CANN // New interface: Initialize AICPU loader with binary data - const std::vector kernel_names = { - "DynTileFwkKernelServerInit", - "DynTileFwkKernelServer" - }; + const std::vector kernel_names = {"DynTileFwkKernelServerInit", "DynTileFwkKernelServer"}; int rc = aicpu_loader_.init_with_binary(aicpu_so_binary, kernel_names); if (rc != 0) { LOG_ERROR("AicpuLoader init_with_binary failed: %d", rc); diff --git a/src/a5/platform/onboard/host/aicpu_loader.cpp b/src/a5/platform/onboard/host/aicpu_loader.cpp index 0fb559ea2..a1bab7bfe 100644 --- a/src/a5/platform/onboard/host/aicpu_loader.cpp +++ b/src/a5/platform/onboard/host/aicpu_loader.cpp @@ -3,7 +3,7 @@ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of * CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. * ----------------------------------------------------------------------------------------------------------- @@ -43,7 +43,7 @@ struct AicpuOpConfig { }; // Generate AICPU op info JSON file -static bool GenerateAicpuOpJson(const std::string& json_path, const std::vector& op_configs) { +static bool GenerateAicpuOpJson(const std::string &json_path, const std::vector &op_configs) { std::ofstream json_file(json_path); if (!json_file.is_open()) { LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); @@ -52,7 +52,7 @@ static bool GenerateAicpuOpJson(const std::string& json_path, const std::vector< json_file << "{\n"; for (size_t i = 0; i < op_configs.size(); ++i) { - const auto& config = op_configs[i]; + const auto &config = op_configs[i]; json_file << " \"" << config.opType << "\": {\n"; json_file << " \"opInfo\": {\n"; json_file << " \"functionName\": \"" << config.functionName << "\",\n"; @@ -75,14 +75,16 @@ static bool GenerateAicpuOpJson(const std::string& json_path, const std::vector< #endif -int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, const std::vector& kernel_names) { +int AicpuLoader::init_with_binary( + const std::vector &aicpu_binary, const std::vector &kernel_names +) { #ifdef BUILD_WITH_NEW_CANN // New interface: Load binary using JSON descriptor (pypto approach) LOG_INFO("AicpuLoader: Using new rtsBinaryLoadFromFile + rtsLaunchCpuKernel interface"); LOG_INFO("AicpuLoader: Binary size=%zu bytes", aicpu_binary.size()); // Step 1: Generate op info JSON at runtime (using only filename, not full path) - const char* tmp_dir = std::getenv("TMPDIR") ? std::getenv("TMPDIR") : "/tmp"; + const char *tmp_dir = std::getenv("TMPDIR") ? std::getenv("TMPDIR") : "/tmp"; std::string json_path_template = std::string(tmp_dir) + "/simpler_aicpu_op_info_XXXXXX.json"; std::vector json_path_buffer(json_path_template.begin(), json_path_template.end()); json_path_buffer.push_back('\0'); @@ -104,7 +106,7 @@ int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, cons // Create op configs for JSON generation // kernelSo uses only filename - runtime will find it via library search path std::vector op_configs; - for (const auto& name : kernel_names) { + for (const auto &name : kernel_names) { AicpuOpConfig config; config.opType = name; config.functionName = name_mapping[name]; @@ -136,7 +138,7 @@ int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, cons LOG_INFO("AicpuLoader: Loaded binary from JSON, handle=%p", binary_handle_); // Step 3: Resolve function handles: rtsFuncGetByName - for (const auto& name : kernel_names) { + for (const auto &name : kernel_names) { rtFuncHandle func_handle = nullptr; rc = rtsFuncGetByName(binary_handle_, name.c_str(), &func_handle); if (rc != RT_ERROR_NONE) { @@ -158,7 +160,7 @@ int AicpuLoader::init_with_binary(const std::vector& aicpu_binary, cons #endif } -int AicpuLoader::init(const std::string& so_path, const std::vector& kernel_names) { +int AicpuLoader::init(const std::string &so_path, const std::vector &kernel_names) { #ifdef BUILD_WITH_NEW_CANN // New interface: Use init_with_binary() instead // This init() is kept for backward compatibility but does nothing @@ -175,7 +177,7 @@ int AicpuLoader::init(const std::string& so_path, const std::vector #endif } -int AicpuLoader::launch(rtStream_t stream, KernelArgs* k_args, const char* kernel_name, int aicpu_num) { +int AicpuLoader::launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { #ifdef BUILD_WITH_NEW_CANN // New interface: rtsLaunchCpuKernel auto it = func_handles_.find(kernel_name); diff --git a/src/a5/platform/onboard/host/aicpu_loader.h b/src/a5/platform/onboard/host/aicpu_loader.h index 0be15b68f..1b37afcde 100644 --- a/src/a5/platform/onboard/host/aicpu_loader.h +++ b/src/a5/platform/onboard/host/aicpu_loader.h @@ -3,7 +3,7 @@ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of * CANN Open Software License Agreement Version 2.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. * ----------------------------------------------------------------------------------------------------------- @@ -55,7 +55,7 @@ class AicpuLoader { * @param kernel_names List of kernel function names to resolve * @return 0 on success, error code on failure */ - int init_with_binary(const std::vector& aicpu_binary, const std::vector& kernel_names); + int init_with_binary(const std::vector &aicpu_binary, const std::vector &kernel_names); /** * @brief Initialize the AICPU loader (legacy interface compatibility) @@ -67,7 +67,7 @@ class AicpuLoader { * @param kernel_names List of kernel function names (not used) * @return 0 on success, error code on failure */ - int init(const std::string& so_path, const std::vector& kernel_names); + int init(const std::string &so_path, const std::vector &kernel_names); /** * @brief Launch an AICPU kernel @@ -80,7 +80,7 @@ class AicpuLoader { * @param aicpu_num Number of AICPU instances to launch * @return 0 on success, error code on failure */ - int launch(rtStream_t stream, KernelArgs* k_args, const char* kernel_name, int aicpu_num); + int launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num); /** * @brief Cleanup resources @@ -91,17 +91,17 @@ class AicpuLoader { void finalize(); // Disable copy and move - AicpuLoader(const AicpuLoader&) = delete; - AicpuLoader& operator=(const AicpuLoader&) = delete; - AicpuLoader(AicpuLoader&&) = delete; - AicpuLoader& operator=(AicpuLoader&&) = delete; + AicpuLoader(const AicpuLoader &) = delete; + AicpuLoader &operator=(const AicpuLoader &) = delete; + AicpuLoader(AicpuLoader &&) = delete; + AicpuLoader &operator=(AicpuLoader &&) = delete; private: #ifdef BUILD_WITH_NEW_CANN // New interface members - void* binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromFile - std::unordered_map func_handles_; // Function handles (kernel_name -> func_handle) - std::string json_file_path_; // Path to temporary JSON descriptor file + void *binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromFile + std::unordered_map func_handles_; // Function handles (kernel_name -> func_handle) + std::string json_file_path_; // Path to temporary JSON descriptor file #else // Legacy interface - no state needed #endif diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index d4cfc7f80..a92703210 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -233,10 +233,7 @@ int DeviceRunner::ensure_binaries_loaded( #ifdef BUILD_WITH_NEW_CANN // New interface: Initialize AICPU loader with binary data - const std::vector kernel_names = { - "DynTileFwkKernelServerInit", - "DynTileFwkKernelServer" - }; + const std::vector kernel_names = {"DynTileFwkKernelServerInit", "DynTileFwkKernelServer"}; int rc = aicpu_loader_.init_with_binary(aicpu_so_binary, kernel_names); if (rc != 0) { LOG_ERROR("AicpuLoader init_with_binary failed: %d", rc); From 8b60921ea9d033dbef36bf6a812e6206ddccc802 Mon Sep 17 00:00:00 2001 From: puddingfjz <2811443837@qq.com> Date: Mon, 13 Apr 2026 18:43:56 +0800 Subject: [PATCH 5/6] Fix: address review comments from PR #537 - Revert hardcoded aarch64-linux path in CMakeLists.txt, use portable paths - Restore default parameter for launch_aicpu_num in device_runner.h - Add documentation explaining JSON construction and name_mapping design The JSON construction uses manual string concatenation without a library. This is safe because kernel names are controlled strings without special characters, matching pypto's approach for similar AICPU op descriptors. The name_mapping from opType to functionName is specific to the Ascend tile framework kernels and is unlikely to change. Co-Authored-By: Claude Opus 4.6 --- src/a2a3/platform/onboard/host/CMakeLists.txt | 3 ++- .../platform/onboard/host/aicpu_loader.cpp | 20 ++++++++++++++++++- .../platform/onboard/host/device_runner.h | 2 +- src/a5/platform/onboard/host/CMakeLists.txt | 3 ++- src/a5/platform/onboard/host/aicpu_loader.cpp | 20 ++++++++++++++++++- 5 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index ac31fd9c9..75f348a88 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -108,7 +108,8 @@ target_link_libraries(host_runtime target_link_directories(host_runtime PRIVATE - ${ASCEND_HOME_PATH}/aarch64-linux/lib64 + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 ) set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.cpp b/src/a2a3/platform/onboard/host/aicpu_loader.cpp index a1bab7bfe..72a9a8dcc 100644 --- a/src/a2a3/platform/onboard/host/aicpu_loader.cpp +++ b/src/a2a3/platform/onboard/host/aicpu_loader.cpp @@ -43,6 +43,14 @@ struct AicpuOpConfig { }; // Generate AICPU op info JSON file +// +// Note: This function manually constructs JSON without using a library. +// The kernel names and configuration values are controlled strings that do not +// contain special characters (quotes, backslashes, control characters). This +// matches the approach used in pypto's GenerateAicpuOpJson for similar AICPU +// op descriptors. If new kernels are added that may contain special characters, +// consider adding a JSON library dependency (e.g., nlohmann/json) or implementing +// proper string escaping. static bool GenerateAicpuOpJson(const std::string &json_path, const std::vector &op_configs) { std::ofstream json_file(json_path); if (!json_file.is_open()) { @@ -97,7 +105,17 @@ int AicpuLoader::init_with_binary( close(json_fd); json_file_path_ = json_path_buffer.data(); - // Map opType (used for rtsFuncGetByName) to functionName (actual symbol in .so) + // Map opType (external kernel name used by rtsFuncGetByName) to functionName + // (actual C++ symbol name in the .so file). + // + // This mapping is specific to the Ascend tile framework kernels: + // - DynTileFwkKernelServerInit -> DynTileFwkBackendKernelServerInit + // - DynTileFwkKernelServer -> DynTileFwkBackendKernelServer + // + // The opType names are used by the CANN runtime to look up kernels, while the + // functionName names are the actual symbols exported by the shared library. + // This mapping is defined here as it's specific to the tile framework's + // naming convention and is unlikely to change. std::unordered_map name_mapping = { {"DynTileFwkKernelServerInit", "DynTileFwkBackendKernelServerInit"}, {"DynTileFwkKernelServer", "DynTileFwkBackendKernelServer"} diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index cfafff928..79bbc2bf6 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -253,7 +253,7 @@ class DeviceRunner { */ int run(Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num); + const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1); /** * Print handshake results from device diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index ac31fd9c9..75f348a88 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -108,7 +108,8 @@ target_link_libraries(host_runtime target_link_directories(host_runtime PRIVATE - ${ASCEND_HOME_PATH}/aarch64-linux/lib64 + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 ) set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a5/platform/onboard/host/aicpu_loader.cpp b/src/a5/platform/onboard/host/aicpu_loader.cpp index a1bab7bfe..72a9a8dcc 100644 --- a/src/a5/platform/onboard/host/aicpu_loader.cpp +++ b/src/a5/platform/onboard/host/aicpu_loader.cpp @@ -43,6 +43,14 @@ struct AicpuOpConfig { }; // Generate AICPU op info JSON file +// +// Note: This function manually constructs JSON without using a library. +// The kernel names and configuration values are controlled strings that do not +// contain special characters (quotes, backslashes, control characters). This +// matches the approach used in pypto's GenerateAicpuOpJson for similar AICPU +// op descriptors. If new kernels are added that may contain special characters, +// consider adding a JSON library dependency (e.g., nlohmann/json) or implementing +// proper string escaping. static bool GenerateAicpuOpJson(const std::string &json_path, const std::vector &op_configs) { std::ofstream json_file(json_path); if (!json_file.is_open()) { @@ -97,7 +105,17 @@ int AicpuLoader::init_with_binary( close(json_fd); json_file_path_ = json_path_buffer.data(); - // Map opType (used for rtsFuncGetByName) to functionName (actual symbol in .so) + // Map opType (external kernel name used by rtsFuncGetByName) to functionName + // (actual C++ symbol name in the .so file). + // + // This mapping is specific to the Ascend tile framework kernels: + // - DynTileFwkKernelServerInit -> DynTileFwkBackendKernelServerInit + // - DynTileFwkKernelServer -> DynTileFwkBackendKernelServer + // + // The opType names are used by the CANN runtime to look up kernels, while the + // functionName names are the actual symbols exported by the shared library. + // This mapping is defined here as it's specific to the tile framework's + // naming convention and is unlikely to change. std::unordered_map name_mapping = { {"DynTileFwkKernelServerInit", "DynTileFwkBackendKernelServerInit"}, {"DynTileFwkKernelServer", "DynTileFwkBackendKernelServer"} From ddfa274a2f222299ab6f1cd987e26b766e4a313f Mon Sep 17 00:00:00 2001 From: puddingfjz <2811443837@qq.com> Date: Wed, 15 Apr 2026 09:23:38 +0800 Subject: [PATCH 6/6] Add: migrate AICPU launch to new rts API with two-layer dispatcher - Replace rtAicpuKernelLaunchExWithArgs with rtsBinaryLoadFromFile/ rtsFuncGetByName/rtsLaunchCpuKernel (BUILD_WITH_NEW_CANN=ON path) - Add two-layer architecture: outer libaicpu_dispatcher.so + inner runtime SO. Dispatcher saves inner SO to /tmp/aicpu_kernels/ and forwards kernel calls via dlopen/dlsym - Add LoadAicpuOp class to manage JSON generation and function handles - Generate JSON in SO directory with matching basename (cpuKernelMode=1) - Remove static linking (-static-libstdc++ -static-libgcc), add -Wl,-Bsymbolic,--build-id, strip debug info to match CANN built-ins - Update a2a3 and a5 platforms identically Known issue: scheduler fails with 507018 (symbol lookup failure for DynTileFwkKernelServerNull in libaicpu_dispatcher.so). Host-side API calls succeed, scheduler-side dlsym fails despite symbol being present in SO binary. Investigation ongoing. --- .gitignore | 4 + python/simpler/runtime_compiler.py | 6 + .../platform/onboard/aicpu/CMakeLists.txt | 36 ++ src/a2a3/platform/onboard/host/CMakeLists.txt | 6 + .../platform/onboard/host/aicpu_loader.cpp | 235 +------------ src/a2a3/platform/onboard/host/aicpu_loader.h | 63 +--- .../platform/onboard/host/device_runner.cpp | 75 ++++- .../platform/onboard/host/device_runner.h | 8 + src/a5/platform/onboard/aicpu/CMakeLists.txt | 36 ++ src/a5/platform/onboard/host/CMakeLists.txt | 6 + src/a5/platform/onboard/host/aicpu_loader.cpp | 235 +------------ src/a5/platform/onboard/host/aicpu_loader.h | 63 +--- .../platform/onboard/host/device_runner.cpp | 75 ++++- src/a5/platform/onboard/host/device_runner.h | 8 + src/common/aicpu_dispatcher/CMakeLists.txt | 52 +++ src/common/aicpu_dispatcher/README.md | 38 +++ .../aicpu_dispatcher/aicpu_dispatcher.cpp | 313 ++++++++++++++++++ .../aicpu_dispatcher/aicpu_dispatcher.h | 128 +++++++ src/common/host/CMakeLists.txt | 20 ++ src/common/host/load_aicpu_op.cpp | 203 ++++++++++++ src/common/host/load_aicpu_op.h | 156 +++++++++ 21 files changed, 1194 insertions(+), 572 deletions(-) create mode 100644 src/common/aicpu_dispatcher/CMakeLists.txt create mode 100644 src/common/aicpu_dispatcher/README.md create mode 100644 src/common/aicpu_dispatcher/aicpu_dispatcher.cpp create mode 100644 src/common/aicpu_dispatcher/aicpu_dispatcher.h create mode 100644 src/common/host/CMakeLists.txt create mode 100644 src/common/host/load_aicpu_op.cpp create mode 100644 src/common/host/load_aicpu_op.h diff --git a/.gitignore b/.gitignore index 37d5e142b..e3c04a989 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,7 @@ compile_commands.json # Built nanobind extensions python/_task_interface*.so python/_task_interface*.dylib + +# Log files +*.log +profiling_logs_*/ diff --git a/python/simpler/runtime_compiler.py b/python/simpler/runtime_compiler.py index ce159d422..57cba0db3 100644 --- a/python/simpler/runtime_compiler.py +++ b/python/simpler/runtime_compiler.py @@ -233,6 +233,12 @@ def _build(actual_build_dir: str) -> Union[bytes, Path]: od.mkdir(parents=True, exist_ok=True) dest = od / binary_name shutil.copy2(binary_path, dest) + dispatcher_so = Path(actual_build_dir) / "libaicpu_dispatcher.so" + if dispatcher_so.is_file(): + dest_dispatcher = od / "libaicpu_dispatcher.so" + shutil.copy2(dispatcher_so, dest_dispatcher) + # Strip debug info to match CANN built-in SO format + subprocess.run(["strip", "-s", str(dest_dispatcher)], check=True) return dest else: with open(binary_path, "rb") as f: diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt index e1fb32d2c..00e041094 100644 --- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt @@ -10,6 +10,7 @@ project(aicpu_kernel LANGUAGES C CXX) set(CMAKE_CUSTOM_INCLUDE_DIRS "") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -75,3 +76,38 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) +# Build dispatcher SO (two-layer architecture) +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" +) +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +# Compiler options for dispatcher (same as AICPU kernel) +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -rdynamic + -O3 + -fPIC + -g + $<$:-std=gnu++17> +) + +# Include directories for dispatcher +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CUSTOM_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include +) + +# Link against dl for dlopen/dlsym +target_link_libraries(aicpu_dispatcher PRIVATE dl) + +# Match CANN built-in SO properties: SYMBOLIC flag, build-id, stripped +set_target_properties(aicpu_dispatcher PROPERTIES + LINK_FLAGS "-Wl,-Bsymbolic,--build-id" + OUTPUT_NAME "aicpu_dispatcher" +) + diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index 75f348a88..b7dbf4d2f 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -19,6 +19,8 @@ set(CMAKE_CUSTOM_INCLUDE_DIRS "") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/worker") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -40,6 +42,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" ) +# Add common/host sources (LoadAicpuOp) +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.cpp b/src/a2a3/platform/onboard/host/aicpu_loader.cpp index 72a9a8dcc..f99450cf9 100644 --- a/src/a2a3/platform/onboard/host/aicpu_loader.cpp +++ b/src/a2a3/platform/onboard/host/aicpu_loader.cpp @@ -9,239 +9,39 @@ * ----------------------------------------------------------------------------------------------------------- */ /** - * AICPU Loader Implementation + * AICPU Loader Implementation (Legacy Interface) + * + * Provides AICPU kernel launching via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used when BUILD_WITH_NEW_CANN is OFF. When BUILD_WITH_NEW_CANN is ON, + * device_runner uses LoadAicpuOp (src/common/host/load_aicpu_op.h) instead. */ #include "aicpu_loader.h" #include -#include -#include -#include -#include -#include #include "common/unified_log.h" #include "common/kernel_args.h" -#ifdef BUILD_WITH_NEW_CANN -// New CANN RTS header for rtsLaunchCpuKernel interface (CANN 7.0+) -#include "runtime/runtime/rts/rts_kernel.h" -#include "runtime/runtime/kernel.h" - -// Forward declarations for JSON structures -struct AicpuOpConfig { - std::string functionName; - std::string kernelSo; - std::string opKernelLib; - std::string computeCost = "100"; - std::string engine = "DNN_VM_AICPU"; - std::string flagAsync = "False"; - std::string flagPartial = "False"; - std::string userDefined = "False"; - std::string opType; -}; - -// Generate AICPU op info JSON file -// -// Note: This function manually constructs JSON without using a library. -// The kernel names and configuration values are controlled strings that do not -// contain special characters (quotes, backslashes, control characters). This -// matches the approach used in pypto's GenerateAicpuOpJson for similar AICPU -// op descriptors. If new kernels are added that may contain special characters, -// consider adding a JSON library dependency (e.g., nlohmann/json) or implementing -// proper string escaping. -static bool GenerateAicpuOpJson(const std::string &json_path, const std::vector &op_configs) { - std::ofstream json_file(json_path); - if (!json_file.is_open()) { - LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); - return false; - } - - json_file << "{\n"; - for (size_t i = 0; i < op_configs.size(); ++i) { - const auto &config = op_configs[i]; - json_file << " \"" << config.opType << "\": {\n"; - json_file << " \"opInfo\": {\n"; - json_file << " \"functionName\": \"" << config.functionName << "\",\n"; - json_file << " \"kernelSo\": \"" << config.kernelSo << "\",\n"; - json_file << " \"opKernelLib\": \"" << config.opKernelLib << "\",\n"; - json_file << " \"computeCost\": \"" << config.computeCost << "\",\n"; - json_file << " \"engine\": \"" << config.engine << "\",\n"; - json_file << " \"flagAsync\": \"" << config.flagAsync << "\",\n"; - json_file << " \"flagPartial\": \"" << config.flagPartial << "\",\n"; - json_file << " \"userDefined\": \"" << config.userDefined << "\"\n"; - json_file << " }\n"; - json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; - } - json_file << "}\n"; - json_file.close(); - - LOG_INFO("Generated AICPU op info JSON: %s", json_path.c_str()); - return true; -} - -#endif - int AicpuLoader::init_with_binary( const std::vector &aicpu_binary, const std::vector &kernel_names ) { -#ifdef BUILD_WITH_NEW_CANN - // New interface: Load binary using JSON descriptor (pypto approach) - LOG_INFO("AicpuLoader: Using new rtsBinaryLoadFromFile + rtsLaunchCpuKernel interface"); - LOG_INFO("AicpuLoader: Binary size=%zu bytes", aicpu_binary.size()); - - // Step 1: Generate op info JSON at runtime (using only filename, not full path) - const char *tmp_dir = std::getenv("TMPDIR") ? std::getenv("TMPDIR") : "/tmp"; - std::string json_path_template = std::string(tmp_dir) + "/simpler_aicpu_op_info_XXXXXX.json"; - std::vector json_path_buffer(json_path_template.begin(), json_path_template.end()); - json_path_buffer.push_back('\0'); - - int json_fd = mkstemps(json_path_buffer.data(), 5); - if (json_fd == -1) { - LOG_ERROR("Failed to create temporary JSON file"); - return -1; - } - close(json_fd); - json_file_path_ = json_path_buffer.data(); - - // Map opType (external kernel name used by rtsFuncGetByName) to functionName - // (actual C++ symbol name in the .so file). - // - // This mapping is specific to the Ascend tile framework kernels: - // - DynTileFwkKernelServerInit -> DynTileFwkBackendKernelServerInit - // - DynTileFwkKernelServer -> DynTileFwkBackendKernelServer - // - // The opType names are used by the CANN runtime to look up kernels, while the - // functionName names are the actual symbols exported by the shared library. - // This mapping is defined here as it's specific to the tile framework's - // naming convention and is unlikely to change. - std::unordered_map name_mapping = { - {"DynTileFwkKernelServerInit", "DynTileFwkBackendKernelServerInit"}, - {"DynTileFwkKernelServer", "DynTileFwkBackendKernelServer"} - }; - - // Create op configs for JSON generation - // kernelSo uses only filename - runtime will find it via library search path - std::vector op_configs; - for (const auto &name : kernel_names) { - AicpuOpConfig config; - config.opType = name; - config.functionName = name_mapping[name]; - config.kernelSo = "libaicpu_kernel.so"; // Filename only, runtime searches library path - config.opKernelLib = "KFCKernel"; - op_configs.push_back(config); - } - - // Generate JSON file - if (!GenerateAicpuOpJson(json_file_path_, op_configs)) { - return -1; - } - - // Step 2: Load binary handle from JSON: rtsBinaryLoadFromFile - // cpuKernelMode=0: JSON only mode, runtime finds .so via library search path - rtLoadBinaryOption_t option = {}; - option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; - option.value.cpuKernelMode = 0; - - rtLoadBinaryConfig_t load_config = {}; - load_config.options = &option; - load_config.numOpt = 1; - - rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); - return rc; - } - LOG_INFO("AicpuLoader: Loaded binary from JSON, handle=%p", binary_handle_); - - // Step 3: Resolve function handles: rtsFuncGetByName - for (const auto &name : kernel_names) { - rtFuncHandle func_handle = nullptr; - rc = rtsFuncGetByName(binary_handle_, name.c_str(), &func_handle); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsFuncGetByName failed for %s: %d", name.c_str(), rc); - return rc; - } - func_handles_[name] = func_handle; - LOG_INFO("AicpuLoader: Resolved function handle for %s: %p", name.c_str(), func_handle); - } - - return 0; - -#else // Legacy interface: No pre-loading needed - (void)so_path; + (void)aicpu_binary; (void)kernel_names; LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); return 0; -#endif } int AicpuLoader::init(const std::string &so_path, const std::vector &kernel_names) { -#ifdef BUILD_WITH_NEW_CANN - // New interface: Use init_with_binary() instead - // This init() is kept for backward compatibility but does nothing - (void)so_path; - (void)kernel_names; - LOG_INFO("AicpuLoader: Use init_with_binary() for new interface"); - return 0; -#else // Legacy interface: No pre-loading needed (void)so_path; (void)kernel_names; LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); return 0; -#endif } int AicpuLoader::launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { -#ifdef BUILD_WITH_NEW_CANN - // New interface: rtsLaunchCpuKernel - auto it = func_handles_.find(kernel_name); - if (it == func_handles_.end()) { - LOG_ERROR("Kernel not found: %s", kernel_name); - return -1; - } - - rtFuncHandle func_handle = it->second; - - // Prepare args for new interface - struct Args { - KernelArgs k_args; - char kernel_name[64]; - char so_name[64]; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - std::strncpy(args.so_name, "libaicpu_extend_kernels.so", sizeof(args.so_name) - 1); - args.so_name[sizeof(args.so_name) - 1] = '\0'; - - rtCpuKernelArgs_t cpu_args = {}; - cpu_args.baseArgs.args = &args; - cpu_args.baseArgs.argsSize = sizeof(args); - cpu_args.baseArgs.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - cpu_args.baseArgs.soNameAddrOffset = offsetof(struct Args, so_name); - cpu_args.baseArgs.hostInputInfoPtr = nullptr; - cpu_args.baseArgs.kernelOffsetInfoPtr = nullptr; - cpu_args.baseArgs.hostInputInfoNum = 0; - cpu_args.baseArgs.kernelOffsetInfoNum = 0; - cpu_args.baseArgs.isNoNeedH2DCopy = 0; - cpu_args.baseArgs.timeout = 0; - cpu_args.cpuParamHeadOffset = 0; - - // Launch: rtsLaunchCpuKernel - rtError_t rc = rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, nullptr, &cpu_args); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsLaunchCpuKernel failed for %s: %d", kernel_name, rc); - return rc; - } - - return 0; - -#else // Legacy interface: rtAicpuKernelLaunchExWithArgs struct Args { KernelArgs k_args; @@ -264,31 +64,8 @@ int AicpuLoader::launch(rtStream_t stream, KernelArgs *k_args, const char *kerne return rtAicpuKernelLaunchExWithArgs( rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 ); -#endif } void AicpuLoader::finalize() { -#ifdef BUILD_WITH_NEW_CANN - // New interface: Unload binary and clear handles - if (binary_handle_ != nullptr) { - rtError_t rc = rtsBinaryUnload(binary_handle_); - if (rc != RT_ERROR_NONE) { - LOG_WARN("rtsBinaryUnload failed: %d", rc); - } - binary_handle_ = nullptr; - } - func_handles_.clear(); - - // Delete temporary JSON file if it was created - if (!json_file_path_.empty()) { - std::remove(json_file_path_.c_str()); - LOG_INFO("AicpuLoader: Deleted temporary JSON file: %s", json_file_path_.c_str()); - json_file_path_.clear(); - } - - LOG_INFO("AicpuLoader: Finalized new interface"); -#else // Legacy interface: No-op - (void)this; // Suppress unused warning -#endif } diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.h b/src/a2a3/platform/onboard/host/aicpu_loader.h index 1ec949227..e4d72ad4f 100644 --- a/src/a2a3/platform/onboard/host/aicpu_loader.h +++ b/src/a2a3/platform/onboard/host/aicpu_loader.h @@ -9,15 +9,11 @@ * ----------------------------------------------------------------------------------------------------------- */ /** - * AICPU Loader Abstraction + * AICPU Loader Abstraction (Legacy Interface) * - * This file provides an abstraction layer for AICPU kernel launching that supports - * both the legacy rtAicpuKernelLaunchExWithArgs API and the new rtsLaunchCpuKernel - * interface available in newer CANN versions. - * - * The interface used is controlled by the BUILD_WITH_NEW_CANN compile flag: - * - When undefined or OFF: Uses legacy rtAicpuKernelLaunchExWithArgs - * - When ON: Uses new rtsLaunchCpuKernel / rtsBinaryLoadFromFile / rtsFuncGetByName + * Provides AICPU kernel launching via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used when BUILD_WITH_NEW_CANN is OFF. When BUILD_WITH_NEW_CANN is ON, + * device_runner uses LoadAicpuOp (src/common/host/load_aicpu_op.h) instead. */ #ifndef A2A3_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ @@ -25,7 +21,6 @@ #include #include -#include #include #include @@ -34,9 +29,10 @@ struct KernelArgs; /** - * @brief AICPU kernel loader abstraction + * @brief AICPU kernel loader (legacy interface) * - * Supports both legacy and new CANN AICPU launch interfaces through conditional compilation. + * Launches AICPU kernels via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used as the fallback when BUILD_WITH_NEW_CANN is OFF. */ class AicpuLoader { public: @@ -44,49 +40,22 @@ class AicpuLoader { ~AicpuLoader() = default; /** - * @brief Initialize the AICPU loader with binary data - * - * For the new interface (BUILD_WITH_NEW_CANN=ON), this generates a JSON descriptor - * and loads the binary using rtsBinaryLoadFromFile. The .so file is referenced by - * filename only (libaicpu_kernel.so) and must be findable via library search path. - * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. - * - * @param aicpu_binary Binary data of the AICPU shared library (not used, kept for API compatibility) - * @param kernel_names List of kernel function names to resolve - * @return 0 on success, error code on failure + * @brief Initialize the AICPU loader with binary data (no-op for legacy interface) */ int init_with_binary(const std::vector &aicpu_binary, const std::vector &kernel_names); /** - * @brief Initialize the AICPU loader (legacy interface compatibility) - * - * For the new interface (BUILD_WITH_NEW_CANN=ON), this stores kernel names for later use. - * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. - * - * @param so_path Path to the AICPU shared library (not used in new interface) - * @param kernel_names List of kernel function names to resolve - * @return 0 on success, error code on failure + * @brief Initialize the AICPU loader (no-op for legacy interface) */ int init(const std::string &so_path, const std::vector &kernel_names); /** - * @brief Launch an AICPU kernel - * - * Unified interface that delegates to either legacy or new implementation. - * - * @param stream CUDA-style stream for execution - * @param k_args Kernel arguments - * @param kernel_name Name of the kernel to launch - * @param aicpu_num Number of AICPU instances to launch - * @return 0 on success, error code on failure + * @brief Launch an AICPU kernel via legacy rtAicpuKernelLaunchExWithArgs */ int launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num); /** - * @brief Cleanup resources - * - * For the new interface, this unloads the binary and clears handles. - * For the legacy interface, this is a no-op. + * @brief Cleanup resources (no-op for legacy interface) */ void finalize(); @@ -95,16 +64,6 @@ class AicpuLoader { AicpuLoader &operator=(const AicpuLoader &) = delete; AicpuLoader(AicpuLoader &&) = delete; AicpuLoader &operator=(AicpuLoader &&) = delete; - -private: -#ifdef BUILD_WITH_NEW_CANN - // New interface members - void *binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromFile - std::unordered_map func_handles_; // Function handles (kernel_name -> func_handle) - std::string json_file_path_; // Path to temporary JSON descriptor file -#else - // Legacy interface - no state needed -#endif }; #endif // A2A3_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index d5106e532..4d88ec01a 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -17,6 +17,10 @@ #include "device_runner.h" +#ifdef BUILD_WITH_NEW_CANN +#include "load_aicpu_op.h" +#endif + #include #include @@ -24,6 +28,24 @@ #include #include +#ifdef BUILD_WITH_NEW_CANN + +static std::string resolve_dispatcher_so_path() { + Dl_info info; + if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { + return ""; + } + // info.dli_fname is the path to host_runtime.so + std::string so_dir = info.dli_fname; + size_t pos = so_dir.rfind('/'); + if (pos == std::string::npos) { + return "libaicpu_dispatcher.so"; + } + so_dir = so_dir.substr(0, pos + 1); + return so_dir + "libaicpu_dispatcher.so"; +} +#endif + // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" #include "callable.h" @@ -314,14 +336,14 @@ int DeviceRunner::ensure_binaries_loaded( aicore_kernel_binary_ = aicore_kernel_binary; #ifdef BUILD_WITH_NEW_CANN - // New interface: Initialize AICPU loader with binary data - const std::vector kernel_names = {"DynTileFwkKernelServerInit", "DynTileFwkKernelServer"}; - int rc = aicpu_loader_.init_with_binary(aicpu_so_binary, kernel_names); + // New interface: Initialize LoadAicpuOp (loads dispatcher SO) + std::string dispatcher_so_path = resolve_dispatcher_so_path(); + int rc = load_aicpu_op_.Init(dispatcher_so_path); if (rc != 0) { - LOG_ERROR("AicpuLoader init_with_binary failed: %d", rc); + LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); return rc; } - LOG_INFO("DeviceRunner: AICPU loader initialized with %zu bytes of binary data", aicpu_so_binary.size()); + LOG_INFO("DeviceRunner: LoadAicpuOp initialized"); #else int rc = 0; #endif @@ -514,6 +536,28 @@ int DeviceRunner::run( return rc; } +#ifdef BUILD_WITH_NEW_CANN + // Three-phase launch pattern with dispatcher: + // 1. Load (Null) - Pass inner SO binary to dispatcher + // 2. Init - Initialize inner SO + // 3. Run - Execute actual kernel + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerNull (Load) ===" << '\n'; + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerNull", 1); + if (rc != 0) { + LOG_ERROR("launch_aicpu_kernel (load/null) failed: %d", rc); + return rc; + } + + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerInit (Init) ===" << '\n'; + // Launch AICPU init kernel + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); + if (rc != 0) { + LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); + return rc; + } + + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer (Run) ===" << '\n'; +#else std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerInit===" << '\n'; // Launch AICPU init kernel rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); @@ -523,6 +567,7 @@ int DeviceRunner::run( } std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer===" << '\n'; +#endif // Launch AICPU main kernel (over-launch for affinity gate) rc = launch_aicpu_kernel( stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH @@ -620,8 +665,12 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); +#ifdef BUILD_WITH_NEW_CANN + // LoadAicpuOp cleanup happens automatically in destructor +#else // Cleanup AICPU loader aicpu_loader_.finalize(); +#endif // Kernel binaries should have been removed by validate_runtime_impl() if (!func_id_to_addr_.empty()) { @@ -675,7 +724,23 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { +#ifdef BUILD_WITH_NEW_CANN + // Map kernel name to LoadAicpuOp function name + std::string func_name; + if (std::strcmp(kernel_name, "DynTileFwkKernelServerInit") == 0) { + func_name = host::KernelNames::InitName; + } else if (std::strcmp(kernel_name, "DynTileFwkKernelServer") == 0) { + func_name = host::KernelNames::RunName; + } else if (std::strcmp(kernel_name, "DynTileFwkKernelServerNull") == 0) { + func_name = host::KernelNames::NullName; + } else { + LOG_ERROR("Unknown kernel name: %s", kernel_name); + return -1; + } + return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, func_name, kernel_name); +#else return aicpu_loader_.launch(stream, k_args, kernel_name, aicpu_num); +#endif } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 79bbc2bf6..19aab8a51 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -43,7 +43,11 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#ifdef BUILD_WITH_NEW_CANN +#include "load_aicpu_op.h" +#else #include "aicpu_loader.h" +#endif #include "runtime.h" /** @@ -382,7 +386,11 @@ class DeviceRunner { std::vector aicore_kernel_binary_; // AICPU loader abstraction (supports both legacy and new CANN interfaces) +#ifdef BUILD_WITH_NEW_CANN + host::LoadAicpuOp load_aicpu_op_; +#else AicpuLoader aicpu_loader_; +#endif // Memory management MemoryAllocator mem_alloc_; diff --git a/src/a5/platform/onboard/aicpu/CMakeLists.txt b/src/a5/platform/onboard/aicpu/CMakeLists.txt index e1fb32d2c..00e041094 100644 --- a/src/a5/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a5/platform/onboard/aicpu/CMakeLists.txt @@ -10,6 +10,7 @@ project(aicpu_kernel LANGUAGES C CXX) set(CMAKE_CUSTOM_INCLUDE_DIRS "") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -75,3 +76,38 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) +# Build dispatcher SO (two-layer architecture) +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" +) +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +# Compiler options for dispatcher (same as AICPU kernel) +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -rdynamic + -O3 + -fPIC + -g + $<$:-std=gnu++17> +) + +# Include directories for dispatcher +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CUSTOM_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include +) + +# Link against dl for dlopen/dlsym +target_link_libraries(aicpu_dispatcher PRIVATE dl) + +# Match CANN built-in SO properties: SYMBOLIC flag, build-id, stripped +set_target_properties(aicpu_dispatcher PROPERTIES + LINK_FLAGS "-Wl,-Bsymbolic,--build-id" + OUTPUT_NAME "aicpu_dispatcher" +) + diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index 75f348a88..b7dbf4d2f 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -19,6 +19,8 @@ set(CMAKE_CUSTOM_INCLUDE_DIRS "") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/worker") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -40,6 +42,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" ) +# Add common/host sources (LoadAicpuOp) +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") diff --git a/src/a5/platform/onboard/host/aicpu_loader.cpp b/src/a5/platform/onboard/host/aicpu_loader.cpp index 72a9a8dcc..f99450cf9 100644 --- a/src/a5/platform/onboard/host/aicpu_loader.cpp +++ b/src/a5/platform/onboard/host/aicpu_loader.cpp @@ -9,239 +9,39 @@ * ----------------------------------------------------------------------------------------------------------- */ /** - * AICPU Loader Implementation + * AICPU Loader Implementation (Legacy Interface) + * + * Provides AICPU kernel launching via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used when BUILD_WITH_NEW_CANN is OFF. When BUILD_WITH_NEW_CANN is ON, + * device_runner uses LoadAicpuOp (src/common/host/load_aicpu_op.h) instead. */ #include "aicpu_loader.h" #include -#include -#include -#include -#include -#include #include "common/unified_log.h" #include "common/kernel_args.h" -#ifdef BUILD_WITH_NEW_CANN -// New CANN RTS header for rtsLaunchCpuKernel interface (CANN 7.0+) -#include "runtime/runtime/rts/rts_kernel.h" -#include "runtime/runtime/kernel.h" - -// Forward declarations for JSON structures -struct AicpuOpConfig { - std::string functionName; - std::string kernelSo; - std::string opKernelLib; - std::string computeCost = "100"; - std::string engine = "DNN_VM_AICPU"; - std::string flagAsync = "False"; - std::string flagPartial = "False"; - std::string userDefined = "False"; - std::string opType; -}; - -// Generate AICPU op info JSON file -// -// Note: This function manually constructs JSON without using a library. -// The kernel names and configuration values are controlled strings that do not -// contain special characters (quotes, backslashes, control characters). This -// matches the approach used in pypto's GenerateAicpuOpJson for similar AICPU -// op descriptors. If new kernels are added that may contain special characters, -// consider adding a JSON library dependency (e.g., nlohmann/json) or implementing -// proper string escaping. -static bool GenerateAicpuOpJson(const std::string &json_path, const std::vector &op_configs) { - std::ofstream json_file(json_path); - if (!json_file.is_open()) { - LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); - return false; - } - - json_file << "{\n"; - for (size_t i = 0; i < op_configs.size(); ++i) { - const auto &config = op_configs[i]; - json_file << " \"" << config.opType << "\": {\n"; - json_file << " \"opInfo\": {\n"; - json_file << " \"functionName\": \"" << config.functionName << "\",\n"; - json_file << " \"kernelSo\": \"" << config.kernelSo << "\",\n"; - json_file << " \"opKernelLib\": \"" << config.opKernelLib << "\",\n"; - json_file << " \"computeCost\": \"" << config.computeCost << "\",\n"; - json_file << " \"engine\": \"" << config.engine << "\",\n"; - json_file << " \"flagAsync\": \"" << config.flagAsync << "\",\n"; - json_file << " \"flagPartial\": \"" << config.flagPartial << "\",\n"; - json_file << " \"userDefined\": \"" << config.userDefined << "\"\n"; - json_file << " }\n"; - json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; - } - json_file << "}\n"; - json_file.close(); - - LOG_INFO("Generated AICPU op info JSON: %s", json_path.c_str()); - return true; -} - -#endif - int AicpuLoader::init_with_binary( const std::vector &aicpu_binary, const std::vector &kernel_names ) { -#ifdef BUILD_WITH_NEW_CANN - // New interface: Load binary using JSON descriptor (pypto approach) - LOG_INFO("AicpuLoader: Using new rtsBinaryLoadFromFile + rtsLaunchCpuKernel interface"); - LOG_INFO("AicpuLoader: Binary size=%zu bytes", aicpu_binary.size()); - - // Step 1: Generate op info JSON at runtime (using only filename, not full path) - const char *tmp_dir = std::getenv("TMPDIR") ? std::getenv("TMPDIR") : "/tmp"; - std::string json_path_template = std::string(tmp_dir) + "/simpler_aicpu_op_info_XXXXXX.json"; - std::vector json_path_buffer(json_path_template.begin(), json_path_template.end()); - json_path_buffer.push_back('\0'); - - int json_fd = mkstemps(json_path_buffer.data(), 5); - if (json_fd == -1) { - LOG_ERROR("Failed to create temporary JSON file"); - return -1; - } - close(json_fd); - json_file_path_ = json_path_buffer.data(); - - // Map opType (external kernel name used by rtsFuncGetByName) to functionName - // (actual C++ symbol name in the .so file). - // - // This mapping is specific to the Ascend tile framework kernels: - // - DynTileFwkKernelServerInit -> DynTileFwkBackendKernelServerInit - // - DynTileFwkKernelServer -> DynTileFwkBackendKernelServer - // - // The opType names are used by the CANN runtime to look up kernels, while the - // functionName names are the actual symbols exported by the shared library. - // This mapping is defined here as it's specific to the tile framework's - // naming convention and is unlikely to change. - std::unordered_map name_mapping = { - {"DynTileFwkKernelServerInit", "DynTileFwkBackendKernelServerInit"}, - {"DynTileFwkKernelServer", "DynTileFwkBackendKernelServer"} - }; - - // Create op configs for JSON generation - // kernelSo uses only filename - runtime will find it via library search path - std::vector op_configs; - for (const auto &name : kernel_names) { - AicpuOpConfig config; - config.opType = name; - config.functionName = name_mapping[name]; - config.kernelSo = "libaicpu_kernel.so"; // Filename only, runtime searches library path - config.opKernelLib = "KFCKernel"; - op_configs.push_back(config); - } - - // Generate JSON file - if (!GenerateAicpuOpJson(json_file_path_, op_configs)) { - return -1; - } - - // Step 2: Load binary handle from JSON: rtsBinaryLoadFromFile - // cpuKernelMode=0: JSON only mode, runtime finds .so via library search path - rtLoadBinaryOption_t option = {}; - option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; - option.value.cpuKernelMode = 0; - - rtLoadBinaryConfig_t load_config = {}; - load_config.options = &option; - load_config.numOpt = 1; - - rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); - return rc; - } - LOG_INFO("AicpuLoader: Loaded binary from JSON, handle=%p", binary_handle_); - - // Step 3: Resolve function handles: rtsFuncGetByName - for (const auto &name : kernel_names) { - rtFuncHandle func_handle = nullptr; - rc = rtsFuncGetByName(binary_handle_, name.c_str(), &func_handle); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsFuncGetByName failed for %s: %d", name.c_str(), rc); - return rc; - } - func_handles_[name] = func_handle; - LOG_INFO("AicpuLoader: Resolved function handle for %s: %p", name.c_str(), func_handle); - } - - return 0; - -#else // Legacy interface: No pre-loading needed - (void)so_path; + (void)aicpu_binary; (void)kernel_names; LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); return 0; -#endif } int AicpuLoader::init(const std::string &so_path, const std::vector &kernel_names) { -#ifdef BUILD_WITH_NEW_CANN - // New interface: Use init_with_binary() instead - // This init() is kept for backward compatibility but does nothing - (void)so_path; - (void)kernel_names; - LOG_INFO("AicpuLoader: Use init_with_binary() for new interface"); - return 0; -#else // Legacy interface: No pre-loading needed (void)so_path; (void)kernel_names; LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); return 0; -#endif } int AicpuLoader::launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { -#ifdef BUILD_WITH_NEW_CANN - // New interface: rtsLaunchCpuKernel - auto it = func_handles_.find(kernel_name); - if (it == func_handles_.end()) { - LOG_ERROR("Kernel not found: %s", kernel_name); - return -1; - } - - rtFuncHandle func_handle = it->second; - - // Prepare args for new interface - struct Args { - KernelArgs k_args; - char kernel_name[64]; - char so_name[64]; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - std::strncpy(args.so_name, "libaicpu_extend_kernels.so", sizeof(args.so_name) - 1); - args.so_name[sizeof(args.so_name) - 1] = '\0'; - - rtCpuKernelArgs_t cpu_args = {}; - cpu_args.baseArgs.args = &args; - cpu_args.baseArgs.argsSize = sizeof(args); - cpu_args.baseArgs.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - cpu_args.baseArgs.soNameAddrOffset = offsetof(struct Args, so_name); - cpu_args.baseArgs.hostInputInfoPtr = nullptr; - cpu_args.baseArgs.kernelOffsetInfoPtr = nullptr; - cpu_args.baseArgs.hostInputInfoNum = 0; - cpu_args.baseArgs.kernelOffsetInfoNum = 0; - cpu_args.baseArgs.isNoNeedH2DCopy = 0; - cpu_args.baseArgs.timeout = 0; - cpu_args.cpuParamHeadOffset = 0; - - // Launch: rtsLaunchCpuKernel - rtError_t rc = rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, nullptr, &cpu_args); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsLaunchCpuKernel failed for %s: %d", kernel_name, rc); - return rc; - } - - return 0; - -#else // Legacy interface: rtAicpuKernelLaunchExWithArgs struct Args { KernelArgs k_args; @@ -264,31 +64,8 @@ int AicpuLoader::launch(rtStream_t stream, KernelArgs *k_args, const char *kerne return rtAicpuKernelLaunchExWithArgs( rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 ); -#endif } void AicpuLoader::finalize() { -#ifdef BUILD_WITH_NEW_CANN - // New interface: Unload binary and clear handles - if (binary_handle_ != nullptr) { - rtError_t rc = rtsBinaryUnload(binary_handle_); - if (rc != RT_ERROR_NONE) { - LOG_WARN("rtsBinaryUnload failed: %d", rc); - } - binary_handle_ = nullptr; - } - func_handles_.clear(); - - // Delete temporary JSON file if it was created - if (!json_file_path_.empty()) { - std::remove(json_file_path_.c_str()); - LOG_INFO("AicpuLoader: Deleted temporary JSON file: %s", json_file_path_.c_str()); - json_file_path_.clear(); - } - - LOG_INFO("AicpuLoader: Finalized new interface"); -#else // Legacy interface: No-op - (void)this; // Suppress unused warning -#endif } diff --git a/src/a5/platform/onboard/host/aicpu_loader.h b/src/a5/platform/onboard/host/aicpu_loader.h index 1b37afcde..3dd1390af 100644 --- a/src/a5/platform/onboard/host/aicpu_loader.h +++ b/src/a5/platform/onboard/host/aicpu_loader.h @@ -9,15 +9,11 @@ * ----------------------------------------------------------------------------------------------------------- */ /** - * AICPU Loader Abstraction + * AICPU Loader Abstraction (Legacy Interface) * - * This file provides an abstraction layer for AICPU kernel launching that supports - * both the legacy rtAicpuKernelLaunchExWithArgs API and the new rtsLaunchCpuKernel - * interface available in newer CANN versions. - * - * The interface used is controlled by the BUILD_WITH_NEW_CANN compile flag: - * - When undefined or OFF: Uses legacy rtAicpuKernelLaunchExWithArgs - * - When ON: Uses new rtsLaunchCpuKernel / rtsBinaryLoadFromFile / rtsFuncGetByName + * Provides AICPU kernel launching via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used when BUILD_WITH_NEW_CANN is OFF. When BUILD_WITH_NEW_CANN is ON, + * device_runner uses LoadAicpuOp (src/common/host/load_aicpu_op.h) instead. */ #ifndef A5_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ @@ -25,7 +21,6 @@ #include #include -#include #include #include @@ -34,9 +29,10 @@ struct KernelArgs; /** - * @brief AICPU kernel loader abstraction + * @brief AICPU kernel loader (legacy interface) * - * Supports both legacy and new CANN AICPU launch interfaces through conditional compilation. + * Launches AICPU kernels via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used as the fallback when BUILD_WITH_NEW_CANN is OFF. */ class AicpuLoader { public: @@ -44,49 +40,22 @@ class AicpuLoader { ~AicpuLoader() = default; /** - * @brief Initialize the AICPU loader with binary data - * - * For the new interface (BUILD_WITH_NEW_CANN=ON), this generates a JSON descriptor - * and loads the binary using rtsBinaryLoadFromFile. The .so file is referenced by - * filename only (libaicpu_kernel.so) and must be findable via library search path. - * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. - * - * @param aicpu_binary Binary data of the AICPU shared library (not used, kept for API compatibility) - * @param kernel_names List of kernel function names to resolve - * @return 0 on success, error code on failure + * @brief Initialize the AICPU loader with binary data (no-op for legacy interface) */ int init_with_binary(const std::vector &aicpu_binary, const std::vector &kernel_names); /** - * @brief Initialize the AICPU loader (legacy interface compatibility) - * - * For the new interface (BUILD_WITH_NEW_CANN=ON), this does nothing. - * For the legacy interface (BUILD_WITH_NEW_CANN=OFF), this is a no-op. - * - * @param so_path Path to the AICPU shared library (not used) - * @param kernel_names List of kernel function names (not used) - * @return 0 on success, error code on failure + * @brief Initialize the AICPU loader (no-op for legacy interface) */ int init(const std::string &so_path, const std::vector &kernel_names); /** - * @brief Launch an AICPU kernel - * - * Unified interface that delegates to either legacy or new implementation. - * - * @param stream CUDA-style stream for execution - * @param k_args Kernel arguments - * @param kernel_name Name of the kernel to launch - * @param aicpu_num Number of AICPU instances to launch - * @return 0 on success, error code on failure + * @brief Launch an AICPU kernel via legacy rtAicpuKernelLaunchExWithArgs */ int launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num); /** - * @brief Cleanup resources - * - * For the new interface, this unloads the binary and clears handles. - * For the legacy interface, this is a no-op. + * @brief Cleanup resources (no-op for legacy interface) */ void finalize(); @@ -95,16 +64,6 @@ class AicpuLoader { AicpuLoader &operator=(const AicpuLoader &) = delete; AicpuLoader(AicpuLoader &&) = delete; AicpuLoader &operator=(AicpuLoader &&) = delete; - -private: -#ifdef BUILD_WITH_NEW_CANN - // New interface members - void *binary_handle_ = nullptr; // Binary handle from rtsBinaryLoadFromFile - std::unordered_map func_handles_; // Function handles (kernel_name -> func_handle) - std::string json_file_path_; // Path to temporary JSON descriptor file -#else - // Legacy interface - no state needed -#endif }; #endif // A5_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index a92703210..c0cc23d80 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -17,12 +17,34 @@ #include "device_runner.h" +#ifdef BUILD_WITH_NEW_CANN +#include "load_aicpu_op.h" +#endif + #include #include +#include #include #include #include +#ifdef BUILD_WITH_NEW_CANN + +static std::string resolve_dispatcher_so_path() { + Dl_info info; + if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { + return ""; + } + std::string so_dir = info.dli_fname; + size_t pos = so_dir.rfind('/'); + if (pos == std::string::npos) { + return "libaicpu_dispatcher.so"; + } + so_dir = so_dir.substr(0, pos + 1); + return so_dir + "libaicpu_dispatcher.so"; +} +#endif + #include "callable.h" #include "host/host_regs.h" // Register address retrieval #include "host/raii_scope_guard.h" @@ -232,14 +254,14 @@ int DeviceRunner::ensure_binaries_loaded( aicore_kernel_binary_ = aicore_kernel_binary; #ifdef BUILD_WITH_NEW_CANN - // New interface: Initialize AICPU loader with binary data - const std::vector kernel_names = {"DynTileFwkKernelServerInit", "DynTileFwkKernelServer"}; - int rc = aicpu_loader_.init_with_binary(aicpu_so_binary, kernel_names); + // New interface: Initialize LoadAicpuOp (loads dispatcher SO) + std::string dispatcher_so_path = resolve_dispatcher_so_path(); + int rc = load_aicpu_op_.Init(dispatcher_so_path); if (rc != 0) { - LOG_ERROR("AicpuLoader init_with_binary failed: %d", rc); + LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); return rc; } - LOG_INFO("DeviceRunner: AICPU loader initialized with binary data"); + LOG_INFO("DeviceRunner: LoadAicpuOp initialized"); #else int rc = 0; #endif @@ -407,6 +429,28 @@ int DeviceRunner::run( return rc; } +#ifdef BUILD_WITH_NEW_CANN + // Three-phase launch pattern with dispatcher: + // 1. Load (Null) - Pass inner SO binary to dispatcher + // 2. Init - Initialize inner SO + // 3. Run - Execute actual kernel + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerNull (Load) ===" << '\n'; + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerNull", 1); + if (rc != 0) { + LOG_ERROR("launch_aicpu_kernel (load/null) failed: %d", rc); + return rc; + } + + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerInit (Init) ===" << '\n'; + // Launch AICPU init kernel + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); + if (rc != 0) { + LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); + return rc; + } + + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer (Run) ===" << '\n'; +#else std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerInit===" << '\n'; // Launch AICPU init kernel rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); @@ -416,6 +460,7 @@ int DeviceRunner::run( } std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer===" << '\n'; +#endif // Launch AICPU main kernel (over-launch for affinity gate) rc = launch_aicpu_kernel( stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH @@ -493,8 +538,12 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); +#ifdef BUILD_WITH_NEW_CANN + // LoadAicpuOp cleanup happens automatically in destructor +#else // Cleanup AICPU loader aicpu_loader_.finalize(); +#endif // Kernel binaries should have been removed by validate_runtime_impl() if (!func_id_to_addr_.empty()) { @@ -536,7 +585,23 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { +#ifdef BUILD_WITH_NEW_CANN + // Map kernel name to LoadAicpuOp function name + std::string func_name; + if (std::strcmp(kernel_name, "DynTileFwkKernelServerInit") == 0) { + func_name = host::KernelNames::InitName; + } else if (std::strcmp(kernel_name, "DynTileFwkKernelServer") == 0) { + func_name = host::KernelNames::RunName; + } else if (std::strcmp(kernel_name, "DynTileFwkKernelServerNull") == 0) { + func_name = host::KernelNames::NullName; + } else { + LOG_ERROR("Unknown kernel name: %s", kernel_name); + return -1; + } + return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, func_name, kernel_name); +#else return aicpu_loader_.launch(stream, k_args, kernel_name, aicpu_num); +#endif } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime *runtime) { diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index c94595e8f..e9347b5fb 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -43,7 +43,11 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#ifdef BUILD_WITH_NEW_CANN +#include "load_aicpu_op.h" +#else #include "aicpu_loader.h" +#endif #include "runtime.h" /** @@ -344,7 +348,11 @@ class DeviceRunner { std::vector aicore_kernel_binary_; // AICPU loader abstraction (supports both legacy and new CANN interfaces) +#ifdef BUILD_WITH_NEW_CANN + host::LoadAicpuOp load_aicpu_op_; +#else AicpuLoader aicpu_loader_; +#endif // Memory management MemoryAllocator mem_alloc_; diff --git a/src/common/aicpu_dispatcher/CMakeLists.txt b/src/common/aicpu_dispatcher/CMakeLists.txt new file mode 100644 index 000000000..833459afb --- /dev/null +++ b/src/common/aicpu_dispatcher/CMakeLists.txt @@ -0,0 +1,52 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# Build AICPU Dispatcher SO - Two-layer architecture for runtime-specific AICPU kernels +cmake_minimum_required(VERSION 3.16.3) + +project(aicpu_dispatcher LANGUAGES C CXX) + +# Dispatcher SO sources +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_dispatcher.cpp" +) + +# Create shared library +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +# C++ standard +set_target_properties(aicpu_dispatcher PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON +) + +# Compile options (matching AICPU pattern) +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -fPIC + -O3 + -g +) + +# Include directories +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + "${CMAKE_CURRENT_SOURCE_DIR}/../../../.." # For common/unified_log.h +) + +# Link against dl for dlopen/dlsym +target_link_libraries(aicpu_dispatcher + PRIVATE + dl +) + +# Set output name +set_target_properties(aicpu_dispatcher PROPERTIES OUTPUT_NAME "aicpu_dispatcher") diff --git a/src/common/aicpu_dispatcher/README.md b/src/common/aicpu_dispatcher/README.md new file mode 100644 index 000000000..2f462139e --- /dev/null +++ b/src/common/aicpu_dispatcher/README.md @@ -0,0 +1,38 @@ +# AICPU Dispatcher SO + +Two-layer architecture for runtime-specific AICPU kernels. + +## Architecture + +The dispatcher SO provides a two-layer architecture where: + +- **Outer layer (this SO)** is fixed and handles dynamic SO loading +- **Inner layer (runtime-specific SO)** can be different for each runtime + +This allows different runtimes (tensormap, ringbuffer, etc.) to load their own AICPU kernel implementations at runtime without recompiling the dispatcher. + +## Exported Functions + +Three C-style exported functions (AICPU entry points): + +1. `DynTileFwkKernelServerNull` - Load phase: receives inner SO binary, saves to filesystem +2. `DynTileFwkKernelServerInit` - Init phase: delegates to inner SO's initialization +3. `DynTileFwkKernelServer` - Run phase: delegates to inner SO's execution + +## BackendServerHandleManager + +Internal class that manages the lifecycle of the inner SO: + +- `SaveSoFile()` - Saves inner SO binary to `/tmp/aicpu_kernels/` +- `SetTileFwkKernelMap()` - Loads init and run functions from inner SO using dlopen/dlsym +- `ExecuteFunc()` - Executes inner SO functions with provided arguments + +## Function Key Mapping + +- `dyInitFuncKey = 2` - Initialization function +- `dyExecFuncKey = 3` - Execution function + +## Reference + +Based on pypto's implementation: +`/data/fangjingzhi/pypto/framework/src/machine/device/machine_interface/pypto_aicpu_interface.{h,cpp}` diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp new file mode 100644 index 000000000..1825f042f --- /dev/null +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp @@ -0,0 +1,313 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Dispatcher SO Implementation + */ + +#include "aicpu_dispatcher.h" + +#include +#include +#include +#include +#include +#include +#include + +// Weak symbol fallback implementations for unified_log_* functions. +// When dispatcher SO is loaded independently by the AICPU scheduler daemon +// (via dlopen), these weak symbols provide a minimal stderr-based logger. +// When linked into host_runtime.so, the strong symbols from unified_log_host.cpp +// take precedence. +extern "C" { + +__attribute__((weak)) void unified_log_error(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[ERROR] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +__attribute__((weak)) void unified_log_warn(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[WARN] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +__attribute__((weak)) void unified_log_info(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[INFO] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +__attribute__((weak)) void unified_log_debug(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[DEBUG] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +__attribute__((weak)) void unified_log_always(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[ALWAYS] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +} // extern "C" + +// Forward declarations for simpler's KernelArgs and DeviceArgs structures. +// These MUST match the layouts defined in platform-specific kernel_args.h: +// src/a2a3/platform/include/common/kernel_args.h +// src/a5/platform/include/common/kernel_args.h +// +// Both platforms share the same layout for fields accessed here (device_args, +// runtime_args). a2a3 has an additional ffts_base_addr field at the end which +// this code does not access. The static_assert below ensures this struct is +// at least as large as the minimum platform layout. +struct KernelArgs { + uint64_t unused[5] = {0}; + void* device_args{nullptr}; // Pointer to DeviceArgs in device memory + void* runtime_args{nullptr}; + uint64_t regs{0}; +}; + +// DeviceArgs structure as passed from DeviceRunner. +// Must match the layout in platform-specific DeviceArgs (host/device_runner.h). +struct DeviceArgs { + uint64_t unused[12] = {0}; + uint64_t aicpu_so_bin{0}; + uint64_t aicpu_so_len{0}; +}; + +static_assert(sizeof(KernelArgs) >= 64, "KernelArgs layout mismatch with platform kernel_args.h"); +static_assert(sizeof(DeviceArgs) >= 112, "DeviceArgs layout mismatch with platform DeviceArgs"); +static_assert(offsetof(KernelArgs, device_args) == 40, "KernelArgs::device_args offset mismatch"); +static_assert(offsetof(KernelArgs, runtime_args) == 48, "KernelArgs::runtime_args offset mismatch"); +static_assert(offsetof(DeviceArgs, aicpu_so_bin) == 96, "DeviceArgs::aicpu_so_bin offset mismatch"); +static_assert(offsetof(DeviceArgs, aicpu_so_len) == 104, "DeviceArgs::aicpu_so_len offset mismatch"); + +namespace aicpu_dispatcher { + +BackendServerHandleManager::~BackendServerHandleManager() +{ + if (soHandle_ != nullptr) { + LOG_INFO("Closing inner SO handle: %s", innerSoName_.c_str()); + dlclose(soHandle_); + soHandle_ = nullptr; + } +} + +bool BackendServerHandleManager::SaveSoFile(char* data, const uint64_t& len, uint8_t deviceId) +{ + std::lock_guard lock(funcLock_); + + if (len < 1) { + LOG_WARN("AICPU SO len is %lu, skipping save", len); + return true; // Don't fail for empty SO + } + + // Generate inner SO file path based on device ID + // Use /tmp/aicpu_kernels/ for better portability (no root requirement) + const std::string dir_path = "/tmp/aicpu_kernels"; + innerSoName_ = dir_path + "/libaicpu_dispatcher_runtime_" + std::to_string(deviceId) + ".so"; + + // Create directory if it doesn't exist + struct stat st; + if (stat(dir_path.c_str(), &st) != 0) { + // Directory doesn't exist, create it + if (mkdir(dir_path.c_str(), 0755) != 0) { + LOG_ERROR("Failed to create directory %s: %s", dir_path.c_str(), strerror(errno)); + return false; + } + LOG_INFO("Created directory: %s", dir_path.c_str()); + } + + LOG_INFO("Saving inner AICPU SO to device %u: %s (size=%lu bytes)", deviceId, innerSoName_.c_str(), len); + + std::ofstream file(innerSoName_, std::ios::out | std::ios::binary); + if (!file.is_open()) { + LOG_ERROR("Failed to create inner SO file: %s", innerSoName_.c_str()); + return false; + } + + // Write binary to file + file.write(data, len); + + if (!file.good()) { + LOG_ERROR("Failed to write inner SO file: %s", innerSoName_.c_str()); + file.close(); + return false; + } + file.close(); + + LOG_INFO("Successfully saved inner AICPU SO for device %u: %s", deviceId, innerSoName_.c_str()); + return true; +} + +void BackendServerHandleManager::SetTileFwkKernelMap() +{ + std::lock_guard lock(funcLock_); + + if (firstLoadSo_) { + return; // Already loaded + } + + // Load init function from inner SO + (void)LoadTileFwkKernelFunc(DY_TILE_FWK_BACKEND_KERNEL_SERVER_INIT); + // Load run function from inner SO + (void)LoadTileFwkKernelFunc(DY_TILE_FWK_BACKEND_KERNEL_SERVER); + + firstLoadSo_ = true; +} + +int BackendServerHandleManager::ExecuteFunc(void* args, const uint64_t funcKey) +{ + auto func = GetTileFwkKernelFunc(funcKey); + if (func == nullptr) { + LOG_ERROR("Function key %lu not found in inner SO %s", funcKey, innerSoName_.c_str()); + return -1; + } + + return func(args); +} + +void BackendServerHandleManager::LoadTileFwkKernelFunc(const std::string& kernelName) +{ + if (soHandle_ == nullptr) { + soHandle_ = dlopen(innerSoName_.c_str(), RTLD_LAZY | RTLD_DEEPBIND); + if (soHandle_ == nullptr) { + char* error = dlerror(); + LOG_ERROR("Failed to dlopen inner SO %s: %s", innerSoName_.c_str(), error ? error : "unknown error"); + return; + } + LOG_INFO("Successfully dlopened inner SO: %s", innerSoName_.c_str()); + } + + // Map kernel name to function key + uint64_t funcKey = 0; + if (kernelName == DY_TILE_FWK_BACKEND_KERNEL_SERVER_INIT) { + funcKey = dyInitFuncKey; + } else if (kernelName == DY_TILE_FWK_BACKEND_KERNEL_SERVER) { + funcKey = dyExecFuncKey; + } else { + LOG_ERROR("Unknown kernel name: %s", kernelName.c_str()); + return; + } + + LOG_DEBUG("Loading function: name=%s, funcKey=%lu", kernelName.c_str(), funcKey); + + // Skip if function is already loaded + auto iter = kernelKey2FuncHandle_.find(funcKey); + if (iter != kernelKey2FuncHandle_.end()) { + LOG_DEBUG("Function already loaded: %s (funcKey=%lu)", kernelName.c_str(), funcKey); + return; + } + + // Load the function + AicpuKernelFunc funcEntry = reinterpret_cast( + dlsym(soHandle_, kernelName.c_str()) + ); + if (funcEntry == nullptr) { + char* error = dlerror(); + LOG_ERROR("Failed to dlsym %s from %s: %s", + kernelName.c_str(), innerSoName_.c_str(), error ? error : "unknown error"); + (void)dlclose(soHandle_); + soHandle_ = nullptr; + return; + } + LOG_INFO("Successfully loaded function: %s from %s", kernelName.c_str(), innerSoName_.c_str()); + kernelKey2FuncHandle_[funcKey] = funcEntry; +} + +AicpuKernelFunc BackendServerHandleManager::GetTileFwkKernelFunc(const uint64_t funcKey) +{ + auto iter = kernelKey2FuncHandle_.find(funcKey); + if (iter != kernelKey2FuncHandle_.end()) { + return iter->second; + } + LOG_ERROR("Function key %lu not found", funcKey); + return nullptr; +} + +} // namespace aicpu_dispatcher + +namespace { + +// Global instance of the handle manager +aicpu_dispatcher::BackendServerHandleManager g_handleManager; + +} // namespace + +// C-style exported functions (AICPU entry points) +extern "C" { + +__attribute__((visibility("default"))) uint32_t DynTileFwkKernelServerNull(void* args) +{ + if (args == nullptr) { + LOG_ERROR("Dispatcher Load: args is null"); + return 1; + } + + auto* kargs = reinterpret_cast(args); + auto* devArgs = reinterpret_cast(kargs->device_args); + if (devArgs == nullptr) { + LOG_ERROR("Dispatcher Load: DeviceArgs is null"); + return 1; + } + + auto* data = reinterpret_cast(devArgs->aicpu_so_bin); + if (devArgs->aicpu_so_len == 0) { + LOG_WARN("Dispatcher Load: inner SO binary is empty, skipping load"); + return 0; + } + + if (!g_handleManager.SaveSoFile(data, devArgs->aicpu_so_len)) { + LOG_ERROR("Dispatcher Load: failed to save inner SO"); + return 1; + } + g_handleManager.SetTileFwkKernelMap(); + return 0; +} + +__attribute__((visibility("default"))) uint32_t DynTileFwkKernelServerInit(void* args) +{ + auto ret = g_handleManager.ExecuteFunc(args, aicpu_dispatcher::dyInitFuncKey); + if (ret != 0) { + LOG_ERROR("Dispatcher Init: inner SO init failed with code %d", ret); + return 1; + } + return 0; +} + +__attribute__((visibility("default"))) uint32_t DynTileFwkKernelServer(void* args) +{ + auto ret = g_handleManager.ExecuteFunc(args, aicpu_dispatcher::dyExecFuncKey); + if (ret != 0) { + LOG_ERROR("Dispatcher Run: inner SO run failed with code %d", ret); + return 1; + } + return 0; +} + +} // extern "C" diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.h b/src/common/aicpu_dispatcher/aicpu_dispatcher.h new file mode 100644 index 000000000..f0d5dacb7 --- /dev/null +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Dispatcher SO - Two-layer architecture for runtime-specific AICPU kernels + * + * This dispatcher SO provides a two-layer architecture where: + * - Outer layer (this SO) is fixed and handles dynamic SO loading + * - Inner layer (runtime-specific SO) can be different for each runtime + * + * Architecture: + * 1. DynTileFwkKernelServerNull - Load phase: receives inner SO binary, saves to AICPU filesystem + * 2. DynTileFwkKernelServerInit - Init phase: delegates to inner SO's initialization + * 3. DynTileFwkKernelServer - Run phase: delegates to inner SO's execution + * + * This allows different runtimes (tensormap, ringbuffer, etc.) to load their own + * AICPU kernel implementations at runtime without recompiling the dispatcher. + */ + +#ifndef COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ +#define COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ + +#include +#include +#include +#include +#include +#include + +#include "common/unified_log.h" + +// Function pointer type for AICPU kernel functions +using AicpuKernelFunc = int (*)(void*); + +namespace aicpu_dispatcher { + +// Function key constants for inner SO function lookup +constexpr uint64_t dyInitFuncKey = 2; +constexpr uint64_t dyExecFuncKey = 3; + +// Kernel name constants (actual symbol names in inner SO) +constexpr char const* DY_TILE_FWK_BACKEND_KERNEL_SERVER_INIT = "DynTileFwkBackendKernelServerInit"; +constexpr char const* DY_TILE_FWK_BACKEND_KERNEL_SERVER = "DynTileFwkBackendKernelServer"; + +/** + * @brief Backend server handle manager for two-layer SO architecture + * + * Manages the lifecycle of the inner SO: + * - Saves inner SO binary to /tmp/aicpu_kernels/ + * - Loads functions from inner SO using dlopen/dlsym + * - Executes inner SO functions with provided arguments + * + * Data flow: + * - Host passes inner SO binary via DeviceArgs (aicpu_so_bin, aicpu_so_len) + * - Dispatcher's Null function receives KernelArgs->device_args pointer + * - Binary is saved to filesystem and inner SO is loaded via dlopen + */ +class BackendServerHandleManager { +public: + BackendServerHandleManager() = default; + ~BackendServerHandleManager(); + + /** + * @brief Save inner SO binary to AICPU filesystem + * + * @param data Pointer to inner SO binary data + * @param len Length of the binary data + * @param deviceId Device ID for SO naming + * @return true on success, false on failure + */ + bool SaveSoFile(char* data, const uint64_t& len, uint8_t deviceId = 0); + + /** + * @brief Load function symbols from inner SO + * + * Loads the init and run functions from the saved inner SO using dlopen/dlsym. + */ + void SetTileFwkKernelMap(); + + /** + * @brief Execute a function from the inner SO + * + * @param args Arguments to pass to the function + * @param funcKey Function key (2=init, 3=run) + * @return Return value from the function, or error code + */ + int ExecuteFunc(void* args, const uint64_t funcKey); + +private: + /** + * @brief Load a specific function from the inner SO + * + * @param kernelName Name of the function to load (symbol name in inner SO) + */ + void LoadTileFwkKernelFunc(const std::string& kernelName); + + /** + * @brief Get a loaded function by its key + * + * @param funcKey Function key (2=init, 3=run) + * @return Function pointer, or nullptr if not found + */ + AicpuKernelFunc GetTileFwkKernelFunc(const uint64_t funcKey); + + std::unordered_map kernelKey2FuncHandle_; + std::mutex funcLock_; + void* soHandle_ = nullptr; + bool firstLoadSo_ = false; + std::string innerSoName_; +}; + +} // namespace aicpu_dispatcher + +// C-style exported functions (AICPU entry points) +extern "C" { + __attribute__((visibility("default"))) uint32_t DynTileFwkKernelServerNull(void* args); + __attribute__((visibility("default"))) uint32_t DynTileFwkKernelServerInit(void* args); + __attribute__((visibility("default"))) uint32_t DynTileFwkKernelServer(void* args); +} + +#endif // COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ diff --git a/src/common/host/CMakeLists.txt b/src/common/host/CMakeLists.txt new file mode 100644 index 000000000..577d062b1 --- /dev/null +++ b/src/common/host/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# Build host-side AICPU operation loader +cmake_minimum_required(VERSION 3.16.3) + +project(host_common LANGUAGES C CXX) + +# Host common sources +set(HOST_COMMON_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/load_aicpu_op.cpp" +) + +# This library is included directly in host_runtime, not built separately +# Sources are added to HOST_RUNTIME_SOURCES in platform CMakeLists.txt diff --git a/src/common/host/load_aicpu_op.cpp b/src/common/host/load_aicpu_op.cpp new file mode 100644 index 000000000..661ae3507 --- /dev/null +++ b/src/common/host/load_aicpu_op.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Operation Loader Implementation + */ + +#include "load_aicpu_op.h" + +#include +#include +#include +#include + +#include "common/unified_log.h" + +#ifdef BUILD_WITH_NEW_CANN + +namespace host { + +LoadAicpuOp::~LoadAicpuOp() +{ +#ifdef BUILD_WITH_NEW_CANN + if (binary_handle_ != nullptr) { + rtError_t rc = rtsBinaryUnload(binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_WARN("rtsBinaryUnload failed: %d", rc); + } + binary_handle_ = nullptr; + } + func_handles_.clear(); + + if (!json_file_path_.empty()) { + std::remove(json_file_path_.c_str()); + LOG_INFO("LoadAicpuOp: Deleted temporary JSON file: %s", json_file_path_.c_str()); + json_file_path_.clear(); + } +#endif +} + +bool LoadAicpuOp::GenerateAicpuOpJson(const std::string& json_path, const std::string& kernel_so) +{ + std::ofstream json_file(json_path); + if (!json_file.is_open()) { + LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); + return false; + } + + AicpuOpConfig init_config; + init_config.opType = KernelNames::InitName; + init_config.functionName = "DynTileFwkKernelServerInit"; + init_config.kernelSo = kernel_so; + init_config.opKernelLib = "KFCKernel"; + + AicpuOpConfig run_config; + run_config.opType = KernelNames::RunName; + run_config.functionName = "DynTileFwkKernelServer"; + run_config.kernelSo = kernel_so; + run_config.opKernelLib = "KFCKernel"; + + AicpuOpConfig null_config; + null_config.opType = KernelNames::NullName; + null_config.functionName = "DynTileFwkKernelServerNull"; + null_config.kernelSo = kernel_so; + null_config.opKernelLib = "AICPUKernel"; + + std::vector op_configs = {init_config, run_config, null_config}; + + json_file << "{\n"; + for (size_t i = 0; i < op_configs.size(); ++i) { + const auto& config = op_configs[i]; + json_file << " \"" << config.opType << "\": {\n"; + json_file << " \"opInfo\": {\n"; + json_file << " \"functionName\": \"" << config.functionName << "\",\n"; + json_file << " \"kernelSo\": \"" << config.kernelSo << "\",\n"; + json_file << " \"opKernelLib\": \"" << config.opKernelLib << "\",\n"; + json_file << " \"computeCost\": \"" << config.computeCost << "\",\n"; + json_file << " \"engine\": \"" << config.engine << "\",\n"; + json_file << " \"flagAsync\": \"" << config.flagAsync << "\",\n"; + json_file << " \"flagPartial\": \"" << config.flagPartial << "\",\n"; + json_file << " \"userDefined\": \"" << config.userDefined << "\"\n"; + json_file << " }\n"; + json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; + } + json_file << "}\n"; + json_file.close(); + + LOG_INFO("Generated AICPU op info JSON: %s", json_path.c_str()); + return true; +} + +int LoadAicpuOp::Init(const std::string& dispatcher_so_path) +{ + // Generate JSON in the same directory as the SO, with the same basename + // e.g. /path/libaicpu_dispatcher.so -> /path/libaicpu_dispatcher.json + // cpuKernelMode=1 derives the SO path by replacing .json with .so + std::string so_dir; + size_t last_slash = dispatcher_so_path.rfind('/'); + if (last_slash != std::string::npos) { + so_dir = dispatcher_so_path.substr(0, last_slash + 1); + } + + std::string so_basename = dispatcher_so_path; + if (last_slash != std::string::npos) { + so_basename = dispatcher_so_path.substr(last_slash + 1); + } + // Replace .so suffix with .json + std::string json_name = so_basename; + size_t so_ext = json_name.rfind(".so"); + if (so_ext != std::string::npos) { + json_name = json_name.substr(0, so_ext) + ".json"; + } + + json_file_path_ = so_dir + json_name; + + // kernelSo uses relative filename (scheduler resolves via ASCEND_AICPU_PATH) + if (!GenerateAicpuOpJson(json_file_path_, so_basename)) { + json_file_path_.clear(); + return -1; + } + + // Load via rtsBinaryLoadFromFile with cpuKernelMode=1 + rtLoadBinaryOption_t option = {}; + option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; + option.value.cpuKernelMode = 1; + + rtLoadBinaryConfig_t load_config = {}; + load_config.options = &option; + load_config.numOpt = 1; + + LOG_INFO("LoadAicpuOp: JSON path: %s", json_file_path_.c_str()); + LOG_INFO("LoadAicpuOp: SO path: %s", dispatcher_so_path.c_str()); + + rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); + std::remove(json_file_path_.c_str()); + json_file_path_.clear(); + return rc; + } + LOG_INFO("LoadAicpuOp: Loaded dispatcher SO, handle=%p", binary_handle_); + + // Step 4: Resolve function handles for all three kernels + const char* kernel_names[] = {KernelNames::NullName, KernelNames::InitName, KernelNames::RunName}; + for (const char* name : kernel_names) { + rtFuncHandle func_handle = nullptr; + rc = rtsFuncGetByName(binary_handle_, name, &func_handle); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsFuncGetByName failed for %s: %d", name, rc); + return rc; + } + func_handles_[name] = func_handle; + LOG_INFO("LoadAicpuOp: Resolved function handle for %s: %p", name, func_handle); + } + + return 0; +} + +int LoadAicpuOp::AicpuKernelLaunch( + rtFuncHandle func_handle, rtStream_t stream, KernelArgs* k_args, int aicpu_num, const std::string& kernel_name +) { + (void)kernel_name; + + rtCpuKernelArgs_t cpu_args = {}; + cpu_args.baseArgs.args = k_args; + cpu_args.baseArgs.argsSize = sizeof(KernelArgs); + + rtKernelLaunchCfg_t kernelLaunchCfg = {nullptr, 0U}; + auto launchKernelAttr = std::make_unique(); + kernelLaunchCfg.attrs = launchKernelAttr.get(); + + rtError_t rc = rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, &kernelLaunchCfg, &cpu_args); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsLaunchCpuKernel failed: %d", rc); + return rc; + } + + return 0; +} + +int LoadAicpuOp::LaunchBuiltInOp( + rtStream_t stream, KernelArgs* k_args, int aicpu_num, const std::string& func_name, const std::string& kernel_name +) { + auto it = func_handles_.find(func_name); + if (it == func_handles_.end()) { + LOG_ERROR("Function not found: %s", func_name.c_str()); + return -1; + } + + rtFuncHandle func_handle = it->second; + return AicpuKernelLaunch(func_handle, stream, k_args, aicpu_num, kernel_name); +} + +} // namespace host + +#endif // BUILD_WITH_NEW_CANN diff --git a/src/common/host/load_aicpu_op.h b/src/common/host/load_aicpu_op.h new file mode 100644 index 000000000..9c088ea65 --- /dev/null +++ b/src/common/host/load_aicpu_op.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * @file load_aicpu_op.h + * @brief Host-side AICPU operation loader using new CANN 7.0+ rtsLaunchCpuKernel interface + * + * This class provides the host-side wrapper for loading and launching AICPU kernels + * through the two-layer dispatcher architecture. It generates JSON descriptors, + * loads the dispatcher SO via rtsBinaryLoadFromFile, and launches kernels via + * rtsLaunchCpuKernel. + * + * Architecture: + * - Dispatcher SO (libaicpu_dispatcher.so) - Fixed outer layer + * - Runtime SO (replaceable) - Different for each runtime (tensormap, ringbuffer, etc.) + * + * Three-phase launch pattern: + * 1. Load phase (DynTileFwkKernelServerNull) - Pass inner SO binary to dispatcher + * 2. Init phase (DynTileFwkKernelServerInit) - Initialize inner SO + * 3. Run phase (DynTileFwkKernelServer) - Execute actual kernel + */ + +#ifndef COMMON_HOST_LOAD_AICPU_OP_H_ +#define COMMON_HOST_LOAD_AICPU_OP_H_ + +#include +#include +#include +#include + +#include "common/kernel_args.h" + +#ifdef BUILD_WITH_NEW_CANN +#include "runtime/runtime/rts/rts_kernel.h" +#endif + +namespace host { + +/** + * @brief AICPU operation configuration for JSON descriptor generation + */ +struct AicpuOpConfig { + std::string functionName; // Actual symbol name in SO (e.g., DynTileFwkBackendKernelServerInit) + std::string kernelSo; // SO filename (e.g., libaicpu_dispatcher.so) + std::string opKernelLib; // Kernel library type (KFCKernel or AICPUKernel) + std::string computeCost = "100"; + std::string engine = "DNN_VM_AICPU"; + std::string flagAsync = "False"; + std::string flagPartial = "False"; + std::string userDefined = "False"; + std::string opType; // External kernel name for rtsFuncGetByName lookup +}; + +/** + * @brief Host-side AICPU operation loader + * + * Manages the lifecycle of loading and launching AICPU kernels through the + * two-layer dispatcher architecture using CANN 7.0+ rtsLaunchCpuKernel interface. + * + * Reference: /data/fangjingzhi/pypto/framework/src/machine/runtime/load_aicpu_op.{h,cpp} + */ +class LoadAicpuOp { +public: + LoadAicpuOp() = default; + ~LoadAicpuOp(); + + // Delete copy and move to ensure singleton behavior + LoadAicpuOp(const LoadAicpuOp&) = delete; + LoadAicpuOp& operator=(const LoadAicpuOp&) = delete; + LoadAicpuOp(LoadAicpuOp&&) = delete; + LoadAicpuOp& operator=(LoadAicpuOp&&) = delete; + + /** + * @brief Initialize the loader by loading dispatcher SO + * + * Passes the dispatcher SO path directly to rtsBinaryLoadFromFile + * and resolves function handles via rtsFuncGetByName. + * + * @param dispatcher_so_path Absolute path to libaicpu_dispatcher.so + * @return 0 on success, error code on failure + */ + int Init(const std::string& dispatcher_so_path); + + /** + * @brief Launch a built-in dispatcher kernel + * + * Launches one of the three dispatcher kernels (Null/Init/Run) via + * rtsLaunchCpuKernel. + * + * @param stream RTS stream for kernel launch + * @param k_args Kernel arguments to pass to the AICPU kernel + * @param aicpu_num Number of AICPU cores to use + * @param func_name Kernel function name for rtsFuncGetByName lookup (PyptoNull/PyptoInit/PyptoRun) + * @param kernel_name Actual symbol name in the SO (DynTileFwkKernelServerNull/Init/Server) + * @return 0 on success, error code on failure + */ + int LaunchBuiltInOp( + rtStream_t stream, KernelArgs* k_args, int aicpu_num, const std::string& func_name, const std::string& kernel_name + ); + +private: +#ifdef BUILD_WITH_NEW_CANN + void* binary_handle_ = nullptr; // Handle from rtsBinaryLoadFromFile + std::unordered_map func_handles_; // Function handles from rtsFuncGetByName + std::string json_file_path_; // Path to generated JSON file (same dir/basename as SO) + + /** + * @brief Generate JSON descriptor for dispatcher SO + * + * @param json_path Path where JSON file will be created + * @param kernel_so Absolute path to the dispatcher SO (placed in kernelSo JSON field) + * @return true on success, false on failure + */ + bool GenerateAicpuOpJson(const std::string& json_path, const std::string& kernel_so); + + /** + * @brief Launch AICPU kernel using rtsLaunchCpuKernel + * + * @param func_handle Function handle from rtsFuncGetByName + * @param stream RTS stream + * @param k_args Kernel arguments + * @param aicpu_num Number of AICPU cores + * @param kernel_name Kernel name to embed in args struct + * @return 0 on success, error code on failure + */ + int AicpuKernelLaunch( + rtFuncHandle func_handle, rtStream_t stream, KernelArgs* k_args, int aicpu_num, const std::string& kernel_name + ); +#else + // Dummy members for legacy build + void* binary_handle_ = nullptr; +#endif +}; + +// Kernel name constants +namespace KernelNames { + constexpr const char* NullName = "PyptoNull"; // Load phase + constexpr const char* InitName = "PyptoInit"; // Init phase + constexpr const char* RunName = "PyptoRun"; // Run phase +} + +// Dispatcher SO name +namespace SoNames { + constexpr const char* DispatcherSo = "libaicpu_dispatcher.so"; +} + +} // namespace host + +#endif // COMMON_HOST_LOAD_AICPU_OP_H_