diff --git a/.gitignore b/.gitignore index 37d5e142b..e3c04a989 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,7 @@ compile_commands.json # Built nanobind extensions python/_task_interface*.so python/_task_interface*.dylib + +# Log files +*.log +profiling_logs_*/ diff --git a/python/simpler/runtime_compiler.py b/python/simpler/runtime_compiler.py index ce159d422..57cba0db3 100644 --- a/python/simpler/runtime_compiler.py +++ b/python/simpler/runtime_compiler.py @@ -233,6 +233,12 @@ def _build(actual_build_dir: str) -> Union[bytes, Path]: od.mkdir(parents=True, exist_ok=True) dest = od / binary_name shutil.copy2(binary_path, dest) + dispatcher_so = Path(actual_build_dir) / "libaicpu_dispatcher.so" + if dispatcher_so.is_file(): + dest_dispatcher = od / "libaicpu_dispatcher.so" + shutil.copy2(dispatcher_so, dest_dispatcher) + # Strip debug info to match CANN built-in SO format + subprocess.run(["strip", "-s", str(dest_dispatcher)], check=True) return dest else: with open(binary_path, "rb") as f: diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt index e1fb32d2c..00e041094 100644 --- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt @@ -10,6 +10,7 @@ project(aicpu_kernel LANGUAGES C CXX) set(CMAKE_CUSTOM_INCLUDE_DIRS "") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -75,3 +76,38 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) +# Build dispatcher SO (two-layer architecture) +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" +) +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +# Compiler options for dispatcher (same as AICPU kernel) +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -rdynamic + -O3 + -fPIC + -g + $<$:-std=gnu++17> +) + +# Include directories for dispatcher +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CUSTOM_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include +) + +# Link against dl for dlopen/dlsym +target_link_libraries(aicpu_dispatcher PRIVATE dl) + +# Match CANN built-in SO properties: SYMBOLIC flag, build-id, stripped +set_target_properties(aicpu_dispatcher PROPERTIES + LINK_FLAGS "-Wl,-Bsymbolic,--build-id" + OUTPUT_NAME "aicpu_dispatcher" +) + diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index 12c86f4fd..b7dbf4d2f 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -19,6 +19,8 @@ set(CMAKE_CUSTOM_INCLUDE_DIRS "") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/worker") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -35,10 +37,15 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_loader.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" ) +# Add common/host sources (LoadAicpuOp) +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") @@ -84,11 +91,16 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 -) +# Conditional compilation for new CANN interface +option(BUILD_WITH_NEW_CANN "Use new rtsLaunchCpuKernel interface (CANN 7.0+)" ON) +if(BUILD_WITH_NEW_CANN) + target_compile_definitions(host_runtime PRIVATE BUILD_WITH_NEW_CANN) + # Add additional include path for new RTS headers (CANN 7.0+) + target_include_directories(host_runtime PRIVATE + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime + ) + message(STATUS "Building with new CANN rtsLaunchCpuKernel interface") +endif() # Link against CANN runtime libraries # ascend_hal is dynamically loaded at runtime via dlopen in device_runner @@ -100,4 +112,10 @@ target_link_libraries(host_runtime dl ) +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 +) + set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.cpp b/src/a2a3/platform/onboard/host/aicpu_loader.cpp new file mode 100644 index 000000000..f99450cf9 --- /dev/null +++ b/src/a2a3/platform/onboard/host/aicpu_loader.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Loader Implementation (Legacy Interface) + * + * Provides AICPU kernel launching via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used when BUILD_WITH_NEW_CANN is OFF. When BUILD_WITH_NEW_CANN is ON, + * device_runner uses LoadAicpuOp (src/common/host/load_aicpu_op.h) instead. + */ + +#include "aicpu_loader.h" + +#include + +#include "common/unified_log.h" +#include "common/kernel_args.h" + +int AicpuLoader::init_with_binary( + const std::vector &aicpu_binary, const std::vector &kernel_names +) { + // Legacy interface: No pre-loading needed + (void)aicpu_binary; + (void)kernel_names; + LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); + return 0; +} + +int AicpuLoader::init(const std::string &so_path, const std::vector &kernel_names) { + // Legacy interface: No pre-loading needed + (void)so_path; + (void)kernel_names; + LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); + return 0; +} + +int AicpuLoader::launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { + // Legacy interface: rtAicpuKernelLaunchExWithArgs + struct Args { + KernelArgs k_args; + char kernel_name[32]; + const char so_name[32] = {"libaicpu_extend_kernels.so"}; + const char op_name[32] = {""}; + } args; + + args.k_args = *k_args; + std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); + args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; + + rtAicpuArgsEx_t rt_args; + std::memset(&rt_args, 0, sizeof(rt_args)); + rt_args.args = &args; + rt_args.argsSize = sizeof(args); + rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); + rt_args.soNameAddrOffset = offsetof(struct Args, so_name); + + return rtAicpuKernelLaunchExWithArgs( + rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 + ); +} + +void AicpuLoader::finalize() { + // Legacy interface: No-op +} diff --git a/src/a2a3/platform/onboard/host/aicpu_loader.h b/src/a2a3/platform/onboard/host/aicpu_loader.h new file mode 100644 index 000000000..e4d72ad4f --- /dev/null +++ b/src/a2a3/platform/onboard/host/aicpu_loader.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Loader Abstraction (Legacy Interface) + * + * Provides AICPU kernel launching via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used when BUILD_WITH_NEW_CANN is OFF. When BUILD_WITH_NEW_CANN is ON, + * device_runner uses LoadAicpuOp (src/common/host/load_aicpu_op.h) instead. + */ + +#ifndef A2A3_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ +#define A2A3_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ + +#include +#include +#include + +#include + +// Forward declarations +struct KernelArgs; + +/** + * @brief AICPU kernel loader (legacy interface) + * + * Launches AICPU kernels via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used as the fallback when BUILD_WITH_NEW_CANN is OFF. + */ +class AicpuLoader { +public: + AicpuLoader() = default; + ~AicpuLoader() = default; + + /** + * @brief Initialize the AICPU loader with binary data (no-op for legacy interface) + */ + int init_with_binary(const std::vector &aicpu_binary, const std::vector &kernel_names); + + /** + * @brief Initialize the AICPU loader (no-op for legacy interface) + */ + int init(const std::string &so_path, const std::vector &kernel_names); + + /** + * @brief Launch an AICPU kernel via legacy rtAicpuKernelLaunchExWithArgs + */ + int launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num); + + /** + * @brief Cleanup resources (no-op for legacy interface) + */ + void finalize(); + + // Disable copy and move + AicpuLoader(const AicpuLoader &) = delete; + AicpuLoader &operator=(const AicpuLoader &) = delete; + AicpuLoader(AicpuLoader &&) = delete; + AicpuLoader &operator=(AicpuLoader &&) = delete; +}; + +#endif // A2A3_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 41d2235c8..4d88ec01a 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -17,6 +17,10 @@ #include "device_runner.h" +#ifdef BUILD_WITH_NEW_CANN +#include "load_aicpu_op.h" +#endif + #include #include @@ -24,6 +28,24 @@ #include #include +#ifdef BUILD_WITH_NEW_CANN + +static std::string resolve_dispatcher_so_path() { + Dl_info info; + if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { + return ""; + } + // info.dli_fname is the path to host_runtime.so + std::string so_dir = info.dli_fname; + size_t pos = so_dir.rfind('/'); + if (pos == std::string::npos) { + return "libaicpu_dispatcher.so"; + } + so_dir = so_dir.substr(0, pos + 1); + return so_dir + "libaicpu_dispatcher.so"; +} +#endif + // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" #include "callable.h" @@ -313,8 +335,21 @@ int DeviceRunner::ensure_binaries_loaded( aicore_kernel_binary_ = aicore_kernel_binary; +#ifdef BUILD_WITH_NEW_CANN + // New interface: Initialize LoadAicpuOp (loads dispatcher SO) + std::string dispatcher_so_path = resolve_dispatcher_so_path(); + int rc = load_aicpu_op_.Init(dispatcher_so_path); + if (rc != 0) { + LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); + return rc; + } + LOG_INFO("DeviceRunner: LoadAicpuOp initialized"); +#else + int rc = 0; +#endif + // Load AICPU SO - int rc = so_info_.init(aicpu_so_binary, mem_alloc_); + rc = so_info_.init(aicpu_so_binary, mem_alloc_); if (rc != 0) { LOG_ERROR("AicpuSoInfo::init failed: %d", rc); return rc; @@ -501,6 +536,28 @@ int DeviceRunner::run( return rc; } +#ifdef BUILD_WITH_NEW_CANN + // Three-phase launch pattern with dispatcher: + // 1. Load (Null) - Pass inner SO binary to dispatcher + // 2. Init - Initialize inner SO + // 3. Run - Execute actual kernel + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerNull (Load) ===" << '\n'; + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerNull", 1); + if (rc != 0) { + LOG_ERROR("launch_aicpu_kernel (load/null) failed: %d", rc); + return rc; + } + + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerInit (Init) ===" << '\n'; + // Launch AICPU init kernel + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); + if (rc != 0) { + LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); + return rc; + } + + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer (Run) ===" << '\n'; +#else std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerInit===" << '\n'; // Launch AICPU init kernel rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); @@ -510,6 +567,7 @@ int DeviceRunner::run( } std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer===" << '\n'; +#endif // Launch AICPU main kernel (over-launch for affinity gate) rc = launch_aicpu_kernel( stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH @@ -607,6 +665,13 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); +#ifdef BUILD_WITH_NEW_CANN + // LoadAicpuOp cleanup happens automatically in destructor +#else + // Cleanup AICPU loader + aicpu_loader_.finalize(); +#endif + // Kernel binaries should have been removed by validate_runtime_impl() if (!func_id_to_addr_.empty()) { LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size()); @@ -659,27 +724,23 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - struct Args { - KernelArgs k_args; - char kernel_name[32]; - const char so_name[32] = {"libaicpu_extend_kernels.so"}; - const char op_name[32] = {""}; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - - rtAicpuArgsEx_t rt_args; - std::memset(&rt_args, 0, sizeof(rt_args)); - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(struct Args, so_name); - - return rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 - ); +#ifdef BUILD_WITH_NEW_CANN + // Map kernel name to LoadAicpuOp function name + std::string func_name; + if (std::strcmp(kernel_name, "DynTileFwkKernelServerInit") == 0) { + func_name = host::KernelNames::InitName; + } else if (std::strcmp(kernel_name, "DynTileFwkKernelServer") == 0) { + func_name = host::KernelNames::RunName; + } else if (std::strcmp(kernel_name, "DynTileFwkKernelServerNull") == 0) { + func_name = host::KernelNames::NullName; + } else { + LOG_ERROR("Unknown kernel name: %s", kernel_name); + return -1; + } + return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, func_name, kernel_name); +#else + return aicpu_loader_.launch(stream, k_args, kernel_name, aicpu_num); +#endif } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 0c7598363..19aab8a51 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -43,6 +43,11 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#ifdef BUILD_WITH_NEW_CANN +#include "load_aicpu_op.h" +#else +#include "aicpu_loader.h" +#endif #include "runtime.h" /** @@ -380,6 +385,13 @@ class DeviceRunner { int worker_count_{0}; // Stored for print_handshake_results in destructor std::vector aicore_kernel_binary_; + // AICPU loader abstraction (supports both legacy and new CANN interfaces) +#ifdef BUILD_WITH_NEW_CANN + host::LoadAicpuOp load_aicpu_op_; +#else + AicpuLoader aicpu_loader_; +#endif + // Memory management MemoryAllocator mem_alloc_; diff --git a/src/a5/platform/onboard/aicpu/CMakeLists.txt b/src/a5/platform/onboard/aicpu/CMakeLists.txt index e1fb32d2c..00e041094 100644 --- a/src/a5/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a5/platform/onboard/aicpu/CMakeLists.txt @@ -10,6 +10,7 @@ project(aicpu_kernel LANGUAGES C CXX) set(CMAKE_CUSTOM_INCLUDE_DIRS "") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -75,3 +76,38 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) +# Build dispatcher SO (two-layer architecture) +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" +) +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +# Compiler options for dispatcher (same as AICPU kernel) +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -rdynamic + -O3 + -fPIC + -g + $<$:-std=gnu++17> +) + +# Include directories for dispatcher +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CUSTOM_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include +) + +# Link against dl for dlopen/dlsym +target_link_libraries(aicpu_dispatcher PRIVATE dl) + +# Match CANN built-in SO properties: SYMBOLIC flag, build-id, stripped +set_target_properties(aicpu_dispatcher PROPERTIES + LINK_FLAGS "-Wl,-Bsymbolic,--build-id" + OUTPUT_NAME "aicpu_dispatcher" +) + diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index 12c86f4fd..b7dbf4d2f 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -19,6 +19,8 @@ set(CMAKE_CUSTOM_INCLUDE_DIRS "") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/worker") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -35,10 +37,15 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_loader.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" ) +# Add common/host sources (LoadAicpuOp) +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") @@ -84,11 +91,16 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 -) +# Conditional compilation for new CANN interface +option(BUILD_WITH_NEW_CANN "Use new rtsLaunchCpuKernel interface (CANN 7.0+)" ON) +if(BUILD_WITH_NEW_CANN) + target_compile_definitions(host_runtime PRIVATE BUILD_WITH_NEW_CANN) + # Add additional include path for new RTS headers (CANN 7.0+) + target_include_directories(host_runtime PRIVATE + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime + ) + message(STATUS "Building with new CANN rtsLaunchCpuKernel interface") +endif() # Link against CANN runtime libraries # ascend_hal is dynamically loaded at runtime via dlopen in device_runner @@ -100,4 +112,10 @@ target_link_libraries(host_runtime dl ) +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 +) + set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a5/platform/onboard/host/aicpu_loader.cpp b/src/a5/platform/onboard/host/aicpu_loader.cpp new file mode 100644 index 000000000..f99450cf9 --- /dev/null +++ b/src/a5/platform/onboard/host/aicpu_loader.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Loader Implementation (Legacy Interface) + * + * Provides AICPU kernel launching via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used when BUILD_WITH_NEW_CANN is OFF. When BUILD_WITH_NEW_CANN is ON, + * device_runner uses LoadAicpuOp (src/common/host/load_aicpu_op.h) instead. + */ + +#include "aicpu_loader.h" + +#include + +#include "common/unified_log.h" +#include "common/kernel_args.h" + +int AicpuLoader::init_with_binary( + const std::vector &aicpu_binary, const std::vector &kernel_names +) { + // Legacy interface: No pre-loading needed + (void)aicpu_binary; + (void)kernel_names; + LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); + return 0; +} + +int AicpuLoader::init(const std::string &so_path, const std::vector &kernel_names) { + // Legacy interface: No pre-loading needed + (void)so_path; + (void)kernel_names; + LOG_INFO("AicpuLoader: Using legacy rtAicpuKernelLaunchExWithArgs interface"); + return 0; +} + +int AicpuLoader::launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { + // Legacy interface: rtAicpuKernelLaunchExWithArgs + struct Args { + KernelArgs k_args; + char kernel_name[32]; + const char so_name[32] = {"libaicpu_extend_kernels.so"}; + const char op_name[32] = {""}; + } args; + + args.k_args = *k_args; + std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); + args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; + + rtAicpuArgsEx_t rt_args; + std::memset(&rt_args, 0, sizeof(rt_args)); + rt_args.args = &args; + rt_args.argsSize = sizeof(args); + rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); + rt_args.soNameAddrOffset = offsetof(struct Args, so_name); + + return rtAicpuKernelLaunchExWithArgs( + rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 + ); +} + +void AicpuLoader::finalize() { + // Legacy interface: No-op +} diff --git a/src/a5/platform/onboard/host/aicpu_loader.h b/src/a5/platform/onboard/host/aicpu_loader.h new file mode 100644 index 000000000..3dd1390af --- /dev/null +++ b/src/a5/platform/onboard/host/aicpu_loader.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Loader Abstraction (Legacy Interface) + * + * Provides AICPU kernel launching via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used when BUILD_WITH_NEW_CANN is OFF. When BUILD_WITH_NEW_CANN is ON, + * device_runner uses LoadAicpuOp (src/common/host/load_aicpu_op.h) instead. + */ + +#ifndef A5_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ +#define A5_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ + +#include +#include +#include + +#include + +// Forward declarations +struct KernelArgs; + +/** + * @brief AICPU kernel loader (legacy interface) + * + * Launches AICPU kernels via the legacy rtAicpuKernelLaunchExWithArgs API. + * Used as the fallback when BUILD_WITH_NEW_CANN is OFF. + */ +class AicpuLoader { +public: + AicpuLoader() = default; + ~AicpuLoader() = default; + + /** + * @brief Initialize the AICPU loader with binary data (no-op for legacy interface) + */ + int init_with_binary(const std::vector &aicpu_binary, const std::vector &kernel_names); + + /** + * @brief Initialize the AICPU loader (no-op for legacy interface) + */ + int init(const std::string &so_path, const std::vector &kernel_names); + + /** + * @brief Launch an AICPU kernel via legacy rtAicpuKernelLaunchExWithArgs + */ + int launch(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num); + + /** + * @brief Cleanup resources (no-op for legacy interface) + */ + void finalize(); + + // Disable copy and move + AicpuLoader(const AicpuLoader &) = delete; + AicpuLoader &operator=(const AicpuLoader &) = delete; + AicpuLoader(AicpuLoader &&) = delete; + AicpuLoader &operator=(AicpuLoader &&) = delete; +}; + +#endif // A5_PLATFORM_ONBOARD_HOST_AICPU_LOADER_H_ diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index e451a5efa..c0cc23d80 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -17,12 +17,34 @@ #include "device_runner.h" +#ifdef BUILD_WITH_NEW_CANN +#include "load_aicpu_op.h" +#endif + #include #include +#include #include #include #include +#ifdef BUILD_WITH_NEW_CANN + +static std::string resolve_dispatcher_so_path() { + Dl_info info; + if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { + return ""; + } + std::string so_dir = info.dli_fname; + size_t pos = so_dir.rfind('/'); + if (pos == std::string::npos) { + return "libaicpu_dispatcher.so"; + } + so_dir = so_dir.substr(0, pos + 1); + return so_dir + "libaicpu_dispatcher.so"; +} +#endif + #include "callable.h" #include "host/host_regs.h" // Register address retrieval #include "host/raii_scope_guard.h" @@ -231,8 +253,21 @@ int DeviceRunner::ensure_binaries_loaded( aicore_kernel_binary_ = aicore_kernel_binary; +#ifdef BUILD_WITH_NEW_CANN + // New interface: Initialize LoadAicpuOp (loads dispatcher SO) + std::string dispatcher_so_path = resolve_dispatcher_so_path(); + int rc = load_aicpu_op_.Init(dispatcher_so_path); + if (rc != 0) { + LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); + return rc; + } + LOG_INFO("DeviceRunner: LoadAicpuOp initialized"); +#else + int rc = 0; +#endif + // Load AICPU SO - int rc = so_info_.init(aicpu_so_binary, mem_alloc_); + rc = so_info_.init(aicpu_so_binary, mem_alloc_); if (rc != 0) { LOG_ERROR("AicpuSoInfo::init failed: %d", rc); return rc; @@ -394,6 +429,28 @@ int DeviceRunner::run( return rc; } +#ifdef BUILD_WITH_NEW_CANN + // Three-phase launch pattern with dispatcher: + // 1. Load (Null) - Pass inner SO binary to dispatcher + // 2. Init - Initialize inner SO + // 3. Run - Execute actual kernel + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerNull (Load) ===" << '\n'; + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerNull", 1); + if (rc != 0) { + LOG_ERROR("launch_aicpu_kernel (load/null) failed: %d", rc); + return rc; + } + + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerInit (Init) ===" << '\n'; + // Launch AICPU init kernel + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); + if (rc != 0) { + LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); + return rc; + } + + std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer (Run) ===" << '\n'; +#else std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServerInit===" << '\n'; // Launch AICPU init kernel rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); @@ -403,6 +460,7 @@ int DeviceRunner::run( } std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer===" << '\n'; +#endif // Launch AICPU main kernel (over-launch for affinity gate) rc = launch_aicpu_kernel( stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH @@ -480,6 +538,13 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); +#ifdef BUILD_WITH_NEW_CANN + // LoadAicpuOp cleanup happens automatically in destructor +#else + // Cleanup AICPU loader + aicpu_loader_.finalize(); +#endif + // Kernel binaries should have been removed by validate_runtime_impl() if (!func_id_to_addr_.empty()) { LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size()); @@ -520,27 +585,23 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - struct Args { - KernelArgs k_args; - char kernel_name[32]; - const char so_name[32] = {"libaicpu_extend_kernels.so"}; - const char op_name[32] = {""}; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - - rtAicpuArgsEx_t rt_args; - std::memset(&rt_args, 0, sizeof(rt_args)); - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(struct Args, so_name); - - return rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 - ); +#ifdef BUILD_WITH_NEW_CANN + // Map kernel name to LoadAicpuOp function name + std::string func_name; + if (std::strcmp(kernel_name, "DynTileFwkKernelServerInit") == 0) { + func_name = host::KernelNames::InitName; + } else if (std::strcmp(kernel_name, "DynTileFwkKernelServer") == 0) { + func_name = host::KernelNames::RunName; + } else if (std::strcmp(kernel_name, "DynTileFwkKernelServerNull") == 0) { + func_name = host::KernelNames::NullName; + } else { + LOG_ERROR("Unknown kernel name: %s", kernel_name); + return -1; + } + return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, func_name, kernel_name); +#else + return aicpu_loader_.launch(stream, k_args, kernel_name, aicpu_num); +#endif } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime *runtime) { diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 6658f7221..e9347b5fb 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -43,6 +43,11 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#ifdef BUILD_WITH_NEW_CANN +#include "load_aicpu_op.h" +#else +#include "aicpu_loader.h" +#endif #include "runtime.h" /** @@ -342,6 +347,13 @@ class DeviceRunner { int worker_count_{0}; // Stored for print_handshake_results in destructor std::vector aicore_kernel_binary_; + // AICPU loader abstraction (supports both legacy and new CANN interfaces) +#ifdef BUILD_WITH_NEW_CANN + host::LoadAicpuOp load_aicpu_op_; +#else + AicpuLoader aicpu_loader_; +#endif + // Memory management MemoryAllocator mem_alloc_; diff --git a/src/common/aicpu_dispatcher/CMakeLists.txt b/src/common/aicpu_dispatcher/CMakeLists.txt new file mode 100644 index 000000000..833459afb --- /dev/null +++ b/src/common/aicpu_dispatcher/CMakeLists.txt @@ -0,0 +1,52 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# Build AICPU Dispatcher SO - Two-layer architecture for runtime-specific AICPU kernels +cmake_minimum_required(VERSION 3.16.3) + +project(aicpu_dispatcher LANGUAGES C CXX) + +# Dispatcher SO sources +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_dispatcher.cpp" +) + +# Create shared library +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +# C++ standard +set_target_properties(aicpu_dispatcher PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON +) + +# Compile options (matching AICPU pattern) +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -fPIC + -O3 + -g +) + +# Include directories +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + "${CMAKE_CURRENT_SOURCE_DIR}/../../../.." # For common/unified_log.h +) + +# Link against dl for dlopen/dlsym +target_link_libraries(aicpu_dispatcher + PRIVATE + dl +) + +# Set output name +set_target_properties(aicpu_dispatcher PROPERTIES OUTPUT_NAME "aicpu_dispatcher") diff --git a/src/common/aicpu_dispatcher/README.md b/src/common/aicpu_dispatcher/README.md new file mode 100644 index 000000000..2f462139e --- /dev/null +++ b/src/common/aicpu_dispatcher/README.md @@ -0,0 +1,38 @@ +# AICPU Dispatcher SO + +Two-layer architecture for runtime-specific AICPU kernels. + +## Architecture + +The dispatcher SO provides a two-layer architecture where: + +- **Outer layer (this SO)** is fixed and handles dynamic SO loading +- **Inner layer (runtime-specific SO)** can be different for each runtime + +This allows different runtimes (tensormap, ringbuffer, etc.) to load their own AICPU kernel implementations at runtime without recompiling the dispatcher. + +## Exported Functions + +Three C-style exported functions (AICPU entry points): + +1. `DynTileFwkKernelServerNull` - Load phase: receives inner SO binary, saves to filesystem +2. `DynTileFwkKernelServerInit` - Init phase: delegates to inner SO's initialization +3. `DynTileFwkKernelServer` - Run phase: delegates to inner SO's execution + +## BackendServerHandleManager + +Internal class that manages the lifecycle of the inner SO: + +- `SaveSoFile()` - Saves inner SO binary to `/tmp/aicpu_kernels/` +- `SetTileFwkKernelMap()` - Loads init and run functions from inner SO using dlopen/dlsym +- `ExecuteFunc()` - Executes inner SO functions with provided arguments + +## Function Key Mapping + +- `dyInitFuncKey = 2` - Initialization function +- `dyExecFuncKey = 3` - Execution function + +## Reference + +Based on pypto's implementation: +`/data/fangjingzhi/pypto/framework/src/machine/device/machine_interface/pypto_aicpu_interface.{h,cpp}` diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp new file mode 100644 index 000000000..1825f042f --- /dev/null +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp @@ -0,0 +1,313 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Dispatcher SO Implementation + */ + +#include "aicpu_dispatcher.h" + +#include +#include +#include +#include +#include +#include +#include + +// Weak symbol fallback implementations for unified_log_* functions. +// When dispatcher SO is loaded independently by the AICPU scheduler daemon +// (via dlopen), these weak symbols provide a minimal stderr-based logger. +// When linked into host_runtime.so, the strong symbols from unified_log_host.cpp +// take precedence. +extern "C" { + +__attribute__((weak)) void unified_log_error(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[ERROR] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +__attribute__((weak)) void unified_log_warn(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[WARN] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +__attribute__((weak)) void unified_log_info(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[INFO] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +__attribute__((weak)) void unified_log_debug(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[DEBUG] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +__attribute__((weak)) void unified_log_always(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[ALWAYS] [%s] ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +} // extern "C" + +// Forward declarations for simpler's KernelArgs and DeviceArgs structures. +// These MUST match the layouts defined in platform-specific kernel_args.h: +// src/a2a3/platform/include/common/kernel_args.h +// src/a5/platform/include/common/kernel_args.h +// +// Both platforms share the same layout for fields accessed here (device_args, +// runtime_args). a2a3 has an additional ffts_base_addr field at the end which +// this code does not access. The static_assert below ensures this struct is +// at least as large as the minimum platform layout. +struct KernelArgs { + uint64_t unused[5] = {0}; + void* device_args{nullptr}; // Pointer to DeviceArgs in device memory + void* runtime_args{nullptr}; + uint64_t regs{0}; +}; + +// DeviceArgs structure as passed from DeviceRunner. +// Must match the layout in platform-specific DeviceArgs (host/device_runner.h). +struct DeviceArgs { + uint64_t unused[12] = {0}; + uint64_t aicpu_so_bin{0}; + uint64_t aicpu_so_len{0}; +}; + +static_assert(sizeof(KernelArgs) >= 64, "KernelArgs layout mismatch with platform kernel_args.h"); +static_assert(sizeof(DeviceArgs) >= 112, "DeviceArgs layout mismatch with platform DeviceArgs"); +static_assert(offsetof(KernelArgs, device_args) == 40, "KernelArgs::device_args offset mismatch"); +static_assert(offsetof(KernelArgs, runtime_args) == 48, "KernelArgs::runtime_args offset mismatch"); +static_assert(offsetof(DeviceArgs, aicpu_so_bin) == 96, "DeviceArgs::aicpu_so_bin offset mismatch"); +static_assert(offsetof(DeviceArgs, aicpu_so_len) == 104, "DeviceArgs::aicpu_so_len offset mismatch"); + +namespace aicpu_dispatcher { + +BackendServerHandleManager::~BackendServerHandleManager() +{ + if (soHandle_ != nullptr) { + LOG_INFO("Closing inner SO handle: %s", innerSoName_.c_str()); + dlclose(soHandle_); + soHandle_ = nullptr; + } +} + +bool BackendServerHandleManager::SaveSoFile(char* data, const uint64_t& len, uint8_t deviceId) +{ + std::lock_guard lock(funcLock_); + + if (len < 1) { + LOG_WARN("AICPU SO len is %lu, skipping save", len); + return true; // Don't fail for empty SO + } + + // Generate inner SO file path based on device ID + // Use /tmp/aicpu_kernels/ for better portability (no root requirement) + const std::string dir_path = "/tmp/aicpu_kernels"; + innerSoName_ = dir_path + "/libaicpu_dispatcher_runtime_" + std::to_string(deviceId) + ".so"; + + // Create directory if it doesn't exist + struct stat st; + if (stat(dir_path.c_str(), &st) != 0) { + // Directory doesn't exist, create it + if (mkdir(dir_path.c_str(), 0755) != 0) { + LOG_ERROR("Failed to create directory %s: %s", dir_path.c_str(), strerror(errno)); + return false; + } + LOG_INFO("Created directory: %s", dir_path.c_str()); + } + + LOG_INFO("Saving inner AICPU SO to device %u: %s (size=%lu bytes)", deviceId, innerSoName_.c_str(), len); + + std::ofstream file(innerSoName_, std::ios::out | std::ios::binary); + if (!file.is_open()) { + LOG_ERROR("Failed to create inner SO file: %s", innerSoName_.c_str()); + return false; + } + + // Write binary to file + file.write(data, len); + + if (!file.good()) { + LOG_ERROR("Failed to write inner SO file: %s", innerSoName_.c_str()); + file.close(); + return false; + } + file.close(); + + LOG_INFO("Successfully saved inner AICPU SO for device %u: %s", deviceId, innerSoName_.c_str()); + return true; +} + +void BackendServerHandleManager::SetTileFwkKernelMap() +{ + std::lock_guard lock(funcLock_); + + if (firstLoadSo_) { + return; // Already loaded + } + + // Load init function from inner SO + (void)LoadTileFwkKernelFunc(DY_TILE_FWK_BACKEND_KERNEL_SERVER_INIT); + // Load run function from inner SO + (void)LoadTileFwkKernelFunc(DY_TILE_FWK_BACKEND_KERNEL_SERVER); + + firstLoadSo_ = true; +} + +int BackendServerHandleManager::ExecuteFunc(void* args, const uint64_t funcKey) +{ + auto func = GetTileFwkKernelFunc(funcKey); + if (func == nullptr) { + LOG_ERROR("Function key %lu not found in inner SO %s", funcKey, innerSoName_.c_str()); + return -1; + } + + return func(args); +} + +void BackendServerHandleManager::LoadTileFwkKernelFunc(const std::string& kernelName) +{ + if (soHandle_ == nullptr) { + soHandle_ = dlopen(innerSoName_.c_str(), RTLD_LAZY | RTLD_DEEPBIND); + if (soHandle_ == nullptr) { + char* error = dlerror(); + LOG_ERROR("Failed to dlopen inner SO %s: %s", innerSoName_.c_str(), error ? error : "unknown error"); + return; + } + LOG_INFO("Successfully dlopened inner SO: %s", innerSoName_.c_str()); + } + + // Map kernel name to function key + uint64_t funcKey = 0; + if (kernelName == DY_TILE_FWK_BACKEND_KERNEL_SERVER_INIT) { + funcKey = dyInitFuncKey; + } else if (kernelName == DY_TILE_FWK_BACKEND_KERNEL_SERVER) { + funcKey = dyExecFuncKey; + } else { + LOG_ERROR("Unknown kernel name: %s", kernelName.c_str()); + return; + } + + LOG_DEBUG("Loading function: name=%s, funcKey=%lu", kernelName.c_str(), funcKey); + + // Skip if function is already loaded + auto iter = kernelKey2FuncHandle_.find(funcKey); + if (iter != kernelKey2FuncHandle_.end()) { + LOG_DEBUG("Function already loaded: %s (funcKey=%lu)", kernelName.c_str(), funcKey); + return; + } + + // Load the function + AicpuKernelFunc funcEntry = reinterpret_cast( + dlsym(soHandle_, kernelName.c_str()) + ); + if (funcEntry == nullptr) { + char* error = dlerror(); + LOG_ERROR("Failed to dlsym %s from %s: %s", + kernelName.c_str(), innerSoName_.c_str(), error ? error : "unknown error"); + (void)dlclose(soHandle_); + soHandle_ = nullptr; + return; + } + LOG_INFO("Successfully loaded function: %s from %s", kernelName.c_str(), innerSoName_.c_str()); + kernelKey2FuncHandle_[funcKey] = funcEntry; +} + +AicpuKernelFunc BackendServerHandleManager::GetTileFwkKernelFunc(const uint64_t funcKey) +{ + auto iter = kernelKey2FuncHandle_.find(funcKey); + if (iter != kernelKey2FuncHandle_.end()) { + return iter->second; + } + LOG_ERROR("Function key %lu not found", funcKey); + return nullptr; +} + +} // namespace aicpu_dispatcher + +namespace { + +// Global instance of the handle manager +aicpu_dispatcher::BackendServerHandleManager g_handleManager; + +} // namespace + +// C-style exported functions (AICPU entry points) +extern "C" { + +__attribute__((visibility("default"))) uint32_t DynTileFwkKernelServerNull(void* args) +{ + if (args == nullptr) { + LOG_ERROR("Dispatcher Load: args is null"); + return 1; + } + + auto* kargs = reinterpret_cast(args); + auto* devArgs = reinterpret_cast(kargs->device_args); + if (devArgs == nullptr) { + LOG_ERROR("Dispatcher Load: DeviceArgs is null"); + return 1; + } + + auto* data = reinterpret_cast(devArgs->aicpu_so_bin); + if (devArgs->aicpu_so_len == 0) { + LOG_WARN("Dispatcher Load: inner SO binary is empty, skipping load"); + return 0; + } + + if (!g_handleManager.SaveSoFile(data, devArgs->aicpu_so_len)) { + LOG_ERROR("Dispatcher Load: failed to save inner SO"); + return 1; + } + g_handleManager.SetTileFwkKernelMap(); + return 0; +} + +__attribute__((visibility("default"))) uint32_t DynTileFwkKernelServerInit(void* args) +{ + auto ret = g_handleManager.ExecuteFunc(args, aicpu_dispatcher::dyInitFuncKey); + if (ret != 0) { + LOG_ERROR("Dispatcher Init: inner SO init failed with code %d", ret); + return 1; + } + return 0; +} + +__attribute__((visibility("default"))) uint32_t DynTileFwkKernelServer(void* args) +{ + auto ret = g_handleManager.ExecuteFunc(args, aicpu_dispatcher::dyExecFuncKey); + if (ret != 0) { + LOG_ERROR("Dispatcher Run: inner SO run failed with code %d", ret); + return 1; + } + return 0; +} + +} // extern "C" diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.h b/src/common/aicpu_dispatcher/aicpu_dispatcher.h new file mode 100644 index 000000000..f0d5dacb7 --- /dev/null +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Dispatcher SO - Two-layer architecture for runtime-specific AICPU kernels + * + * This dispatcher SO provides a two-layer architecture where: + * - Outer layer (this SO) is fixed and handles dynamic SO loading + * - Inner layer (runtime-specific SO) can be different for each runtime + * + * Architecture: + * 1. DynTileFwkKernelServerNull - Load phase: receives inner SO binary, saves to AICPU filesystem + * 2. DynTileFwkKernelServerInit - Init phase: delegates to inner SO's initialization + * 3. DynTileFwkKernelServer - Run phase: delegates to inner SO's execution + * + * This allows different runtimes (tensormap, ringbuffer, etc.) to load their own + * AICPU kernel implementations at runtime without recompiling the dispatcher. + */ + +#ifndef COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ +#define COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ + +#include +#include +#include +#include +#include +#include + +#include "common/unified_log.h" + +// Function pointer type for AICPU kernel functions +using AicpuKernelFunc = int (*)(void*); + +namespace aicpu_dispatcher { + +// Function key constants for inner SO function lookup +constexpr uint64_t dyInitFuncKey = 2; +constexpr uint64_t dyExecFuncKey = 3; + +// Kernel name constants (actual symbol names in inner SO) +constexpr char const* DY_TILE_FWK_BACKEND_KERNEL_SERVER_INIT = "DynTileFwkBackendKernelServerInit"; +constexpr char const* DY_TILE_FWK_BACKEND_KERNEL_SERVER = "DynTileFwkBackendKernelServer"; + +/** + * @brief Backend server handle manager for two-layer SO architecture + * + * Manages the lifecycle of the inner SO: + * - Saves inner SO binary to /tmp/aicpu_kernels/ + * - Loads functions from inner SO using dlopen/dlsym + * - Executes inner SO functions with provided arguments + * + * Data flow: + * - Host passes inner SO binary via DeviceArgs (aicpu_so_bin, aicpu_so_len) + * - Dispatcher's Null function receives KernelArgs->device_args pointer + * - Binary is saved to filesystem and inner SO is loaded via dlopen + */ +class BackendServerHandleManager { +public: + BackendServerHandleManager() = default; + ~BackendServerHandleManager(); + + /** + * @brief Save inner SO binary to AICPU filesystem + * + * @param data Pointer to inner SO binary data + * @param len Length of the binary data + * @param deviceId Device ID for SO naming + * @return true on success, false on failure + */ + bool SaveSoFile(char* data, const uint64_t& len, uint8_t deviceId = 0); + + /** + * @brief Load function symbols from inner SO + * + * Loads the init and run functions from the saved inner SO using dlopen/dlsym. + */ + void SetTileFwkKernelMap(); + + /** + * @brief Execute a function from the inner SO + * + * @param args Arguments to pass to the function + * @param funcKey Function key (2=init, 3=run) + * @return Return value from the function, or error code + */ + int ExecuteFunc(void* args, const uint64_t funcKey); + +private: + /** + * @brief Load a specific function from the inner SO + * + * @param kernelName Name of the function to load (symbol name in inner SO) + */ + void LoadTileFwkKernelFunc(const std::string& kernelName); + + /** + * @brief Get a loaded function by its key + * + * @param funcKey Function key (2=init, 3=run) + * @return Function pointer, or nullptr if not found + */ + AicpuKernelFunc GetTileFwkKernelFunc(const uint64_t funcKey); + + std::unordered_map kernelKey2FuncHandle_; + std::mutex funcLock_; + void* soHandle_ = nullptr; + bool firstLoadSo_ = false; + std::string innerSoName_; +}; + +} // namespace aicpu_dispatcher + +// C-style exported functions (AICPU entry points) +extern "C" { + __attribute__((visibility("default"))) uint32_t DynTileFwkKernelServerNull(void* args); + __attribute__((visibility("default"))) uint32_t DynTileFwkKernelServerInit(void* args); + __attribute__((visibility("default"))) uint32_t DynTileFwkKernelServer(void* args); +} + +#endif // COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ diff --git a/src/common/host/CMakeLists.txt b/src/common/host/CMakeLists.txt new file mode 100644 index 000000000..577d062b1 --- /dev/null +++ b/src/common/host/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# Build host-side AICPU operation loader +cmake_minimum_required(VERSION 3.16.3) + +project(host_common LANGUAGES C CXX) + +# Host common sources +set(HOST_COMMON_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/load_aicpu_op.cpp" +) + +# This library is included directly in host_runtime, not built separately +# Sources are added to HOST_RUNTIME_SOURCES in platform CMakeLists.txt diff --git a/src/common/host/load_aicpu_op.cpp b/src/common/host/load_aicpu_op.cpp new file mode 100644 index 000000000..661ae3507 --- /dev/null +++ b/src/common/host/load_aicpu_op.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Operation Loader Implementation + */ + +#include "load_aicpu_op.h" + +#include +#include +#include +#include + +#include "common/unified_log.h" + +#ifdef BUILD_WITH_NEW_CANN + +namespace host { + +LoadAicpuOp::~LoadAicpuOp() +{ +#ifdef BUILD_WITH_NEW_CANN + if (binary_handle_ != nullptr) { + rtError_t rc = rtsBinaryUnload(binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_WARN("rtsBinaryUnload failed: %d", rc); + } + binary_handle_ = nullptr; + } + func_handles_.clear(); + + if (!json_file_path_.empty()) { + std::remove(json_file_path_.c_str()); + LOG_INFO("LoadAicpuOp: Deleted temporary JSON file: %s", json_file_path_.c_str()); + json_file_path_.clear(); + } +#endif +} + +bool LoadAicpuOp::GenerateAicpuOpJson(const std::string& json_path, const std::string& kernel_so) +{ + std::ofstream json_file(json_path); + if (!json_file.is_open()) { + LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); + return false; + } + + AicpuOpConfig init_config; + init_config.opType = KernelNames::InitName; + init_config.functionName = "DynTileFwkKernelServerInit"; + init_config.kernelSo = kernel_so; + init_config.opKernelLib = "KFCKernel"; + + AicpuOpConfig run_config; + run_config.opType = KernelNames::RunName; + run_config.functionName = "DynTileFwkKernelServer"; + run_config.kernelSo = kernel_so; + run_config.opKernelLib = "KFCKernel"; + + AicpuOpConfig null_config; + null_config.opType = KernelNames::NullName; + null_config.functionName = "DynTileFwkKernelServerNull"; + null_config.kernelSo = kernel_so; + null_config.opKernelLib = "AICPUKernel"; + + std::vector op_configs = {init_config, run_config, null_config}; + + json_file << "{\n"; + for (size_t i = 0; i < op_configs.size(); ++i) { + const auto& config = op_configs[i]; + json_file << " \"" << config.opType << "\": {\n"; + json_file << " \"opInfo\": {\n"; + json_file << " \"functionName\": \"" << config.functionName << "\",\n"; + json_file << " \"kernelSo\": \"" << config.kernelSo << "\",\n"; + json_file << " \"opKernelLib\": \"" << config.opKernelLib << "\",\n"; + json_file << " \"computeCost\": \"" << config.computeCost << "\",\n"; + json_file << " \"engine\": \"" << config.engine << "\",\n"; + json_file << " \"flagAsync\": \"" << config.flagAsync << "\",\n"; + json_file << " \"flagPartial\": \"" << config.flagPartial << "\",\n"; + json_file << " \"userDefined\": \"" << config.userDefined << "\"\n"; + json_file << " }\n"; + json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; + } + json_file << "}\n"; + json_file.close(); + + LOG_INFO("Generated AICPU op info JSON: %s", json_path.c_str()); + return true; +} + +int LoadAicpuOp::Init(const std::string& dispatcher_so_path) +{ + // Generate JSON in the same directory as the SO, with the same basename + // e.g. /path/libaicpu_dispatcher.so -> /path/libaicpu_dispatcher.json + // cpuKernelMode=1 derives the SO path by replacing .json with .so + std::string so_dir; + size_t last_slash = dispatcher_so_path.rfind('/'); + if (last_slash != std::string::npos) { + so_dir = dispatcher_so_path.substr(0, last_slash + 1); + } + + std::string so_basename = dispatcher_so_path; + if (last_slash != std::string::npos) { + so_basename = dispatcher_so_path.substr(last_slash + 1); + } + // Replace .so suffix with .json + std::string json_name = so_basename; + size_t so_ext = json_name.rfind(".so"); + if (so_ext != std::string::npos) { + json_name = json_name.substr(0, so_ext) + ".json"; + } + + json_file_path_ = so_dir + json_name; + + // kernelSo uses relative filename (scheduler resolves via ASCEND_AICPU_PATH) + if (!GenerateAicpuOpJson(json_file_path_, so_basename)) { + json_file_path_.clear(); + return -1; + } + + // Load via rtsBinaryLoadFromFile with cpuKernelMode=1 + rtLoadBinaryOption_t option = {}; + option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; + option.value.cpuKernelMode = 1; + + rtLoadBinaryConfig_t load_config = {}; + load_config.options = &option; + load_config.numOpt = 1; + + LOG_INFO("LoadAicpuOp: JSON path: %s", json_file_path_.c_str()); + LOG_INFO("LoadAicpuOp: SO path: %s", dispatcher_so_path.c_str()); + + rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); + std::remove(json_file_path_.c_str()); + json_file_path_.clear(); + return rc; + } + LOG_INFO("LoadAicpuOp: Loaded dispatcher SO, handle=%p", binary_handle_); + + // Step 4: Resolve function handles for all three kernels + const char* kernel_names[] = {KernelNames::NullName, KernelNames::InitName, KernelNames::RunName}; + for (const char* name : kernel_names) { + rtFuncHandle func_handle = nullptr; + rc = rtsFuncGetByName(binary_handle_, name, &func_handle); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsFuncGetByName failed for %s: %d", name, rc); + return rc; + } + func_handles_[name] = func_handle; + LOG_INFO("LoadAicpuOp: Resolved function handle for %s: %p", name, func_handle); + } + + return 0; +} + +int LoadAicpuOp::AicpuKernelLaunch( + rtFuncHandle func_handle, rtStream_t stream, KernelArgs* k_args, int aicpu_num, const std::string& kernel_name +) { + (void)kernel_name; + + rtCpuKernelArgs_t cpu_args = {}; + cpu_args.baseArgs.args = k_args; + cpu_args.baseArgs.argsSize = sizeof(KernelArgs); + + rtKernelLaunchCfg_t kernelLaunchCfg = {nullptr, 0U}; + auto launchKernelAttr = std::make_unique(); + kernelLaunchCfg.attrs = launchKernelAttr.get(); + + rtError_t rc = rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, &kernelLaunchCfg, &cpu_args); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsLaunchCpuKernel failed: %d", rc); + return rc; + } + + return 0; +} + +int LoadAicpuOp::LaunchBuiltInOp( + rtStream_t stream, KernelArgs* k_args, int aicpu_num, const std::string& func_name, const std::string& kernel_name +) { + auto it = func_handles_.find(func_name); + if (it == func_handles_.end()) { + LOG_ERROR("Function not found: %s", func_name.c_str()); + return -1; + } + + rtFuncHandle func_handle = it->second; + return AicpuKernelLaunch(func_handle, stream, k_args, aicpu_num, kernel_name); +} + +} // namespace host + +#endif // BUILD_WITH_NEW_CANN diff --git a/src/common/host/load_aicpu_op.h b/src/common/host/load_aicpu_op.h new file mode 100644 index 000000000..9c088ea65 --- /dev/null +++ b/src/common/host/load_aicpu_op.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * @file load_aicpu_op.h + * @brief Host-side AICPU operation loader using new CANN 7.0+ rtsLaunchCpuKernel interface + * + * This class provides the host-side wrapper for loading and launching AICPU kernels + * through the two-layer dispatcher architecture. It generates JSON descriptors, + * loads the dispatcher SO via rtsBinaryLoadFromFile, and launches kernels via + * rtsLaunchCpuKernel. + * + * Architecture: + * - Dispatcher SO (libaicpu_dispatcher.so) - Fixed outer layer + * - Runtime SO (replaceable) - Different for each runtime (tensormap, ringbuffer, etc.) + * + * Three-phase launch pattern: + * 1. Load phase (DynTileFwkKernelServerNull) - Pass inner SO binary to dispatcher + * 2. Init phase (DynTileFwkKernelServerInit) - Initialize inner SO + * 3. Run phase (DynTileFwkKernelServer) - Execute actual kernel + */ + +#ifndef COMMON_HOST_LOAD_AICPU_OP_H_ +#define COMMON_HOST_LOAD_AICPU_OP_H_ + +#include +#include +#include +#include + +#include "common/kernel_args.h" + +#ifdef BUILD_WITH_NEW_CANN +#include "runtime/runtime/rts/rts_kernel.h" +#endif + +namespace host { + +/** + * @brief AICPU operation configuration for JSON descriptor generation + */ +struct AicpuOpConfig { + std::string functionName; // Actual symbol name in SO (e.g., DynTileFwkBackendKernelServerInit) + std::string kernelSo; // SO filename (e.g., libaicpu_dispatcher.so) + std::string opKernelLib; // Kernel library type (KFCKernel or AICPUKernel) + std::string computeCost = "100"; + std::string engine = "DNN_VM_AICPU"; + std::string flagAsync = "False"; + std::string flagPartial = "False"; + std::string userDefined = "False"; + std::string opType; // External kernel name for rtsFuncGetByName lookup +}; + +/** + * @brief Host-side AICPU operation loader + * + * Manages the lifecycle of loading and launching AICPU kernels through the + * two-layer dispatcher architecture using CANN 7.0+ rtsLaunchCpuKernel interface. + * + * Reference: /data/fangjingzhi/pypto/framework/src/machine/runtime/load_aicpu_op.{h,cpp} + */ +class LoadAicpuOp { +public: + LoadAicpuOp() = default; + ~LoadAicpuOp(); + + // Delete copy and move to ensure singleton behavior + LoadAicpuOp(const LoadAicpuOp&) = delete; + LoadAicpuOp& operator=(const LoadAicpuOp&) = delete; + LoadAicpuOp(LoadAicpuOp&&) = delete; + LoadAicpuOp& operator=(LoadAicpuOp&&) = delete; + + /** + * @brief Initialize the loader by loading dispatcher SO + * + * Passes the dispatcher SO path directly to rtsBinaryLoadFromFile + * and resolves function handles via rtsFuncGetByName. + * + * @param dispatcher_so_path Absolute path to libaicpu_dispatcher.so + * @return 0 on success, error code on failure + */ + int Init(const std::string& dispatcher_so_path); + + /** + * @brief Launch a built-in dispatcher kernel + * + * Launches one of the three dispatcher kernels (Null/Init/Run) via + * rtsLaunchCpuKernel. + * + * @param stream RTS stream for kernel launch + * @param k_args Kernel arguments to pass to the AICPU kernel + * @param aicpu_num Number of AICPU cores to use + * @param func_name Kernel function name for rtsFuncGetByName lookup (PyptoNull/PyptoInit/PyptoRun) + * @param kernel_name Actual symbol name in the SO (DynTileFwkKernelServerNull/Init/Server) + * @return 0 on success, error code on failure + */ + int LaunchBuiltInOp( + rtStream_t stream, KernelArgs* k_args, int aicpu_num, const std::string& func_name, const std::string& kernel_name + ); + +private: +#ifdef BUILD_WITH_NEW_CANN + void* binary_handle_ = nullptr; // Handle from rtsBinaryLoadFromFile + std::unordered_map func_handles_; // Function handles from rtsFuncGetByName + std::string json_file_path_; // Path to generated JSON file (same dir/basename as SO) + + /** + * @brief Generate JSON descriptor for dispatcher SO + * + * @param json_path Path where JSON file will be created + * @param kernel_so Absolute path to the dispatcher SO (placed in kernelSo JSON field) + * @return true on success, false on failure + */ + bool GenerateAicpuOpJson(const std::string& json_path, const std::string& kernel_so); + + /** + * @brief Launch AICPU kernel using rtsLaunchCpuKernel + * + * @param func_handle Function handle from rtsFuncGetByName + * @param stream RTS stream + * @param k_args Kernel arguments + * @param aicpu_num Number of AICPU cores + * @param kernel_name Kernel name to embed in args struct + * @return 0 on success, error code on failure + */ + int AicpuKernelLaunch( + rtFuncHandle func_handle, rtStream_t stream, KernelArgs* k_args, int aicpu_num, const std::string& kernel_name + ); +#else + // Dummy members for legacy build + void* binary_handle_ = nullptr; +#endif +}; + +// Kernel name constants +namespace KernelNames { + constexpr const char* NullName = "PyptoNull"; // Load phase + constexpr const char* InitName = "PyptoInit"; // Init phase + constexpr const char* RunName = "PyptoRun"; // Run phase +} + +// Dispatcher SO name +namespace SoNames { + constexpr const char* DispatcherSo = "libaicpu_dispatcher.so"; +} + +} // namespace host + +#endif // COMMON_HOST_LOAD_AICPU_OP_H_