From f4a0dc59759518c711605819e73d72675eacb20d Mon Sep 17 00:00:00 2001 From: dengbo Date: Mon, 22 Jun 2026 09:17:49 +0800 Subject: [PATCH] server: add LFM2 SMT vision support --- tools/mtmd/smt-vision-wrapper.cpp | 406 ++++++++++++++++++----------- tools/server/server-smt-vision.cpp | 2 +- 2 files changed, 253 insertions(+), 155 deletions(-) diff --git a/tools/mtmd/smt-vision-wrapper.cpp b/tools/mtmd/smt-vision-wrapper.cpp index f4306607aeb7..f0f866e62491 100644 --- a/tools/mtmd/smt-vision-wrapper.cpp +++ b/tools/mtmd/smt-vision-wrapper.cpp @@ -3,16 +3,17 @@ #include "smt-vision-wrapper.h" #include "ggml-profile.h" -#include "spine_vision_engine.h" +#include "onnxruntime_cxx_api.h" #include #include #include +#include #include #include #include #include -#include +#include #if defined(_WIN32) # include @@ -28,58 +29,252 @@ const OrtApi * g_ort = NULL; namespace { -template using void_t = void; +static int get_ep_thread_num(const std::unordered_map & ep_config, + const std::string & key, + int default_value) { + auto it = ep_config.find(key); + if (it == ep_config.end() || it->second.empty()) { + return default_value; + } + return std::stoi(it->second); +} -template struct has_ep_config_vision_ctor : std::false_type {}; +static bool has_spacemit_ep_affinity(const std::unordered_map & ep_config) { + auto it = ep_config.find("SPACEMIT_EP_INTRA_THREAD_AFFINITY"); + return it != ep_config.end() && !it->second.empty(); +} -template -struct has_ep_config_vision_ctor< - T, - void_t(), - std::declval(), - std::declval &>()))>> : std::true_type {}; +static std::unordered_map make_provider_options( + const std::unordered_map & ep_config) { + std::unordered_map provider_options = ep_config; + if (provider_options.find("SPACEMIT_EP_INTRA_THREAD_NUM") == provider_options.end()) { + provider_options["SPACEMIT_EP_INTRA_THREAD_NUM"] = "1"; + } + if (provider_options.find("SPACEMIT_EP_INTER_THREAD_NUM") == provider_options.end()) { + provider_options["SPACEMIT_EP_INTER_THREAD_NUM"] = "1"; + } + return provider_options; +} -template struct has_legacy_affinity_vision_ctor : std::false_type {}; +static bool init_spacemit_execution_provider(Ort::SessionOptions & options, + const std::unordered_map & provider_options, + std::string & error_message) { + std::vector keys; + std::vector values; + keys.reserve(provider_options.size()); + values.reserve(provider_options.size()); + for (const auto & entry : provider_options) { + keys.push_back(entry.first.c_str()); + values.push_back(entry.second.c_str()); + } + + void * handle = dlopen("libspacemit_ep.so", RTLD_NOW); + if (!handle) { + error_message = std::string("failed to load libspacemit_ep.so: ") + dlerror(); + return false; + } -template -struct has_legacy_affinity_vision_ctor(), - std::declval(), - std::declval(), - std::declval()))>> : std::true_type {}; + auto * ep_init = + reinterpret_cast( + dlsym(handle, "OrtSessionOptionsSpaceMITEnvInit")); + if (!ep_init) { + error_message = std::string("failed to find OrtSessionOptionsSpaceMITEnvInit: ") + dlerror(); + return false; + } -static std::unique_ptr create_spine_vision_model_engine( - std::string & model_path, - const std::string & architecture, - const std::unordered_map & ep_config) { - int intra_thread_num = 4; - int inter_thread_num = 1; - std::string intra_thread_affinity; + if (OrtStatus * status = ep_init(options, keys.data(), values.data(), keys.size())) { + error_message = Ort::GetApi().GetErrorMessage(status); + Ort::GetApi().ReleaseStatus(status); + return false; + } + + return true; +} - if (ep_config.count("SPACEMIT_EP_INTRA_THREAD_NUM")) { - intra_thread_num = std::stoi(ep_config.at("SPACEMIT_EP_INTRA_THREAD_NUM")); +static std::vector make_name_ptrs(const std::vector & names) { + std::vector ptrs; + ptrs.reserve(names.size()); + for (const auto & name : names) { + ptrs.push_back(name.c_str()); } - if (ep_config.count("SPACEMIT_EP_INTER_THREAD_NUM")) { - inter_thread_num = std::stoi(ep_config.at("SPACEMIT_EP_INTER_THREAD_NUM")); + return ptrs; +} + +static std::vector get_io_names(Ort::Session & session, bool inputs) { + std::vector names; + Ort::AllocatorWithDefaultOptions allocator; + const size_t count = inputs ? session.GetInputCount() : session.GetOutputCount(); + names.reserve(count); + for (size_t i = 0; i < count; ++i) { + auto allocated = + inputs ? session.GetInputNameAllocated(i, allocator) : session.GetOutputNameAllocated(i, allocator); + names.emplace_back(allocated.get()); + } + return names; +} + +static Ort::Value make_tensor_f32(const std::vector & shape, std::vector & data) { + Ort::MemoryInfo memory_info = + Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); + return Ort::Value::CreateTensor(memory_info, data.data(), data.size(), shape.data(), shape.size()); +} + +class smt_ort_vision_engine { +public: + smt_ort_vision_engine(std::string model_path, std::unordered_map ep_config) : + model_path_(std::move(model_path)), + ep_config_(std::move(ep_config)), + env_(ORT_LOGGING_LEVEL_WARNING, "smt-vision") {} + + Ort::Session & create_session() { + session_options_.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + + const int intra_thread_num = get_ep_thread_num(ep_config_, "SPACEMIT_EP_INTRA_THREAD_NUM", 1); + const int inter_thread_num = get_ep_thread_num(ep_config_, "SPACEMIT_EP_INTER_THREAD_NUM", 1); + if (!has_spacemit_ep_affinity(ep_config_)) { + session_options_.SetIntraOpNumThreads(intra_thread_num); + session_options_.SetInterOpNumThreads(inter_thread_num); + } else { + std::cerr << "[SMT][vision] detected SPACEMIT_EP_INTRA_THREAD_AFFINITY, skip ORT session thread pinning" + << " to avoid conflicting with EP-managed affinity\n"; + } + + provider_options_ = make_provider_options(ep_config_); + std::string error_message; + if (!init_spacemit_execution_provider(session_options_, provider_options_, error_message)) { + throw std::runtime_error("[SMT][vision] failed to initialize Spacemit EP: " + error_message); + } + + std::cerr << "[SMT][vision] Spacemit EP enabled ("; + for (const auto & pair : provider_options_) { + std::cerr << ", " << pair.first << "=" << pair.second; + } + std::cerr << ")\n"; + + session_ = Ort::Session(env_, model_path_.c_str(), session_options_); + input_names_ = get_io_names(session_, true); + output_names_ = get_io_names(session_, false); + input_names_raw_ = make_name_ptrs(input_names_); + output_names_raw_ = make_name_ptrs(output_names_); + + if (input_names_raw_.size() != 1 || output_names_raw_.size() != 1) { + throw std::runtime_error("Unexpected SMT vision ONNX IO signature"); + } + + return session_; } - if (ep_config.count("SPACEMIT_EP_INTRA_THREAD_AFFINITY")) { - intra_thread_affinity = ep_config.at("SPACEMIT_EP_INTRA_THREAD_AFFINITY"); + + Ort::Value & set_input_tensor(const std::string & input_binary_path) { + auto type_info = session_.GetInputTypeInfo(0); + auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); + const std::vector input_shape = tensor_info.GetShape(); + + if (tensor_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + throw std::runtime_error("SMT vision expects float32 input tensor"); + } + + size_t input_size = 1; + for (const int64_t dim : input_shape) { + if (dim <= 0) { + throw std::runtime_error("SMT vision input tensor must have static positive shape"); + } + if (input_size > std::numeric_limits::max() / static_cast(dim)) { + throw std::runtime_error("SMT vision input tensor is too large"); + } + input_size *= static_cast(dim); + } + + input_data_.resize(input_size); + std::ifstream file(input_binary_path, std::ios::binary | std::ios::ate); + if (!file.is_open()) { + throw std::runtime_error("failed to open SMT vision input binary: " + input_binary_path); + } + + const std::streamoff actual_bytes = file.tellg(); + const size_t expected_bytes = input_data_.size() * sizeof(float); + if (actual_bytes < 0 || static_cast(actual_bytes) != expected_bytes) { + throw std::runtime_error("SMT vision input binary size mismatch: expected " + + std::to_string(expected_bytes) + ", actual " + + std::to_string(actual_bytes < 0 ? 0 : static_cast(actual_bytes))); + } + + file.seekg(0, std::ios::beg); + file.read(reinterpret_cast(input_data_.data()), static_cast(expected_bytes)); + if (!file) { + throw std::runtime_error("failed to read SMT vision input binary: " + input_binary_path); + } + + input_tensor_ = make_tensor_f32(input_shape, input_data_); + return input_tensor_; } - if constexpr (has_ep_config_vision_ctor::value) { - return std::make_unique(model_path, architecture, ep_config); - } else if constexpr (has_legacy_affinity_vision_ctor::value) { - return std::make_unique(model_path, intra_thread_num, - inter_thread_num, intra_thread_affinity); - } else { - if (!intra_thread_affinity.empty()) { - std::cerr << "[SMT][vision] warning: SPACEMIT_EP_INTRA_THREAD_AFFINITY is ignored by this " - "SpineVisionModelEngine version\n"; + Ort::Value make_zero_input_tensor() { + auto type_info = session_.GetInputTypeInfo(0); + auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); + const std::vector input_shape = tensor_info.GetShape(); + + if (tensor_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + throw std::runtime_error("SMT vision warmup expects float32 input tensor"); } - return std::make_unique(model_path, intra_thread_num, - inter_thread_num); + + size_t input_size = 1; + for (const int64_t dim : input_shape) { + if (dim <= 0) { + throw std::runtime_error("SMT vision warmup requires a static positive input shape"); + } + if (input_size > std::numeric_limits::max() / static_cast(dim)) { + throw std::runtime_error("SMT vision warmup input tensor is too large"); + } + input_size *= static_cast(dim); + } + + warmup_data_.assign(input_size, 0.0f); + return make_tensor_f32(input_shape, warmup_data_); } -} + + std::vector run_session(Ort::Value & input_tensor) { + std::vector output_tensors = + session_.Run(Ort::RunOptions{ nullptr }, input_names_raw_.data(), &input_tensor, input_names_raw_.size(), + output_names_raw_.data(), output_names_raw_.size()); + + if (output_tensors.empty()) { + throw std::runtime_error("SMT vision ONNX returned no outputs"); + } + + Ort::Value & output = output_tensors[0]; + auto tensor_info = output.GetTensorTypeAndShapeInfo(); + auto shape = tensor_info.GetShape(); + if (tensor_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + throw std::runtime_error("Expected float32 output from SMT vision model"); + } + + if (shape.size() == 3 && shape[0] == 1) { + shape = { shape[1], shape[2] }; + } + if (shape.size() != 2) { + throw std::runtime_error("Unexpected output shape from SMT vision encoder"); + } + + const size_t total_elements = static_cast(shape[0]) * static_cast(shape[1]); + const float * data = output.GetTensorData(); + return std::vector(data, data + total_elements); + } + +private: + std::string model_path_; + std::unordered_map ep_config_; + std::unordered_map provider_options_; + Ort::Env env_; + Ort::SessionOptions session_options_; + Ort::Session session_{ nullptr }; + std::vector input_names_; + std::vector output_names_; + std::vector input_names_raw_; + std::vector output_names_raw_; + std::vector input_data_; + std::vector warmup_data_; + Ort::Value input_tensor_{ nullptr }; +}; struct smt_vision_config { std::vector architectures; @@ -567,111 +762,22 @@ static bool load_smt_vision_config(const std::string & config_dir, smt_vision_co } // namespace struct smt_vision_context::impl { - smt_vision_config config; - std::unique_ptr vision_engine; - std::string arch_name; + smt_vision_config config; + std::unique_ptr vision_engine; + std::string arch_name; }; namespace { -static size_t get_static_input_tensor_elements(Ort::Session & session) { - auto type_info = session.GetInputTypeInfo(0); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - - if (tensor_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { - throw std::runtime_error("SMT vision warmup expects float32 input tensor"); - } - - const std::vector input_shape = tensor_info.GetShape(); - size_t input_size = 1; - for (const int64_t dim : input_shape) { - if (dim <= 0) { - throw std::runtime_error("SMT vision warmup requires a static positive input shape"); - } - if (input_size > std::numeric_limits::max() / static_cast(dim)) { - throw std::runtime_error("SMT vision warmup input tensor is too large"); - } - input_size *= static_cast(dim); - } - - return input_size; -} - -static std::string write_zero_tensor_file(size_t n_floats) { - const std::vector zeros(n_floats, 0.0f); - -#if defined(_WIN32) - char temp_path[MAX_PATH] = { 0 }; - char temp_file[MAX_PATH] = { 0 }; - - if (GetTempPathA(MAX_PATH, temp_path) == 0) { - throw std::runtime_error("failed to get temp path for SMT vision warmup"); - } - if (GetTempFileNameA(temp_path, "lsw", 0, temp_file) == 0) { - throw std::runtime_error("failed to create temp file for SMT vision warmup"); - } - - std::ofstream file(temp_file, std::ios::binary); - if (!file.is_open()) { - std::remove(temp_file); - throw std::runtime_error("failed to open temp file for SMT vision warmup"); - } - file.write(reinterpret_cast(zeros.data()), - static_cast(zeros.size() * sizeof(float))); - if (!file) { - file.close(); - std::remove(temp_file); - throw std::runtime_error("failed to write temp file for SMT vision warmup"); - } - file.close(); - return std::string(temp_file); -#else - char tmpl[] = "/tmp/llama-smt-vision-warmup-XXXXXX"; - const int fd = mkstemp(tmpl); - if (fd < 0) { - throw std::runtime_error("failed to create temp file for SMT vision warmup"); - } - - const char * ptr = reinterpret_cast(zeros.data()); - size_t bytes_left = zeros.size() * sizeof(float); - while (bytes_left > 0) { - const ssize_t written = write(fd, ptr, bytes_left); - if (written <= 0) { - close(fd); - std::remove(tmpl); - throw std::runtime_error("failed to write temp file for SMT vision warmup"); - } - ptr += written; - bytes_left -= static_cast(written); - } - - close(fd); - return std::string(tmpl); -#endif -} - -static void warmup_vision_engine(onnxruntime::spacemit::SpineVisionModelEngine & vision_engine, - Ort::Session & session, - const std::string & arch_name) { - const size_t input_elements = get_static_input_tensor_elements(session); - const std::string temp_path = write_zero_tensor_file(input_elements); - - try { - std::cerr << "[SMT][vision] warmup ONNX session"; - if (!arch_name.empty()) { - std::cerr << " for " << arch_name; - } - std::cerr << "\n"; - - std::string path_copy = temp_path; - Ort::Value & input_tensor = vision_engine.SetInputTensor(path_copy); - (void) vision_engine.RunSession(input_tensor); - } catch (...) { - std::remove(temp_path.c_str()); - throw; +static void warmup_vision_engine(smt_ort_vision_engine & vision_engine, const std::string & arch_name) { + std::cerr << "[SMT][vision] warmup ONNX session"; + if (!arch_name.empty()) { + std::cerr << " for " << arch_name; } + std::cerr << "\n"; - std::remove(temp_path.c_str()); + Ort::Value input_tensor = vision_engine.make_zero_input_tensor(); + (void) vision_engine.run_session(input_tensor); } } // namespace @@ -696,18 +802,12 @@ std::unique_ptr smt_vision_context::create(const std::string onnxruntime::g_ort = OrtGetApiBase()->GetApi(ORT_API_VERSION); // 3. Create vision engine and session - d.vision_engine = create_spine_vision_model_engine(d.config.vision_model_path, d.arch_name, d.config.ep_config); - Ort::Session & vision_session = d.vision_engine->CreateVisionModelSession(); + d.vision_engine = std::make_unique(d.config.vision_model_path, d.config.ep_config); + (void) d.vision_engine->create_session(); if (warmup) { - warmup_vision_engine(*d.vision_engine, vision_session, d.arch_name); + warmup_vision_engine(*d.vision_engine, d.arch_name); } - std::cerr << "[SMT][vision] Spacemit EP enabled ("; - for (const auto & pair : d.config.ep_config) { - std::cerr << ", " << pair.first << "=" << pair.second; - } - std::cerr << ")\n"; - return ctx; } @@ -716,14 +816,12 @@ std::vector smt_vision_context::encode_image(const std::string & binary_p ggml_trace_log_begin("encode_image", "Vision", NULL); - std::string path_copy = binary_path; - ggml_trace_log_begin("set_input_tensor", "Vision", NULL); - Ort::Value & input_tensor = d.vision_engine->SetInputTensor(path_copy); + Ort::Value & input_tensor = d.vision_engine->set_input_tensor(binary_path); ggml_trace_log_end("set_input_tensor", "Vision", NULL); ggml_trace_log_begin("vision_session_run", "Vision", NULL); - std::vector result = d.vision_engine->RunSession(input_tensor); + std::vector result = d.vision_engine->run_session(input_tensor); ggml_trace_log_end("vision_session_run", "Vision", NULL); ggml_trace_log_end("encode_image", "Vision", NULL); diff --git a/tools/server/server-smt-vision.cpp b/tools/server/server-smt-vision.cpp index 38bbc40d0362..09bc7b9f048d 100644 --- a/tools/server/server-smt-vision.cpp +++ b/tools/server/server-smt-vision.cpp @@ -650,7 +650,7 @@ static std::pair, std::vector> detect_imag contains_icase(arch_name, "qwen3vl") || contains_icase(arch_name, "youtuvl")) { return { tokenize_exact_special(lctx, "<|vision_start|>"), tokenize_exact_special(lctx, "<|vision_end|>") }; } - if (contains_icase(arch_name, "llama4")) { + if (contains_icase(arch_name, "llama4") || contains_icase(arch_name, "lfm2")) { return { tokenize_exact_special(lctx, "<|image_start|>"), tokenize_exact_special(lctx, "<|image_end|>") }; } if (contains_icase(arch_name, "gemma3")) {