From 743973d0539dc2bea49a0b37456266f765b9e03e Mon Sep 17 00:00:00 2001 From: liangjunzhao Date: Mon, 15 Jun 2026 11:19:34 +0800 Subject: [PATCH] feat(mtmd): add FunASR fbank and LFR audio encoding path Add Kaldi-compatible fbank extraction and LFR frame stacking helpers for FunASR/SenseVoice-style models. Parse lfr_m/lfr_n from audio config and route SMT audio encoding through the FunASR frontend shape when enabled. Also support SMT audio backends with either hidden_states-only or hidden_states-plus-attention_mask inputs, and add FunASR architecture detection in the SMT vision server. --- tools/mtmd/mtmd-audio.cpp | 175 +++++++++++++++++ tools/mtmd/mtmd-audio.h | 23 +++ tools/mtmd/smt-audio-wrapper.cpp | 297 ++++++++++++++++++++--------- tools/server/server-smt-vision.cpp | 4 + 4 files changed, 406 insertions(+), 93 deletions(-) diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 1446fbefcae1..22b6d187e1a5 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -1088,3 +1088,178 @@ std::vector mtmd_audio_streaming_istft::flush() { return output; } + +// +// FunASR Kaldi-compatible fbank + LFR +// + +static void funasr_fft_inplace(float * data, int n) { + for (int i = 1, j = 0; i < n; i++) { + int bit = n >> 1; + for (; j & bit; bit >>= 1) { + j ^= bit; + } + j ^= bit; + if (i < j) { + std::swap(data[2 * i], data[2 * j]); + std::swap(data[2 * i + 1], data[2 * j + 1]); + } + } + for (int len = 2; len <= n; len <<= 1) { + float angle = -2.0f * (float) M_PI / len; + float wre = cosf(angle); + float wim = sinf(angle); + for (int i = 0; i < n; i += len) { + float ure = 1.0f, uim = 0.0f; + for (int j = 0; j < len / 2; j++) { + int a = i + j; + int b = i + j + len / 2; + float tre = data[2 * b] * ure - data[2 * b + 1] * uim; + float tim = data[2 * b] * uim + data[2 * b + 1] * ure; + data[2 * b] = data[2 * a] - tre; + data[2 * b + 1] = data[2 * a + 1] - tim; + data[2 * a] += tre; + data[2 * a + 1] += tim; + float new_ure = ure * wre - uim * wim; + uim = ure * wim + uim * wre; + ure = new_ure; + } + } + } +} + +static std::vector funasr_build_mel_filterbank_htk(int n_mel, int n_fft, int sample_rate) { + const int n_fft_bins = n_fft / 2 + 1; + const float fmax = (float) sample_rate / 2.0f; + const float bin_hz_step = (float) sample_rate / (float) n_fft; + + auto hz_to_mel = [](float f) -> float { return 1127.0f * logf(1.0f + f / 700.0f); }; + auto mel_to_hz = [](float m) -> float { return 700.0f * (expf(m / 1127.0f) - 1.0f); }; + + float mel_lo = hz_to_mel(0.0f); + float mel_hi = hz_to_mel(fmax); + std::vector mel_pts(n_mel + 2); + for (int i = 0; i < n_mel + 2; i++) { + mel_pts[i] = mel_lo + (mel_hi - mel_lo) * (float) i / (float) (n_mel + 1); + } + + std::vector hz_pts(n_mel + 2); + for (int i = 0; i < n_mel + 2; i++) { + hz_pts[i] = mel_to_hz(mel_pts[i]); + } + + std::vector filters((size_t) n_mel * n_fft_bins, 0.0f); + for (int m = 0; m < n_mel; m++) { + float f_left = hz_pts[m]; + float f_center = hz_pts[m + 1]; + float f_right = hz_pts[m + 2]; + for (int k = 0; k < n_fft_bins; k++) { + float f = k * bin_hz_step; + if (f >= f_left && f <= f_center && f_center > f_left) { + filters[(size_t) m * n_fft_bins + k] = (f - f_left) / (f_center - f_left); + } else if (f > f_center && f <= f_right && f_right > f_center) { + filters[(size_t) m * n_fft_bins + k] = (f_right - f) / (f_right - f_center); + } + } + } + return filters; +} + +bool mtmd_audio_compute_kaldi_fbank(const float * samples, + size_t n_samples, + int sample_rate, + int n_mel, + int frame_len, + int frame_shift, + float preemph_coeff, + std::vector & features, + int & n_frames_out) { + if (n_samples == 0 || n_mel <= 0) { + return false; + } + + std::vector emphasized(n_samples); + emphasized[0] = samples[0]; + for (size_t i = 1; i < n_samples; i++) { + emphasized[i] = samples[i] - preemph_coeff * samples[i - 1]; + } + + int n_frames = ((int) n_samples - frame_len) / frame_shift + 1; + if (n_frames <= 0) { + return false; + } + + int n_fft = 1; + while (n_fft < frame_len) { + n_fft <<= 1; + } + int n_fft_bins = n_fft / 2 + 1; + + std::vector window(frame_len); + for (int i = 0; i < frame_len; i++) { + window[i] = 0.54f - 0.46f * cosf(2.0f * (float) M_PI * i / frame_len); + } + + auto mel_filters = funasr_build_mel_filterbank_htk(n_mel, n_fft, sample_rate); + + features.resize((size_t) n_frames * n_mel); + std::vector fft_buf((size_t) n_fft * 2, 0.0f); + + const float inv_n_fft = 1.0f / (float) n_fft; + + for (int frame = 0; frame < n_frames; frame++) { + int offset = frame * frame_shift; + std::fill(fft_buf.begin(), fft_buf.end(), 0.0f); + for (int j = 0; j < frame_len; j++) { + fft_buf[2 * j] = emphasized[offset + j] * window[j]; + } + + funasr_fft_inplace(fft_buf.data(), n_fft); + + for (int m = 0; m < n_mel; m++) { + float sum = 0.0f; + for (int k = 0; k < n_fft_bins; k++) { + float re = fft_buf[2 * k]; + float im = fft_buf[2 * k + 1]; + float power = (re * re + im * im) * inv_n_fft; + sum += power * mel_filters[(size_t) m * n_fft_bins + k]; + } + features[(size_t) frame * n_mel + m] = logf(std::max(sum, 1e-10f)); + } + } + + n_frames_out = n_frames; + return true; +} + +bool mtmd_audio_compute_lfr(const std::vector & features, + int n_frames, + int n_mel, + int lfr_m, + int lfr_n, + std::vector & lfr_features, + int & n_lfr_frames_out) { + if (n_frames <= 0 || lfr_n <= 0) { + return false; + } + int n_lfr = (n_frames + lfr_n - 1) / lfr_n; + int feat_dim = n_mel * lfr_m; + int half_m = lfr_m / 2; + lfr_features.resize((size_t) n_lfr * feat_dim); + for (int i = 0; i < n_lfr; i++) { + int center = i * lfr_n; + int start = center - half_m; + for (int j = 0; j < lfr_m; j++) { + int src = start + j; + if (src < 0) { + src = 0; + } else if (src >= n_frames) { + src = n_frames - 1; + } + std::memcpy(lfr_features.data() + (size_t) i * feat_dim + (size_t) j * n_mel, + features.data() + (size_t) src * n_mel, n_mel * sizeof(float)); + } + } + n_lfr_frames_out = n_lfr; + return true; +} diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 1f8b0c3f3793..e116e10c1211 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -120,6 +120,29 @@ struct mtmd_audio_preprocessor_qwen3a : mtmd_audio_preprocessor { mtmd_audio_cache cache; }; +// Kaldi-compatible fbank features (used by FunASR/SenseVoice models). +// Output layout: [n_frames, n_mel] flattened in row-major order per time frame. +bool mtmd_audio_compute_kaldi_fbank(const float * samples, + size_t n_samples, + int sample_rate, + int n_mel, + int frame_len, + int frame_shift, + float preemph_coeff, + std::vector & features, + int & n_frames_out); + +// Low Frame Rate (LFR) frame stacking for FunASR models. +// Stacks lfr_m consecutive frames with stride lfr_n, using centered window +// and boundary clamping. Output layout: [n_lfr_frames, n_mel * lfr_m]. +bool mtmd_audio_compute_lfr(const std::vector & features, + int n_frames, + int n_mel, + int lfr_m, + int lfr_n, + std::vector & lfr_features, + int & n_lfr_frames_out); + // // streaming ISTFT - converts spectrogram frames back to audio one frame at a time // diff --git a/tools/mtmd/smt-audio-wrapper.cpp b/tools/mtmd/smt-audio-wrapper.cpp index 4a6e83320755..e9ea0e96dc44 100644 --- a/tools/mtmd/smt-audio-wrapper.cpp +++ b/tools/mtmd/smt-audio-wrapper.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,8 @@ struct smt_audio_config { int32_t hop_len = 160; int32_t intra_thread_num = 4; int32_t inter_thread_num = 1; + int32_t lfr_m = 0; + int32_t lfr_n = 0; }; static std::string read_file_to_string(const std::string & path) { @@ -343,6 +346,8 @@ static bool parse_audio_config_block(const std::string & config_dir, config.n_fft = (int32_t) extract_int64_value(audio_block, "n_fft", config.n_fft); config.window_len = (int32_t) extract_int64_value(audio_block, "window_len", config.window_len); config.hop_len = (int32_t) extract_int64_value(audio_block, "hop_len", config.hop_len); + config.lfr_m = (int32_t) extract_int64_value(audio_block, "lfr_m", config.lfr_m); + config.lfr_n = (int32_t) extract_int64_value(audio_block, "lfr_n", config.lfr_n); config.ep_config = extract_string_map(audio_block, "ep_config"); apply_legacy_spacemit_ep_config(audio_block, config); config.architectures = extract_string_array(content, "architectures"); @@ -654,7 +659,8 @@ std::unique_ptr smt_audio_context::create(const std::string & d.backend_output_names_raw = make_name_ptrs(d.backend_output_names); if (d.frontend_input_names_raw.size() != 1 || d.frontend_output_names_raw.size() != 1 || - d.backend_input_names_raw.size() != 2 || d.backend_output_names_raw.size() != 1) { + (d.backend_input_names_raw.size() != 1 && d.backend_input_names_raw.size() != 2) || + d.backend_output_names_raw.size() != 1) { throw std::runtime_error("Unexpected SMT audio ONNX IO signature"); } @@ -665,45 +671,77 @@ std::unique_ptr smt_audio_context::create(const std::string & } std::cerr << "\n"; - const int chunk_frames = 100; - const int chunk_tokens = 13; - const int t_out = chunk_tokens; + int warmup_t_out; + std::vector warmup_hidden; - std::vector frontend_input_data((size_t) d.config.num_mel_bins * chunk_frames, 0.0f); - const std::vector frontend_input_shape = { 1, d.config.num_mel_bins, chunk_frames }; - auto frontend_input = make_tensor_f32(frontend_input_shape, frontend_input_data); + if (d.config.lfr_m > 0) { + const int warmup_frames = 10; + const int feat_dim = d.config.num_mel_bins * d.config.lfr_m; - std::cerr << "[SMT][audio] warmup frontend ONNX session: " << d.config.frontend_model_path << "\n"; - auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(), - &frontend_input, 1, d.frontend_output_names_raw.data(), 1); + std::vector frontend_input_data((size_t) warmup_frames * feat_dim, 0.0f); + const std::vector frontend_input_shape = { 1, warmup_frames, (int64_t) feat_dim }; + auto frontend_input = make_tensor_f32(frontend_input_shape, frontend_input_data); - std::vector hidden_states((size_t) t_out * (size_t) d.config.d_model, 0.0f); - if (frontend_outputs.empty()) { - throw std::runtime_error("SMT audio warmup frontend returned no outputs"); - } - const auto frontend_output_info = frontend_outputs[0].GetTensorTypeAndShapeInfo(); - if (frontend_output_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { - throw std::runtime_error("SMT audio warmup frontend output must be float32"); - } - const int64_t frontend_output_elems = frontend_output_info.GetElementCount(); - if (frontend_output_elems < 0 || (size_t) frontend_output_elems < hidden_states.size()) { - throw std::runtime_error("SMT audio warmup frontend output is smaller than expected"); + std::cerr << "[SMT][audio] warmup frontend ONNX session (FunASR): " << d.config.frontend_model_path << "\n"; + auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(), + &frontend_input, 1, d.frontend_output_names_raw.data(), 1); + if (frontend_outputs.empty()) { + throw std::runtime_error("SMT audio warmup frontend returned no outputs"); + } + const auto frontend_output_info = frontend_outputs[0].GetTensorTypeAndShapeInfo(); + if (frontend_output_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + throw std::runtime_error("SMT audio warmup frontend output must be float32"); + } + auto shape = frontend_output_info.GetShape(); + warmup_t_out = (int) shape[1]; + warmup_hidden.resize((size_t) warmup_t_out * (size_t) d.config.d_model, 0.0f); + const float * frontend_output = frontend_outputs[0].GetTensorData(); + std::memcpy(warmup_hidden.data(), frontend_output, warmup_hidden.size() * sizeof(float)); + } else { + const int chunk_frames = 100; + const int chunk_tokens = 13; + warmup_t_out = chunk_tokens; + + std::vector frontend_input_data((size_t) d.config.num_mel_bins * chunk_frames, 0.0f); + const std::vector frontend_input_shape = { 1, d.config.num_mel_bins, chunk_frames }; + auto frontend_input = make_tensor_f32(frontend_input_shape, frontend_input_data); + + std::cerr << "[SMT][audio] warmup frontend ONNX session: " << d.config.frontend_model_path << "\n"; + auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(), + &frontend_input, 1, d.frontend_output_names_raw.data(), 1); + + warmup_hidden.resize((size_t) warmup_t_out * (size_t) d.config.d_model, 0.0f); + if (frontend_outputs.empty()) { + throw std::runtime_error("SMT audio warmup frontend returned no outputs"); + } + const auto frontend_output_info = frontend_outputs[0].GetTensorTypeAndShapeInfo(); + if (frontend_output_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + throw std::runtime_error("SMT audio warmup frontend output must be float32"); + } + const int64_t frontend_output_elems = frontend_output_info.GetElementCount(); + if (frontend_output_elems < 0 || (size_t) frontend_output_elems < warmup_hidden.size()) { + throw std::runtime_error("SMT audio warmup frontend output is smaller than expected"); + } + const float * frontend_output = frontend_outputs[0].GetTensorData(); + std::memcpy(warmup_hidden.data(), frontend_output, warmup_hidden.size() * sizeof(float)); } - const float * frontend_output = frontend_outputs[0].GetTensorData(); - std::memcpy(hidden_states.data(), frontend_output, hidden_states.size() * sizeof(float)); - - std::vector attention_mask((size_t) t_out * (size_t) t_out, 0.0f); - const std::vector backend_hidden_shape = { 1, t_out, d.config.d_model }; - const std::vector backend_mask_shape = { 1, 1, t_out, t_out }; - auto hidden_tensor = make_tensor_f32(backend_hidden_shape, hidden_states); - auto mask_tensor = make_tensor_f32(backend_mask_shape, attention_mask); - std::array backend_inputs = { std::move(hidden_tensor), std::move(mask_tensor) }; + const std::vector backend_hidden_shape = { 1, warmup_t_out, d.config.d_model }; + auto hidden_tensor = make_tensor_f32(backend_hidden_shape, warmup_hidden); std::cerr << "[SMT][audio] warmup backend ONNX session: " << d.config.backend_model_path << "\n"; - (void) d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(), - backend_inputs.data(), backend_inputs.size(), d.backend_output_names_raw.data(), - 1); + if (d.backend_input_names_raw.size() == 2) { + std::vector attention_mask((size_t) warmup_t_out * (size_t) warmup_t_out, 0.0f); + const std::vector backend_mask_shape = { 1, 1, warmup_t_out, warmup_t_out }; + auto mask_tensor = make_tensor_f32(backend_mask_shape, attention_mask); + std::array backend_inputs = { std::move(hidden_tensor), std::move(mask_tensor) }; + (void) d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(), + backend_inputs.data(), backend_inputs.size(), + d.backend_output_names_raw.data(), 1); + } else { + (void) d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(), + &hidden_tensor, 1, d.backend_output_names_raw.data(), 1); + } } return ctx; @@ -724,77 +762,150 @@ std::vector smt_audio_context::encode_audio(const std::string & audio_pat } ggml_trace_log_end("decode_audio_file", "Audio", NULL); - mtmd_audio_mel mel; - ggml_trace_log_begin("compute_log_mel_spectrogram", "Audio", NULL); - if (!mtmd_audio_compute_log_mel_spectrogram(samples.data(), samples.size(), 4, d.config.num_mel_bins, - d.config.n_fft, d.config.window_len, d.config.hop_len, - d.config.sample_rate, true, 0.0f, false, false, mel)) { + int t_out; + std::vector hidden_states; + + if (d.config.lfr_m > 0) { + // FunASR path: kaldi fbank -> LFR -> frontend -> backend + std::vector fbank_features; + int n_fbank_frames = 0; + ggml_trace_log_begin("compute_kaldi_fbank", "Audio", NULL); + if (!mtmd_audio_compute_kaldi_fbank(samples.data(), samples.size(), d.config.sample_rate, d.config.num_mel_bins, + d.config.window_len, d.config.hop_len, 0.97f, fbank_features, n_fbank_frames)) { + ggml_trace_log_end("compute_kaldi_fbank", "Audio", NULL); + ggml_trace_log_end("encode_audio", "Audio", NULL); + ggml_profile_flush_tls(); + throw std::runtime_error("failed to compute kaldi fbank features"); + } + ggml_trace_log_end("compute_kaldi_fbank", "Audio", NULL); + + std::vector lfr_features; + int n_lfr_frames = 0; + if (!mtmd_audio_compute_lfr(fbank_features, n_fbank_frames, d.config.num_mel_bins, d.config.lfr_m, d.config.lfr_n, + lfr_features, n_lfr_frames)) { + ggml_trace_log_end("encode_audio", "Audio", NULL); + ggml_profile_flush_tls(); + throw std::runtime_error("failed to compute LFR features"); + } + + // Per-frame mean subtraction for ONNX numerical stability. + // LayerNorm is shift-invariant: LN(x + c) = LN(x), so subtracting the + // per-frame mean does not change the model output, but prevents catastrophic + // cancellation in the ONNX decomposed variance computation (E[x²] - E[x]²) + // when input values have large magnitude but small variance. + const int feat_dim = d.config.num_mel_bins * d.config.lfr_m; + for (int i = 0; i < n_lfr_frames; i++) { + float * frame = lfr_features.data() + (size_t) i * feat_dim; + float sum = 0.0f; + for (int j = 0; j < feat_dim; j++) { + sum += frame[j]; + } + float mean = sum / (float) feat_dim; + for (int j = 0; j < feat_dim; j++) { + frame[j] -= mean; + } + } + const std::vector frontend_input_shape = { 1, (int64_t) n_lfr_frames, (int64_t) feat_dim }; + auto frontend_input = make_tensor_f32(frontend_input_shape, lfr_features); + + ggml_trace_log_begin("frontend_session_run", "Audio", NULL); + auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(), + &frontend_input, 1, d.frontend_output_names_raw.data(), 1); + ggml_trace_log_end("frontend_session_run", "Audio", NULL); + + if (frontend_outputs.empty()) { + ggml_trace_log_end("encode_audio", "Audio", NULL); + ggml_profile_flush_tls(); + throw std::runtime_error("FunASR frontend returned no outputs"); + } + + const auto frontend_shape = frontend_outputs[0].GetTensorTypeAndShapeInfo().GetShape(); + t_out = (int) frontend_shape[1]; + hidden_states.resize((size_t) t_out * (size_t) d.config.d_model); + std::memcpy(hidden_states.data(), frontend_outputs[0].GetTensorData(), + hidden_states.size() * sizeof(float)); + + } else { + // Qwen3ASR path: mel spectrogram -> chunk -> frontend -> backend + mtmd_audio_mel mel; + ggml_trace_log_begin("compute_log_mel_spectrogram", "Audio", NULL); + if (!mtmd_audio_compute_log_mel_spectrogram(samples.data(), samples.size(), 4, d.config.num_mel_bins, + d.config.n_fft, d.config.window_len, d.config.hop_len, + d.config.sample_rate, true, 0.0f, false, false, mel)) { + ggml_trace_log_end("compute_log_mel_spectrogram", "Audio", NULL); + ggml_trace_log_end("encode_audio", "Audio", NULL); + ggml_profile_flush_tls(); + throw std::runtime_error("failed to compute Qwen3-ASR mel spectrogram"); + } ggml_trace_log_end("compute_log_mel_spectrogram", "Audio", NULL); - ggml_trace_log_end("encode_audio", "Audio", NULL); - ggml_profile_flush_tls(); - throw std::runtime_error("failed to compute Qwen3-ASR mel spectrogram"); - } - ggml_trace_log_end("compute_log_mel_spectrogram", "Audio", NULL); - if (mel.n_len <= 0 || mel.n_mel != d.config.num_mel_bins) { - ggml_trace_log_end("encode_audio", "Audio", NULL); - ggml_profile_flush_tls(); - throw std::runtime_error("invalid mel spectrogram shape"); - } - - const int frames = mel.n_len; - const int chunk_frames = 100; - const int chunk_tokens = 13; - const int padded_frames = ((frames + chunk_frames - 1) / chunk_frames) * chunk_frames; - const int n_chunks = padded_frames / chunk_frames; - - std::vector hidden_states((size_t) n_chunks * chunk_tokens * (size_t) d.config.d_model); - std::vector chunk_input((size_t) d.config.num_mel_bins * chunk_frames, 0.0f); - const std::vector frontend_input_shape = { 1, d.config.num_mel_bins, chunk_frames }; - - ggml_trace_log_begin("frontend_session_run", "Audio", NULL); - for (int chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) { - std::fill(chunk_input.begin(), chunk_input.end(), 0.0f); - const int frame_offset = chunk_idx * chunk_frames; - const int copy_frames = std::min(chunk_frames, frames - frame_offset); - if (copy_frames > 0) { - for (int mel_idx = 0; mel_idx < mel.n_mel; ++mel_idx) { - const float * src = mel.data.data() + (size_t) mel_idx * mel.n_len + frame_offset; - float * dst = chunk_input.data() + (size_t) mel_idx * chunk_frames; - std::memcpy(dst, src, (size_t) copy_frames * sizeof(float)); + if (mel.n_len <= 0 || mel.n_mel != d.config.num_mel_bins) { + ggml_trace_log_end("encode_audio", "Audio", NULL); + ggml_profile_flush_tls(); + throw std::runtime_error("invalid mel spectrogram shape"); + } + + const int frames = mel.n_len; + const int chunk_frames = 100; + const int chunk_tokens = 13; + const int padded_frames = ((frames + chunk_frames - 1) / chunk_frames) * chunk_frames; + const int n_chunks = padded_frames / chunk_frames; + + hidden_states.resize((size_t) n_chunks * chunk_tokens * (size_t) d.config.d_model); + std::vector chunk_input((size_t) d.config.num_mel_bins * chunk_frames, 0.0f); + const std::vector frontend_input_shape = { 1, d.config.num_mel_bins, chunk_frames }; + + ggml_trace_log_begin("frontend_session_run", "Audio", NULL); + for (int chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) { + std::fill(chunk_input.begin(), chunk_input.end(), 0.0f); + const int frame_offset = chunk_idx * chunk_frames; + const int copy_frames = std::min(chunk_frames, frames - frame_offset); + if (copy_frames > 0) { + for (int mel_idx = 0; mel_idx < mel.n_mel; ++mel_idx) { + const float * src = mel.data.data() + (size_t) mel_idx * mel.n_len + frame_offset; + float * dst = chunk_input.data() + (size_t) mel_idx * chunk_frames; + std::memcpy(dst, src, (size_t) copy_frames * sizeof(float)); + } } + + auto frontend_input = make_tensor_f32(frontend_input_shape, chunk_input); + auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(), + &frontend_input, 1, d.frontend_output_names_raw.data(), 1); + float * chunk_out = frontend_outputs[0].GetTensorMutableData(); + std::memcpy(hidden_states.data() + (size_t) chunk_idx * chunk_tokens * (size_t) d.config.d_model, chunk_out, + (size_t) chunk_tokens * (size_t) d.config.d_model * sizeof(float)); } + ggml_trace_log_end("frontend_session_run", "Audio", NULL); - auto frontend_input = make_tensor_f32(frontend_input_shape, chunk_input); - auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(), - &frontend_input, 1, d.frontend_output_names_raw.data(), 1); - float * chunk_out = frontend_outputs[0].GetTensorMutableData(); - std::memcpy(hidden_states.data() + (size_t) chunk_idx * chunk_tokens * (size_t) d.config.d_model, chunk_out, - (size_t) chunk_tokens * (size_t) d.config.d_model * sizeof(float)); - } - ggml_trace_log_end("frontend_session_run", "Audio", NULL); + t_out = get_feat_extract_output_lengths(frames); + if (t_out <= 0 || t_out > n_chunks * chunk_tokens) { + ggml_trace_log_end("encode_audio", "Audio", NULL); + ggml_profile_flush_tls(); + throw std::runtime_error("invalid split-encoder output length"); + } - const int t_out = get_feat_extract_output_lengths(frames); - if (t_out <= 0 || t_out > n_chunks * chunk_tokens) { - ggml_trace_log_end("encode_audio", "Audio", NULL); - ggml_profile_flush_tls(); - throw std::runtime_error("invalid split-encoder output length"); + hidden_states.resize((size_t) t_out * (size_t) d.config.d_model); } - hidden_states.resize((size_t) t_out * (size_t) d.config.d_model); - std::vector attention_mask((size_t) t_out * (size_t) t_out, 0.0f); - + // Common backend path const std::vector backend_hidden_shape = { 1, t_out, d.config.d_model }; - const std::vector backend_mask_shape = { 1, 1, t_out, t_out }; - - auto hidden_tensor = make_tensor_f32(backend_hidden_shape, hidden_states); - auto mask_tensor = make_tensor_f32(backend_mask_shape, attention_mask); - std::array backend_inputs = { std::move(hidden_tensor), std::move(mask_tensor) }; + auto hidden_tensor = make_tensor_f32(backend_hidden_shape, hidden_states); ggml_trace_log_begin("backend_session_run", "Audio", NULL); - auto backend_outputs = - d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(), backend_inputs.data(), - backend_inputs.size(), d.backend_output_names_raw.data(), 1); + std::vector backend_outputs; + if (d.backend_input_names_raw.size() == 2) { + // Backend expects hidden_states + attention_mask + std::vector attention_mask((size_t) t_out * (size_t) t_out, 0.0f); + const std::vector backend_mask_shape = { 1, 1, t_out, t_out }; + auto mask_tensor = make_tensor_f32(backend_mask_shape, attention_mask); + std::array inputs = { std::move(hidden_tensor), std::move(mask_tensor) }; + backend_outputs = d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(), + inputs.data(), inputs.size(), d.backend_output_names_raw.data(), 1); + } else { + // Backend only expects hidden_states (attention_mask pruned by ONNX exporter) + backend_outputs = d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(), + &hidden_tensor, 1, d.backend_output_names_raw.data(), 1); + } ggml_trace_log_end("backend_session_run", "Audio", NULL); float * output = backend_outputs[0].GetTensorMutableData(); diff --git a/tools/server/server-smt-vision.cpp b/tools/server/server-smt-vision.cpp index 38bbc40d0362..6dc9f290b880 100644 --- a/tools/server/server-smt-vision.cpp +++ b/tools/server/server-smt-vision.cpp @@ -614,6 +614,10 @@ static bool arch_is_qwen3asr(const std::string & arch_name) { return contains_icase(arch_name, "qwen3asr"); } +static bool arch_is_funasr(const std::string & arch_name) { + return contains_icase(arch_name, "funasr"); +} + static std::pair infer_image_grid_xy(int32_t n_tokens) { if (n_tokens <= 0) { return { 0, 0 };