From 743973d0539dc2bea49a0b37456266f765b9e03e Mon Sep 17 00:00:00 2001
From: liangjunzhao <junzhao.liang@spacemit.com>
Date: Mon, 15 Jun 2026 11:19:34 +0800
Subject: [PATCH] feat(mtmd): add FunASR fbank and LFR audio encoding path

Add Kaldi-compatible fbank extraction and LFR frame stacking helpers for
FunASR/SenseVoice-style models. Parse lfr_m/lfr_n from audio config and route
SMT audio encoding through the FunASR frontend shape when enabled.

Also support SMT audio backends with either hidden_states-only or
hidden_states-plus-attention_mask inputs, and add FunASR architecture detection
in the SMT vision server.
---
 tools/mtmd/mtmd-audio.cpp          | 175 +++++++++++++++++
 tools/mtmd/mtmd-audio.h            |  23 +++
 tools/mtmd/smt-audio-wrapper.cpp   | 297 ++++++++++++++++++++---------
 tools/server/server-smt-vision.cpp |   4 +
 4 files changed, 406 insertions(+), 93 deletions(-)
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index 1446fbefcae1..22b6d187e1a5 100644
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -1088,3 +1088,178 @@ std::vector<float> mtmd_audio_streaming_istft::flush() {
 
     return output;
 }
+
+//
+// FunASR Kaldi-compatible fbank + LFR
+//
+
+static void funasr_fft_inplace(float * data, int n) {
+    for (int i = 1, j = 0; i < n; i++) {
+        int bit = n >> 1;
+        for (; j & bit; bit >>= 1) {
+            j ^= bit;
+        }
+        j ^= bit;
+        if (i < j) {
+            std::swap(data[2 * i], data[2 * j]);
+            std::swap(data[2 * i + 1], data[2 * j + 1]);
+        }
+    }
+    for (int len = 2; len <= n; len <<= 1) {
+        float angle = -2.0f * (float) M_PI / len;
+        float wre   = cosf(angle);
+        float wim   = sinf(angle);
+        for (int i = 0; i < n; i += len) {
+            float ure = 1.0f, uim = 0.0f;
+            for (int j = 0; j < len / 2; j++) {
+                int   a   = i + j;
+                int   b   = i + j + len / 2;
+                float tre = data[2 * b] * ure - data[2 * b + 1] * uim;
+                float tim = data[2 * b] * uim + data[2 * b + 1] * ure;
+                data[2 * b]     = data[2 * a] - tre;
+                data[2 * b + 1] = data[2 * a + 1] - tim;
+                data[2 * a] += tre;
+                data[2 * a + 1] += tim;
+                float new_ure = ure * wre - uim * wim;
+                uim           = ure * wim + uim * wre;
+                ure           = new_ure;
+            }
+        }
+    }
+}
+
+static std::vector<float> funasr_build_mel_filterbank_htk(int n_mel, int n_fft, int sample_rate) {
+    const int   n_fft_bins  = n_fft / 2 + 1;
+    const float fmax        = (float) sample_rate / 2.0f;
+    const float bin_hz_step = (float) sample_rate / (float) n_fft;
+
+    auto hz_to_mel = [](float f) -> float { return 1127.0f * logf(1.0f + f / 700.0f); };
+    auto mel_to_hz = [](float m) -> float { return 700.0f * (expf(m / 1127.0f) - 1.0f); };
+
+    float              mel_lo = hz_to_mel(0.0f);
+    float              mel_hi = hz_to_mel(fmax);
+    std::vector<float> mel_pts(n_mel + 2);
+    for (int i = 0; i < n_mel + 2; i++) {
+        mel_pts[i] = mel_lo + (mel_hi - mel_lo) * (float) i / (float) (n_mel + 1);
+    }
+
+    std::vector<float> hz_pts(n_mel + 2);
+    for (int i = 0; i < n_mel + 2; i++) {
+        hz_pts[i] = mel_to_hz(mel_pts[i]);
+    }
+
+    std::vector<float> filters((size_t) n_mel * n_fft_bins, 0.0f);
+    for (int m = 0; m < n_mel; m++) {
+        float f_left   = hz_pts[m];
+        float f_center = hz_pts[m + 1];
+        float f_right  = hz_pts[m + 2];
+        for (int k = 0; k < n_fft_bins; k++) {
+            float f = k * bin_hz_step;
+            if (f >= f_left && f <= f_center && f_center > f_left) {
+                filters[(size_t) m * n_fft_bins + k] = (f - f_left) / (f_center - f_left);
+            } else if (f > f_center && f <= f_right && f_right > f_center) {
+                filters[(size_t) m * n_fft_bins + k] = (f_right - f) / (f_right - f_center);
+            }
+        }
+    }
+    return filters;
+}
+
+bool mtmd_audio_compute_kaldi_fbank(const float * samples,
+                                    size_t        n_samples,
+                                    int           sample_rate,
+                                    int           n_mel,
+                                    int           frame_len,
+                                    int           frame_shift,
+                                    float         preemph_coeff,
+                                    std::vector<float> & features,
+                                    int &         n_frames_out) {
+    if (n_samples == 0 || n_mel <= 0) {
+        return false;
+    }
+
+    std::vector<float> emphasized(n_samples);
+    emphasized[0] = samples[0];
+    for (size_t i = 1; i < n_samples; i++) {
+        emphasized[i] = samples[i] - preemph_coeff * samples[i - 1];
+    }
+
+    int n_frames = ((int) n_samples - frame_len) / frame_shift + 1;
+    if (n_frames <= 0) {
+        return false;
+    }
+
+    int n_fft = 1;
+    while (n_fft < frame_len) {
+        n_fft <<= 1;
+    }
+    int n_fft_bins = n_fft / 2 + 1;
+
+    std::vector<float> window(frame_len);
+    for (int i = 0; i < frame_len; i++) {
+        window[i] = 0.54f - 0.46f * cosf(2.0f * (float) M_PI * i / frame_len);
+    }
+
+    auto mel_filters = funasr_build_mel_filterbank_htk(n_mel, n_fft, sample_rate);
+
+    features.resize((size_t) n_frames * n_mel);
+    std::vector<float> fft_buf((size_t) n_fft * 2, 0.0f);
+
+    const float inv_n_fft = 1.0f / (float) n_fft;
+
+    for (int frame = 0; frame < n_frames; frame++) {
+        int offset = frame * frame_shift;
+        std::fill(fft_buf.begin(), fft_buf.end(), 0.0f);
+        for (int j = 0; j < frame_len; j++) {
+            fft_buf[2 * j] = emphasized[offset + j] * window[j];
+        }
+
+        funasr_fft_inplace(fft_buf.data(), n_fft);
+
+        for (int m = 0; m < n_mel; m++) {
+            float sum = 0.0f;
+            for (int k = 0; k < n_fft_bins; k++) {
+                float re    = fft_buf[2 * k];
+                float im    = fft_buf[2 * k + 1];
+                float power = (re * re + im * im) * inv_n_fft;
+                sum += power * mel_filters[(size_t) m * n_fft_bins + k];
+            }
+            features[(size_t) frame * n_mel + m] = logf(std::max(sum, 1e-10f));
+        }
+    }
+
+    n_frames_out = n_frames;
+    return true;
+}
+
+bool mtmd_audio_compute_lfr(const std::vector<float> & features,
+                            int n_frames,
+                            int n_mel,
+                            int lfr_m,
+                            int lfr_n,
+                            std::vector<float> & lfr_features,
+                            int & n_lfr_frames_out) {
+    if (n_frames <= 0 || lfr_n <= 0) {
+        return false;
+    }
+    int n_lfr    = (n_frames + lfr_n - 1) / lfr_n;
+    int feat_dim = n_mel * lfr_m;
+    int half_m   = lfr_m / 2;
+    lfr_features.resize((size_t) n_lfr * feat_dim);
+    for (int i = 0; i < n_lfr; i++) {
+        int center = i * lfr_n;
+        int start  = center - half_m;
+        for (int j = 0; j < lfr_m; j++) {
+            int src = start + j;
+            if (src < 0) {
+                src = 0;
+            } else if (src >= n_frames) {
+                src = n_frames - 1;
+            }
+            std::memcpy(lfr_features.data() + (size_t) i * feat_dim + (size_t) j * n_mel,
+                        features.data() + (size_t) src * n_mel, n_mel * sizeof(float));
+        }
+    }
+    n_lfr_frames_out = n_lfr;
+    return true;
+}
diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h
index 1f8b0c3f3793..e116e10c1211 100644
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -120,6 +120,29 @@ struct mtmd_audio_preprocessor_qwen3a : mtmd_audio_preprocessor {
     mtmd_audio_cache cache;
 };
 
+// Kaldi-compatible fbank features (used by FunASR/SenseVoice models).
+// Output layout: [n_frames, n_mel] flattened in row-major order per time frame.
+bool mtmd_audio_compute_kaldi_fbank(const float * samples,
+                                    size_t        n_samples,
+                                    int           sample_rate,
+                                    int           n_mel,
+                                    int           frame_len,
+                                    int           frame_shift,
+                                    float         preemph_coeff,
+                                    std::vector<float> & features,
+                                    int &         n_frames_out);
+
+// Low Frame Rate (LFR) frame stacking for FunASR models.
+// Stacks lfr_m consecutive frames with stride lfr_n, using centered window
+// and boundary clamping.  Output layout: [n_lfr_frames, n_mel * lfr_m].
+bool mtmd_audio_compute_lfr(const std::vector<float> & features,
+                            int n_frames,
+                            int n_mel,
+                            int lfr_m,
+                            int lfr_n,
+                            std::vector<float> & lfr_features,
+                            int & n_lfr_frames_out);
+
 //
 // streaming ISTFT - converts spectrogram frames back to audio one frame at a time
 //
diff --git a/tools/mtmd/smt-audio-wrapper.cpp b/tools/mtmd/smt-audio-wrapper.cpp
index 4a6e83320755..e9ea0e96dc44 100644
--- a/tools/mtmd/smt-audio-wrapper.cpp
+++ b/tools/mtmd/smt-audio-wrapper.cpp
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <array>
 #include <cctype>
+#include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
@@ -50,6 +51,8 @@ struct smt_audio_config {
     int32_t                                      hop_len          = 160;
     int32_t                                      intra_thread_num = 4;
     int32_t                                      inter_thread_num = 1;
+    int32_t                                      lfr_m            = 0;
+    int32_t                                      lfr_n            = 0;
 };
 
 static std::string read_file_to_string(const std::string & path) {
@@ -343,6 +346,8 @@ static bool parse_audio_config_block(const std::string & config_dir,
     config.n_fft        = (int32_t) extract_int64_value(audio_block, "n_fft", config.n_fft);
     config.window_len   = (int32_t) extract_int64_value(audio_block, "window_len", config.window_len);
     config.hop_len      = (int32_t) extract_int64_value(audio_block, "hop_len", config.hop_len);
+    config.lfr_m        = (int32_t) extract_int64_value(audio_block, "lfr_m", config.lfr_m);
+    config.lfr_n        = (int32_t) extract_int64_value(audio_block, "lfr_n", config.lfr_n);
     config.ep_config    = extract_string_map(audio_block, "ep_config");
     apply_legacy_spacemit_ep_config(audio_block, config);
     config.architectures = extract_string_array(content, "architectures");
@@ -654,7 +659,8 @@ std::unique_ptr<smt_audio_context> smt_audio_context::create(const std::string &
     d.backend_output_names_raw = make_name_ptrs(d.backend_output_names);
 
     if (d.frontend_input_names_raw.size() != 1 || d.frontend_output_names_raw.size() != 1 ||
-        d.backend_input_names_raw.size() != 2 || d.backend_output_names_raw.size() != 1) {
+        (d.backend_input_names_raw.size() != 1 && d.backend_input_names_raw.size() != 2) ||
+        d.backend_output_names_raw.size() != 1) {
         throw std::runtime_error("Unexpected SMT audio ONNX IO signature");
     }
 
@@ -665,45 +671,77 @@ std::unique_ptr<smt_audio_context> smt_audio_context::create(const std::string &
         }
         std::cerr << "\n";
 
-        const int chunk_frames = 100;
-        const int chunk_tokens = 13;
-        const int t_out        = chunk_tokens;
+        int                warmup_t_out;
+        std::vector<float> warmup_hidden;
 
-        std::vector<float>         frontend_input_data((size_t) d.config.num_mel_bins * chunk_frames, 0.0f);
-        const std::vector<int64_t> frontend_input_shape = { 1, d.config.num_mel_bins, chunk_frames };
-        auto                       frontend_input       = make_tensor_f32(frontend_input_shape, frontend_input_data);
+        if (d.config.lfr_m > 0) {
+            const int warmup_frames = 10;
+            const int feat_dim      = d.config.num_mel_bins * d.config.lfr_m;
 
-        std::cerr << "[SMT][audio] warmup frontend ONNX session: " << d.config.frontend_model_path << "\n";
-        auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(),
-                                                       &frontend_input, 1, d.frontend_output_names_raw.data(), 1);
+            std::vector<float>         frontend_input_data((size_t) warmup_frames * feat_dim, 0.0f);
+            const std::vector<int64_t> frontend_input_shape = { 1, warmup_frames, (int64_t) feat_dim };
+            auto                       frontend_input       = make_tensor_f32(frontend_input_shape, frontend_input_data);
 
-        std::vector<float> hidden_states((size_t) t_out * (size_t) d.config.d_model, 0.0f);
-        if (frontend_outputs.empty()) {
-            throw std::runtime_error("SMT audio warmup frontend returned no outputs");
-        }
-        const auto frontend_output_info = frontend_outputs[0].GetTensorTypeAndShapeInfo();
-        if (frontend_output_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
-            throw std::runtime_error("SMT audio warmup frontend output must be float32");
-        }
-        const int64_t frontend_output_elems = frontend_output_info.GetElementCount();
-        if (frontend_output_elems < 0 || (size_t) frontend_output_elems < hidden_states.size()) {
-            throw std::runtime_error("SMT audio warmup frontend output is smaller than expected");
+            std::cerr << "[SMT][audio] warmup frontend ONNX session (FunASR): " << d.config.frontend_model_path << "\n";
+            auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(),
+                                                           &frontend_input, 1, d.frontend_output_names_raw.data(), 1);
+            if (frontend_outputs.empty()) {
+                throw std::runtime_error("SMT audio warmup frontend returned no outputs");
+            }
+            const auto frontend_output_info = frontend_outputs[0].GetTensorTypeAndShapeInfo();
+            if (frontend_output_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+                throw std::runtime_error("SMT audio warmup frontend output must be float32");
+            }
+            auto shape    = frontend_output_info.GetShape();
+            warmup_t_out  = (int) shape[1];
+            warmup_hidden.resize((size_t) warmup_t_out * (size_t) d.config.d_model, 0.0f);
+            const float * frontend_output = frontend_outputs[0].GetTensorData<float>();
+            std::memcpy(warmup_hidden.data(), frontend_output, warmup_hidden.size() * sizeof(float));
+        } else {
+            const int chunk_frames = 100;
+            const int chunk_tokens = 13;
+            warmup_t_out           = chunk_tokens;
+
+            std::vector<float>         frontend_input_data((size_t) d.config.num_mel_bins * chunk_frames, 0.0f);
+            const std::vector<int64_t> frontend_input_shape = { 1, d.config.num_mel_bins, chunk_frames };
+            auto                       frontend_input       = make_tensor_f32(frontend_input_shape, frontend_input_data);
+
+            std::cerr << "[SMT][audio] warmup frontend ONNX session: " << d.config.frontend_model_path << "\n";
+            auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(),
+                                                           &frontend_input, 1, d.frontend_output_names_raw.data(), 1);
+
+            warmup_hidden.resize((size_t) warmup_t_out * (size_t) d.config.d_model, 0.0f);
+            if (frontend_outputs.empty()) {
+                throw std::runtime_error("SMT audio warmup frontend returned no outputs");
+            }
+            const auto frontend_output_info = frontend_outputs[0].GetTensorTypeAndShapeInfo();
+            if (frontend_output_info.GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+                throw std::runtime_error("SMT audio warmup frontend output must be float32");
+            }
+            const int64_t frontend_output_elems = frontend_output_info.GetElementCount();
+            if (frontend_output_elems < 0 || (size_t) frontend_output_elems < warmup_hidden.size()) {
+                throw std::runtime_error("SMT audio warmup frontend output is smaller than expected");
+            }
+            const float * frontend_output = frontend_outputs[0].GetTensorData<float>();
+            std::memcpy(warmup_hidden.data(), frontend_output, warmup_hidden.size() * sizeof(float));
         }
-        const float * frontend_output = frontend_outputs[0].GetTensorData<float>();
-        std::memcpy(hidden_states.data(), frontend_output, hidden_states.size() * sizeof(float));
-
-        std::vector<float>         attention_mask((size_t) t_out * (size_t) t_out, 0.0f);
-        const std::vector<int64_t> backend_hidden_shape = { 1, t_out, d.config.d_model };
-        const std::vector<int64_t> backend_mask_shape   = { 1, 1, t_out, t_out };
 
-        auto                      hidden_tensor  = make_tensor_f32(backend_hidden_shape, hidden_states);
-        auto                      mask_tensor    = make_tensor_f32(backend_mask_shape, attention_mask);
-        std::array<Ort::Value, 2> backend_inputs = { std::move(hidden_tensor), std::move(mask_tensor) };
+        const std::vector<int64_t> backend_hidden_shape = { 1, warmup_t_out, d.config.d_model };
+        auto                      hidden_tensor        = make_tensor_f32(backend_hidden_shape, warmup_hidden);
 
         std::cerr << "[SMT][audio] warmup backend ONNX session: " << d.config.backend_model_path << "\n";
-        (void) d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(),
-                                     backend_inputs.data(), backend_inputs.size(), d.backend_output_names_raw.data(),
-                                     1);
+        if (d.backend_input_names_raw.size() == 2) {
+            std::vector<float>         attention_mask((size_t) warmup_t_out * (size_t) warmup_t_out, 0.0f);
+            const std::vector<int64_t> backend_mask_shape = { 1, 1, warmup_t_out, warmup_t_out };
+            auto                       mask_tensor        = make_tensor_f32(backend_mask_shape, attention_mask);
+            std::array<Ort::Value, 2>  backend_inputs     = { std::move(hidden_tensor), std::move(mask_tensor) };
+            (void) d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(),
+                                         backend_inputs.data(), backend_inputs.size(),
+                                         d.backend_output_names_raw.data(), 1);
+        } else {
+            (void) d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(),
+                                         &hidden_tensor, 1, d.backend_output_names_raw.data(), 1);
+        }
     }
 
     return ctx;
@@ -724,77 +762,150 @@ std::vector<float> smt_audio_context::encode_audio(const std::string & audio_pat
     }
     ggml_trace_log_end("decode_audio_file", "Audio", NULL);
 
-    mtmd_audio_mel mel;
-    ggml_trace_log_begin("compute_log_mel_spectrogram", "Audio", NULL);
-    if (!mtmd_audio_compute_log_mel_spectrogram(samples.data(), samples.size(), 4, d.config.num_mel_bins,
-                                                d.config.n_fft, d.config.window_len, d.config.hop_len,
-                                                d.config.sample_rate, true, 0.0f, false, false, mel)) {
+    int                t_out;
+    std::vector<float> hidden_states;
+
+    if (d.config.lfr_m > 0) {
+        // FunASR path: kaldi fbank -> LFR -> frontend -> backend
+        std::vector<float> fbank_features;
+        int                n_fbank_frames = 0;
+        ggml_trace_log_begin("compute_kaldi_fbank", "Audio", NULL);
+        if (!mtmd_audio_compute_kaldi_fbank(samples.data(), samples.size(), d.config.sample_rate, d.config.num_mel_bins,
+                                 d.config.window_len, d.config.hop_len, 0.97f, fbank_features, n_fbank_frames)) {
+            ggml_trace_log_end("compute_kaldi_fbank", "Audio", NULL);
+            ggml_trace_log_end("encode_audio", "Audio", NULL);
+            ggml_profile_flush_tls();
+            throw std::runtime_error("failed to compute kaldi fbank features");
+        }
+        ggml_trace_log_end("compute_kaldi_fbank", "Audio", NULL);
+
+        std::vector<float> lfr_features;
+        int                n_lfr_frames = 0;
+        if (!mtmd_audio_compute_lfr(fbank_features, n_fbank_frames, d.config.num_mel_bins, d.config.lfr_m, d.config.lfr_n,
+                         lfr_features, n_lfr_frames)) {
+            ggml_trace_log_end("encode_audio", "Audio", NULL);
+            ggml_profile_flush_tls();
+            throw std::runtime_error("failed to compute LFR features");
+        }
+
+        // Per-frame mean subtraction for ONNX numerical stability.
+        // LayerNorm is shift-invariant: LN(x + c) = LN(x), so subtracting the
+        // per-frame mean does not change the model output, but prevents catastrophic
+        // cancellation in the ONNX decomposed variance computation (E[x²] - E[x]²)
+        // when input values have large magnitude but small variance.
+        const int feat_dim = d.config.num_mel_bins * d.config.lfr_m;
+        for (int i = 0; i < n_lfr_frames; i++) {
+            float * frame = lfr_features.data() + (size_t) i * feat_dim;
+            float   sum   = 0.0f;
+            for (int j = 0; j < feat_dim; j++) {
+                sum += frame[j];
+            }
+            float mean = sum / (float) feat_dim;
+            for (int j = 0; j < feat_dim; j++) {
+                frame[j] -= mean;
+            }
+        }
+        const std::vector<int64_t> frontend_input_shape = { 1, (int64_t) n_lfr_frames, (int64_t) feat_dim };
+        auto                       frontend_input       = make_tensor_f32(frontend_input_shape, lfr_features);
+
+        ggml_trace_log_begin("frontend_session_run", "Audio", NULL);
+        auto frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(),
+                                                       &frontend_input, 1, d.frontend_output_names_raw.data(), 1);
+        ggml_trace_log_end("frontend_session_run", "Audio", NULL);
+
+        if (frontend_outputs.empty()) {
+            ggml_trace_log_end("encode_audio", "Audio", NULL);
+            ggml_profile_flush_tls();
+            throw std::runtime_error("FunASR frontend returned no outputs");
+        }
+
+        const auto frontend_shape = frontend_outputs[0].GetTensorTypeAndShapeInfo().GetShape();
+        t_out                     = (int) frontend_shape[1];
+        hidden_states.resize((size_t) t_out * (size_t) d.config.d_model);
+        std::memcpy(hidden_states.data(), frontend_outputs[0].GetTensorData<float>(),
+                    hidden_states.size() * sizeof(float));
+
+    } else {
+        // Qwen3ASR path: mel spectrogram -> chunk -> frontend -> backend
+        mtmd_audio_mel mel;
+        ggml_trace_log_begin("compute_log_mel_spectrogram", "Audio", NULL);
+        if (!mtmd_audio_compute_log_mel_spectrogram(samples.data(), samples.size(), 4, d.config.num_mel_bins,
+                                                    d.config.n_fft, d.config.window_len, d.config.hop_len,
+                                                    d.config.sample_rate, true, 0.0f, false, false, mel)) {
+            ggml_trace_log_end("compute_log_mel_spectrogram", "Audio", NULL);
+            ggml_trace_log_end("encode_audio", "Audio", NULL);
+            ggml_profile_flush_tls();
+            throw std::runtime_error("failed to compute Qwen3-ASR mel spectrogram");
+        }
         ggml_trace_log_end("compute_log_mel_spectrogram", "Audio", NULL);
-        ggml_trace_log_end("encode_audio", "Audio", NULL);
-        ggml_profile_flush_tls();
-        throw std::runtime_error("failed to compute Qwen3-ASR mel spectrogram");
-    }
-    ggml_trace_log_end("compute_log_mel_spectrogram", "Audio", NULL);
 
-    if (mel.n_len <= 0 || mel.n_mel != d.config.num_mel_bins) {
-        ggml_trace_log_end("encode_audio", "Audio", NULL);
-        ggml_profile_flush_tls();
-        throw std::runtime_error("invalid mel spectrogram shape");
-    }
-
-    const int frames        = mel.n_len;
-    const int chunk_frames  = 100;
-    const int chunk_tokens  = 13;
-    const int padded_frames = ((frames + chunk_frames - 1) / chunk_frames) * chunk_frames;
-    const int n_chunks      = padded_frames / chunk_frames;
-
-    std::vector<float>         hidden_states((size_t) n_chunks * chunk_tokens * (size_t) d.config.d_model);
-    std::vector<float>         chunk_input((size_t) d.config.num_mel_bins * chunk_frames, 0.0f);
-    const std::vector<int64_t> frontend_input_shape = { 1, d.config.num_mel_bins, chunk_frames };
-
-    ggml_trace_log_begin("frontend_session_run", "Audio", NULL);
-    for (int chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) {
-        std::fill(chunk_input.begin(), chunk_input.end(), 0.0f);
-        const int frame_offset = chunk_idx * chunk_frames;
-        const int copy_frames  = std::min(chunk_frames, frames - frame_offset);
-        if (copy_frames > 0) {
-            for (int mel_idx = 0; mel_idx < mel.n_mel; ++mel_idx) {
-                const float * src = mel.data.data() + (size_t) mel_idx * mel.n_len + frame_offset;
-                float *       dst = chunk_input.data() + (size_t) mel_idx * chunk_frames;
-                std::memcpy(dst, src, (size_t) copy_frames * sizeof(float));
+        if (mel.n_len <= 0 || mel.n_mel != d.config.num_mel_bins) {
+            ggml_trace_log_end("encode_audio", "Audio", NULL);
+            ggml_profile_flush_tls();
+            throw std::runtime_error("invalid mel spectrogram shape");
+        }
+
+        const int frames        = mel.n_len;
+        const int chunk_frames  = 100;
+        const int chunk_tokens  = 13;
+        const int padded_frames = ((frames + chunk_frames - 1) / chunk_frames) * chunk_frames;
+        const int n_chunks      = padded_frames / chunk_frames;
+
+        hidden_states.resize((size_t) n_chunks * chunk_tokens * (size_t) d.config.d_model);
+        std::vector<float>         chunk_input((size_t) d.config.num_mel_bins * chunk_frames, 0.0f);
+        const std::vector<int64_t> frontend_input_shape = { 1, d.config.num_mel_bins, chunk_frames };
+
+        ggml_trace_log_begin("frontend_session_run", "Audio", NULL);
+        for (int chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) {
+            std::fill(chunk_input.begin(), chunk_input.end(), 0.0f);
+            const int frame_offset = chunk_idx * chunk_frames;
+            const int copy_frames  = std::min(chunk_frames, frames - frame_offset);
+            if (copy_frames > 0) {
+                for (int mel_idx = 0; mel_idx < mel.n_mel; ++mel_idx) {
+                    const float * src = mel.data.data() + (size_t) mel_idx * mel.n_len + frame_offset;
+                    float *       dst = chunk_input.data() + (size_t) mel_idx * chunk_frames;
+                    std::memcpy(dst, src, (size_t) copy_frames * sizeof(float));
+                }
             }
+
+            auto    frontend_input   = make_tensor_f32(frontend_input_shape, chunk_input);
+            auto    frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(),
+                                                              &frontend_input, 1, d.frontend_output_names_raw.data(), 1);
+            float * chunk_out        = frontend_outputs[0].GetTensorMutableData<float>();
+            std::memcpy(hidden_states.data() + (size_t) chunk_idx * chunk_tokens * (size_t) d.config.d_model, chunk_out,
+                        (size_t) chunk_tokens * (size_t) d.config.d_model * sizeof(float));
         }
+        ggml_trace_log_end("frontend_session_run", "Audio", NULL);
 
-        auto    frontend_input   = make_tensor_f32(frontend_input_shape, chunk_input);
-        auto    frontend_outputs = d.frontend_session.Run(Ort::RunOptions{ nullptr }, d.frontend_input_names_raw.data(),
-                                                          &frontend_input, 1, d.frontend_output_names_raw.data(), 1);
-        float * chunk_out        = frontend_outputs[0].GetTensorMutableData<float>();
-        std::memcpy(hidden_states.data() + (size_t) chunk_idx * chunk_tokens * (size_t) d.config.d_model, chunk_out,
-                    (size_t) chunk_tokens * (size_t) d.config.d_model * sizeof(float));
-    }
-    ggml_trace_log_end("frontend_session_run", "Audio", NULL);
+        t_out = get_feat_extract_output_lengths(frames);
+        if (t_out <= 0 || t_out > n_chunks * chunk_tokens) {
+            ggml_trace_log_end("encode_audio", "Audio", NULL);
+            ggml_profile_flush_tls();
+            throw std::runtime_error("invalid split-encoder output length");
+        }
 
-    const int t_out = get_feat_extract_output_lengths(frames);
-    if (t_out <= 0 || t_out > n_chunks * chunk_tokens) {
-        ggml_trace_log_end("encode_audio", "Audio", NULL);
-        ggml_profile_flush_tls();
-        throw std::runtime_error("invalid split-encoder output length");
+        hidden_states.resize((size_t) t_out * (size_t) d.config.d_model);
     }
 
-    hidden_states.resize((size_t) t_out * (size_t) d.config.d_model);
-    std::vector<float> attention_mask((size_t) t_out * (size_t) t_out, 0.0f);
-
+    // Common backend path
     const std::vector<int64_t> backend_hidden_shape = { 1, t_out, d.config.d_model };
-    const std::vector<int64_t> backend_mask_shape   = { 1, 1, t_out, t_out };
-
-    auto                      hidden_tensor  = make_tensor_f32(backend_hidden_shape, hidden_states);
-    auto                      mask_tensor    = make_tensor_f32(backend_mask_shape, attention_mask);
-    std::array<Ort::Value, 2> backend_inputs = { std::move(hidden_tensor), std::move(mask_tensor) };
+    auto                       hidden_tensor        = make_tensor_f32(backend_hidden_shape, hidden_states);
 
     ggml_trace_log_begin("backend_session_run", "Audio", NULL);
-    auto backend_outputs =
-        d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(), backend_inputs.data(),
-                              backend_inputs.size(), d.backend_output_names_raw.data(), 1);
+    std::vector<Ort::Value> backend_outputs;
+    if (d.backend_input_names_raw.size() == 2) {
+        // Backend expects hidden_states + attention_mask
+        std::vector<float>         attention_mask((size_t) t_out * (size_t) t_out, 0.0f);
+        const std::vector<int64_t> backend_mask_shape = { 1, 1, t_out, t_out };
+        auto                       mask_tensor        = make_tensor_f32(backend_mask_shape, attention_mask);
+        std::array<Ort::Value, 2>  inputs             = { std::move(hidden_tensor), std::move(mask_tensor) };
+        backend_outputs = d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(),
+                                                inputs.data(), inputs.size(), d.backend_output_names_raw.data(), 1);
+    } else {
+        // Backend only expects hidden_states (attention_mask pruned by ONNX exporter)
+        backend_outputs = d.backend_session.Run(Ort::RunOptions{ nullptr }, d.backend_input_names_raw.data(),
+                                                &hidden_tensor, 1, d.backend_output_names_raw.data(), 1);
+    }
     ggml_trace_log_end("backend_session_run", "Audio", NULL);
 
     float *            output = backend_outputs[0].GetTensorMutableData<float>();
diff --git a/tools/server/server-smt-vision.cpp b/tools/server/server-smt-vision.cpp
index 38bbc40d0362..6dc9f290b880 100644
--- a/tools/server/server-smt-vision.cpp
+++ b/tools/server/server-smt-vision.cpp
@@ -614,6 +614,10 @@ static bool arch_is_qwen3asr(const std::string & arch_name) {
     return contains_icase(arch_name, "qwen3asr");
 }
 
+static bool arch_is_funasr(const std::string & arch_name) {
+    return contains_icase(arch_name, "funasr");
+}
+
 static std::pair<int32_t, int32_t> infer_image_grid_xy(int32_t n_tokens) {
     if (n_tokens <= 0) {
         return { 0, 0 };