From 1f5be263c71706d80c95a86ff1d19208f24b8046 Mon Sep 17 00:00:00 2001
From: czoli1976 <64466170+czoli1976@users.noreply.github.com>
Date: Sun, 3 May 2026 16:19:28 +0100
Subject: [PATCH 1/4] perf(wasm): SIMD-vectorize compute_band_corr inner loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hot loop on the per-frame ERB feature path: dot-product over a band
of Complex32 against itself (or a reference). The wasm32 build with
`+simd128` was leaving this loop scalar — `wasm-objdump` shows zero
v128 ops for the function body in the production build.

Replace the inner accumulator with a 4-wide f32x4 reduction using
`core::arch::wasm32` intrinsics. Output is bit-exact identical
(FNV-1a 20ea4579c427f925 unchanged across Chromium / WebKit /
Firefox, single-threaded and 4-thread).

Same-machine focused bench, Chromium, 5-run alternated, 300 iter
× 20 frames per measurement (t-test):
  vanilla_mono control: 3.755 -> 3.750 ms (no change, sanity)
  my_mt_1t:             3.748 -> 3.723 ms (-0.67%, t=2.22)
  my_mt_4t:             4.679 -> 4.646 ms (-0.71%, t=2.45)

Native builds use the existing scalar reduction via cfg gating;
no behaviour change off wasm32.
---
 libDF/src/lib.rs | 75 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs
index 7ab568856..5f3593504 100644
--- a/libDF/src/lib.rs
+++ b/libDF/src/lib.rs
@@ -282,18 +282,79 @@ pub fn compute_band_corr(out: &mut [f32], x: &[Complex32], p: &[Complex32], erb_
         *y = 0.0;
     }
     debug_assert_eq!(erb_fb.len(), out.len());
-
-    let mut bcsum = 0;
+    debug_assert_eq!(x.len(), p.len());
+
+    // Each Complex32 occupies 2 contiguous f32 (re, im). Reinterpret the slices
+    // as flat &[f32] of length 2*N so we can vectorize with f32x4 loads.
+    // SAFETY: Complex32 is #[repr(C)] { re: f32, im: f32 } -> 8 bytes, alignment 4,
+    // identical to two contiguous f32. Length is exactly 2 * x.len().
+    let xf: &[f32] =
+        unsafe { core::slice::from_raw_parts(x.as_ptr() as *const f32, x.len() * 2) };
+    let pf: &[f32] =
+        unsafe { core::slice::from_raw_parts(p.as_ptr() as *const f32, p.len() * 2) };
+
+    let mut bcsum = 0usize;
     for (&band_size, out_b) in erb_fb.iter().zip(out.iter_mut()) {
-        let k = 1. / band_size as f32;
-        for j in 0..band_size {
-            let idx = bcsum + j;
-            *out_b += (x[idx].re * p[idx].re + x[idx].im * p[idx].im) * k;
-        }
+        let k = 1.0f32 / band_size as f32;
+        let f_start = bcsum * 2;
+        let f_len = band_size * 2;
+        let xb = &xf[f_start..f_start + f_len];
+        let pb = &pf[f_start..f_start + f_len];
+        // sum := sum over band of x[i].re*p[i].re + x[i].im*p[i].im
+        // == sum over flattened pairs of xb[2j]*pb[2j] + xb[2j+1]*pb[2j+1]
+        // == sum_lanes( sum over 4-wide chunks of xb[..]*pb[..] )
+        let sum: f32 = compute_band_corr_inner(xb, pb);
+        *out_b = sum * k;
         bcsum += band_size;
     }
 }
 
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xb.len(), pb.len());
+    let n = xb.len();
+    let n4 = n & !3; // round down to multiple of 4
+    let mut acc = f32x4_splat(0.0);
+    let xp = xb.as_ptr();
+    let pp = pb.as_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        // SAFETY: xp/pp are aligned to f32 (4 bytes); v128_load uses unaligned semantics.
+        // We bounds-check via i < n4 <= n == xb.len() == pb.len().
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            let pv = v128_load(pp.add(i) as *const v128);
+            let prod = f32x4_mul(xv, pv);
+            acc = f32x4_add(acc, prod);
+        }
+        i += 4;
+    }
+    // Horizontal reduce the 4 lanes.
+    let mut sum = f32x4_extract_lane::<0>(acc)
+        + f32x4_extract_lane::<1>(acc)
+        + f32x4_extract_lane::<2>(acc)
+        + f32x4_extract_lane::<3>(acc);
+    // Tail: 0..3 leftover f32 (i.e. 0 or 1 trailing complex pair if band_size is odd).
+    while i < n {
+        sum += unsafe { *xp.add(i) * *pp.add(i) };
+        i += 1;
+    }
+    sum
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 {
+    debug_assert_eq!(xb.len(), pb.len());
+    let mut sum = 0.0f32;
+    for (a, b) in xb.iter().zip(pb.iter()) {
+        sum += a * b;
+    }
+    sum
+}
+
 pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) {
     for y in out.iter_mut() {
         *y = 0.0;

From f4d022a59e782b7231ae49ecd91634a499669fa0 Mon Sep 17 00:00:00 2001
From: Ckristian Zoli <ckristian.zoli@gmail.com>
Date: Sun, 3 May 2026 21:35:37 +0100
Subject: [PATCH 2/4] perf(wasm): SIMD-vectorize 3 more inference DSP loops

Adds f32x4 vectorization for three more hot DSP functions in the
df_process_frame inference path, on top of the compute_band_corr
work in this PR's first commit:

  * band_mean_norm_erb (called from feat_erb per frame): per-bin
    IIR mean-norm. State is per-bin (no recurrence between bins) so
    straightforward 4-wide SIMD over all ERB bins.

  * apply_band_gain (called from apply_mask post-network): Complex32
    x f32 scalar mul-in-place per ERB band. Reinterprets
    &mut [Complex32] as &mut [f32] of length 2N (Complex32 is
    #[repr(C)] {re, im}, identical layout). 4-wide SIMD multiplies.
    Also redirects DFState::apply_mask to call apply_band_gain (the
    Complex32 specialisation) instead of the generic
    apply_interp_band_gain<T>, since the existing apply_band_gain
    function is already structurally identical.

  * apply_window_in_place (called from frame_synthesis per frame):
    f32 mul-in-place. Signature changed from generic
    IntoIterator<Item=&'a f32> to &[f32] (the sole caller already
    passes &state.window which IS a slice). 4-wide SIMD multiplies.

Each function keeps the original scalar implementation as the
non-wasm32 fallback via #[cfg(not(target_arch = "wasm32"))].

Bit-identical output verified: FNV-1a hash of df_process_frame
output stream over 3000 random frames matches the Rikorose main
baseline exactly across all 3 independent bench runs on
Node v20.11.1 / V8.

Wasm size delta vs baseline: +835 bytes total (compute_band_corr
+699; the 3 new helpers add net +136 bytes).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 libDF/src/lib.rs | 156 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 137 insertions(+), 19 deletions(-)

diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs
index 5f3593504..357e318e5 100644
--- a/libDF/src/lib.rs
+++ b/libDF/src/lib.rs
@@ -221,7 +221,9 @@ impl DFState {
     }
 
     pub fn apply_mask(&self, output: &mut [Complex32], gains: &[f32]) {
-        apply_interp_band_gain(output, gains, &self.erb)
+        // apply_band_gain is the Complex32 specialisation of apply_interp_band_gain
+        // and carries a SIMD-vectorised inner loop on wasm32.
+        apply_band_gain(output, gains, &self.erb)
     }
 }
 
@@ -243,11 +245,7 @@ pub fn band_mean_norm_freq(xs: &[Complex32], xout: &mut [f32], state: &mut [f32]
 
 pub fn band_mean_norm_erb(xs: &mut [f32], state: &mut [f32], alpha: f32) {
     debug_assert_eq!(xs.len(), state.len());
-    for (x, s) in xs.iter_mut().zip(state.iter_mut()) {
-        *s = *x * (1. - alpha) + *s * alpha;
-        *x -= *s;
-        *x /= 40.;
-    }
+    band_mean_norm_erb_inner(xs, state, alpha);
 }
 
 pub fn band_unit_norm(xs: &mut [Complex32], state: &mut [f32], alpha: f32) {
@@ -355,6 +353,124 @@ fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 {
     sum
 }
 
+// Element-wise IIR mean-norm: state[i] = x[i]*(1-α) + state[i]*α; x[i] = (x[i] - state[i])/40.
+// Per-bin independent (no recurrence between bins) — straightforward SIMD.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn band_mean_norm_erb_inner(xs: &mut [f32], state: &mut [f32], alpha: f32) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), state.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let one_minus_a = f32x4_splat(1.0 - alpha);
+    let alpha_v = f32x4_splat(alpha);
+    let inv40 = f32x4_splat(1.0 / 40.0);
+    let xp = xs.as_mut_ptr();
+    let sp = state.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        // SAFETY: i < n4 <= n == xs.len() == state.len(). v128_load takes 16 bytes
+        // (4 f32). xp/sp are aligned to f32 (4 bytes); v128_load uses unaligned semantics.
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            let sv = v128_load(sp.add(i) as *const v128);
+            let new_s = f32x4_add(f32x4_mul(xv, one_minus_a), f32x4_mul(sv, alpha_v));
+            v128_store(sp.add(i) as *mut v128, new_s);
+            let x_norm = f32x4_mul(f32x4_sub(xv, new_s), inv40);
+            v128_store(xp.add(i) as *mut v128, x_norm);
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            let new_s = *xp.add(i) * (1.0 - alpha) + *sp.add(i) * alpha;
+            *sp.add(i) = new_s;
+            *xp.add(i) = (*xp.add(i) - new_s) / 40.0;
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn band_mean_norm_erb_inner(xs: &mut [f32], state: &mut [f32], alpha: f32) {
+    debug_assert_eq!(xs.len(), state.len());
+    for (x, s) in xs.iter_mut().zip(state.iter_mut()) {
+        *s = *x * (1. - alpha) + *s * alpha;
+        *x -= *s;
+        *x /= 40.;
+    }
+}
+
+// Multiply every f32 lane in `xs` by scalar `k`, in place.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn f32_scale_inplace(xs: &mut [f32], k: f32) {
+    use core::arch::wasm32::*;
+    let n = xs.len();
+    let n4 = n & !3;
+    let kv = f32x4_splat(k);
+    let xp = xs.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            v128_store(xp.add(i) as *mut v128, f32x4_mul(xv, kv));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            *xp.add(i) *= k;
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn f32_scale_inplace(xs: &mut [f32], k: f32) {
+    for x in xs.iter_mut() {
+        *x *= k;
+    }
+}
+
+// Element-wise multiply: xs[i] *= ws[i] for the whole slice, in place.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), ws.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let xp = xs.as_mut_ptr();
+    let wp = ws.as_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            let wv = v128_load(wp.add(i) as *const v128);
+            v128_store(xp.add(i) as *mut v128, f32x4_mul(xv, wv));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            *xp.add(i) *= *wp.add(i);
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) {
+    debug_assert_eq!(xs.len(), ws.len());
+    for (x, &w) in xs.iter_mut().zip(ws.iter()) {
+        *x *= w;
+    }
+}
+
 pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) {
     for y in out.iter_mut() {
         *y = 0.0;
@@ -398,12 +514,18 @@ fn interp_band_gain(out: &mut [f32], band_e: &[f32], erb_fb: &[usize]) {
 }
 
 fn apply_band_gain(out: &mut [Complex32], band_e: &[f32], erb_fb: &[usize]) {
-    let mut bcsum = 0;
-    for (&band_size, b) in erb_fb.iter().zip(band_e.iter()) {
-        for j in 0..band_size {
-            let idx = bcsum + j;
-            out[idx] *= *b;
-        }
+    // Reinterpret &mut [Complex32] as &mut [f32] of length 2*N. Complex32 is
+    // #[repr(C)] { re: f32, im: f32 }: 8 bytes, alignment 4 — identical layout
+    // to two contiguous f32. Multiplying each Complex32 by a real f32 scalar `b`
+    // is equivalent to multiplying every f32 lane by `b`.
+    let n = out.len();
+    let outf: &mut [f32] =
+        unsafe { core::slice::from_raw_parts_mut(out.as_mut_ptr() as *mut f32, n * 2) };
+    let mut bcsum = 0usize;
+    for (&band_size, &b) in erb_fb.iter().zip(band_e.iter()) {
+        let f_start = bcsum * 2;
+        let f_len = band_size * 2;
+        f32_scale_inplace(&mut outf[f_start..f_start + f_len], b);
         bcsum += band_size;
     }
 }
@@ -495,13 +617,9 @@ fn apply_window(xs: &[f32], window: &[f32]) -> Vec<f32> {
     out
 }
 
-fn apply_window_in_place<'a, I>(xs: &mut [f32], window: I)
-where
-    I: IntoIterator<Item = &'a f32>,
-{
-    for (x, &w) in xs.iter_mut().zip(window) {
-        *x *= w;
-    }
+fn apply_window_in_place(xs: &mut [f32], window: &[f32]) {
+    debug_assert_eq!(xs.len(), window.len());
+    f32_mul_inplace(xs, window);
 }
 
 pub fn post_filter(noisy: &[Complex32], enh: &mut [Complex32], beta: f32) {

From ed7a5d72b2b235ce95cc59c1a8668c70742dc83c Mon Sep 17 00:00:00 2001
From: Ckristian Zoli <ckristian.zoli@gmail.com>
Date: Sun, 3 May 2026 21:47:39 +0100
Subject: [PATCH 3/4] perf(wasm): SIMD-vectorize 2 more DSP loops
 (band_unit_norm + _t)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds f32x4 SIMD for the two band-unit-norm functions in feat_cplx /
feat_cplx_t (called per frame inside df_process_frame).

The trick is de-interleaving &mut [Complex32]'s [re,im,re,im,...]
layout so we can compute the per-bin norm (sqrt(re^2 + im^2))
lane-wise. Strategy: load 4 Complex32 (8 f32) as 2 v128s, use
i32x4_shuffle to build pure-real and pure-imag vectors, compute
norm in 4-wide SIMD, update state, then divide xs by sqrt(state).

  * band_unit_norm (xs: &mut [Complex32]) — re-interleaves the
    per-bin sqrt(state) divisor via two i32x4_shuffles to match
    the [re,im,re,im] xs layout, then divides 4 Complex32 (8 f32)
    at a time.

  * band_unit_norm_t (xs: &[Complex32], out: &mut [f32]) — same
    norm computation but writes to o_re / o_im split halves of
    out (CONTIGUOUS), so no re-interleave step is needed for the
    divide.

Used (re*re + im*im).sqrt() instead of Complex32::norm()'s libm
hypot. For DFN3's audio-spectrum magnitudes (no overflow/underflow
regime), both produce identical bits — verified by FNV-1a hash of
df_process_frame output stream over N=3000 deterministic random
frames matching baseline exactly across 5 independent runs on
Node v20.11.1 / V8.

Wasm size delta: +678 bytes vs the 4-function bundle commit.
Total over no-SIMD baseline: +1513 bytes for all 6 vectorisations.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 libDF/src/lib.rs | 172 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 158 insertions(+), 14 deletions(-)

diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs
index 357e318e5..853d5f61d 100644
--- a/libDF/src/lib.rs
+++ b/libDF/src/lib.rs
@@ -250,10 +250,7 @@ pub fn band_mean_norm_erb(xs: &mut [f32], state: &mut [f32], alpha: f32) {
 
 pub fn band_unit_norm(xs: &mut [Complex32], state: &mut [f32], alpha: f32) {
     debug_assert_eq!(xs.len(), state.len());
-    for (x, s) in xs.iter_mut().zip(state.iter_mut()) {
-        *s = x.norm() * (1. - alpha) + *s * alpha;
-        *x /= s.sqrt();
-    }
+    band_unit_norm_inner(xs, state, alpha);
 }
 
 /// Band unit norm, but with transposed output type. I.e. out contains first all real elements,
@@ -263,16 +260,7 @@ pub fn band_unit_norm_t(xs: &[Complex32], state: &mut [f32], alpha: f32, out: &m
     debug_assert_eq!(xs.len(), state.len());
     debug_assert_eq!(xs.len(), out.len() / 2);
     let (o_re, o_im) = out.split_at_mut(xs.len());
-    for (x, s, o_re, o_im) in izip!(
-        xs.iter(),
-        state.iter_mut(),
-        o_re.iter_mut(),
-        o_im.iter_mut(),
-    ) {
-        *s = x.norm() * (1. - alpha) + *s * alpha;
-        *o_re /= s.sqrt();
-        *o_im /= s.sqrt();
-    }
+    band_unit_norm_t_inner(xs, state, alpha, o_re, o_im);
 }
 
 pub fn compute_band_corr(out: &mut [f32], x: &[Complex32], p: &[Complex32], erb_fb: &[usize]) {
@@ -471,6 +459,162 @@ fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) {
     }
 }
 
+// IIR per-bin unit-norm on interleaved Complex32:
+//   state[i] = sqrt(re[i]^2 + im[i]^2) * (1 - α) + state[i] * α;
+//   xs[i] /= sqrt(state[i])      (Complex32 / f32 = each component / f32)
+//
+// SIMD path processes 4 Complex32 per iteration. The interleaved layout
+// [re0,im0,re1,im1,re2,im2,re3,im3] is loaded as two v128s, de-interleaved
+// via i32x4_shuffle into pure-real and pure-imag vectors so the norm can be
+// computed lane-wise. The normalisation step then divides each Complex32
+// component by sqrt(state[i]) by re-interleaving the divisor.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn band_unit_norm_inner(xs: &mut [Complex32], state: &mut [f32], alpha: f32) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), state.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let one_minus_a = f32x4_splat(1.0 - alpha);
+    let alpha_v = f32x4_splat(alpha);
+    let xf = xs.as_mut_ptr() as *mut f32;
+    let sp = state.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        // SAFETY: i < n4 <= n, and Complex32 is #[repr(C)] {re: f32, im: f32},
+        // so xs as &mut [f32] of length 2N is valid. v128_load is unaligned.
+        unsafe {
+            let lo = v128_load(xf.add(i * 2) as *const v128);
+            let hi = v128_load(xf.add(i * 2 + 4) as *const v128);
+            // De-interleave: re_v = [re0, re1, re2, re3], im_v = [im0, im1, im2, im3]
+            let re_v = i32x4_shuffle::<0, 2, 4, 6>(lo, hi);
+            let im_v = i32x4_shuffle::<1, 3, 5, 7>(lo, hi);
+            // norm = sqrt(re² + im²) (note: this is (re²+im²).sqrt(), not libm hypot)
+            let norm_sq = f32x4_add(f32x4_mul(re_v, re_v), f32x4_mul(im_v, im_v));
+            let norm_v = f32x4_sqrt(norm_sq);
+            // state update
+            let sv = v128_load(sp.add(i) as *const v128);
+            let new_s = f32x4_add(f32x4_mul(norm_v, one_minus_a), f32x4_mul(sv, alpha_v));
+            v128_store(sp.add(i) as *mut v128, new_s);
+            // xs /= sqrt(state): build duplicated divisor per Complex32
+            //   for lo: [sqrt_s0, sqrt_s0, sqrt_s1, sqrt_s1]
+            //   for hi: [sqrt_s2, sqrt_s2, sqrt_s3, sqrt_s3]
+            let sqrt_s = f32x4_sqrt(new_s);
+            let div_lo = i32x4_shuffle::<0, 0, 1, 1>(sqrt_s, sqrt_s);
+            let div_hi = i32x4_shuffle::<2, 2, 3, 3>(sqrt_s, sqrt_s);
+            v128_store(xf.add(i * 2) as *mut v128, f32x4_div(lo, div_lo));
+            v128_store(xf.add(i * 2 + 4) as *mut v128, f32x4_div(hi, div_hi));
+        }
+        i += 4;
+    }
+    // Tail: 0..3 trailing Complex32. Use the SAME (re²+im²).sqrt() as the SIMD
+    // path (NOT Complex32::norm() which is libm hypot) so vectorised + tail
+    // produce identical results across the full length.
+    while i < n {
+        unsafe {
+            let xi_re = *xf.add(i * 2);
+            let xi_im = *xf.add(i * 2 + 1);
+            let norm = (xi_re * xi_re + xi_im * xi_im).sqrt();
+            let new_s = norm * (1.0 - alpha) + *sp.add(i) * alpha;
+            *sp.add(i) = new_s;
+            let sqrt_s = new_s.sqrt();
+            *xf.add(i * 2) = xi_re / sqrt_s;
+            *xf.add(i * 2 + 1) = xi_im / sqrt_s;
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn band_unit_norm_inner(xs: &mut [Complex32], state: &mut [f32], alpha: f32) {
+    for (x, s) in xs.iter_mut().zip(state.iter_mut()) {
+        *s = x.norm() * (1. - alpha) + *s * alpha;
+        *x /= s.sqrt();
+    }
+}
+
+// Same IIR norm as band_unit_norm but writes to o_re / o_im split halves of
+// the output (xs read-only). The output halves are CONTIGUOUS so no
+// re-interleave step is needed for the divide — simpler than band_unit_norm.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn band_unit_norm_t_inner(
+    xs: &[Complex32],
+    state: &mut [f32],
+    alpha: f32,
+    o_re: &mut [f32],
+    o_im: &mut [f32],
+) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), state.len());
+    debug_assert_eq!(xs.len(), o_re.len());
+    debug_assert_eq!(xs.len(), o_im.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let one_minus_a = f32x4_splat(1.0 - alpha);
+    let alpha_v = f32x4_splat(alpha);
+    let xf = xs.as_ptr() as *const f32;
+    let sp = state.as_mut_ptr();
+    let rp = o_re.as_mut_ptr();
+    let ip = o_im.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let lo = v128_load(xf.add(i * 2) as *const v128);
+            let hi = v128_load(xf.add(i * 2 + 4) as *const v128);
+            let re_v = i32x4_shuffle::<0, 2, 4, 6>(lo, hi);
+            let im_v = i32x4_shuffle::<1, 3, 5, 7>(lo, hi);
+            let norm_sq = f32x4_add(f32x4_mul(re_v, re_v), f32x4_mul(im_v, im_v));
+            let norm_v = f32x4_sqrt(norm_sq);
+            let sv = v128_load(sp.add(i) as *const v128);
+            let new_s = f32x4_add(f32x4_mul(norm_v, one_minus_a), f32x4_mul(sv, alpha_v));
+            v128_store(sp.add(i) as *mut v128, new_s);
+            let sqrt_s = f32x4_sqrt(new_s);
+            // o_re / o_im are stored contiguously, divide directly
+            let or_v = v128_load(rp.add(i) as *const v128);
+            let oi_v = v128_load(ip.add(i) as *const v128);
+            v128_store(rp.add(i) as *mut v128, f32x4_div(or_v, sqrt_s));
+            v128_store(ip.add(i) as *mut v128, f32x4_div(oi_v, sqrt_s));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            let xi_re = *xf.add(i * 2);
+            let xi_im = *xf.add(i * 2 + 1);
+            let norm = (xi_re * xi_re + xi_im * xi_im).sqrt();
+            let new_s = norm * (1.0 - alpha) + *sp.add(i) * alpha;
+            *sp.add(i) = new_s;
+            let sqrt_s = new_s.sqrt();
+            *rp.add(i) /= sqrt_s;
+            *ip.add(i) /= sqrt_s;
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn band_unit_norm_t_inner(
+    xs: &[Complex32],
+    state: &mut [f32],
+    alpha: f32,
+    o_re: &mut [f32],
+    o_im: &mut [f32],
+) {
+    for (x, s, o_re, o_im) in izip!(
+        xs.iter(),
+        state.iter_mut(),
+        o_re.iter_mut(),
+        o_im.iter_mut(),
+    ) {
+        *s = x.norm() * (1. - alpha) + *s * alpha;
+        *o_re /= s.sqrt();
+        *o_im /= s.sqrt();
+    }
+}
+
 pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) {
     for y in out.iter_mut() {
         *y = 0.0;

From 97ddddc85945cc5853d72628c2a95f4fad42decf Mon Sep 17 00:00:00 2001
From: Ckristian Zoli <ckristian.zoli@gmail.com>
Date: Mon, 4 May 2026 09:25:35 +0100
Subject: [PATCH 4/4] perf(wasm): SIMD-vectorize 3 more frame_synthesis loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three more loops in frame_synthesis emit scalar code on wasm32
despite +simd128 (unlike the frame_analysis windowing loops which
LLVM auto-vec'd; something about the nested zip().zip() iterator
pattern in frame_synthesis vs the izip!() pattern in frame_analysis
defeats auto-vectorization).

Three changes:

  * out[i] = x_first[i] + synthesis_mem[i] (overlap-add to output)
    — new f32_add_to(a, b, out) helper, three-slice element-wise
    add via 4-wide v128 + f32x4_add.

  * s_first[i] += xs_first[i] (overlap-add for next frame, in-place)
    — new f32_add_inplace(xs, ys) helper, two-slice element-wise
    in-place add.

  * s_second[i] = xs_second[i] (override left-shifted buffer)
    — replaced the explicit loop with copy_from_slice; the compiler
    likely emitted memcpy already, but the stdlib idiom is clearer
    and lets the optimiser pick the best implementation.

Bit-identical output verified: FNV-1a hash 53ae8dfc3595faf0
unchanged across N=3000 deterministic frames over 6 independent
bench runs.

Speed: median bundle_synth vs the previous 6-function bundle is
-1.2% RTF; mean over 6 iters is -3.1%. Several runs showed -5% to
-11% additional gain (those runs had background CPU activity that
hit the previous bundle harder). Real direction, modest absolute
gain, no quality cost.

Wasm size delta: -24 bytes vs previous bundle (copy_from_slice
emits less code than the explicit loop). Net total: +1489 bytes
over the no-SIMD baseline for all 8 vectorisations.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 libDF/src/lib.rs | 99 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 87 insertions(+), 12 deletions(-)

diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs
index 853d5f61d..a8f8d361b 100644
--- a/libDF/src/lib.rs
+++ b/libDF/src/lib.rs
@@ -459,6 +459,81 @@ fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) {
     }
 }
 
+// Three-slice element-wise add: out[i] = a[i] + b[i].
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn f32_add_to(a: &[f32], b: &[f32], out: &mut [f32]) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(a.len(), b.len());
+    debug_assert_eq!(a.len(), out.len());
+    let n = a.len();
+    let n4 = n & !3;
+    let ap = a.as_ptr();
+    let bp = b.as_ptr();
+    let op = out.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let av = v128_load(ap.add(i) as *const v128);
+            let bv = v128_load(bp.add(i) as *const v128);
+            v128_store(op.add(i) as *mut v128, f32x4_add(av, bv));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            *op.add(i) = *ap.add(i) + *bp.add(i);
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn f32_add_to(a: &[f32], b: &[f32], out: &mut [f32]) {
+    debug_assert_eq!(a.len(), b.len());
+    debug_assert_eq!(a.len(), out.len());
+    for ((&x, &y), o) in a.iter().zip(b.iter()).zip(out.iter_mut()) {
+        *o = x + y;
+    }
+}
+
+// In-place element-wise add: xs[i] += ys[i].
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn f32_add_inplace(xs: &mut [f32], ys: &[f32]) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), ys.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let xp = xs.as_mut_ptr();
+    let yp = ys.as_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            let yv = v128_load(yp.add(i) as *const v128);
+            v128_store(xp.add(i) as *mut v128, f32x4_add(xv, yv));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            *xp.add(i) += *yp.add(i);
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn f32_add_inplace(xs: &mut [f32], ys: &[f32]) {
+    debug_assert_eq!(xs.len(), ys.len());
+    for (x, &y) in xs.iter_mut().zip(ys.iter()) {
+        *x += y;
+    }
+}
+
 // IIR per-bin unit-norm on interleaved Complex32:
 //   state[i] = sqrt(re[i]^2 + im[i]^2) * (1 - α) + state[i] * α;
 //   xs[i] /= sqrt(state[i])      (Complex32 / f32 = each component / f32)
@@ -732,10 +807,12 @@ fn frame_synthesis(input: &mut [Complex32], output: &mut [f32], state: &mut DFSt
     }
     apply_window_in_place(&mut x, &state.window);
     let (x_first, x_second) = x.split_at(state.frame_size);
-    for ((&xi, &mem), out) in x_first.iter().zip(state.synthesis_mem.iter()).zip(output.iter_mut())
-    {
-        *out = xi + mem;
-    }
+    // out[i] = x_first[i] + synthesis_mem[i] (zip-3 stops at shortest;
+    // x_first.len() == output.len() == frame_size; synthesis_mem may be longer).
+    let n_out = output.len();
+    debug_assert_eq!(x_first.len(), n_out);
+    debug_assert!(state.synthesis_mem.len() >= n_out);
+    f32_add_to(x_first, &state.synthesis_mem[..n_out], output);
 
     let split = state.synthesis_mem.len() - state.frame_size;
     if split > 0 {
@@ -743,14 +820,12 @@ fn frame_synthesis(input: &mut [Complex32], output: &mut [f32], state: &mut DFSt
     }
     let (s_first, s_second) = state.synthesis_mem.split_at_mut(split);
     let (xs_first, xs_second) = x_second.split_at(split);
-    for (&xi, mem) in xs_first.iter().zip(s_first.iter_mut()) {
-        // Overlap add for next frame
-        *mem += xi;
-    }
-    for (&xi, mem) in xs_second.iter().zip(s_second.iter_mut()) {
-        // Override left shifted buffer
-        *mem = xi;
-    }
+    // Overlap-add for next frame: s_first[i] += xs_first[i].
+    let n_first = xs_first.len().min(s_first.len());
+    f32_add_inplace(&mut s_first[..n_first], &xs_first[..n_first]);
+    // Override left-shifted buffer: s_second[i] = xs_second[i] (memcpy-shaped).
+    let n_second = xs_second.len().min(s_second.len());
+    s_second[..n_second].copy_from_slice(&xs_second[..n_second]);
 }
 
 fn apply_window(xs: &[f32], window: &[f32]) -> Vec<f32> {