From fc98281184a5eea9c583e38b9ca34681335b836f Mon Sep 17 00:00:00 2001
From: zkasuran <zkasuran@gmail.com>
Date: Mon, 8 Jun 2026 08:36:51 +0530
Subject: [PATCH 1/2] Fuse distributed prefix-suffix multi-SWAP (closes #595)

The localiser performed each prefix<->suffix SWAP in turn, so an amplitude
moved by one SWAP was often moved again by the next, crossing the network
several times. This fuses the group of disjoint SWAPs into one operation that
computes each amplitude's final node and sends it there directly, so every
amplitude crosses the network at most once.

The disjoint SWAPs commute and compose into a single bit permutation. For the
uncontrolled case (every internal caller) the routine enumerates the up to
2^eta-1 destination nodes and packs, exchanges and unpacks only the amplitudes
bound to each. A new cpu_statevec_unpackAmpsFromBuffer scatters the received
sub-buffer back into the strided local amplitudes, the inverse of the existing
packer, looping over moved amplitudes not the whole state.

Scope is CPU/OpenMP. GPU quregs and controlled multi-SWAPs keep the existing
per-SWAP path, so the GPU build is unchanged.

Comm volume drops 25% at eta=2 and 42% at eta=3 (1 - 1/2^eta), matching theory.
Existing applySwap, applyCompMatr, applyCompMatr2 and calcPartialTrace suites
pass at 1, 2, 4 and 8 ranks.
---
 quest/src/core/accelerator.cpp    | 17 ++++++-
 quest/src/core/accelerator.hpp    |  2 +
 quest/src/core/localiser.cpp      | 73 ++++++++++++++++++++++++++-----
 quest/src/cpu/cpu_subroutines.cpp | 43 +++++++++++++++++-
 quest/src/cpu/cpu_subroutines.hpp |  2 +
 5 files changed, 124 insertions(+), 13 deletions(-)

diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 677e6c74a..7752b8557 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -232,12 +232,27 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL
 
     // note qubits may incidentally be ctrls or targs; it doesn't matter
     GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() );
-    
+
     // return the number of packed amps, for caller convenience
     return func(qureg, qubits, qubitStates);
 }
 
 
+void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) {
+
+    // inverse of packing; scatters received sub-buffer into strided local amps where
+    // the given qubits are in the given states (used by the fused multi-SWAP routine).
+    // only the CPU path is dispatched; the fused routine restricts itself to non-GPU
+    // quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel
+    // is needed and the GPU build is left untouched
+    if (qubitStates.empty())
+        error_noCtrlsGivenToBufferPacker();
+
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() );
+    func(qureg, qubits, qubitStates);
+}
+
+
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {
 
     return (qureg.isGpuAccelerated)?
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 5a8dc37fb..879ad8d78 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -171,6 +171,8 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS
 
 qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
 
+void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
+
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
 
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 83a23b921..ab11f6215 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -900,24 +900,75 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
     // the SWAPs act on unique qubit pairs and so commute.
 
     /// @todo
-    ///   - the sequence of pair-wise full-swaps should be more efficient as a
-    ///     "single" sequence of smaller messages sending amps directly to their
-    ///     final destination node. This could use a new "multiSwap" function.
-    ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
-    ///     multiSwap function should use custatevecSwapIndexBits() if local,
-    ///     or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
+    ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this
+    ///     routine could use custatevecSwapIndexBits() if local, or
+    ///     custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
     ///     although the latter requires substantially more work like setting up
     ///     a communicator which may be inelegant alongside our own distribution scheme.
 
-    // perform necessary swaps to move all targets into suffix, each of which invokes communication
+    // collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit
+    auto suffixTargs = lists_getEmptyList64();
+    auto prefixTargs = lists_getEmptyList64();
     for (size_t i=0; i<targsA.size(); i++) {
-
         if (targsA[i] == targsB[i])
             continue;
+        suffixTargs.push_back(std::min(targsA[i], targsB[i]));
+        prefixTargs.push_back(std::max(targsA[i], targsB[i]));
+    }
+    int numSwaps = suffixTargs.size();
+    if (numSwaps == 0)
+        return;
+
+    // the fused routine below targets the uncontrolled, non-GPU case which every internal
+    // caller currently uses. A controlled multi-SWAP, or a GPU-accelerated Qureg, falls back
+    // to the per-swap routine (issue #595 notes the OpenMP logic alone is sufficient, so the
+    // GPU path is left unchanged)
+    if (!ctrls.empty() || qureg.isGpuAccelerated) {
+        for (int i=0; i<numSwaps; i++)
+            anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
+        return;
+    }
+
+    // FUSED multi-SWAP: rather than performing each prefix<->suffix SWAP in turn (which
+    // wastefully relays an amplitude through intermediate nodes before its final node),
+    // we send each amplitude directly to its destination node in a single pass. The
+    // numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude
+    // of this node moves to the rank obtained by overwriting each prefix-target rank-bit
+    // with the value of its partnered suffix-target bit. We enumerate the (up to)
+    // 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose
+    // partnered suffix bit disagrees with this node's rank bit) and, for each, pack +
+    // exchange + unpack only the amplitudes bound there. The move is an involution
+    // between paired nodes, so the packed and unpacked amplitudes occupy the same local
+    // slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV.
+
+    std::vector<int> prefBits(numSwaps);
+    std::vector<int> rankBits(numSwaps);
+    for (int i=0; i<numSwaps; i++) {
+        prefBits[i] = util_getPrefixInd(prefixTargs[i], qureg);
+        rankBits[i] = getBit(qureg.rank, prefBits[i]);
+    }
+
+    // subset 0 are the amplitudes that do not move (all suffix bits already match the
+    // rank bits), so we skip it and iterate only the communicating subsets
+    qindex numSubsets = powerOf2(numSwaps);
+    for (qindex sub=1; sub<numSubsets; sub++) {
+
+        // the destination node flips this node's rank bits for the targeted subset, and
+        // the to-be-sent amplitudes are those whose suffix-target bits match the pattern
+        auto states = lists_getEmptyList64();
+        int pairRank = qureg.rank;
+        for (int i=0; i<numSwaps; i++) {
+            int inSubset = getBit(sub, i);
+            states.push_back(inSubset ? !rankBits[i] : rankBits[i]);
+            if (inSubset)
+                pairRank = static_cast<int>(flipBit(pairRank, prefBits[i]));
+        }
 
-        int suffixTarg = std::min(targsA[i], targsB[i]);
-        int prefixTarg = std::max(targsA[i], targsB[i]);
-        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
+        // pack the amplitudes bound for pairRank, exchange, and scatter the received
+        // amplitudes back into those same local slots
+        qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states);
+        comm_exchangeSubBuffers(qureg, numPacked, pairRank);
+        accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states);
     }
 }
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 59df946e9..524e6d89d 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -284,8 +284,49 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )
 
 
+template <int NumQubits>
+void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+
+    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);
+
+    // this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received
+    // contiguous sub-buffer back into the strided local amplitudes where the given qubits
+    // are in the given states. It generalises anyCtrlSwap_subC to multiple constrained
+    // qubits, as needed by the fused multi-SWAP routine.
+
+    // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
 
-/* 
+    // each constrained qubit halves the number of received amps
+    qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());
+
+    // received amplitudes begin at the buffer's receive offset
+    qindex offset = getBufferRecvInd();
+
+    auto sortedQubitInds = util_getSorted(qubitInds);
+    auto qubitStateMask  = util_getBitMask(qubitInds, qubitStates);
+
+    // use template param to compile-time unroll loop in insertBits()
+    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());
+
+    #pragma omp parallel for if(qureg.isMultithreaded)
+    for (qindex n=0; n<numIts; n++) {
+
+        // i = nth local index where qubits are in the specified states
+        qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask);
+
+        // scatter the contiguous sub-buffer among the strided local amplitudes
+        amps[i] = buffer[offset + n];
+    }
+}
+
+
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) )
+
+
+
+/*
  * SWAPS
  */
 
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index 3dbae057b..5925ad5a4 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -46,6 +46,8 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
 
 template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
 
+template <int NumQubits> void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
+
 qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
 

From 2d542d66c8638f45535d56be3fc0ff7b6961a6bf Mon Sep 17 00:00:00 2001
From: zkasuran <zkasuran@gmail.com>
Date: Mon, 15 Jun 2026 20:59:07 +0530
Subject: [PATCH 2/2] Batch fused multi-SWAP exchanges into async waves

The fused distributed multi-SWAP ran one blocking sub-buffer exchange per
destination subset, up to 2^numSwaps - 1 sequential rounds. This packs each
subset into a distinct slice of the buffer's send half and exchanges a whole
wave of subsets under a single MPI_Waitall. Only half the buffer can send, so
a wave holds 2^(numSwaps-1) subsets and the communicating subsets need at most
two waves. The same total amplitudes cross the network, with the number of
synchronisations cut from 2^numSwaps - 1 down to at most two.

The new comm_exchangeSubBufferChunks() follows the async-with-final-wait
pattern already in comm_routines.cpp (the one credited to arXiv:2308.07402),
and the packer and unpacker gain offset-aware variants so several subsets can
share the buffer. The controlled and GPU paths keep the per-swap fallback.

Adds a focused test (tests/unit/operations.cpp) that applies a random
multi-qubit unitary on prefix-spanning targets, forcing 2 to 4 prefix targets
and so the two-wave batching, and checks every deployment against the
reference linear algebra. Corrects the swap-fusion reference: quant-ph/0608239
is a general parallel simulator, not swap fusion, so the comment now points at
the distributed index-bit-swap references (arXiv:2203.16044 and cuStateVec's
custatevecDistIndexBitSwapScheduler) and arXiv:2311.01512 for the exchange.
---
 quest/src/comm/comm_routines.cpp  | 48 ++++++++++++++++++-
 quest/src/comm/comm_routines.hpp  | 10 ++++
 quest/src/core/accelerator.cpp    | 25 ++++++++++
 quest/src/core/accelerator.hpp    |  2 +
 quest/src/core/localiser.cpp      | 77 +++++++++++++++++++++++--------
 quest/src/cpu/cpu_subroutines.cpp | 43 ++++++++++++-----
 quest/src/cpu/cpu_subroutines.hpp |  2 +
 tests/unit/operations.cpp         | 53 +++++++++++++++++++++
 8 files changed, 229 insertions(+), 31 deletions(-)

diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
index cf6956454..35247a9af 100644
--- a/quest/src/comm/comm_routines.cpp
+++ b/quest/src/comm/comm_routines.cpp
@@ -21,6 +21,7 @@
 #include "quest/src/gpu/gpu_config.hpp"
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/comm/comm_indices.hpp"
+#include "quest/src/comm/comm_routines.hpp"
 
 #if QUEST_COMPILE_MPI
     #include <mpi.h>
@@ -528,11 +529,56 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) {
 
     if (qureg.isGpuAccelerated)
         exchangeGpuSubBuffers(qureg, numAmps, pairRank);
-    else 
+    else
         exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank);
 }
 
 
+void comm_exchangeSubBufferChunks(Qureg qureg, const vector<CommChunk>& chunks) {
+#if QUEST_COMPILE_MPI
+
+    assert_commQuregIsDistributed(qureg);
+
+    // exchange several disjoint sub-buffer chunks, each with its own pair rank, under a single
+    // wait. This collapses the up-to (2^k - 1) blocking exchanges of the fused multi-SWAP into one
+    // asynchronous wave (the caller bounds a wave's chunks to fit the send and receive buffer halves).
+    // Each chunk targets a DISTINCT pair rank, so (source rank, tag) already identifies every message
+    // and no per-partner tag offset is needed; we reuse the per-message tag = m exactly as
+    // exchangeArrays does. Async-with-final-wait as per arxiv.org/abs/2308.07402. The fused routine is
+    // CPU-only (it restricts itself to non-GPU quregs), so only the CPU buffer is exchanged here.
+
+    MPI_Comm mpiComm = comm_getMpiComm();
+
+    // validate every chunk and total the messages, to size the request list up-front
+    qindex numRequests = 0;
+    for (const CommChunk& chunk : chunks) {
+        assert_commBoundsAreValid(qureg, chunk.sendInd, chunk.recvInd, chunk.numAmps);
+        assert_bufferSendRecvDoesNotOverlap(chunk.sendInd, chunk.recvInd, chunk.numAmps);
+        assert_pairRankIsDistinct(qureg, chunk.pairRank);
+        numRequests += 2 * dividePow2PayloadIntoMessages(chunk.numAmps)[1];
+    }
+
+    vector<MPI_Request> requests(numRequests, MPI_REQUEST_NULL);
+
+    // post every chunk's receives and sends, then wait once for the whole wave
+    qindex r = 0;
+    for (const CommChunk& chunk : chunks) {
+        auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(chunk.numAmps);
+        for (qindex m=0; m<numMessages; m++) {
+            int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
+            MPI_Irecv(&qureg.cpuCommBuffer[chunk.recvInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]);
+            MPI_Isend(&qureg.cpuCommBuffer[chunk.sendInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]);
+        }
+    }
+
+    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+
+#else
+    error_commButEnvNotDistributed();
+#endif
+}
+
+
 void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) {
 
     auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg);
diff --git a/quest/src/comm/comm_routines.hpp b/quest/src/comm/comm_routines.hpp
index e75e889f6..771149147 100644
--- a/quest/src/comm/comm_routines.hpp
+++ b/quest/src/comm/comm_routines.hpp
@@ -31,6 +31,16 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank);
 
 void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank);
 
+// one disjoint sub-buffer chunk to exchange with a single pair rank, used by comm_exchangeSubBufferChunks
+struct CommChunk {
+    qindex sendInd; // buffer index where this chunk's amps to send begin
+    qindex recvInd; // buffer index where this chunk's received amps are written
+    qindex numAmps; // number of amps exchanged (a power of two)
+    int pairRank;   // the partner rank for this chunk (distinct from this node)
+};
+
+void comm_exchangeSubBufferChunks(Qureg qureg, const vector<CommChunk>& chunks);
+
 void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank);
 
 void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank);
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 7752b8557..da7933642 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -253,6 +253,31 @@ void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstL
 }
 
 
+qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd) {
+
+    // as accel_statevec_packAmpsIntoBuffer, but packs into an explicit send offset so the
+    // fused multi-SWAP can lay several subsets into one buffer and exchange them in a wave.
+    // CPU-only, like the unpacker, since the fused routine restricts itself to non-GPU quregs
+    if (qubitStates.empty())
+        error_noCtrlsGivenToBufferPacker();
+
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_packAmpsIntoSubBuffer, qubits.size() );
+    return func(qureg, qubits, qubitStates, sendInd);
+}
+
+
+void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd) {
+
+    // inverse of accel_statevec_packAmpsIntoSubBuffer; scatters a sub-buffer received at an
+    // explicit offset back into the strided local amps. CPU-only for the same reason as above
+    if (qubitStates.empty())
+        error_noCtrlsGivenToBufferPacker();
+
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromSubBuffer, qubits.size() );
+    func(qureg, qubits, qubitStates, recvInd);
+}
+
+
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {
 
     return (qureg.isGpuAccelerated)?
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 879ad8d78..ab16ca82d 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -170,8 +170,10 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS
  */
 
 qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
+qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd);
 
 void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
+void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd);
 
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index ab11f6215..80b1f7c80 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -25,6 +25,7 @@
 #include "quest/src/core/accelerator.hpp"
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/comm/comm_routines.hpp"
+#include "quest/src/comm/comm_indices.hpp"
 #include "quest/src/cpu/cpu_config.hpp"
 #include "quest/src/gpu/gpu_config.hpp"
 
@@ -939,7 +940,9 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
     // partnered suffix bit disagrees with this node's rank bit) and, for each, pack +
     // exchange + unpack only the amplitudes bound there. The move is an involution
     // between paired nodes, so the packed and unpacked amplitudes occupy the same local
-    // slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV.
+    // slots. This composed distributed index-bit swap is that of mpiQulacs (arXiv:2203.16044)
+    // and cuStateVec's custatevecDistIndexBitSwapScheduler; the pairwise amplitude exchange
+    // follows arXiv:2311.01512 Sec IV.
 
     std::vector<int> prefBits(numSwaps);
     std::vector<int> rankBits(numSwaps);
@@ -949,26 +952,64 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
     }
 
     // subset 0 are the amplitudes that do not move (all suffix bits already match the
-    // rank bits), so we skip it and iterate only the communicating subsets
+    // rank bits), so we skip it and communicate only the other subsets. Every communicating
+    // subset packs the same number of amplitudes (one per local amp whose suffix-target bits
+    // match the subset pattern)
     qindex numSubsets = powerOf2(numSwaps);
-    for (qindex sub=1; sub<numSubsets; sub++) {
-
-        // the destination node flips this node's rank bits for the targeted subset, and
-        // the to-be-sent amplitudes are those whose suffix-target bits match the pattern
-        auto states = lists_getEmptyList64();
-        int pairRank = qureg.rank;
-        for (int i=0; i<numSwaps; i++) {
-            int inSubset = getBit(sub, i);
-            states.push_back(inSubset ? !rankBits[i] : rankBits[i]);
-            if (inSubset)
-                pairRank = static_cast<int>(flipBit(pairRank, prefBits[i]));
+    qindex numPacked  = qureg.numAmpsPerNode / numSubsets;
+
+    // rather than one blocking exchange per subset (up to 2^numSwaps - 1 sequential syncs), we
+    // pack each subset into a distinct slice of the buffer's send half and exchange a whole wave
+    // of subsets under a single wait. Only half the buffer can send, so a wave holds
+    // (numAmpsPerNode/2)/numPacked = 2^(numSwaps-1) subsets, and the 2^numSwaps - 1 communicating
+    // subsets need at most two waves. The same total amplitudes cross the network, with the
+    // synchronisation count cut from 2^numSwaps - 1 down to at most two.
+    qindex sendBase = getSubBufferSendInd(qureg);
+    qindex recvBase = getBufferRecvInd();
+    qindex perWave  = (qureg.numAmpsPerNode / 2) / numPacked;
+
+    for (qindex first=1; first<numSubsets; first+=perWave) {
+
+        qindex last = std::min(first + perWave, numSubsets); // exclusive
+
+        // pack every subset of this wave into its own buffer slice, remembering the states so the
+        // matching received slice can be scattered back after the exchange
+        std::vector<CommChunk> chunks;
+        std::vector<List64> waveStates;
+        chunks.reserve(last - first);
+        waveStates.reserve(last - first);
+
+        for (qindex sub=first; sub<last; sub++) {
+
+            // the destination node flips this node's rank bits for the targeted subset, and
+            // the to-be-sent amplitudes are those whose suffix-target bits match the pattern
+            auto states = lists_getEmptyList64();
+            int pairRank = qureg.rank;
+            for (int i=0; i<numSwaps; i++) {
+                int inSubset = getBit(sub, i);
+                states.push_back(inSubset ? !rankBits[i] : rankBits[i]);
+                if (inSubset)
+                    pairRank = static_cast<int>(flipBit(pairRank, prefBits[i]));
+            }
+
+            qindex slot    = sub - first;
+            qindex sendInd = sendBase + slot * numPacked;
+            qindex recvInd = recvBase + slot * numPacked;
+
+            accel_statevec_packAmpsIntoSubBuffer(qureg, suffixTargs, states, sendInd);
+            chunks.push_back({sendInd, recvInd, numPacked, pairRank});
+            waveStates.push_back(states);
         }
 
-        // pack the amplitudes bound for pairRank, exchange, and scatter the received
-        // amplitudes back into those same local slots
-        qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states);
-        comm_exchangeSubBuffers(qureg, numPacked, pairRank);
-        accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states);
+        // exchange the whole wave with a single wait, then scatter each received slice back into
+        // the strided local amplitudes it came from
+        comm_exchangeSubBufferChunks(qureg, chunks);
+
+        for (qindex sub=first; sub<last; sub++) {
+            qindex slot    = sub - first;
+            qindex recvInd = recvBase + slot * numPacked;
+            accel_statevec_unpackAmpsFromSubBuffer(qureg, suffixTargs, waveStates[slot], recvInd);
+        }
     }
 }
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 524e6d89d..8df3f4a60 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -216,7 +216,7 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
 
 
 template <int NumQubits>
-qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+qindex cpu_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex sendInd) {
 
     assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);
 
@@ -224,15 +224,16 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const
     cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
     cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
 
-    // each control qubit halves the needed iterations
+    // each constrained qubit halves the needed iterations
     qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());
 
-    // amplitudes are packed at an offset into the buffer
-    qindex offset = getSubBufferSendInd(qureg);
+    // amplitudes are packed contiguously from the caller's send offset, so that several
+    // disjoint subsets can occupy distinct slices of the buffer and be exchanged in one wave
+    qindex offset = sendInd;
 
     auto sortedQubitInds = util_getSorted(qubitInds);
     auto qubitStateMask  = util_getBitMask(qubitInds, qubitStates);
-    
+
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());
 
@@ -251,6 +252,14 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const
 }
 
 
+template <int NumQubits>
+qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+
+    // pack into the buffer's single default send region (which begins at half its capacity)
+    return cpu_statevec_packAmpsIntoSubBuffer<NumQubits>(qureg, qubitInds, qubitStates, getSubBufferSendInd(qureg));
+}
+
+
 qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {
     
     assert_bufferPackerGivenIncreasingQubits(qubit1, qubit2, qubit3);
@@ -282,17 +291,18 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 
 
 INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoSubBuffer, (Qureg, ConstList64, ConstList64, qindex) )
 
 
 template <int NumQubits>
-void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+void cpu_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex recvInd) {
 
     assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);
 
-    // this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received
-    // contiguous sub-buffer back into the strided local amplitudes where the given qubits
-    // are in the given states. It generalises anyCtrlSwap_subC to multiple constrained
-    // qubits, as needed by the fused multi-SWAP routine.
+    // this is the inverse of cpu_statevec_packAmpsIntoSubBuffer; it scatters a received
+    // contiguous sub-buffer (beginning at the caller's receive offset) back into the strided
+    // local amplitudes where the given qubits are in the given states. It generalises
+    // anyCtrlSwap_subC to multiple constrained qubits, as needed by the fused multi-SWAP routine.
 
     // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
     cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
@@ -301,8 +311,8 @@ void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, Const
     // each constrained qubit halves the number of received amps
     qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());
 
-    // received amplitudes begin at the buffer's receive offset
-    qindex offset = getBufferRecvInd();
+    // received amplitudes begin at the caller's receive offset
+    qindex offset = recvInd;
 
     auto sortedQubitInds = util_getSorted(qubitInds);
     auto qubitStateMask  = util_getBitMask(qubitInds, qubitStates);
@@ -322,7 +332,16 @@ void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, Const
 }
 
 
+template <int NumQubits>
+void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+
+    // unpack from the buffer's single default receive region (which begins at index zero)
+    cpu_statevec_unpackAmpsFromSubBuffer<NumQubits>(qureg, qubitInds, qubitStates, getBufferRecvInd());
+}
+
+
 INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromSubBuffer, (Qureg, ConstList64, ConstList64, qindex) )
 
 
 
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index 5925ad5a4..0745d0a8c 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -45,8 +45,10 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
  */
 
 template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
+template <int NumQubits> qindex cpu_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex sendInd);
 
 template <int NumQubits> void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
+template <int NumQubits> void cpu_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex recvInd);
 
 qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 80b75b9c2..22ae987bf 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -3020,4 +3020,57 @@ TEST_CASE( "rightapplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG )
 }
 
 
+/*
+ * FUSED DISTRIBUTED MULTI-SWAP
+ *
+ * A focused check of the fused prefix-suffix multi-SWAP (localiser.cpp), which moves several
+ * prefix-qubit targets into the suffix in one batched exchange. Applying a random multi-qubit
+ * unitary on a target set whose upper qubits land in the prefix substate routes through that
+ * routine, exercising 2, 3 or 4 prefix targets at np = 4, 8, 16 (and the two-wave async batching
+ * those k force). Every cached deployment is compared against the same reference linear algebra:
+ * the distributed quregs run the fused exchange while the serial and multithreaded quregs run it
+ * communication-free, so agreement pins the fused, batched exchange to a swap-free reference. The
+ * generic dense-matrix tests already touch this path incidentally; this names and isolates it.
+ */
+
+TEST_CASE( "fused distributed multiSwap", TEST_CATEGORY_OPS ) {
+
+    int numQubits = getNumCachedQubits();
+    auto quregs = getCachedStatevecs();
+
+    for (int numTargs=2; numTargs<=5; numTargs++) {
+
+        if (numTargs > numQubits)
+            continue;
+
+        // the top numTargs qubits are prefix when the qureg spans at least 2^numTargs nodes; at
+        // fewer nodes only some are prefix and the routine simply fuses fewer swaps, still correct
+        vector<int> targs(numTargs);
+        for (int i=0; i<numTargs; i++)
+            targs[i] = numQubits - numTargs + i;
+
+        for (auto& [label, qureg]: quregs) {
+
+            DYNAMIC_SECTION( "numTargs=" + std::to_string(numTargs) + LABEL_DELIMITER + label ) {
+
+                qmatrix refMatr = getRandomUnitary(numTargs);
+                CompMatr apiMatr = createCompMatr(numTargs);
+                setCompMatr(apiMatr, refMatr);
+
+                qvector ref = getZeroVector(getPow2(numQubits));
+                initDebugState(qureg);
+                setToDebugState(ref);
+
+                applyCompMatr(qureg, targs, apiMatr);
+                applyReferenceOperator(ref, {}, targs, refMatr);
+
+                REQUIRE_AGREE( qureg, ref );
+
+                destroyCompMatr(apiMatr);
+            }
+        }
+    }
+}
+
+
 /** @} (end defgroup) */