QuEST-Kit · zkasuran · Jun 8, 2026 · Jun 15, 2026
diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
@@ -21,6 +21,7 @@
 #include "quest/src/gpu/gpu_config.hpp"
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/comm/comm_indices.hpp"
+#include "quest/src/comm/comm_routines.hpp"
 
 #if QUEST_COMPILE_MPI
     #include <mpi.h>
@@ -528,11 +529,56 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) {
 
     if (qureg.isGpuAccelerated)
         exchangeGpuSubBuffers(qureg, numAmps, pairRank);
-    else 
+    else
         exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank);
 }
 
 
+void comm_exchangeSubBufferChunks(Qureg qureg, const vector<CommChunk>& chunks) {
+#if QUEST_COMPILE_MPI
+
+    assert_commQuregIsDistributed(qureg);
+
+    // exchange several disjoint sub-buffer chunks, each with its own pair rank, under a single
+    // wait. This collapses the up-to (2^k - 1) blocking exchanges of the fused multi-SWAP into one
+    // asynchronous wave (the caller bounds a wave's chunks to fit the send and receive buffer halves).
+    // Each chunk targets a DISTINCT pair rank, so (source rank, tag) already identifies every message
+    // and no per-partner tag offset is needed; we reuse the per-message tag = m exactly as
+    // exchangeArrays does. Async-with-final-wait as per arxiv.org/abs/2308.07402. The fused routine is
+    // CPU-only (it restricts itself to non-GPU quregs), so only the CPU buffer is exchanged here.
+
+    MPI_Comm mpiComm = comm_getMpiComm();
+
+    // validate every chunk and total the messages, to size the request list up-front
+    qindex numRequests = 0;
+    for (const CommChunk& chunk : chunks) {
+        assert_commBoundsAreValid(qureg, chunk.sendInd, chunk.recvInd, chunk.numAmps);
+        assert_bufferSendRecvDoesNotOverlap(chunk.sendInd, chunk.recvInd, chunk.numAmps);
+        assert_pairRankIsDistinct(qureg, chunk.pairRank);
+        numRequests += 2 * dividePow2PayloadIntoMessages(chunk.numAmps)[1];
+    }
+
+    vector<MPI_Request> requests(numRequests, MPI_REQUEST_NULL);
+
+    // post every chunk's receives and sends, then wait once for the whole wave
+    qindex r = 0;
+    for (const CommChunk& chunk : chunks) {
+        auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(chunk.numAmps);
+        for (qindex m=0; m<numMessages; m++) {
+            int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
+            MPI_Irecv(&qureg.cpuCommBuffer[chunk.recvInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]);
+            MPI_Isend(&qureg.cpuCommBuffer[chunk.sendInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]);
+        }
+    }
+
+    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+
+#else
+    error_commButEnvNotDistributed();
+#endif
+}
+
+
 void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) {
 
     auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg);

diff --git a/quest/src/comm/comm_routines.hpp b/quest/src/comm/comm_routines.hpp
@@ -31,6 +31,16 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank);
 
 void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank);
 
+// one disjoint sub-buffer chunk to exchange with a single pair rank, used by comm_exchangeSubBufferChunks
+struct CommChunk {
+    qindex sendInd; // buffer index where this chunk's amps to send begin
+    qindex recvInd; // buffer index where this chunk's received amps are written
+    qindex numAmps; // number of amps exchanged (a power of two)
+    int pairRank;   // the partner rank for this chunk (distinct from this node)
+};
+
+void comm_exchangeSubBufferChunks(Qureg qureg, const vector<CommChunk>& chunks);
+
 void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank);
 
 void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank);

diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
@@ -232,12 +232,52 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL
 
     // note qubits may incidentally be ctrls or targs; it doesn't matter
     GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() );
-    
+
     // return the number of packed amps, for caller convenience
     return func(qureg, qubits, qubitStates);
 }
 
 
+void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) {
+
+    // inverse of packing; scatters received sub-buffer into strided local amps where
+    // the given qubits are in the given states (used by the fused multi-SWAP routine).
+    // only the CPU path is dispatched; the fused routine restricts itself to non-GPU
+    // quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel
+    // is needed and the GPU build is left untouched
+    if (qubitStates.empty())
+        error_noCtrlsGivenToBufferPacker();
+
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() );
+    func(qureg, qubits, qubitStates);
+}
+
+
+qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd) {
+
+    // as accel_statevec_packAmpsIntoBuffer, but packs into an explicit send offset so the
+    // fused multi-SWAP can lay several subsets into one buffer and exchange them in a wave.
+    // CPU-only, like the unpacker, since the fused routine restricts itself to non-GPU quregs
+    if (qubitStates.empty())
+        error_noCtrlsGivenToBufferPacker();
+
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_packAmpsIntoSubBuffer, qubits.size() );
+    return func(qureg, qubits, qubitStates, sendInd);
+}
+
+
+void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd) {
+
+    // inverse of accel_statevec_packAmpsIntoSubBuffer; scatters a sub-buffer received at an
+    // explicit offset back into the strided local amps. CPU-only for the same reason as above
+    if (qubitStates.empty())
+        error_noCtrlsGivenToBufferPacker();
+
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromSubBuffer, qubits.size() );
+    func(qureg, qubits, qubitStates, recvInd);
+}
+
+
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {
 
     return (qureg.isGpuAccelerated)?

diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
@@ -170,6 +170,10 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS
  */
 
 qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
+qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd);
+
+void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
+void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd);
 
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 

diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
@@ -25,6 +25,7 @@
 #include "quest/src/core/accelerator.hpp"
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/comm/comm_routines.hpp"
+#include "quest/src/comm/comm_indices.hpp"
 #include "quest/src/cpu/cpu_config.hpp"
 #include "quest/src/gpu/gpu_config.hpp"
 
@@ -900,24 +901,115 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
     // the SWAPs act on unique qubit pairs and so commute.
 
     /// @todo
-    ///   - the sequence of pair-wise full-swaps should be more efficient as a
-    ///     "single" sequence of smaller messages sending amps directly to their
-    ///     final destination node. This could use a new "multiSwap" function.
-    ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
-    ///     multiSwap function should use custatevecSwapIndexBits() if local,
-    ///     or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
+    ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this
+    ///     routine could use custatevecSwapIndexBits() if local, or
+    ///     custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
     ///     although the latter requires substantially more work like setting up
     ///     a communicator which may be inelegant alongside our own distribution scheme.
 
-    // perform necessary swaps to move all targets into suffix, each of which invokes communication
+    // collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit
+    auto suffixTargs = lists_getEmptyList64();
+    auto prefixTargs = lists_getEmptyList64();
     for (size_t i=0; i<targsA.size(); i++) {
-
         if (targsA[i] == targsB[i])
             continue;
+        suffixTargs.push_back(std::min(targsA[i], targsB[i]));
+        prefixTargs.push_back(std::max(targsA[i], targsB[i]));
+    }
+    int numSwaps = suffixTargs.size();
+    if (numSwaps == 0)
+        return;
+
+    // the fused routine below targets the uncontrolled, non-GPU case which every internal
+    // caller currently uses. A controlled multi-SWAP, or a GPU-accelerated Qureg, falls back
+    // to the per-swap routine (issue #595 notes the OpenMP logic alone is sufficient, so the
+    // GPU path is left unchanged)
+    if (!ctrls.empty() || qureg.isGpuAccelerated) {
+        for (int i=0; i<numSwaps; i++)
+            anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
+        return;
+    }
+
+    // FUSED multi-SWAP: rather than performing each prefix<->suffix SWAP in turn (which
+    // wastefully relays an amplitude through intermediate nodes before its final node),
+    // we send each amplitude directly to its destination node in a single pass. The
+    // numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude
+    // of this node moves to the rank obtained by overwriting each prefix-target rank-bit
+    // with the value of its partnered suffix-target bit. We enumerate the (up to)
+    // 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose
+    // partnered suffix bit disagrees with this node's rank bit) and, for each, pack +
+    // exchange + unpack only the amplitudes bound there. The move is an involution
+    // between paired nodes, so the packed and unpacked amplitudes occupy the same local
+    // slots. This composed distributed index-bit swap is that of mpiQulacs (arXiv:2203.16044)
+    // and cuStateVec's custatevecDistIndexBitSwapScheduler; the pairwise amplitude exchange
+    // follows arXiv:2311.01512 Sec IV.
+
+    std::vector<int> prefBits(numSwaps);
+    std::vector<int> rankBits(numSwaps);
+    for (int i=0; i<numSwaps; i++) {
+        prefBits[i] = util_getPrefixInd(prefixTargs[i], qureg);
+        rankBits[i] = getBit(qureg.rank, prefBits[i]);
+    }
 
-        int suffixTarg = std::min(targsA[i], targsB[i]);
-        int prefixTarg = std::max(targsA[i], targsB[i]);
-        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
+    // subset 0 are the amplitudes that do not move (all suffix bits already match the
+    // rank bits), so we skip it and communicate only the other subsets. Every communicating
+    // subset packs the same number of amplitudes (one per local amp whose suffix-target bits
+    // match the subset pattern)
+    qindex numSubsets = powerOf2(numSwaps);
+    qindex numPacked  = qureg.numAmpsPerNode / numSubsets;
+
+    // rather than one blocking exchange per subset (up to 2^numSwaps - 1 sequential syncs), we
+    // pack each subset into a distinct slice of the buffer's send half and exchange a whole wave
+    // of subsets under a single wait. Only half the buffer can send, so a wave holds
+    // (numAmpsPerNode/2)/numPacked = 2^(numSwaps-1) subsets, and the 2^numSwaps - 1 communicating
+    // subsets need at most two waves. The same total amplitudes cross the network, with the
+    // synchronisation count cut from 2^numSwaps - 1 down to at most two.
+    qindex sendBase = getSubBufferSendInd(qureg);
+    qindex recvBase = getBufferRecvInd();
+    qindex perWave  = (qureg.numAmpsPerNode / 2) / numPacked;
+
+    for (qindex first=1; first<numSubsets; first+=perWave) {
+
+        qindex last = std::min(first + perWave, numSubsets); // exclusive
+
+        // pack every subset of this wave into its own buffer slice, remembering the states so the
+        // matching received slice can be scattered back after the exchange
+        std::vector<CommChunk> chunks;
+        std::vector<List64> waveStates;
+        chunks.reserve(last - first);
+        waveStates.reserve(last - first);
+
+        for (qindex sub=first; sub<last; sub++) {
+
+            // the destination node flips this node's rank bits for the targeted subset, and
+            // the to-be-sent amplitudes are those whose suffix-target bits match the pattern
+            auto states = lists_getEmptyList64();
+            int pairRank = qureg.rank;
+            for (int i=0; i<numSwaps; i++) {
+                int inSubset = getBit(sub, i);
+                states.push_back(inSubset ? !rankBits[i] : rankBits[i]);
+                if (inSubset)
+                    pairRank = static_cast<int>(flipBit(pairRank, prefBits[i]));
+            }
+
+            qindex slot    = sub - first;
+            qindex sendInd = sendBase + slot * numPacked;
+            qindex recvInd = recvBase + slot * numPacked;
+
+            accel_statevec_packAmpsIntoSubBuffer(qureg, suffixTargs, states, sendInd);
+            chunks.push_back({sendInd, recvInd, numPacked, pairRank});
+            waveStates.push_back(states);
+        }
+
+        // exchange the whole wave with a single wait, then scatter each received slice back into
+        // the strided local amplitudes it came from
+        comm_exchangeSubBufferChunks(qureg, chunks);
+
+        for (qindex sub=first; sub<last; sub++) {
+            qindex slot    = sub - first;
+            qindex recvInd = recvBase + slot * numPacked;
+            accel_statevec_unpackAmpsFromSubBuffer(qureg, suffixTargs, waveStates[slot], recvInd);
+        }
     }
 }
 

diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
@@ -216,23 +216,24 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
 
 
 template <int NumQubits>
-qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+qindex cpu_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex sendInd) {
 
     assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);
 
     // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
     cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
     cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
 
-    // each control qubit halves the needed iterations
+    // each constrained qubit halves the needed iterations
     qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());
 
-    // amplitudes are packed at an offset into the buffer
-    qindex offset = getSubBufferSendInd(qureg);
+    // amplitudes are packed contiguously from the caller's send offset, so that several
+    // disjoint subsets can occupy distinct slices of the buffer and be exchanged in one wave
+    qindex offset = sendInd;
 
     auto sortedQubitInds = util_getSorted(qubitInds);
     auto qubitStateMask  = util_getBitMask(qubitInds, qubitStates);
-    
+
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());
 
@@ -251,6 +252,14 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const
 }
 
 
+template <int NumQubits>
+qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+
+    // pack into the buffer's single default send region (which begins at half its capacity)
+    return cpu_statevec_packAmpsIntoSubBuffer<NumQubits>(qureg, qubitInds, qubitStates, getSubBufferSendInd(qureg));
+}
+
+
 qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {
 
     assert_bufferPackerGivenIncreasingQubits(qubit1, qubit2, qubit3);
@@ -282,10 +291,61 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 
 
 INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoSubBuffer, (Qureg, ConstList64, ConstList64, qindex) )
 
 
+template <int NumQubits>
+void cpu_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex recvInd) {
+
+    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);
+
+    // this is the inverse of cpu_statevec_packAmpsIntoSubBuffer; it scatters a received
+    // contiguous sub-buffer (beginning at the caller's receive offset) back into the strided
+    // local amplitudes where the given qubits are in the given states. It generalises
+    // anyCtrlSwap_subC to multiple constrained qubits, as needed by the fused multi-SWAP routine.
+
+    // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
+    // each constrained qubit halves the number of received amps
+    qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());
+
+    // received amplitudes begin at the caller's receive offset
+    qindex offset = recvInd;
 
-/* 
+    auto sortedQubitInds = util_getSorted(qubitInds);
+    auto qubitStateMask  = util_getBitMask(qubitInds, qubitStates);
+
+    // use template param to compile-time unroll loop in insertBits()
+    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());
+
+    #pragma omp parallel for if(qureg.isMultithreaded)
+    for (qindex n=0; n<numIts; n++) {
+
+        // i = nth local index where qubits are in the specified states
+        qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask);
+
+        // scatter the contiguous sub-buffer among the strided local amplitudes
+        amps[i] = buffer[offset + n];
+    }
+}
+
+
+template <int NumQubits>
+void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+
+    // unpack from the buffer's single default receive region (which begins at index zero)
+    cpu_statevec_unpackAmpsFromSubBuffer<NumQubits>(qureg, qubitInds, qubitStates, getBufferRecvInd());
+}
+
+
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromSubBuffer, (Qureg, ConstList64, ConstList64, qindex) )
+
+
+
+/*
  * SWAPS
  */