From fc98281184a5eea9c583e38b9ca34681335b836f Mon Sep 17 00:00:00 2001 From: zkasuran Date: Mon, 8 Jun 2026 08:36:51 +0530 Subject: [PATCH 1/2] Fuse distributed prefix-suffix multi-SWAP (closes #595) The localiser performed each prefix<->suffix SWAP in turn, so an amplitude moved by one SWAP was often moved again by the next, crossing the network several times. This fuses the group of disjoint SWAPs into one operation that computes each amplitude's final node and sends it there directly, so every amplitude crosses the network at most once. The disjoint SWAPs commute and compose into a single bit permutation. For the uncontrolled case (every internal caller) the routine enumerates the up to 2^eta-1 destination nodes and packs, exchanges and unpacks only the amplitudes bound to each. A new cpu_statevec_unpackAmpsFromBuffer scatters the received sub-buffer back into the strided local amplitudes, the inverse of the existing packer, looping over moved amplitudes not the whole state. Scope is CPU/OpenMP. GPU quregs and controlled multi-SWAPs keep the existing per-SWAP path, so the GPU build is unchanged. Comm volume drops 25% at eta=2 and 42% at eta=3 (1 - 1/2^eta), matching theory. Existing applySwap, applyCompMatr, applyCompMatr2 and calcPartialTrace suites pass at 1, 2, 4 and 8 ranks. --- quest/src/core/accelerator.cpp | 17 ++++++- quest/src/core/accelerator.hpp | 2 + quest/src/core/localiser.cpp | 73 ++++++++++++++++++++++++++----- quest/src/cpu/cpu_subroutines.cpp | 43 +++++++++++++++++- quest/src/cpu/cpu_subroutines.hpp | 2 + 5 files changed, 124 insertions(+), 13 deletions(-) diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp index 677e6c74a..7752b8557 100644 --- a/quest/src/core/accelerator.cpp +++ b/quest/src/core/accelerator.cpp @@ -232,12 +232,27 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL // note qubits may incidentally be ctrls or targs; it doesn't matter GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() ); - + // return the number of packed amps, for caller convenience return func(qureg, qubits, qubitStates); } +void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) { + + // inverse of packing; scatters received sub-buffer into strided local amps where + // the given qubits are in the given states (used by the fused multi-SWAP routine). + // only the CPU path is dispatched; the fused routine restricts itself to non-GPU + // quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel + // is needed and the GPU build is left untouched + if (qubitStates.empty()) + error_noCtrlsGivenToBufferPacker(); + + GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() ); + func(qureg, qubits, qubitStates); +} + + qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) { return (qureg.isGpuAccelerated)? diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp index 5a8dc37fb..879ad8d78 100644 --- a/quest/src/core/accelerator.hpp +++ b/quest/src/core/accelerator.hpp @@ -171,6 +171,8 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); +void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); + qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp index 83a23b921..ab11f6215 100644 --- a/quest/src/core/localiser.cpp +++ b/quest/src/core/localiser.cpp @@ -900,24 +900,75 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons // the SWAPs act on unique qubit pairs and so commute. /// @todo - /// - the sequence of pair-wise full-swaps should be more efficient as a - /// "single" sequence of smaller messages sending amps directly to their - /// final destination node. This could use a new "multiSwap" function. - /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the - /// multiSwap function should use custatevecSwapIndexBits() if local, - /// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, + /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this + /// routine could use custatevecSwapIndexBits() if local, or + /// custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, /// although the latter requires substantially more work like setting up /// a communicator which may be inelegant alongside our own distribution scheme. - // perform necessary swaps to move all targets into suffix, each of which invokes communication + // collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit + auto suffixTargs = lists_getEmptyList64(); + auto prefixTargs = lists_getEmptyList64(); for (size_t i=0; isuffix SWAP in turn (which + // wastefully relays an amplitude through intermediate nodes before its final node), + // we send each amplitude directly to its destination node in a single pass. The + // numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude + // of this node moves to the rank obtained by overwriting each prefix-target rank-bit + // with the value of its partnered suffix-target bit. We enumerate the (up to) + // 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose + // partnered suffix bit disagrees with this node's rank bit) and, for each, pack + + // exchange + unpack only the amplitudes bound there. The move is an involution + // between paired nodes, so the packed and unpacked amplitudes occupy the same local + // slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV. + + std::vector prefBits(numSwaps); + std::vector rankBits(numSwaps); + for (int i=0; i(flipBit(pairRank, prefBits[i])); + } - int suffixTarg = std::min(targsA[i], targsB[i]); - int prefixTarg = std::max(targsA[i], targsB[i]); - anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg); + // pack the amplitudes bound for pairRank, exchange, and scatter the received + // amplitudes back into those same local slots + qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states); + comm_exchangeSubBuffers(qureg, numPacked, pairRank); + accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states); } } diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 59df946e9..524e6d89d 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -284,8 +284,49 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) ) +template +void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { + + assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits); + + // this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received + // contiguous sub-buffer back into the strided local amplitudes where the given qubits + // are in the given states. It generalises anyCtrlSwap_subC to multiple constrained + // qubits, as needed by the fused multi-SWAP routine. + + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); -/* + // each constrained qubit halves the number of received amps + qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size()); + + // received amplitudes begin at the buffer's receive offset + qindex offset = getBufferRecvInd(); + + auto sortedQubitInds = util_getSorted(qubitInds); + auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); + + // use template param to compile-time unroll loop in insertBits() + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size()); + + #pragma omp parallel for if(qureg.isMultithreaded) + for (qindex n=0; n qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); +template void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); + qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); From 2d542d66c8638f45535d56be3fc0ff7b6961a6bf Mon Sep 17 00:00:00 2001 From: zkasuran Date: Mon, 15 Jun 2026 20:59:07 +0530 Subject: [PATCH 2/2] Batch fused multi-SWAP exchanges into async waves The fused distributed multi-SWAP ran one blocking sub-buffer exchange per destination subset, up to 2^numSwaps - 1 sequential rounds. This packs each subset into a distinct slice of the buffer's send half and exchanges a whole wave of subsets under a single MPI_Waitall. Only half the buffer can send, so a wave holds 2^(numSwaps-1) subsets and the communicating subsets need at most two waves. The same total amplitudes cross the network, with the number of synchronisations cut from 2^numSwaps - 1 down to at most two. The new comm_exchangeSubBufferChunks() follows the async-with-final-wait pattern already in comm_routines.cpp (the one credited to arXiv:2308.07402), and the packer and unpacker gain offset-aware variants so several subsets can share the buffer. The controlled and GPU paths keep the per-swap fallback. Adds a focused test (tests/unit/operations.cpp) that applies a random multi-qubit unitary on prefix-spanning targets, forcing 2 to 4 prefix targets and so the two-wave batching, and checks every deployment against the reference linear algebra. Corrects the swap-fusion reference: quant-ph/0608239 is a general parallel simulator, not swap fusion, so the comment now points at the distributed index-bit-swap references (arXiv:2203.16044 and cuStateVec's custatevecDistIndexBitSwapScheduler) and arXiv:2311.01512 for the exchange. --- quest/src/comm/comm_routines.cpp | 48 ++++++++++++++++++- quest/src/comm/comm_routines.hpp | 10 ++++ quest/src/core/accelerator.cpp | 25 ++++++++++ quest/src/core/accelerator.hpp | 2 + quest/src/core/localiser.cpp | 77 +++++++++++++++++++++++-------- quest/src/cpu/cpu_subroutines.cpp | 43 ++++++++++++----- quest/src/cpu/cpu_subroutines.hpp | 2 + tests/unit/operations.cpp | 53 +++++++++++++++++++++ 8 files changed, 229 insertions(+), 31 deletions(-) diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp index cf6956454..35247a9af 100644 --- a/quest/src/comm/comm_routines.cpp +++ b/quest/src/comm/comm_routines.cpp @@ -21,6 +21,7 @@ #include "quest/src/gpu/gpu_config.hpp" #include "quest/src/comm/comm_config.hpp" #include "quest/src/comm/comm_indices.hpp" +#include "quest/src/comm/comm_routines.hpp" #if QUEST_COMPILE_MPI #include @@ -528,11 +529,56 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) { if (qureg.isGpuAccelerated) exchangeGpuSubBuffers(qureg, numAmps, pairRank); - else + else exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank); } +void comm_exchangeSubBufferChunks(Qureg qureg, const vector& chunks) { +#if QUEST_COMPILE_MPI + + assert_commQuregIsDistributed(qureg); + + // exchange several disjoint sub-buffer chunks, each with its own pair rank, under a single + // wait. This collapses the up-to (2^k - 1) blocking exchanges of the fused multi-SWAP into one + // asynchronous wave (the caller bounds a wave's chunks to fit the send and receive buffer halves). + // Each chunk targets a DISTINCT pair rank, so (source rank, tag) already identifies every message + // and no per-partner tag offset is needed; we reuse the per-message tag = m exactly as + // exchangeArrays does. Async-with-final-wait as per arxiv.org/abs/2308.07402. The fused routine is + // CPU-only (it restricts itself to non-GPU quregs), so only the CPU buffer is exchanged here. + + MPI_Comm mpiComm = comm_getMpiComm(); + + // validate every chunk and total the messages, to size the request list up-front + qindex numRequests = 0; + for (const CommChunk& chunk : chunks) { + assert_commBoundsAreValid(qureg, chunk.sendInd, chunk.recvInd, chunk.numAmps); + assert_bufferSendRecvDoesNotOverlap(chunk.sendInd, chunk.recvInd, chunk.numAmps); + assert_pairRankIsDistinct(qureg, chunk.pairRank); + numRequests += 2 * dividePow2PayloadIntoMessages(chunk.numAmps)[1]; + } + + vector requests(numRequests, MPI_REQUEST_NULL); + + // post every chunk's receives and sends, then wait once for the whole wave + qindex r = 0; + for (const CommChunk& chunk : chunks) { + auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(chunk.numAmps); + for (qindex m=0; m(m); // gauranteed int, but m*messageSize needs qindex + MPI_Irecv(&qureg.cpuCommBuffer[chunk.recvInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]); + MPI_Isend(&qureg.cpuCommBuffer[chunk.sendInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]); + } + } + + MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE); + +#else + error_commButEnvNotDistributed(); +#endif +} + + void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) { auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg); diff --git a/quest/src/comm/comm_routines.hpp b/quest/src/comm/comm_routines.hpp index e75e889f6..771149147 100644 --- a/quest/src/comm/comm_routines.hpp +++ b/quest/src/comm/comm_routines.hpp @@ -31,6 +31,16 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank); void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank); +// one disjoint sub-buffer chunk to exchange with a single pair rank, used by comm_exchangeSubBufferChunks +struct CommChunk { + qindex sendInd; // buffer index where this chunk's amps to send begin + qindex recvInd; // buffer index where this chunk's received amps are written + qindex numAmps; // number of amps exchanged (a power of two) + int pairRank; // the partner rank for this chunk (distinct from this node) +}; + +void comm_exchangeSubBufferChunks(Qureg qureg, const vector& chunks); + void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank); void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank); diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp index 7752b8557..da7933642 100644 --- a/quest/src/core/accelerator.cpp +++ b/quest/src/core/accelerator.cpp @@ -253,6 +253,31 @@ void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstL } +qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd) { + + // as accel_statevec_packAmpsIntoBuffer, but packs into an explicit send offset so the + // fused multi-SWAP can lay several subsets into one buffer and exchange them in a wave. + // CPU-only, like the unpacker, since the fused routine restricts itself to non-GPU quregs + if (qubitStates.empty()) + error_noCtrlsGivenToBufferPacker(); + + GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_packAmpsIntoSubBuffer, qubits.size() ); + return func(qureg, qubits, qubitStates, sendInd); +} + + +void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd) { + + // inverse of accel_statevec_packAmpsIntoSubBuffer; scatters a sub-buffer received at an + // explicit offset back into the strided local amps. CPU-only for the same reason as above + if (qubitStates.empty()) + error_noCtrlsGivenToBufferPacker(); + + GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromSubBuffer, qubits.size() ); + func(qureg, qubits, qubitStates, recvInd); +} + + qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) { return (qureg.isGpuAccelerated)? diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp index 879ad8d78..ab16ca82d 100644 --- a/quest/src/core/accelerator.hpp +++ b/quest/src/core/accelerator.hpp @@ -170,8 +170,10 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS */ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); +qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd); void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); +void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd); qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp index ab11f6215..80b1f7c80 100644 --- a/quest/src/core/localiser.cpp +++ b/quest/src/core/localiser.cpp @@ -25,6 +25,7 @@ #include "quest/src/core/accelerator.hpp" #include "quest/src/comm/comm_config.hpp" #include "quest/src/comm/comm_routines.hpp" +#include "quest/src/comm/comm_indices.hpp" #include "quest/src/cpu/cpu_config.hpp" #include "quest/src/gpu/gpu_config.hpp" @@ -939,7 +940,9 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons // partnered suffix bit disagrees with this node's rank bit) and, for each, pack + // exchange + unpack only the amplitudes bound there. The move is an involution // between paired nodes, so the packed and unpacked amplitudes occupy the same local - // slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV. + // slots. This composed distributed index-bit swap is that of mpiQulacs (arXiv:2203.16044) + // and cuStateVec's custatevecDistIndexBitSwapScheduler; the pairwise amplitude exchange + // follows arXiv:2311.01512 Sec IV. std::vector prefBits(numSwaps); std::vector rankBits(numSwaps); @@ -949,26 +952,64 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons } // subset 0 are the amplitudes that do not move (all suffix bits already match the - // rank bits), so we skip it and iterate only the communicating subsets + // rank bits), so we skip it and communicate only the other subsets. Every communicating + // subset packs the same number of amplitudes (one per local amp whose suffix-target bits + // match the subset pattern) qindex numSubsets = powerOf2(numSwaps); - for (qindex sub=1; sub(flipBit(pairRank, prefBits[i])); + qindex numPacked = qureg.numAmpsPerNode / numSubsets; + + // rather than one blocking exchange per subset (up to 2^numSwaps - 1 sequential syncs), we + // pack each subset into a distinct slice of the buffer's send half and exchange a whole wave + // of subsets under a single wait. Only half the buffer can send, so a wave holds + // (numAmpsPerNode/2)/numPacked = 2^(numSwaps-1) subsets, and the 2^numSwaps - 1 communicating + // subsets need at most two waves. The same total amplitudes cross the network, with the + // synchronisation count cut from 2^numSwaps - 1 down to at most two. + qindex sendBase = getSubBufferSendInd(qureg); + qindex recvBase = getBufferRecvInd(); + qindex perWave = (qureg.numAmpsPerNode / 2) / numPacked; + + for (qindex first=1; first chunks; + std::vector waveStates; + chunks.reserve(last - first); + waveStates.reserve(last - first); + + for (qindex sub=first; sub(flipBit(pairRank, prefBits[i])); + } + + qindex slot = sub - first; + qindex sendInd = sendBase + slot * numPacked; + qindex recvInd = recvBase + slot * numPacked; + + accel_statevec_packAmpsIntoSubBuffer(qureg, suffixTargs, states, sendInd); + chunks.push_back({sendInd, recvInd, numPacked, pairRank}); + waveStates.push_back(states); } - // pack the amplitudes bound for pairRank, exchange, and scatter the received - // amplitudes back into those same local slots - qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states); - comm_exchangeSubBuffers(qureg, numPacked, pairRank); - accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states); + // exchange the whole wave with a single wait, then scatter each received slice back into + // the strided local amplitudes it came from + comm_exchangeSubBufferChunks(qureg, chunks); + + for (qindex sub=first; sub -qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { +qindex cpu_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex sendInd) { assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits); @@ -224,15 +224,16 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); - // each control qubit halves the needed iterations + // each constrained qubit halves the needed iterations qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size()); - // amplitudes are packed at an offset into the buffer - qindex offset = getSubBufferSendInd(qureg); + // amplitudes are packed contiguously from the caller's send offset, so that several + // disjoint subsets can occupy distinct slices of the buffer and be exchanged in one wave + qindex offset = sendInd; auto sortedQubitInds = util_getSorted(qubitInds); auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); - + // use template param to compile-time unroll loop in insertBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size()); @@ -251,6 +252,14 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const } +template +qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { + + // pack into the buffer's single default send region (which begins at half its capacity) + return cpu_statevec_packAmpsIntoSubBuffer(qureg, qubitInds, qubitStates, getSubBufferSendInd(qureg)); +} + + qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) { assert_bufferPackerGivenIncreasingQubits(qubit1, qubit2, qubit3); @@ -282,17 +291,18 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) ) +INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoSubBuffer, (Qureg, ConstList64, ConstList64, qindex) ) template -void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { +void cpu_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex recvInd) { assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits); - // this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received - // contiguous sub-buffer back into the strided local amplitudes where the given qubits - // are in the given states. It generalises anyCtrlSwap_subC to multiple constrained - // qubits, as needed by the fused multi-SWAP routine. + // this is the inverse of cpu_statevec_packAmpsIntoSubBuffer; it scatters a received + // contiguous sub-buffer (beginning at the caller's receive offset) back into the strided + // local amplitudes where the given qubits are in the given states. It generalises + // anyCtrlSwap_subC to multiple constrained qubits, as needed by the fused multi-SWAP routine. // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); @@ -301,8 +311,8 @@ void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, Const // each constrained qubit halves the number of received amps qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size()); - // received amplitudes begin at the buffer's receive offset - qindex offset = getBufferRecvInd(); + // received amplitudes begin at the caller's receive offset + qindex offset = recvInd; auto sortedQubitInds = util_getSorted(qubitInds); auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); @@ -322,7 +332,16 @@ void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, Const } +template +void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { + + // unpack from the buffer's single default receive region (which begins at index zero) + cpu_statevec_unpackAmpsFromSubBuffer(qureg, qubitInds, qubitStates, getBufferRecvInd()); +} + + INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) ) +INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromSubBuffer, (Qureg, ConstList64, ConstList64, qindex) ) diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp index 5925ad5a4..0745d0a8c 100644 --- a/quest/src/cpu/cpu_subroutines.hpp +++ b/quest/src/cpu/cpu_subroutines.hpp @@ -45,8 +45,10 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp */ template qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); +template qindex cpu_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex sendInd); template void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); +template void cpu_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex recvInd); qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp index 80b75b9c2..22ae987bf 100644 --- a/tests/unit/operations.cpp +++ b/tests/unit/operations.cpp @@ -3020,4 +3020,57 @@ TEST_CASE( "rightapplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) } +/* + * FUSED DISTRIBUTED MULTI-SWAP + * + * A focused check of the fused prefix-suffix multi-SWAP (localiser.cpp), which moves several + * prefix-qubit targets into the suffix in one batched exchange. Applying a random multi-qubit + * unitary on a target set whose upper qubits land in the prefix substate routes through that + * routine, exercising 2, 3 or 4 prefix targets at np = 4, 8, 16 (and the two-wave async batching + * those k force). Every cached deployment is compared against the same reference linear algebra: + * the distributed quregs run the fused exchange while the serial and multithreaded quregs run it + * communication-free, so agreement pins the fused, batched exchange to a swap-free reference. The + * generic dense-matrix tests already touch this path incidentally; this names and isolates it. + */ + +TEST_CASE( "fused distributed multiSwap", TEST_CATEGORY_OPS ) { + + int numQubits = getNumCachedQubits(); + auto quregs = getCachedStatevecs(); + + for (int numTargs=2; numTargs<=5; numTargs++) { + + if (numTargs > numQubits) + continue; + + // the top numTargs qubits are prefix when the qureg spans at least 2^numTargs nodes; at + // fewer nodes only some are prefix and the routine simply fuses fewer swaps, still correct + vector targs(numTargs); + for (int i=0; i