diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp index cf6956454..35247a9af 100644 --- a/quest/src/comm/comm_routines.cpp +++ b/quest/src/comm/comm_routines.cpp @@ -21,6 +21,7 @@ #include "quest/src/gpu/gpu_config.hpp" #include "quest/src/comm/comm_config.hpp" #include "quest/src/comm/comm_indices.hpp" +#include "quest/src/comm/comm_routines.hpp" #if QUEST_COMPILE_MPI #include @@ -528,11 +529,56 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) { if (qureg.isGpuAccelerated) exchangeGpuSubBuffers(qureg, numAmps, pairRank); - else + else exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank); } +void comm_exchangeSubBufferChunks(Qureg qureg, const vector& chunks) { +#if QUEST_COMPILE_MPI + + assert_commQuregIsDistributed(qureg); + + // exchange several disjoint sub-buffer chunks, each with its own pair rank, under a single + // wait. This collapses the up-to (2^k - 1) blocking exchanges of the fused multi-SWAP into one + // asynchronous wave (the caller bounds a wave's chunks to fit the send and receive buffer halves). + // Each chunk targets a DISTINCT pair rank, so (source rank, tag) already identifies every message + // and no per-partner tag offset is needed; we reuse the per-message tag = m exactly as + // exchangeArrays does. Async-with-final-wait as per arxiv.org/abs/2308.07402. The fused routine is + // CPU-only (it restricts itself to non-GPU quregs), so only the CPU buffer is exchanged here. + + MPI_Comm mpiComm = comm_getMpiComm(); + + // validate every chunk and total the messages, to size the request list up-front + qindex numRequests = 0; + for (const CommChunk& chunk : chunks) { + assert_commBoundsAreValid(qureg, chunk.sendInd, chunk.recvInd, chunk.numAmps); + assert_bufferSendRecvDoesNotOverlap(chunk.sendInd, chunk.recvInd, chunk.numAmps); + assert_pairRankIsDistinct(qureg, chunk.pairRank); + numRequests += 2 * dividePow2PayloadIntoMessages(chunk.numAmps)[1]; + } + + vector requests(numRequests, MPI_REQUEST_NULL); + + // post every chunk's receives and sends, then wait once for the whole wave + qindex r = 0; + for (const CommChunk& chunk : chunks) { + auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(chunk.numAmps); + for (qindex m=0; m(m); // gauranteed int, but m*messageSize needs qindex + MPI_Irecv(&qureg.cpuCommBuffer[chunk.recvInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]); + MPI_Isend(&qureg.cpuCommBuffer[chunk.sendInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]); + } + } + + MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE); + +#else + error_commButEnvNotDistributed(); +#endif +} + + void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) { auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg); diff --git a/quest/src/comm/comm_routines.hpp b/quest/src/comm/comm_routines.hpp index e75e889f6..771149147 100644 --- a/quest/src/comm/comm_routines.hpp +++ b/quest/src/comm/comm_routines.hpp @@ -31,6 +31,16 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank); void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank); +// one disjoint sub-buffer chunk to exchange with a single pair rank, used by comm_exchangeSubBufferChunks +struct CommChunk { + qindex sendInd; // buffer index where this chunk's amps to send begin + qindex recvInd; // buffer index where this chunk's received amps are written + qindex numAmps; // number of amps exchanged (a power of two) + int pairRank; // the partner rank for this chunk (distinct from this node) +}; + +void comm_exchangeSubBufferChunks(Qureg qureg, const vector& chunks); + void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank); void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank); diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp index 677e6c74a..da7933642 100644 --- a/quest/src/core/accelerator.cpp +++ b/quest/src/core/accelerator.cpp @@ -232,12 +232,52 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL // note qubits may incidentally be ctrls or targs; it doesn't matter GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() ); - + // return the number of packed amps, for caller convenience return func(qureg, qubits, qubitStates); } +void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) { + + // inverse of packing; scatters received sub-buffer into strided local amps where + // the given qubits are in the given states (used by the fused multi-SWAP routine). + // only the CPU path is dispatched; the fused routine restricts itself to non-GPU + // quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel + // is needed and the GPU build is left untouched + if (qubitStates.empty()) + error_noCtrlsGivenToBufferPacker(); + + GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() ); + func(qureg, qubits, qubitStates); +} + + +qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd) { + + // as accel_statevec_packAmpsIntoBuffer, but packs into an explicit send offset so the + // fused multi-SWAP can lay several subsets into one buffer and exchange them in a wave. + // CPU-only, like the unpacker, since the fused routine restricts itself to non-GPU quregs + if (qubitStates.empty()) + error_noCtrlsGivenToBufferPacker(); + + GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_packAmpsIntoSubBuffer, qubits.size() ); + return func(qureg, qubits, qubitStates, sendInd); +} + + +void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd) { + + // inverse of accel_statevec_packAmpsIntoSubBuffer; scatters a sub-buffer received at an + // explicit offset back into the strided local amps. CPU-only for the same reason as above + if (qubitStates.empty()) + error_noCtrlsGivenToBufferPacker(); + + GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromSubBuffer, qubits.size() ); + func(qureg, qubits, qubitStates, recvInd); +} + + qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) { return (qureg.isGpuAccelerated)? diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp index 5a8dc37fb..ab16ca82d 100644 --- a/quest/src/core/accelerator.hpp +++ b/quest/src/core/accelerator.hpp @@ -170,6 +170,10 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS */ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); +qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd); + +void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); +void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd); qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp index 83a23b921..80b1f7c80 100644 --- a/quest/src/core/localiser.cpp +++ b/quest/src/core/localiser.cpp @@ -25,6 +25,7 @@ #include "quest/src/core/accelerator.hpp" #include "quest/src/comm/comm_config.hpp" #include "quest/src/comm/comm_routines.hpp" +#include "quest/src/comm/comm_indices.hpp" #include "quest/src/cpu/cpu_config.hpp" #include "quest/src/gpu/gpu_config.hpp" @@ -900,24 +901,115 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons // the SWAPs act on unique qubit pairs and so commute. /// @todo - /// - the sequence of pair-wise full-swaps should be more efficient as a - /// "single" sequence of smaller messages sending amps directly to their - /// final destination node. This could use a new "multiSwap" function. - /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the - /// multiSwap function should use custatevecSwapIndexBits() if local, - /// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, + /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this + /// routine could use custatevecSwapIndexBits() if local, or + /// custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, /// although the latter requires substantially more work like setting up /// a communicator which may be inelegant alongside our own distribution scheme. - // perform necessary swaps to move all targets into suffix, each of which invokes communication + // collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit + auto suffixTargs = lists_getEmptyList64(); + auto prefixTargs = lists_getEmptyList64(); for (size_t i=0; isuffix SWAP in turn (which + // wastefully relays an amplitude through intermediate nodes before its final node), + // we send each amplitude directly to its destination node in a single pass. The + // numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude + // of this node moves to the rank obtained by overwriting each prefix-target rank-bit + // with the value of its partnered suffix-target bit. We enumerate the (up to) + // 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose + // partnered suffix bit disagrees with this node's rank bit) and, for each, pack + + // exchange + unpack only the amplitudes bound there. The move is an involution + // between paired nodes, so the packed and unpacked amplitudes occupy the same local + // slots. This composed distributed index-bit swap is that of mpiQulacs (arXiv:2203.16044) + // and cuStateVec's custatevecDistIndexBitSwapScheduler; the pairwise amplitude exchange + // follows arXiv:2311.01512 Sec IV. + + std::vector prefBits(numSwaps); + std::vector rankBits(numSwaps); + for (int i=0; i chunks; + std::vector waveStates; + chunks.reserve(last - first); + waveStates.reserve(last - first); + + for (qindex sub=first; sub(flipBit(pairRank, prefBits[i])); + } + + qindex slot = sub - first; + qindex sendInd = sendBase + slot * numPacked; + qindex recvInd = recvBase + slot * numPacked; + + accel_statevec_packAmpsIntoSubBuffer(qureg, suffixTargs, states, sendInd); + chunks.push_back({sendInd, recvInd, numPacked, pairRank}); + waveStates.push_back(states); + } + + // exchange the whole wave with a single wait, then scatter each received slice back into + // the strided local amplitudes it came from + comm_exchangeSubBufferChunks(qureg, chunks); + + for (qindex sub=first; sub -qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { +qindex cpu_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex sendInd) { assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits); @@ -224,15 +224,16 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); - // each control qubit halves the needed iterations + // each constrained qubit halves the needed iterations qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size()); - // amplitudes are packed at an offset into the buffer - qindex offset = getSubBufferSendInd(qureg); + // amplitudes are packed contiguously from the caller's send offset, so that several + // disjoint subsets can occupy distinct slices of the buffer and be exchanged in one wave + qindex offset = sendInd; auto sortedQubitInds = util_getSorted(qubitInds); auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); - + // use template param to compile-time unroll loop in insertBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size()); @@ -251,6 +252,14 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const } +template +qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { + + // pack into the buffer's single default send region (which begins at half its capacity) + return cpu_statevec_packAmpsIntoSubBuffer(qureg, qubitInds, qubitStates, getSubBufferSendInd(qureg)); +} + + qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) { assert_bufferPackerGivenIncreasingQubits(qubit1, qubit2, qubit3); @@ -282,10 +291,61 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) ) +INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoSubBuffer, (Qureg, ConstList64, ConstList64, qindex) ) +template +void cpu_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex recvInd) { + + assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits); + + // this is the inverse of cpu_statevec_packAmpsIntoSubBuffer; it scatters a received + // contiguous sub-buffer (beginning at the caller's receive offset) back into the strided + // local amplitudes where the given qubits are in the given states. It generalises + // anyCtrlSwap_subC to multiple constrained qubits, as needed by the fused multi-SWAP routine. + + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + + // each constrained qubit halves the number of received amps + qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size()); + + // received amplitudes begin at the caller's receive offset + qindex offset = recvInd; -/* + auto sortedQubitInds = util_getSorted(qubitInds); + auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); + + // use template param to compile-time unroll loop in insertBits() + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size()); + + #pragma omp parallel for if(qureg.isMultithreaded) + for (qindex n=0; n +void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { + + // unpack from the buffer's single default receive region (which begins at index zero) + cpu_statevec_unpackAmpsFromSubBuffer(qureg, qubitInds, qubitStates, getBufferRecvInd()); +} + + +INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) ) +INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromSubBuffer, (Qureg, ConstList64, ConstList64, qindex) ) + + + +/* * SWAPS */ diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp index 3dbae057b..0745d0a8c 100644 --- a/quest/src/cpu/cpu_subroutines.hpp +++ b/quest/src/cpu/cpu_subroutines.hpp @@ -45,6 +45,10 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp */ template qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); +template qindex cpu_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex sendInd); + +template void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); +template void cpu_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex recvInd); qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp index 80b75b9c2..22ae987bf 100644 --- a/tests/unit/operations.cpp +++ b/tests/unit/operations.cpp @@ -3020,4 +3020,57 @@ TEST_CASE( "rightapplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) } +/* + * FUSED DISTRIBUTED MULTI-SWAP + * + * A focused check of the fused prefix-suffix multi-SWAP (localiser.cpp), which moves several + * prefix-qubit targets into the suffix in one batched exchange. Applying a random multi-qubit + * unitary on a target set whose upper qubits land in the prefix substate routes through that + * routine, exercising 2, 3 or 4 prefix targets at np = 4, 8, 16 (and the two-wave async batching + * those k force). Every cached deployment is compared against the same reference linear algebra: + * the distributed quregs run the fused exchange while the serial and multithreaded quregs run it + * communication-free, so agreement pins the fused, batched exchange to a swap-free reference. The + * generic dense-matrix tests already touch this path incidentally; this names and isolates it. + */ + +TEST_CASE( "fused distributed multiSwap", TEST_CATEGORY_OPS ) { + + int numQubits = getNumCachedQubits(); + auto quregs = getCachedStatevecs(); + + for (int numTargs=2; numTargs<=5; numTargs++) { + + if (numTargs > numQubits) + continue; + + // the top numTargs qubits are prefix when the qureg spans at least 2^numTargs nodes; at + // fewer nodes only some are prefix and the routine simply fuses fewer swaps, still correct + vector targs(numTargs); + for (int i=0; i