Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion quest/src/comm/comm_routines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "quest/src/gpu/gpu_config.hpp"
#include "quest/src/comm/comm_config.hpp"
#include "quest/src/comm/comm_indices.hpp"
#include "quest/src/comm/comm_routines.hpp"

#if QUEST_COMPILE_MPI
#include <mpi.h>
Expand Down Expand Up @@ -528,11 +529,56 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) {

if (qureg.isGpuAccelerated)
exchangeGpuSubBuffers(qureg, numAmps, pairRank);
else
else
exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank);
}


void comm_exchangeSubBufferChunks(Qureg qureg, const vector<CommChunk>& chunks) {
#if QUEST_COMPILE_MPI

assert_commQuregIsDistributed(qureg);

// exchange several disjoint sub-buffer chunks, each with its own pair rank, under a single
// wait. This collapses the up-to (2^k - 1) blocking exchanges of the fused multi-SWAP into one
// asynchronous wave (the caller bounds a wave's chunks to fit the send and receive buffer halves).
// Each chunk targets a DISTINCT pair rank, so (source rank, tag) already identifies every message
// and no per-partner tag offset is needed; we reuse the per-message tag = m exactly as
// exchangeArrays does. Async-with-final-wait as per arxiv.org/abs/2308.07402. The fused routine is
// CPU-only (it restricts itself to non-GPU quregs), so only the CPU buffer is exchanged here.

MPI_Comm mpiComm = comm_getMpiComm();

// validate every chunk and total the messages, to size the request list up-front
qindex numRequests = 0;
for (const CommChunk& chunk : chunks) {
assert_commBoundsAreValid(qureg, chunk.sendInd, chunk.recvInd, chunk.numAmps);
assert_bufferSendRecvDoesNotOverlap(chunk.sendInd, chunk.recvInd, chunk.numAmps);
assert_pairRankIsDistinct(qureg, chunk.pairRank);
numRequests += 2 * dividePow2PayloadIntoMessages(chunk.numAmps)[1];
}

vector<MPI_Request> requests(numRequests, MPI_REQUEST_NULL);

// post every chunk's receives and sends, then wait once for the whole wave
qindex r = 0;
for (const CommChunk& chunk : chunks) {
auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(chunk.numAmps);
for (qindex m=0; m<numMessages; m++) {
int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
MPI_Irecv(&qureg.cpuCommBuffer[chunk.recvInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]);
MPI_Isend(&qureg.cpuCommBuffer[chunk.sendInd + m*messageSize], messageSize, MPI_QCOMP, chunk.pairRank, tag, mpiComm, &requests[r++]);
}
}

MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);

#else
error_commButEnvNotDistributed();
#endif
}


void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) {

auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg);
Expand Down
10 changes: 10 additions & 0 deletions quest/src/comm/comm_routines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank);

void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank);

// one disjoint sub-buffer chunk to exchange with a single pair rank, used by comm_exchangeSubBufferChunks
struct CommChunk {
qindex sendInd; // buffer index where this chunk's amps to send begin
qindex recvInd; // buffer index where this chunk's received amps are written
qindex numAmps; // number of amps exchanged (a power of two)
int pairRank; // the partner rank for this chunk (distinct from this node)
};

void comm_exchangeSubBufferChunks(Qureg qureg, const vector<CommChunk>& chunks);

void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank);

void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank);
Expand Down
42 changes: 41 additions & 1 deletion quest/src/core/accelerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,12 +232,52 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL

// note qubits may incidentally be ctrls or targs; it doesn't matter
GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() );

// return the number of packed amps, for caller convenience
return func(qureg, qubits, qubitStates);
}


void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) {

// inverse of packing; scatters received sub-buffer into strided local amps where
// the given qubits are in the given states (used by the fused multi-SWAP routine).
// only the CPU path is dispatched; the fused routine restricts itself to non-GPU
// quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel
// is needed and the GPU build is left untouched
if (qubitStates.empty())
error_noCtrlsGivenToBufferPacker();

GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() );
func(qureg, qubits, qubitStates);
}


qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd) {

// as accel_statevec_packAmpsIntoBuffer, but packs into an explicit send offset so the
// fused multi-SWAP can lay several subsets into one buffer and exchange them in a wave.
// CPU-only, like the unpacker, since the fused routine restricts itself to non-GPU quregs
if (qubitStates.empty())
error_noCtrlsGivenToBufferPacker();

GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_packAmpsIntoSubBuffer, qubits.size() );
return func(qureg, qubits, qubitStates, sendInd);
}


void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd) {

// inverse of accel_statevec_packAmpsIntoSubBuffer; scatters a sub-buffer received at an
// explicit offset back into the strided local amps. CPU-only for the same reason as above
if (qubitStates.empty())
error_noCtrlsGivenToBufferPacker();

GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromSubBuffer, qubits.size() );
func(qureg, qubits, qubitStates, recvInd);
}


qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {

return (qureg.isGpuAccelerated)?
Expand Down
4 changes: 4 additions & 0 deletions quest/src/core/accelerator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS
*/

qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
qindex accel_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex sendInd);

void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
void accel_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvInd);

qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);

Expand Down
114 changes: 103 additions & 11 deletions quest/src/core/localiser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "quest/src/core/accelerator.hpp"
#include "quest/src/comm/comm_config.hpp"
#include "quest/src/comm/comm_routines.hpp"
#include "quest/src/comm/comm_indices.hpp"
#include "quest/src/cpu/cpu_config.hpp"
#include "quest/src/gpu/gpu_config.hpp"

Expand Down Expand Up @@ -900,24 +901,115 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
// the SWAPs act on unique qubit pairs and so commute.

/// @todo
/// - the sequence of pair-wise full-swaps should be more efficient as a
/// "single" sequence of smaller messages sending amps directly to their
/// final destination node. This could use a new "multiSwap" function.
/// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
/// multiSwap function should use custatevecSwapIndexBits() if local,
/// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
/// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this
/// routine could use custatevecSwapIndexBits() if local, or
/// custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
/// although the latter requires substantially more work like setting up
/// a communicator which may be inelegant alongside our own distribution scheme.

// perform necessary swaps to move all targets into suffix, each of which invokes communication
// collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit
auto suffixTargs = lists_getEmptyList64();
auto prefixTargs = lists_getEmptyList64();
for (size_t i=0; i<targsA.size(); i++) {

if (targsA[i] == targsB[i])
continue;
suffixTargs.push_back(std::min(targsA[i], targsB[i]));
prefixTargs.push_back(std::max(targsA[i], targsB[i]));
}
int numSwaps = suffixTargs.size();
if (numSwaps == 0)
return;

// the fused routine below targets the uncontrolled, non-GPU case which every internal
// caller currently uses. A controlled multi-SWAP, or a GPU-accelerated Qureg, falls back
// to the per-swap routine (issue #595 notes the OpenMP logic alone is sufficient, so the
// GPU path is left unchanged)
if (!ctrls.empty() || qureg.isGpuAccelerated) {
for (int i=0; i<numSwaps; i++)
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
return;
}

// FUSED multi-SWAP: rather than performing each prefix<->suffix SWAP in turn (which
// wastefully relays an amplitude through intermediate nodes before its final node),
// we send each amplitude directly to its destination node in a single pass. The
// numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude
// of this node moves to the rank obtained by overwriting each prefix-target rank-bit
// with the value of its partnered suffix-target bit. We enumerate the (up to)
// 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose
// partnered suffix bit disagrees with this node's rank bit) and, for each, pack +
// exchange + unpack only the amplitudes bound there. The move is an involution
// between paired nodes, so the packed and unpacked amplitudes occupy the same local
// slots. This composed distributed index-bit swap is that of mpiQulacs (arXiv:2203.16044)
// and cuStateVec's custatevecDistIndexBitSwapScheduler; the pairwise amplitude exchange
// follows arXiv:2311.01512 Sec IV.

std::vector<int> prefBits(numSwaps);
std::vector<int> rankBits(numSwaps);
for (int i=0; i<numSwaps; i++) {
prefBits[i] = util_getPrefixInd(prefixTargs[i], qureg);
rankBits[i] = getBit(qureg.rank, prefBits[i]);
}

int suffixTarg = std::min(targsA[i], targsB[i]);
int prefixTarg = std::max(targsA[i], targsB[i]);
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
// subset 0 are the amplitudes that do not move (all suffix bits already match the
// rank bits), so we skip it and communicate only the other subsets. Every communicating
// subset packs the same number of amplitudes (one per local amp whose suffix-target bits
// match the subset pattern)
qindex numSubsets = powerOf2(numSwaps);
qindex numPacked = qureg.numAmpsPerNode / numSubsets;

// rather than one blocking exchange per subset (up to 2^numSwaps - 1 sequential syncs), we
// pack each subset into a distinct slice of the buffer's send half and exchange a whole wave
// of subsets under a single wait. Only half the buffer can send, so a wave holds
// (numAmpsPerNode/2)/numPacked = 2^(numSwaps-1) subsets, and the 2^numSwaps - 1 communicating
// subsets need at most two waves. The same total amplitudes cross the network, with the
// synchronisation count cut from 2^numSwaps - 1 down to at most two.
qindex sendBase = getSubBufferSendInd(qureg);
qindex recvBase = getBufferRecvInd();
qindex perWave = (qureg.numAmpsPerNode / 2) / numPacked;

for (qindex first=1; first<numSubsets; first+=perWave) {

qindex last = std::min(first + perWave, numSubsets); // exclusive

// pack every subset of this wave into its own buffer slice, remembering the states so the
// matching received slice can be scattered back after the exchange
std::vector<CommChunk> chunks;
std::vector<List64> waveStates;
chunks.reserve(last - first);
waveStates.reserve(last - first);

for (qindex sub=first; sub<last; sub++) {

// the destination node flips this node's rank bits for the targeted subset, and
// the to-be-sent amplitudes are those whose suffix-target bits match the pattern
auto states = lists_getEmptyList64();
int pairRank = qureg.rank;
for (int i=0; i<numSwaps; i++) {
int inSubset = getBit(sub, i);
states.push_back(inSubset ? !rankBits[i] : rankBits[i]);
if (inSubset)
pairRank = static_cast<int>(flipBit(pairRank, prefBits[i]));
}

qindex slot = sub - first;
qindex sendInd = sendBase + slot * numPacked;
qindex recvInd = recvBase + slot * numPacked;

accel_statevec_packAmpsIntoSubBuffer(qureg, suffixTargs, states, sendInd);
chunks.push_back({sendInd, recvInd, numPacked, pairRank});
waveStates.push_back(states);
}

// exchange the whole wave with a single wait, then scatter each received slice back into
// the strided local amplitudes it came from
comm_exchangeSubBufferChunks(qureg, chunks);

for (qindex sub=first; sub<last; sub++) {
qindex slot = sub - first;
qindex recvInd = recvBase + slot * numPacked;
accel_statevec_unpackAmpsFromSubBuffer(qureg, suffixTargs, waveStates[slot], recvInd);
}
}
}

Expand Down
72 changes: 66 additions & 6 deletions quest/src/cpu/cpu_subroutines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,23 +216,24 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp


template <int NumQubits>
qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
qindex cpu_statevec_packAmpsIntoSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex sendInd) {

assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);

// use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);

// each control qubit halves the needed iterations
// each constrained qubit halves the needed iterations
qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());

// amplitudes are packed at an offset into the buffer
qindex offset = getSubBufferSendInd(qureg);
// amplitudes are packed contiguously from the caller's send offset, so that several
// disjoint subsets can occupy distinct slices of the buffer and be exchanged in one wave
qindex offset = sendInd;

auto sortedQubitInds = util_getSorted(qubitInds);
auto qubitStateMask = util_getBitMask(qubitInds, qubitStates);

// use template param to compile-time unroll loop in insertBits()
SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());

Expand All @@ -251,6 +252,14 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const
}


template <int NumQubits>
qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {

// pack into the buffer's single default send region (which begins at half its capacity)
return cpu_statevec_packAmpsIntoSubBuffer<NumQubits>(qureg, qubitInds, qubitStates, getSubBufferSendInd(qureg));
}


qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {

assert_bufferPackerGivenIncreasingQubits(qubit1, qubit2, qubit3);
Expand Down Expand Up @@ -282,10 +291,61 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )
INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoSubBuffer, (Qureg, ConstList64, ConstList64, qindex) )


template <int NumQubits>
void cpu_statevec_unpackAmpsFromSubBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates, qindex recvInd) {

assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);

// this is the inverse of cpu_statevec_packAmpsIntoSubBuffer; it scatters a received
// contiguous sub-buffer (beginning at the caller's receive offset) back into the strided
// local amplitudes where the given qubits are in the given states. It generalises
// anyCtrlSwap_subC to multiple constrained qubits, as needed by the fused multi-SWAP routine.

// use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);

// each constrained qubit halves the number of received amps
qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());

// received amplitudes begin at the caller's receive offset
qindex offset = recvInd;

/*
auto sortedQubitInds = util_getSorted(qubitInds);
auto qubitStateMask = util_getBitMask(qubitInds, qubitStates);

// use template param to compile-time unroll loop in insertBits()
SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());

#pragma omp parallel for if(qureg.isMultithreaded)
for (qindex n=0; n<numIts; n++) {

// i = nth local index where qubits are in the specified states
qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask);

// scatter the contiguous sub-buffer among the strided local amplitudes
amps[i] = buffer[offset + n];
}
}


template <int NumQubits>
void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {

// unpack from the buffer's single default receive region (which begins at index zero)
cpu_statevec_unpackAmpsFromSubBuffer<NumQubits>(qureg, qubitInds, qubitStates, getBufferRecvInd());
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) )
INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromSubBuffer, (Qureg, ConstList64, ConstList64, qindex) )



/*
* SWAPS
*/

Expand Down
Loading
Loading