Skip to content

Commit a7f23b9

Browse files
authored
Merge pull request #394 from JohnnyLawDGB/fix/dandelion-abba-deadlock
fix: resolve Dandelion++ ABBA deadlock in CheckDandelionEmbargoes (Bug #29)
2 parents 8c92bfd + 216fb8c commit a7f23b9

1 file changed

Lines changed: 117 additions & 57 deletions

File tree

src/net_processing.cpp

Lines changed: 117 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1627,70 +1627,130 @@ void PeerManagerImpl::RelayDandelionTransaction(const CTransaction& tx, CNode* p
16271627

16281628
void PeerManagerImpl::CheckDandelionEmbargoes()
16291629
{
1630-
LOCK(m_connman.m_dandelion_embargo_mutex);
1631-
auto current_time = GetTime<std::chrono::milliseconds>();
1632-
1633-
// Log embargo checks (debug level only — this fires every second)
1634-
if (!m_connman.mDandelionEmbargo.empty()) {
1635-
LogPrint(BCLog::DANDELION, "CheckDandelionEmbargoes: Checking %d embargoed transactions (stempool=%d, mempool=%d)\n",
1636-
m_connman.mDandelionEmbargo.size(), m_stempool.size(), m_mempool.size());
1637-
}
1638-
1639-
// Check if we now have Dandelion destinations available for stuck transactions
1630+
// =========================================================================
1631+
// Bug #29 fix: ABBA deadlock between m_nodes_mutex and m_dandelion_embargo_mutex
1632+
//
1633+
// The established lock ordering throughout the Dandelion++ code is:
1634+
// m_nodes_mutex FIRST, then m_dandelion_embargo_mutex SECOND
1635+
//
1636+
// This ordering is used by:
1637+
// - DandelionShuffle() (dandelion.cpp:344 → 357)
1638+
// - CloseDandelionConnections() (dandelion.cpp:211 → 292)
1639+
// - DisconnectNodes() (net.cpp:1952 → CloseDandelionConnections)
1640+
//
1641+
// Before this fix, CheckDandelionEmbargoes() violated that ordering:
1642+
// it held m_dandelion_embargo_mutex, then called usingDandelion() and
1643+
// localDandelionDestinationPushInventory(), both of which acquire
1644+
// m_nodes_mutex internally. This created a classic ABBA deadlock:
1645+
//
1646+
// Thread A (shuffle timer): LOCK(m_nodes_mutex) → LOCK(m_dandelion_embargo_mutex)
1647+
// Thread B (embargo timer): LOCK(m_dandelion_embargo_mutex) → LOCK(m_nodes_mutex)
1648+
//
1649+
// When both fire concurrently, each thread holds the lock the other needs.
1650+
// Result: sendtoaddress hangs forever at "Processing Dandelion relay",
1651+
// shutdown hangs on threadDandelionShuffle.join(), RPC times out.
1652+
// (Reported by DanGB on Windows 11, RC26, reproducible after ~1 week uptime.)
1653+
//
1654+
// Fix: restructure this function into two phases:
1655+
// Phase 1 — under m_dandelion_embargo_mutex: scan the embargo map, handle
1656+
// expired/mempool entries, collect txids that need stem routing.
1657+
// Phase 2 — after releasing m_dandelion_embargo_mutex: perform the stem
1658+
// routing (which needs m_nodes_mutex), then briefly re-acquire
1659+
// m_dandelion_embargo_mutex to mark them as routed.
1660+
//
1661+
// usingDandelion() is also moved before the embargo lock for the same reason.
1662+
// The bool may be momentarily stale, but that only means we skip one routing
1663+
// cycle (~1 second) — no correctness impact.
1664+
// =========================================================================
1665+
1666+
// Phase 0: query Dandelion destination availability WITHOUT holding the
1667+
// embargo lock. usingDandelion() acquires m_nodes_mutex internally.
16401668
bool hasDandelionDestinations = m_connman.usingDandelion();
1641-
1642-
for (auto iter = m_connman.mDandelionEmbargo.begin(); iter != m_connman.mDandelionEmbargo.end();) {
1643-
if (m_mempool.exists(iter->first)) {
1644-
LogPrint(BCLog::DANDELION, "Embargoed dandeliontx %s found in mempool; removing from embargo map\n", iter->first.ToString());
1645-
m_connman.m_dandelion_stem_routed.erase(iter->first);
1646-
iter = m_connman.mDandelionEmbargo.erase(iter);
1647-
} else if (iter->second < current_time) {
1648-
LogPrintf("CheckDandelionEmbargoes: dandeliontx %s embargo expired\n", iter->first.ToString());
1649-
CTransactionRef ptx = m_stempool.get(iter->first);
1650-
if (ptx) {
1651-
LogPrintf("CheckDandelionEmbargoes: Moving transaction %s from stempool to mempool for broadcast\n", iter->first.ToString());
1652-
{
1653-
LOCK(cs_main);
1654-
const MempoolAcceptResult result = AcceptToMemoryPool(m_chainman.ActiveChainstate(), m_mempool, ptx, false);
1655-
if (result.m_result_type == MempoolAcceptResult::ResultType::VALID) {
1656-
LogPrintf("CheckDandelionEmbargoes: Successfully moved tx %s to mempool\n", iter->first.ToString());
1657-
LogPrint(BCLog::MEMPOOL, "AcceptToMemoryPool: accepted %s (poolsz %u txn, %u kB)\n",
1658-
iter->first.ToString(), m_mempool.size(), m_mempool.DynamicMemoryUsage() / 1000);
1659-
RelayTransaction(ptx->GetHash(), ptx->GetWitnessHash());
1660-
} else {
1661-
LogPrintf("CheckDandelionEmbargoes: Failed to move tx %s to mempool: %s\n",
1662-
iter->first.ToString(), result.m_state.ToString());
1669+
1670+
// Transactions collected during Phase 1 that need stem routing in Phase 2.
1671+
std::vector<uint256> txidsNeedingStemRoute;
1672+
1673+
// Phase 1: scan embargo map under m_dandelion_embargo_mutex.
1674+
// Everything in this block touches only embargo-protected state (plus
1675+
// cs_main for AcceptToMemoryPool, which has no ordering conflict here).
1676+
{
1677+
LOCK(m_connman.m_dandelion_embargo_mutex);
1678+
auto current_time = GetTime<std::chrono::milliseconds>();
1679+
1680+
// Log embargo checks (debug level only — this fires every second)
1681+
if (!m_connman.mDandelionEmbargo.empty()) {
1682+
LogPrint(BCLog::DANDELION, "CheckDandelionEmbargoes: Checking %d embargoed transactions (stempool=%d, mempool=%d)\n",
1683+
m_connman.mDandelionEmbargo.size(), m_stempool.size(), m_mempool.size());
1684+
}
1685+
1686+
for (auto iter = m_connman.mDandelionEmbargo.begin(); iter != m_connman.mDandelionEmbargo.end();) {
1687+
if (m_mempool.exists(iter->first)) {
1688+
LogPrint(BCLog::DANDELION, "Embargoed dandeliontx %s found in mempool; removing from embargo map\n", iter->first.ToString());
1689+
m_connman.m_dandelion_stem_routed.erase(iter->first);
1690+
iter = m_connman.mDandelionEmbargo.erase(iter);
1691+
} else if (iter->second < current_time) {
1692+
LogPrintf("CheckDandelionEmbargoes: dandeliontx %s embargo expired\n", iter->first.ToString());
1693+
CTransactionRef ptx = m_stempool.get(iter->first);
1694+
if (ptx) {
1695+
LogPrintf("CheckDandelionEmbargoes: Moving transaction %s from stempool to mempool for broadcast\n", iter->first.ToString());
1696+
{
1697+
LOCK(cs_main);
1698+
const MempoolAcceptResult result = AcceptToMemoryPool(m_chainman.ActiveChainstate(), m_mempool, ptx, false);
1699+
if (result.m_result_type == MempoolAcceptResult::ResultType::VALID) {
1700+
LogPrintf("CheckDandelionEmbargoes: Successfully moved tx %s to mempool\n", iter->first.ToString());
1701+
LogPrint(BCLog::MEMPOOL, "AcceptToMemoryPool: accepted %s (poolsz %u txn, %u kB)\n",
1702+
iter->first.ToString(), m_mempool.size(), m_mempool.DynamicMemoryUsage() / 1000);
1703+
RelayTransaction(ptx->GetHash(), ptx->GetWitnessHash());
1704+
} else {
1705+
LogPrintf("CheckDandelionEmbargoes: Failed to move tx %s to mempool: %s\n",
1706+
iter->first.ToString(), result.m_state.ToString());
1707+
}
16631708
}
1709+
} else {
1710+
LogPrintf("CheckDandelionEmbargoes: Transaction %s not found in stempool!\n", iter->first.ToString());
16641711
}
1712+
m_connman.m_dandelion_stem_routed.erase(iter->first);
1713+
iter = m_connman.mDandelionEmbargo.erase(iter);
16651714
} else {
1666-
LogPrintf("CheckDandelionEmbargoes: Transaction %s not found in stempool!\n", iter->first.ToString());
1667-
}
1668-
m_connman.m_dandelion_stem_routed.erase(iter->first);
1669-
iter = m_connman.mDandelionEmbargo.erase(iter);
1670-
} else {
1671-
// Embargo not yet expired — attempt Dandelion stem routing only if:
1672-
// 1. We have Dandelion destinations available
1673-
// 2. This TX has NOT already been successfully routed
1674-
// (prevents the spam bug where we re-send every second)
1675-
// 3. OR the previous Dandelion destination disconnected (destination changed)
1676-
if (hasDandelionDestinations && m_connman.m_dandelion_stem_routed.count(iter->first) == 0) {
1677-
CTransactionRef ptx = m_stempool.get(iter->first);
1678-
if (ptx) {
1679-
CInv inv(MSG_DANDELION_TX, iter->first);
1680-
bool pushed = m_connman.localDandelionDestinationPushInventory(inv);
1681-
if (pushed) {
1682-
LogPrint(BCLog::DANDELION, "CheckDandelionEmbargoes: Routed Dandelion transaction %s via stem\n", iter->first.ToString());
1683-
PushDandelionTransaction(iter->first);
1684-
// Mark as routed so we don't re-send every second
1685-
m_connman.m_dandelion_stem_routed.insert(iter->first);
1715+
// Embargo not yet expired — check if this TX needs stem routing:
1716+
// 1. We have Dandelion destinations available
1717+
// 2. This TX has NOT already been successfully routed
1718+
// (prevents the spam bug where we re-send every second)
1719+
// 3. OR the previous Dandelion destination disconnected (destination changed)
1720+
//
1721+
// We only COLLECT the txid here. The actual push happens in Phase 2,
1722+
// after we release m_dandelion_embargo_mutex, because
1723+
// localDandelionDestinationPushInventory() acquires m_nodes_mutex.
1724+
if (hasDandelionDestinations && m_connman.m_dandelion_stem_routed.count(iter->first) == 0) {
1725+
if (m_stempool.exists(iter->first)) {
1726+
txidsNeedingStemRoute.push_back(iter->first);
16861727
}
16871728
}
1729+
1730+
// Log remaining time
1731+
auto remaining = std::chrono::duration_cast<std::chrono::seconds>(iter->second - current_time).count();
1732+
LogPrint(BCLog::DANDELION, "Transaction %s embargo expires in %d seconds\n", iter->first.ToString(), remaining);
1733+
iter++;
16881734
}
1689-
1690-
// Log remaining time
1691-
auto remaining = std::chrono::duration_cast<std::chrono::seconds>(iter->second - current_time).count();
1692-
LogPrint(BCLog::DANDELION, "Transaction %s embargo expires in %d seconds\n", iter->first.ToString(), remaining);
1693-
iter++;
1735+
}
1736+
} // m_dandelion_embargo_mutex released here — safe to touch m_nodes_mutex now.
1737+
1738+
// Phase 2: perform stem routing WITHOUT holding m_dandelion_embargo_mutex.
1739+
// localDandelionDestinationPushInventory() acquires m_nodes_mutex, which is
1740+
// now safe because we no longer hold the embargo lock.
1741+
for (const auto& txid : txidsNeedingStemRoute) {
1742+
CTransactionRef ptx = m_stempool.get(txid);
1743+
if (!ptx) continue; // raced with expiry/mempool promotion — harmless
1744+
1745+
CInv inv(MSG_DANDELION_TX, txid);
1746+
bool pushed = m_connman.localDandelionDestinationPushInventory(inv);
1747+
if (pushed) {
1748+
LogPrint(BCLog::DANDELION, "CheckDandelionEmbargoes: Routed Dandelion transaction %s via stem\n", txid.ToString());
1749+
PushDandelionTransaction(txid);
1750+
// Re-acquire embargo lock briefly to mark this TX as routed, so we
1751+
// don't re-send it every cycle.
1752+
LOCK(m_connman.m_dandelion_embargo_mutex);
1753+
m_connman.m_dandelion_stem_routed.insert(txid);
16941754
}
16951755
}
16961756
}

0 commit comments

Comments
 (0)