@@ -1627,70 +1627,130 @@ void PeerManagerImpl::RelayDandelionTransaction(const CTransaction& tx, CNode* p
16271627
16281628void PeerManagerImpl::CheckDandelionEmbargoes ()
16291629{
1630- LOCK (m_connman.m_dandelion_embargo_mutex );
1631- auto current_time = GetTime<std::chrono::milliseconds>();
1632-
1633- // Log embargo checks (debug level only — this fires every second)
1634- if (!m_connman.mDandelionEmbargo .empty ()) {
1635- LogPrint (BCLog::DANDELION, " CheckDandelionEmbargoes: Checking %d embargoed transactions (stempool=%d, mempool=%d)\n " ,
1636- m_connman.mDandelionEmbargo .size (), m_stempool.size (), m_mempool.size ());
1637- }
1638-
1639- // Check if we now have Dandelion destinations available for stuck transactions
1630+ // =========================================================================
1631+ // Bug #29 fix: ABBA deadlock between m_nodes_mutex and m_dandelion_embargo_mutex
1632+ //
1633+ // The established lock ordering throughout the Dandelion++ code is:
1634+ // m_nodes_mutex FIRST, then m_dandelion_embargo_mutex SECOND
1635+ //
1636+ // This ordering is used by:
1637+ // - DandelionShuffle() (dandelion.cpp:344 → 357)
1638+ // - CloseDandelionConnections() (dandelion.cpp:211 → 292)
1639+ // - DisconnectNodes() (net.cpp:1952 → CloseDandelionConnections)
1640+ //
1641+ // Before this fix, CheckDandelionEmbargoes() violated that ordering:
1642+ // it held m_dandelion_embargo_mutex, then called usingDandelion() and
1643+ // localDandelionDestinationPushInventory(), both of which acquire
1644+ // m_nodes_mutex internally. This created a classic ABBA deadlock:
1645+ //
1646+ // Thread A (shuffle timer): LOCK(m_nodes_mutex) → LOCK(m_dandelion_embargo_mutex)
1647+ // Thread B (embargo timer): LOCK(m_dandelion_embargo_mutex) → LOCK(m_nodes_mutex)
1648+ //
1649+ // When both fire concurrently, each thread holds the lock the other needs.
1650+ // Result: sendtoaddress hangs forever at "Processing Dandelion relay",
1651+ // shutdown hangs on threadDandelionShuffle.join(), RPC times out.
1652+ // (Reported by DanGB on Windows 11, RC26, reproducible after ~1 week uptime.)
1653+ //
1654+ // Fix: restructure this function into two phases:
1655+ // Phase 1 — under m_dandelion_embargo_mutex: scan the embargo map, handle
1656+ // expired/mempool entries, collect txids that need stem routing.
1657+ // Phase 2 — after releasing m_dandelion_embargo_mutex: perform the stem
1658+ // routing (which needs m_nodes_mutex), then briefly re-acquire
1659+ // m_dandelion_embargo_mutex to mark them as routed.
1660+ //
1661+ // usingDandelion() is also moved before the embargo lock for the same reason.
1662+ // The bool may be momentarily stale, but that only means we skip one routing
1663+ // cycle (~1 second) — no correctness impact.
1664+ // =========================================================================
1665+
1666+ // Phase 0: query Dandelion destination availability WITHOUT holding the
1667+ // embargo lock. usingDandelion() acquires m_nodes_mutex internally.
16401668 bool hasDandelionDestinations = m_connman.usingDandelion ();
1641-
1642- for (auto iter = m_connman.mDandelionEmbargo .begin (); iter != m_connman.mDandelionEmbargo .end ();) {
1643- if (m_mempool.exists (iter->first )) {
1644- LogPrint (BCLog::DANDELION, " Embargoed dandeliontx %s found in mempool; removing from embargo map\n " , iter->first .ToString ());
1645- m_connman.m_dandelion_stem_routed .erase (iter->first );
1646- iter = m_connman.mDandelionEmbargo .erase (iter);
1647- } else if (iter->second < current_time) {
1648- LogPrintf (" CheckDandelionEmbargoes: dandeliontx %s embargo expired\n " , iter->first .ToString ());
1649- CTransactionRef ptx = m_stempool.get (iter->first );
1650- if (ptx) {
1651- LogPrintf (" CheckDandelionEmbargoes: Moving transaction %s from stempool to mempool for broadcast\n " , iter->first .ToString ());
1652- {
1653- LOCK (cs_main);
1654- const MempoolAcceptResult result = AcceptToMemoryPool (m_chainman.ActiveChainstate (), m_mempool, ptx, false );
1655- if (result.m_result_type == MempoolAcceptResult::ResultType::VALID) {
1656- LogPrintf (" CheckDandelionEmbargoes: Successfully moved tx %s to mempool\n " , iter->first .ToString ());
1657- LogPrint (BCLog::MEMPOOL, " AcceptToMemoryPool: accepted %s (poolsz %u txn, %u kB)\n " ,
1658- iter->first .ToString (), m_mempool.size (), m_mempool.DynamicMemoryUsage () / 1000 );
1659- RelayTransaction (ptx->GetHash (), ptx->GetWitnessHash ());
1660- } else {
1661- LogPrintf (" CheckDandelionEmbargoes: Failed to move tx %s to mempool: %s\n " ,
1662- iter->first .ToString (), result.m_state .ToString ());
1669+
1670+ // Transactions collected during Phase 1 that need stem routing in Phase 2.
1671+ std::vector<uint256> txidsNeedingStemRoute;
1672+
1673+ // Phase 1: scan embargo map under m_dandelion_embargo_mutex.
1674+ // Everything in this block touches only embargo-protected state (plus
1675+ // cs_main for AcceptToMemoryPool, which has no ordering conflict here).
1676+ {
1677+ LOCK (m_connman.m_dandelion_embargo_mutex );
1678+ auto current_time = GetTime<std::chrono::milliseconds>();
1679+
1680+ // Log embargo checks (debug level only — this fires every second)
1681+ if (!m_connman.mDandelionEmbargo .empty ()) {
1682+ LogPrint (BCLog::DANDELION, " CheckDandelionEmbargoes: Checking %d embargoed transactions (stempool=%d, mempool=%d)\n " ,
1683+ m_connman.mDandelionEmbargo .size (), m_stempool.size (), m_mempool.size ());
1684+ }
1685+
1686+ for (auto iter = m_connman.mDandelionEmbargo .begin (); iter != m_connman.mDandelionEmbargo .end ();) {
1687+ if (m_mempool.exists (iter->first )) {
1688+ LogPrint (BCLog::DANDELION, " Embargoed dandeliontx %s found in mempool; removing from embargo map\n " , iter->first .ToString ());
1689+ m_connman.m_dandelion_stem_routed .erase (iter->first );
1690+ iter = m_connman.mDandelionEmbargo .erase (iter);
1691+ } else if (iter->second < current_time) {
1692+ LogPrintf (" CheckDandelionEmbargoes: dandeliontx %s embargo expired\n " , iter->first .ToString ());
1693+ CTransactionRef ptx = m_stempool.get (iter->first );
1694+ if (ptx) {
1695+ LogPrintf (" CheckDandelionEmbargoes: Moving transaction %s from stempool to mempool for broadcast\n " , iter->first .ToString ());
1696+ {
1697+ LOCK (cs_main);
1698+ const MempoolAcceptResult result = AcceptToMemoryPool (m_chainman.ActiveChainstate (), m_mempool, ptx, false );
1699+ if (result.m_result_type == MempoolAcceptResult::ResultType::VALID) {
1700+ LogPrintf (" CheckDandelionEmbargoes: Successfully moved tx %s to mempool\n " , iter->first .ToString ());
1701+ LogPrint (BCLog::MEMPOOL, " AcceptToMemoryPool: accepted %s (poolsz %u txn, %u kB)\n " ,
1702+ iter->first .ToString (), m_mempool.size (), m_mempool.DynamicMemoryUsage () / 1000 );
1703+ RelayTransaction (ptx->GetHash (), ptx->GetWitnessHash ());
1704+ } else {
1705+ LogPrintf (" CheckDandelionEmbargoes: Failed to move tx %s to mempool: %s\n " ,
1706+ iter->first .ToString (), result.m_state .ToString ());
1707+ }
16631708 }
1709+ } else {
1710+ LogPrintf (" CheckDandelionEmbargoes: Transaction %s not found in stempool!\n " , iter->first .ToString ());
16641711 }
1712+ m_connman.m_dandelion_stem_routed .erase (iter->first );
1713+ iter = m_connman.mDandelionEmbargo .erase (iter);
16651714 } else {
1666- LogPrintf (" CheckDandelionEmbargoes: Transaction %s not found in stempool!\n " , iter->first .ToString ());
1667- }
1668- m_connman.m_dandelion_stem_routed .erase (iter->first );
1669- iter = m_connman.mDandelionEmbargo .erase (iter);
1670- } else {
1671- // Embargo not yet expired — attempt Dandelion stem routing only if:
1672- // 1. We have Dandelion destinations available
1673- // 2. This TX has NOT already been successfully routed
1674- // (prevents the spam bug where we re-send every second)
1675- // 3. OR the previous Dandelion destination disconnected (destination changed)
1676- if (hasDandelionDestinations && m_connman.m_dandelion_stem_routed .count (iter->first ) == 0 ) {
1677- CTransactionRef ptx = m_stempool.get (iter->first );
1678- if (ptx) {
1679- CInv inv (MSG_DANDELION_TX, iter->first );
1680- bool pushed = m_connman.localDandelionDestinationPushInventory (inv);
1681- if (pushed) {
1682- LogPrint (BCLog::DANDELION, " CheckDandelionEmbargoes: Routed Dandelion transaction %s via stem\n " , iter->first .ToString ());
1683- PushDandelionTransaction (iter->first );
1684- // Mark as routed so we don't re-send every second
1685- m_connman.m_dandelion_stem_routed .insert (iter->first );
1715+ // Embargo not yet expired — check if this TX needs stem routing:
1716+ // 1. We have Dandelion destinations available
1717+ // 2. This TX has NOT already been successfully routed
1718+ // (prevents the spam bug where we re-send every second)
1719+ // 3. OR the previous Dandelion destination disconnected (destination changed)
1720+ //
1721+ // We only COLLECT the txid here. The actual push happens in Phase 2,
1722+ // after we release m_dandelion_embargo_mutex, because
1723+ // localDandelionDestinationPushInventory() acquires m_nodes_mutex.
1724+ if (hasDandelionDestinations && m_connman.m_dandelion_stem_routed .count (iter->first ) == 0 ) {
1725+ if (m_stempool.exists (iter->first )) {
1726+ txidsNeedingStemRoute.push_back (iter->first );
16861727 }
16871728 }
1729+
1730+ // Log remaining time
1731+ auto remaining = std::chrono::duration_cast<std::chrono::seconds>(iter->second - current_time).count ();
1732+ LogPrint (BCLog::DANDELION, " Transaction %s embargo expires in %d seconds\n " , iter->first .ToString (), remaining);
1733+ iter++;
16881734 }
1689-
1690- // Log remaining time
1691- auto remaining = std::chrono::duration_cast<std::chrono::seconds>(iter->second - current_time).count ();
1692- LogPrint (BCLog::DANDELION, " Transaction %s embargo expires in %d seconds\n " , iter->first .ToString (), remaining);
1693- iter++;
1735+ }
1736+ } // m_dandelion_embargo_mutex released here — safe to touch m_nodes_mutex now.
1737+
1738+ // Phase 2: perform stem routing WITHOUT holding m_dandelion_embargo_mutex.
1739+ // localDandelionDestinationPushInventory() acquires m_nodes_mutex, which is
1740+ // now safe because we no longer hold the embargo lock.
1741+ for (const auto & txid : txidsNeedingStemRoute) {
1742+ CTransactionRef ptx = m_stempool.get (txid);
1743+ if (!ptx) continue ; // raced with expiry/mempool promotion — harmless
1744+
1745+ CInv inv (MSG_DANDELION_TX, txid);
1746+ bool pushed = m_connman.localDandelionDestinationPushInventory (inv);
1747+ if (pushed) {
1748+ LogPrint (BCLog::DANDELION, " CheckDandelionEmbargoes: Routed Dandelion transaction %s via stem\n " , txid.ToString ());
1749+ PushDandelionTransaction (txid);
1750+ // Re-acquire embargo lock briefly to mark this TX as routed, so we
1751+ // don't re-send it every cycle.
1752+ LOCK (m_connman.m_dandelion_embargo_mutex );
1753+ m_connman.m_dandelion_stem_routed .insert (txid);
16941754 }
16951755 }
16961756}
0 commit comments