Skip to content

Commit 81eabd1

Browse files
authored
Merge pull request #134 from synonymdev/feat/channel-monitor-desync-repro
feat: add channel monitor desync repro tooling
2 parents 0ca711f + b116c10 commit 81eabd1

8 files changed

Lines changed: 665 additions & 6 deletions

File tree

docker/bitcoin-cli

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ Commands:
2828
getInvoice <amount> Get a new BIP21 URI with a bech32 address
2929
LND:
3030
getinfo Show LND node info (for connectivity debugging)
31+
openchannel <node_id> [amount] Open channel from LND to node (default: 500000 sats)
32+
payinvoice <invoice> [amount] Pay a Lightning invoice via LND
3133
holdinvoice [amount] [-m memo] Create a hold invoice
3234
settleinvoice <preimage> Reveal a preimage and use it to settle the corresponding invoice
3335
cancelinvoice <payment_hash> Cancels a currently open invoice
@@ -196,6 +198,127 @@ if [[ "$command" = "getinfo" ]]; then
196198
exit
197199
fi
198200

201+
# Open channel from LND to a node
202+
if [[ "$command" = "openchannel" ]]; then
203+
shift
204+
205+
node_id="${1:-}"
206+
amount="${2:-500000}"
207+
208+
if [ -z "$node_id" ]; then
209+
echo "Usage: $CLI_NAME openchannel <node_id> [amount_sats]"
210+
echo ""
211+
echo " node_id: app's Lightning node ID (Settings > Advanced > Lightning Node Info)"
212+
echo " amount: channel size in sats (default: 500000)"
213+
exit 1
214+
fi
215+
216+
# Check peer connection
217+
echo "→ Checking peer connection..."
218+
peer_count=$("${LNCLI_CMD[@]}" listpeers 2>/dev/null | jq "[.peers[] | select(.pub_key==\"$node_id\")] | length")
219+
220+
if [ "$peer_count" = "0" ]; then
221+
lnd_pubkey=$("${LNCLI_CMD[@]}" getinfo 2>/dev/null | jq -r '.identity_pubkey')
222+
echo "✗ Node is not connected as a peer."
223+
echo ""
224+
echo " Paste this in the app (Settings > Advanced > Channels > Add Connection):"
225+
echo " ${lnd_pubkey}@0.0.0.0:9735"
226+
echo ""
227+
echo " Then re-run this command."
228+
exit 1
229+
fi
230+
231+
echo "✓ Peer connected"
232+
233+
# Fund LND if needed
234+
balance=$("${LNCLI_CMD[@]}" walletbalance 2>/dev/null | jq -r '.confirmed_balance')
235+
echo "→ LND on-chain balance: $balance sats"
236+
237+
if [ "$balance" -lt "$amount" ]; then
238+
echo "→ Funding LND..."
239+
lnd_addr=$("${LNCLI_CMD[@]}" newaddress p2wkh 2>/dev/null | jq -r '.address')
240+
"${BASE_COMMAND[@]}" -named sendtoaddress address="$lnd_addr" amount=1 fee_rate=25 > /dev/null
241+
"${BASE_COMMAND[@]}" -generate 6 > /dev/null
242+
echo "✓ Funded LND with 1 BTC"
243+
sleep 2
244+
fi
245+
246+
# Open channel
247+
echo "→ Opening ${amount} sat channel to ${node_id:0:20}..."
248+
result=$("${LNCLI_CMD[@]}" openchannel --node_key "$node_id" --local_amt "$amount" --private 2>&1) || {
249+
echo "✗ Failed: $result"
250+
exit 1
251+
}
252+
253+
txid=$(echo "$result" | jq -r '.funding_txid // empty' 2>/dev/null)
254+
if [ -z "$txid" ]; then
255+
echo "✗ Failed: $result"
256+
exit 1
257+
fi
258+
259+
echo "✓ Channel opened, funding txid: $txid"
260+
261+
# Mine and wait
262+
echo "→ Mining 6 blocks..."
263+
"${BASE_COMMAND[@]}" -generate 6 > /dev/null
264+
echo "✓ Mined 6 blocks"
265+
266+
echo "→ Waiting for channel to become active..."
267+
for i in $(seq 1 30); do
268+
sleep 2
269+
active=$("${LNCLI_CMD[@]}" listchannels --peer "$node_id" --active_only 2>/dev/null | jq '.channels | length')
270+
if [ "$active" != "0" ]; then
271+
echo "✓ Channel is active!"
272+
break
273+
fi
274+
if [ $((i % 5)) -eq 0 ]; then echo " still waiting... ($i)"; fi
275+
done
276+
277+
if [ "$active" = "0" ]; then
278+
echo "⚠ Channel not active yet. May need more time or app needs to sync."
279+
fi
280+
281+
# Summary
282+
echo ""
283+
echo "══════════════════════════════════"
284+
"${LNCLI_CMD[@]}" channelbalance 2>/dev/null | jq -r '" LND outbound: \(.local_balance.sat) sats (can pay app)\n LND inbound: \(.remote_balance.sat) sats (can receive from app)"'
285+
echo "══════════════════════════════════"
286+
exit
287+
fi
288+
289+
# Pay a Lightning invoice via LND
290+
if [[ "$command" = "payinvoice" ]]; then
291+
shift
292+
293+
invoice="${1:-}"
294+
amount="${2:-}"
295+
296+
if [ -z "$invoice" ]; then
297+
echo "Usage: $CLI_NAME payinvoice <invoice> [amount_sats]"
298+
exit 1
299+
fi
300+
301+
if [ -n "$amount" ]; then
302+
echo "→ Paying invoice via LND (${amount} sats)..."
303+
result=$("${LNCLI_CMD[@]}" payinvoice --force --amt "$amount" "$invoice" 2>&1)
304+
else
305+
echo "→ Paying invoice via LND..."
306+
result=$("${LNCLI_CMD[@]}" payinvoice --force "$invoice" 2>&1)
307+
fi
308+
309+
status=$(echo "$result" | grep -i "status" | head -1)
310+
if echo "$result" | grep -qi "SUCCEEDED"; then
311+
echo "✓ Payment succeeded"
312+
echo "$result" | grep -i "payment_hash\|payment_preimage" | head -2
313+
else
314+
echo "✗ Payment failed"
315+
echo "$result"
316+
exit 1
317+
fi
318+
319+
exit
320+
fi
321+
199322
# Create a hold invoice (LND)
200323
if [[ "$command" = "holdinvoice" ]]; then
201324
shift

docs/lightning-primer-for-qa.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Lightning primer for QA
2+
3+
Background for [channel monitor desync repro](./repro-channel-monitor-desync.md) and any work that touches Lightning storage, migration, or startup.
4+
5+
## What a Lightning channel is (operationally)
6+
7+
Two parties lock funds in a 2-of-2 on-chain output. **Off-chain**, they exchange **commitment transactions** that encode “who gets what if we publish now.” Each new off-chain state is a **commitment update**. LDK tracks progress with an internal **`update_id`** (a monotonic counter per channel).
8+
9+
- **ChannelManager** — current view of all channels, balances, and pending HTLCs.
10+
- **ChannelMonitor** — per-channel state used to watch the chain, enforce penalties, and react to force-closes. It must stay **consistent** with what the ChannelManager believes.
11+
12+
The **chain::Watch** contract (simplified): durable storage must reflect **latest** ChannelMonitor data **before** the app continues as if that state is live. If an old monitor is paired with an advanced manager, LDK reports **`DangerousValue`** and refuses to start — that protects funds.
13+
14+
## HTLCs
15+
16+
**HTLC** means **Hash Time-Locked Contract**. It is a conditional payment: pay the peer if they reveal a preimage by a deadline; otherwise revert. HTLCs live **inside** commitment updates. Each hop of a multi-hop payment adds HTLCs; resolving them advances commitment state again.
17+
18+
Testing “payments” matters because each payment usually causes **multiple** commitment updates, not a 1:1 mapping to “one payment = one update_id step.”
19+
20+
## “Gap” in the test matrix (e.g. 21 / 30 payments)
21+
22+
The doc’s payment counts are a **proxy for many `update_id` advances**, not a magic number from BOLT math.
23+
24+
- **Small** mismatch between an old backup and the live node may be **healed** via peer reconnection and commitment replay.
25+
- **Large** mismatch, or injecting a **stale monitor** on top of an **advanced** manager, triggers **stale ChannelMonitor** errors and a refused start until recovery.
26+
27+
## What went wrong in the ChannelMonitor desync bug
28+
29+
1. **ChannelManager** on device was **ahead** (normal usage after RN migration).
30+
2. **Old ChannelMonitor** data (e.g. from RN remote backup) was applied without matching the current manager.
31+
3. On load: monitor `update_id` ≪ manager → **stale monitor****`DangerousValue`** → node will not run.
32+
33+
The **fix path** uses **`accept_stale_channel_monitors`** so ldk-node can align state and **self-heal** (commitment round-trips, chain sync). That is why recovery logs show retries, healing, and sometimes **over a minute** before balances and payments look normal — especially with **many blocks** to sync (e.g. T5) or **local LND** setups vs Blocktank-only flows.
34+
35+
## What to test when Lightning / LDK storage changes
36+
37+
| Area | Why |
38+
|------|-----|
39+
| **Cold start** | Any path that reads/writes ChannelManager, monitors, or VSS must not pair **new** manager with **old** monitor. |
40+
| **Backup / restore** | Restoring must be **consistent snapshots**; partial or older monitor alone is high risk. |
41+
| **Migration** | RN → native or schema changes: avoid overwriting live data with **stale** remote copies. |
42+
| **Recovery** | After `DangerousValue` / `accept_stale`: peers reconnect, chain sync completes, **inbound and outbound** payments work, **second launch** does not repeat recovery forever. |
43+
| **Infra noise** | On regtest, **stale RGS** / gossip can cause transient **“route not found”** — distinguish from persistence bugs (see logs for `DangerousValue` vs routing errors). |
44+
45+
## Risks of incorrect “fixes”
46+
47+
- Skipping or weakening persistence checks can lead to **wrong** enforcement keys or **missed** on-chain reactions.
48+
- Blindly merging backups can recreate the **stale monitor** class of bug.
49+
- Recovery paths should always be validated with **real sends/receives** and **restart**, not only “app opens.”
50+
51+
## Glossary
52+
53+
| Term | Meaning |
54+
|------|--------|
55+
| **Commitment update** | New off-chain state (balances + HTLC set). |
56+
| **`update_id`** | LDK’s persisted notion of how far the ChannelMonitor has advanced vs the ChannelManager for that channel. |
57+
| **HTLC** | **Hash Time-Locked Contract** — conditional payment inside a commitment (hash lock + time lock). |
58+
| **ChannelMonitor** | Per-channel persisted state for chain watching and dispute handling. |
59+
| **DangerousValue** | LDK/ldk-node refusing to load because continuing would violate safety assumptions (e.g. stale monitor). |
60+
| **accept_stale_channel_monitors** | Explicit recovery mode to load despite mismatch, then heal via protocol + sync (use only in controlled recovery). |
61+
62+
## See also
63+
64+
- [repro-channel-monitor-desync.md](./repro-channel-monitor-desync.md) — repro steps, matrix, recovery timing notes

0 commit comments

Comments
 (0)