Re-claim forwarded HTLCs on startup

TheBlueMatt · TheBlueMatt · commit 2aa66b8ae4b6 · 2023-06-27T15:48:48.000Z
Because `ChannelMonitorUpdate`s can complete asynchronously and
out-of-order now, a `commitment_signed` `ChannelMonitorUpdate` from
a downstream channel could complete prior to the preimage
`ChannelMonitorUpdate` on the upstream channel. In that case, we may
not get a `update_fulfill_htlc` replay on startup. Thus, we have to
ensure any payment preimages contained in that downstream update are
re-claimed on startup.

Here we do this during the existing walk of the `ChannelMonitor`
preimages for closed channels.
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
@@ -4654,7 +4654,7 @@ where
 			for htlc in sources.drain(..) {
 				if let Err((pk, err)) = self.claim_funds_from_hop(
 					htlc.prev_hop, payment_preimage,
-					|_| Some(MonitorUpdateCompletionAction::PaymentClaimed { payment_hash }))
+					|_| Some(MonitorUpdateCompletionAction::PaymentClaimed { payment_hash }), false)
 				{
 					if let msgs::ErrorAction::IgnoreError = err.err.action {
 						// We got a temporary failure updating monitor, but will claim the
@@ -4684,7 +4684,7 @@ where
 	}
 
 	fn claim_funds_from_hop<ComplFunc: FnOnce(Option<u64>) -> Option<MonitorUpdateCompletionAction>>(&self,
-		prev_hop: HTLCPreviousHopData, payment_preimage: PaymentPreimage, completion_action: ComplFunc)
+		prev_hop: HTLCPreviousHopData, payment_preimage: PaymentPreimage, completion_action: ComplFunc, during_init: bool)
 	-> Result<(), (PublicKey, MsgHandleErrInternal)> {
 		//TODO: Delay the claimed_funds relaying just like we do outbound relay!
 
@@ -4714,14 +4714,26 @@ where
 								log_bytes!(chan_id), action);
 							peer_state.monitor_update_blocked_actions.entry(chan_id).or_insert(Vec::new()).push(action);
 						}
-						let res = handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
-							peer_state, per_peer_state, chan);
-						if let Err(e) = res {
-							// TODO: This is a *critical* error - we probably updated the outbound edge
-							// of the HTLC's monitor with a preimage. We should retry this monitor
-							// update over and over again until morale improves.
-							log_error!(self.logger, "Failed to update channel monitor with preimage {:?}", payment_preimage);
-							return Err((counterparty_node_id, e));
+						if !during_init {
+							let res = handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
+								peer_state, per_peer_state, chan);
+							if let Err(e) = res {
+								// TODO: This is a *critical* error - we probably updated the outbound edge
+								// of the HTLC's monitor with a preimage. We should retry this monitor
+								// update over and over again until morale improves.
+								log_error!(self.logger, "Failed to update channel monitor with preimage {:?}", payment_preimage);
+								return Err((counterparty_node_id, e));
+							}
+						} else {
+							// If we're running during init we cannot update a monitor directly -
+							// they probably haven't actually been loaded yet. Instead, push the
+							// monitor update as a background event.
+							self.pending_background_events.lock().unwrap().push(
+								BackgroundEvent::MonitorUpdateRegeneratedOnStartup {
+									counterparty_node_id,
+									funding_txo: prev_hop.outpoint,
+									update: monitor_update.clone(),
+								});
 						}
 					}
 					return Ok(());
@@ -4734,16 +4746,34 @@ where
 				payment_preimage,
 			}],
 		};
-		// We update the ChannelMonitor on the backward link, after
-		// receiving an `update_fulfill_htlc` from the forward link.
-		let update_res = self.chain_monitor.update_channel(prev_hop.outpoint, &preimage_update);
-		if update_res != ChannelMonitorUpdateStatus::Completed {
-			// TODO: This needs to be handled somehow - if we receive a monitor update
-			// with a preimage we *must* somehow manage to propagate it to the upstream
-			// channel, or we must have an ability to receive the same event and try
-			// again on restart.
-			log_error!(self.logger, "Critical error: failed to update channel monitor with preimage {:?}: {:?}",
-				payment_preimage, update_res);
+
+		if !during_init {
+			// We update the ChannelMonitor on the backward link, after
+			// receiving an `update_fulfill_htlc` from the forward link.
+			let update_res = self.chain_monitor.update_channel(prev_hop.outpoint, &preimage_update);
+			if update_res != ChannelMonitorUpdateStatus::Completed {
+				// TODO: This needs to be handled somehow - if we receive a monitor update
+				// with a preimage we *must* somehow manage to propagate it to the upstream
+				// channel, or we must have an ability to receive the same event and try
+				// again on restart.
+				log_error!(self.logger, "Critical error: failed to update channel monitor with preimage {:?}: {:?}",
+					payment_preimage, update_res);
+			}
+		} else {
+			// If we're running during init we cannot update a monitor directly - they probably
+			// haven't actually been loaded yet. Instead, push the monitor update as a background
+			// event.
+			// Note that while its safe to use `ClosingMonitorUpdateRegeneratedOnStartup` here (the
+			// channel is already closed) we need to ultimately handle the monitor update
+			// completion action only after we've completed the monitor update. This is the only
+			// way to guarantee this update *will* be regenerated on startup (otherwise if this was
+			// from a forwarded HTLC the downstream preimage may be deleted before we claim
+			// upstream). Thus, we need to transition to some new `BackgroundEvent` type which will
+			// complete the monitor update completion action from `completion_action`.
+			self.pending_background_events.lock().unwrap().push(
+				BackgroundEvent::ClosingMonitorUpdateRegeneratedOnStartup((
+					prev_hop.outpoint, preimage_update,
+				)));
 		}
 		// Note that we do process the completion action here. This totally could be a
 		// duplicate claim, but we have no way of knowing without interrogating the
@@ -4758,9 +4788,10 @@ where
 		self.pending_outbound_payments.finalize_claims(sources, &self.pending_events);
 	}
 
-	fn claim_funds_internal(&self, source: HTLCSource, payment_preimage: PaymentPreimage, forwarded_htlc_value_msat: Option<u64>, from_onchain: bool, next_channel_id: [u8; 32]) {
+	fn claim_funds_internal(&self, source: HTLCSource, payment_preimage: PaymentPreimage, forwarded_htlc_value_msat: Option<u64>, from_onchain: bool, next_channel_id: [u8; 32], during_init: bool) {
 		match source {
 			HTLCSource::OutboundRoute { session_priv, payment_id, path, .. } => {
+				debug_assert!(!during_init);
 				self.pending_outbound_payments.claim_htlc(payment_id, payment_preimage, session_priv, path, from_onchain, &self.pending_events, &self.logger);
 			},
 			HTLCSource::PreviousHopData(hop_data) => {
@@ -4783,7 +4814,7 @@ where
 								downstream_counterparty_and_funding_outpoint: None,
 							})
 						} else { None }
-					});
+					}, during_init);
 				if let Err((pk, err)) = res {
 					let result: Result<(), _> = Err(err);
 					let _ = handle_error!(self, result, pk);
@@ -5531,7 +5562,7 @@ where
 				hash_map::Entry::Vacant(_) => return Err(MsgHandleErrInternal::send_err_msg_no_close(format!("Got a message for a channel from the wrong node! No such channel for the passed counterparty_node_id {}", counterparty_node_id), msg.channel_id))
 			}
 		};
-		self.claim_funds_internal(htlc_source, msg.payment_preimage.clone(), Some(forwarded_htlc_value), false, msg.channel_id);
+		self.claim_funds_internal(htlc_source, msg.payment_preimage.clone(), Some(forwarded_htlc_value), false, msg.channel_id, false);
 		Ok(())
 	}
 
@@ -5901,7 +5932,7 @@ where
 					MonitorEvent::HTLCEvent(htlc_update) => {
 						if let Some(preimage) = htlc_update.payment_preimage {
 							log_trace!(self.logger, "Claiming HTLC with preimage {} from our monitor", log_bytes!(preimage.0));
-							self.claim_funds_internal(htlc_update.source, preimage, htlc_update.htlc_value_satoshis.map(|v| v * 1000), true, funding_outpoint.to_channel_id());
+							self.claim_funds_internal(htlc_update.source, preimage, htlc_update.htlc_value_satoshis.map(|v| v * 1000), true, funding_outpoint.to_channel_id(), false);
 						} else {
 							log_trace!(self.logger, "Failing HTLC with hash {} from our monitor", log_bytes!(htlc_update.payment_hash.0));
 							let receiver = HTLCDestination::NextHopChannel { node_id: counterparty_node_id, channel_id: funding_outpoint.to_channel_id() };
@@ -8485,6 +8516,11 @@ where
 		// Note that we have to do the above replays before we push new monitor updates.
 		pending_background_events.append(&mut close_background_events);
 
+		// If there's any preimages for forwarded HTLCs hanging around in ChannelMonitors we
+		// should ensure we try them again on the inbound edge. We put them here and do so after we
+		// have a fully-constructed `ChannelManager` at the end.
+		let mut pending_claims_to_replay = Vec::new();
+
 		{
 			// If we're tracking pending payments, ensure we haven't lost any by looking at the
 			// ChannelMonitor data for any channels for which we do not have authorative state
@@ -8495,7 +8531,8 @@ where
 			// We only rebuild the pending payments map if we were most recently serialized by
 			// 0.0.102+
 			for (_, monitor) in args.channel_monitors.iter() {
-				if id_to_peer.get(&monitor.get_funding_txo().0.to_channel_id()).is_none() {
+				let counterparty_opt = id_to_peer.get(&monitor.get_funding_txo().0.to_channel_id());
+				if counterparty_opt.is_none() {
 					for (htlc_source, (htlc, _)) in monitor.get_pending_or_resolved_outbound_htlcs() {
 						if let HTLCSource::OutboundRoute { payment_id, session_priv, path, .. } = htlc_source {
 							if path.hops.is_empty() {
@@ -8589,6 +8626,30 @@ where
 						}
 					}
 				}
+
+				// Whether the downstream channel was closed or not, try to re-apply any payment
+				// preimages from it which may be needed in upstream channels for forwarded
+				// payments.
+				let outbound_claimed_htlcs_iter = monitor.get_all_current_outbound_htlcs()
+					.into_iter()
+					.filter_map(|(htlc_source, (htlc, preimage_opt))| {
+						if let HTLCSource::PreviousHopData(_) = htlc_source {
+							if let Some(payment_preimage) = preimage_opt {
+								Some((htlc_source, payment_preimage, htlc.amount_msat,
+									counterparty_opt.is_none(), // i.e. the downstream chan is closed
+									monitor.get_funding_txo().0.to_channel_id()))
+							} else { None }
+						} else {
+							// If it was an outbound payment, we've handled it above - if a preimage
+							// came in and we persisted the `ChannelManager` we either handled it and
+							// are good to go or the channel force-closed - we don't have to handle the
+							// channel still live case here.
+							None
+						}
+					});
+				for tuple in outbound_claimed_htlcs_iter {
+					pending_claims_to_replay.push(tuple);
+				}
 			}
 		}
 
@@ -8840,6 +8901,11 @@ where
 			channel_manager.fail_htlc_backwards_internal(&source, &payment_hash, &reason, receiver);
 		}
 
+		for (source, preimage, downstream_value, downstream_closed, downstream_chan_id) in pending_claims_to_replay {
+			channel_manager.claim_funds_internal(source, preimage, Some(downstream_value),
+				downstream_closed, downstream_chan_id, true);
+		}
+
 		//TODO: Broadcast channel update for closed channels, but only after we've made a
 		//connection or two.