From df9c15de75aca8dd6368e4ef8f475237f83ea632 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 10 Dec 2023 03:28:37 +0000 Subject: [PATCH 1/6] Cache whether a node is a first-hop target in the per-node state When processing the main loop during routefinding, for each node, we check whether it happens to be our peer in one of our channels. This ensures we never fail to find a route that takes a hop through a private channel of ours, to a private node, then through invoice-provided route hints to reach the ultimate payee. Because this is incredibly hot code, doing a full `HashMap` lookup to check if each node is a first-hop target ends up eating a good chunk of time during routing. Luckily, we can trivially avoid this cost. Because we're already looking up the per-node state in the `dist` map, we can store a bool in each first-hop target's state, avoiding the lookup unless we know its going to succeed. This requires storing a dummy entry in `dist`, which feels somewhat strange, but is ultimately fine as we should never be looking at per-node state unless we've already found a path to that node, updating the fields in doign so. --- lightning/src/routing/router.rs | 63 +++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/lightning/src/routing/router.rs b/lightning/src/routing/router.rs index fd1344ff031..66b2d87e535 100644 --- a/lightning/src/routing/router.rs +++ b/lightning/src/routing/router.rs @@ -1772,6 +1772,14 @@ struct PathBuildingHop<'a> { /// decrease as well. Thus, we have to explicitly track which nodes have been processed and /// avoid processing them again. was_processed: bool, + /// When processing a node as the next best-score candidate, we want to quickly check if it is + /// a direct counterparty of ours, using our local channel information immediately if we can. + /// + /// In order to do so efficiently, we cache whether a node is a direct counterparty here at the + /// start of a route-finding pass. Unlike all other fields in this struct, this field is never + /// updated after being initialized - it is set at the start of a route-finding pass and only + /// read thereafter. + is_first_hop_target: bool, /// Used to compare channels when choosing the for routing. /// Includes paying for the use of a hop and the following hops, as well as /// an estimated cost of reaching this hop. @@ -1810,6 +1818,7 @@ impl<'a> core::fmt::Debug for PathBuildingHop<'a> { .field("source_node_id", &self.candidate.source()) .field("target_node_id", &self.candidate.target()) .field("short_channel_id", &self.candidate.short_channel_id()) + .field("is_first_hop_target", &self.is_first_hop_target) .field("total_fee_msat", &self.total_fee_msat) .field("next_hops_fee_msat", &self.next_hops_fee_msat) .field("hop_use_fee_msat", &self.hop_use_fee_msat) @@ -2516,6 +2525,7 @@ where L::Target: Logger { path_htlc_minimum_msat, path_penalty_msat: u64::max_value(), was_processed: false, + is_first_hop_target: false, #[cfg(all(not(ldk_bench), any(test, fuzzing)))] value_contribution_msat, }); @@ -2679,12 +2689,14 @@ where L::Target: Logger { let fee_to_target_msat; let next_hops_path_htlc_minimum_msat; let next_hops_path_penalty_msat; + let is_first_hop_target; let skip_node = if let Some(elem) = &mut dist[$node.node_counter as usize] { let was_processed = elem.was_processed; elem.was_processed = true; fee_to_target_msat = elem.total_fee_msat; next_hops_path_htlc_minimum_msat = elem.path_htlc_minimum_msat; next_hops_path_penalty_msat = elem.path_penalty_msat; + is_first_hop_target = elem.is_first_hop_target; was_processed } else { // Entries are added to dist in add_entry!() when there is a channel from a node. @@ -2695,21 +2707,24 @@ where L::Target: Logger { fee_to_target_msat = 0; next_hops_path_htlc_minimum_msat = 0; next_hops_path_penalty_msat = 0; + is_first_hop_target = false; false }; if !skip_node { - if let Some((first_channels, peer_node_counter)) = first_hop_targets.get(&$node_id) { - for details in first_channels { - debug_assert_eq!(*peer_node_counter, $node.node_counter); - let candidate = CandidateRouteHop::FirstHop(FirstHopCandidate { - details, payer_node_id: &our_node_id, payer_node_counter, - target_node_counter: $node.node_counter, - }); - add_entry!(&candidate, fee_to_target_msat, - $next_hops_value_contribution, - next_hops_path_htlc_minimum_msat, next_hops_path_penalty_msat, - $next_hops_cltv_delta, $next_hops_path_length); + if is_first_hop_target { + if let Some((first_channels, peer_node_counter)) = first_hop_targets.get(&$node_id) { + for details in first_channels { + debug_assert_eq!(*peer_node_counter, $node.node_counter); + let candidate = CandidateRouteHop::FirstHop(FirstHopCandidate { + details, payer_node_id: &our_node_id, payer_node_counter, + target_node_counter: $node.node_counter, + }); + add_entry!(&candidate, fee_to_target_msat, + $next_hops_value_contribution, + next_hops_path_htlc_minimum_msat, next_hops_path_penalty_msat, + $next_hops_cltv_delta, $next_hops_path_length); + } } } @@ -2756,6 +2771,32 @@ where L::Target: Logger { for e in dist.iter_mut() { *e = None; } + for (_, (chans, peer_node_counter)) in first_hop_targets.iter() { + // In order to avoid looking up whether each node is a first-hop target, we store a + // dummy entry in dist for each first-hop target, allowing us to do this lookup for + // free since we're already looking at the `was_processed` flag. + // + // Note that all the fields (except `is_first_hop_target`) will be overwritten whenever + // we find a path to the target, so are left as dummies here. + dist[*peer_node_counter as usize] = Some(PathBuildingHop { + candidate: CandidateRouteHop::FirstHop(FirstHopCandidate { + details: &chans[0], + payer_node_id: &our_node_id, + target_node_counter: u32::max_value(), + payer_node_counter: u32::max_value(), + }), + fee_msat: 0, + next_hops_fee_msat: u64::max_value(), + hop_use_fee_msat: u64::max_value(), + total_fee_msat: u64::max_value(), + path_htlc_minimum_msat: u64::max_value(), + path_penalty_msat: u64::max_value(), + was_processed: false, + is_first_hop_target: true, + #[cfg(all(not(ldk_bench), any(test, fuzzing)))] + value_contribution_msat: 0, + }); + } hit_minimum_limit = false; // If first hop is a private channel and the only way to reach the payee, this is the only From 3e902401e0f4e56d30c2b537ca53cf9392cea50c Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Fri, 8 Dec 2023 01:49:08 +0000 Subject: [PATCH 2/6] DRY redundant calls to `$candidate.htlc_minimum_msat()` in routing While LLVM should inline and elide the redundant calls, because the router is rather large LLVM can decide against inlining in some cases where it would be an nice win. Thus, its worth DRY'ing the redundant calls explicitly. --- lightning/src/routing/router.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lightning/src/routing/router.rs b/lightning/src/routing/router.rs index 66b2d87e535..bbf73e53106 100644 --- a/lightning/src/routing/router.rs +++ b/lightning/src/routing/router.rs @@ -2425,14 +2425,15 @@ where L::Target: Logger { // Can't overflow due to how the values were computed right above. None => unreachable!(), }; + let htlc_minimum_msat = $candidate.htlc_minimum_msat(); #[allow(unused_comparisons)] // $next_hops_path_htlc_minimum_msat is 0 in some calls so rustc complains - let over_path_minimum_msat = amount_to_transfer_over_msat >= $candidate.htlc_minimum_msat() && + let over_path_minimum_msat = amount_to_transfer_over_msat >= htlc_minimum_msat && amount_to_transfer_over_msat >= $next_hops_path_htlc_minimum_msat; #[allow(unused_comparisons)] // $next_hops_path_htlc_minimum_msat is 0 in some calls so rustc complains let may_overpay_to_meet_path_minimum_msat = - ((amount_to_transfer_over_msat < $candidate.htlc_minimum_msat() && - recommended_value_msat >= $candidate.htlc_minimum_msat()) || + ((amount_to_transfer_over_msat < htlc_minimum_msat && + recommended_value_msat >= htlc_minimum_msat) || (amount_to_transfer_over_msat < $next_hops_path_htlc_minimum_msat && recommended_value_msat >= $next_hops_path_htlc_minimum_msat)); From 98f9e8bbf80c5f7d3bb37da92aef0f12d57c075f Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Fri, 8 Dec 2023 01:49:20 +0000 Subject: [PATCH 3/6] Store source/target `node_counter`s in `DirectionalChannelInfo` Because we now have some slack space in `PathBuildingHop`, we can use it to cache some additional hot values. Here we use it to cache the source and target `node_counter`s for public channels, effectively prefetching the values from the channel state. --- lightning/src/routing/gossip.rs | 17 ++++++++++++----- lightning/src/routing/router.rs | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lightning/src/routing/gossip.rs b/lightning/src/routing/gossip.rs index 04aa174f362..b0117cd68d3 100644 --- a/lightning/src/routing/gossip.rs +++ b/lightning/src/routing/gossip.rs @@ -1053,6 +1053,8 @@ impl Readable for ChannelInfo { pub struct DirectedChannelInfo<'a> { channel: &'a ChannelInfo, direction: &'a ChannelUpdateInfo, + source_counter: u32, + target_counter: u32, /// The direction this channel is in - if set, it indicates that we're traversing the channel /// from [`ChannelInfo::node_one`] to [`ChannelInfo::node_two`]. from_node_one: bool, @@ -1061,7 +1063,12 @@ pub struct DirectedChannelInfo<'a> { impl<'a> DirectedChannelInfo<'a> { #[inline] fn new(channel: &'a ChannelInfo, direction: &'a ChannelUpdateInfo, from_node_one: bool) -> Self { - Self { channel, direction, from_node_one } + let (source_counter, target_counter) = if from_node_one { + (channel.node_one_counter, channel.node_two_counter) + } else { + (channel.node_two_counter, channel.node_one_counter) + }; + Self { channel, direction, from_node_one, source_counter, target_counter } } /// Returns information for the channel. @@ -1104,12 +1111,12 @@ impl<'a> DirectedChannelInfo<'a> { pub fn target(&self) -> &'a NodeId { if self.from_node_one { &self.channel.node_two } else { &self.channel.node_one } } /// Returns the source node's counter - #[inline] - pub(super) fn source_counter(&self) -> u32 { if self.from_node_one { self.channel.node_one_counter } else { self.channel.node_two_counter } } + #[inline(always)] + pub(super) fn source_counter(&self) -> u32 { self.source_counter } /// Returns the target node's counter - #[inline] - pub(super) fn target_counter(&self) -> u32 { if self.from_node_one { self.channel.node_two_counter } else { self.channel.node_one_counter } } + #[inline(always)] + pub(super) fn target_counter(&self) -> u32 { self.target_counter } } impl<'a> fmt::Debug for DirectedChannelInfo<'a> { diff --git a/lightning/src/routing/router.rs b/lightning/src/routing/router.rs index bbf73e53106..58486c3da0f 100644 --- a/lightning/src/routing/router.rs +++ b/lightning/src/routing/router.rs @@ -1437,7 +1437,7 @@ impl<'a> CandidateRouteHop<'a> { } } - #[inline] + #[inline(always)] fn src_node_counter(&self) -> u32 { match self { CandidateRouteHop::FirstHop(hop) => hop.payer_node_counter, From 9566c272c8f2109aeee6628d74dc5b3eecb427e1 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Fri, 8 Dec 2023 05:44:32 +0000 Subject: [PATCH 4/6] Somewhat optimize the generic `Features::requires_unknown_bits` It turns out we spend several percent of our routefinding time just checking if nodes and channels require unknown features byte-by-byte. While the cost is almost certainly dominated by the memory read latency, avoiding doing the checks byte-by-byte should reduce the branch count slightly, which may reduce the overhead. --- lightning/src/ln/features.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/lightning/src/ln/features.rs b/lightning/src/ln/features.rs index 51c608c1a6b..79bf871de72 100644 --- a/lightning/src/ln/features.rs +++ b/lightning/src/ln/features.rs @@ -801,15 +801,23 @@ impl Features { pub fn requires_unknown_bits(&self) -> bool { // Bitwise AND-ing with all even bits set except for known features will select required // unknown features. - let byte_count = T::KNOWN_FEATURE_MASK.len(); - self.flags.iter().enumerate().any(|(i, &byte)| { - let unknown_features = if i < byte_count { - !T::KNOWN_FEATURE_MASK[i] - } else { - 0b11_11_11_11 - }; - (byte & (ANY_REQUIRED_FEATURES_MASK & unknown_features)) != 0 - }) + let mut known_chunks = T::KNOWN_FEATURE_MASK.chunks(8); + for chunk in self.flags.chunks(8) { + let mut flag_bytes = [0; 8]; + flag_bytes[..chunk.len()].copy_from_slice(&chunk); + let flag_int = u64::from_le_bytes(flag_bytes); + + let known_chunk = known_chunks.next().unwrap_or(&[0; 0]); + let mut known_bytes = [0; 8]; + known_bytes[..known_chunk.len()].copy_from_slice(&known_chunk); + let known_int = u64::from_le_bytes(known_bytes); + + const REQ_MASK: u64 = u64::from_le_bytes([ANY_REQUIRED_FEATURES_MASK; 8]); + if flag_int & (REQ_MASK & !known_int) != 0 { + return true; + } + } + false } pub(crate) fn supports_unknown_bits(&self) -> bool { From bed1fb073be98b8a0f68bfb12701506d3b4973ae Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Fri, 8 Dec 2023 19:43:17 +0000 Subject: [PATCH 5/6] Consolidate `candidate` access in `add_entry` during routing Because fetching fields from the `$candidate` often implies an indirect read, grouping them together may result in one or two fewer memory loads, so we do so here. --- lightning/src/routing/router.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lightning/src/routing/router.rs b/lightning/src/routing/router.rs index 58486c3da0f..28a991a3290 100644 --- a/lightning/src/routing/router.rs +++ b/lightning/src/routing/router.rs @@ -2392,6 +2392,8 @@ where L::Target: Logger { // if the amount being transferred over this path is lower. // We do this for now, but this is a subject for removal. if let Some(mut available_value_contribution_msat) = htlc_maximum_msat.checked_sub($next_hops_fee_msat) { + let cltv_expiry_delta = $candidate.cltv_expiry_delta(); + let htlc_minimum_msat = $candidate.htlc_minimum_msat(); let used_liquidity_msat = used_liquidities .get(&$candidate.id()) .map_or(0, |used_liquidity_msat| { @@ -2415,7 +2417,7 @@ where L::Target: Logger { .checked_sub(2*MEDIAN_HOP_CLTV_EXPIRY_DELTA) .unwrap_or(payment_params.max_total_cltv_expiry_delta - final_cltv_expiry_delta); let hop_total_cltv_delta = ($next_hops_cltv_delta as u32) - .saturating_add($candidate.cltv_expiry_delta()); + .saturating_add(cltv_expiry_delta); let exceeds_cltv_delta_limit = hop_total_cltv_delta > max_total_cltv_expiry_delta; let value_contribution_msat = cmp::min(available_value_contribution_msat, $next_hops_value_contribution); @@ -2425,7 +2427,6 @@ where L::Target: Logger { // Can't overflow due to how the values were computed right above. None => unreachable!(), }; - let htlc_minimum_msat = $candidate.htlc_minimum_msat(); #[allow(unused_comparisons)] // $next_hops_path_htlc_minimum_msat is 0 in some calls so rustc complains let over_path_minimum_msat = amount_to_transfer_over_msat >= htlc_minimum_msat && amount_to_transfer_over_msat >= $next_hops_path_htlc_minimum_msat; @@ -2503,12 +2504,14 @@ where L::Target: Logger { // payment path (upstream to the payee). To avoid that, we recompute // path fees knowing the final path contribution after constructing it. let curr_min = cmp::max( - $next_hops_path_htlc_minimum_msat, $candidate.htlc_minimum_msat() + $next_hops_path_htlc_minimum_msat, htlc_minimum_msat ); - let path_htlc_minimum_msat = compute_fees_saturating(curr_min, $candidate.fees()) + let candidate_fees = $candidate.fees(); + let src_node_counter = $candidate.src_node_counter(); + let path_htlc_minimum_msat = compute_fees_saturating(curr_min, candidate_fees) .saturating_add(curr_min); - let dist_entry = &mut dist[$candidate.src_node_counter() as usize]; + let dist_entry = &mut dist[src_node_counter as usize]; let old_entry = if let Some(hop) = dist_entry { hop } else { @@ -2551,7 +2554,7 @@ where L::Target: Logger { if src_node_id != our_node_id { // Note that `u64::max_value` means we'll always fail the // `old_entry.total_fee_msat > total_fee_msat` check below - hop_use_fee_msat = compute_fees_saturating(amount_to_transfer_over_msat, $candidate.fees()); + hop_use_fee_msat = compute_fees_saturating(amount_to_transfer_over_msat, candidate_fees); total_fee_msat = total_fee_msat.saturating_add(hop_use_fee_msat); } From f689e01b358dfa111596721a9a84f808133cd4f9 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sat, 9 Dec 2023 21:22:52 +0000 Subject: [PATCH 6/6] Layout channel info to ensure routing uses cache lines well Because we scan per-channel information in the hot inner loop of our routefinding immediately after looking a channel up in a `HashMap`, we end up spending a nontrivial portion of our routefinding time waiting on memory to be read in. While there is only so much we can do about that, ensuring the channel information that we care about is sitting on one or adjacent cache lines avoids paying that penalty twice. Thus, here we manually lay out `ChannelInfo` and `ChannelUpdateInfo` and set them to 128b and 32b alignment, respectively. This wastes some space in memory in our network graph, but improves routing performance in return. --- lightning/src/routing/gossip.rs | 61 ++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/lightning/src/routing/gossip.rs b/lightning/src/routing/gossip.rs index b0117cd68d3..358bf7095b2 100644 --- a/lightning/src/routing/gossip.rs +++ b/lightning/src/routing/gossip.rs @@ -795,22 +795,32 @@ where } } +// Fetching values from this struct is very performance sensitive during routefinding. Thus, we +// want to ensure that all of the fields we care about (all of them except `last_update_message`) +// sit on the same cache line. +// +// We do this by using `repr(C)`, which forces the struct to be laid out in memory the way we write +// it (ensuring `last_update_message` hangs off the end and no fields are reordered after it), and +// `align(32)`, ensuring the struct starts either at the start, or in the middle, of a 64b x86-64 +// cache line. This ensures the beginning fields (which are 31 bytes) all sit in the same cache +// line. +#[repr(C, align(32))] #[derive(Clone, Debug, PartialEq, Eq)] /// Details about one direction of a channel as received within a [`ChannelUpdate`]. pub struct ChannelUpdateInfo { - /// When the last update to the channel direction was issued. - /// Value is opaque, as set in the announcement. - pub last_update: u32, - /// Whether the channel can be currently used for payments (in this one direction). - pub enabled: bool, - /// The difference in CLTV values that you must have when routing through this channel. - pub cltv_expiry_delta: u16, /// The minimum value, which must be relayed to the next hop via the channel pub htlc_minimum_msat: u64, /// The maximum value which may be relayed to the next hop via the channel. pub htlc_maximum_msat: u64, /// Fees charged when the channel is used for routing pub fees: RoutingFees, + /// When the last update to the channel direction was issued. + /// Value is opaque, as set in the announcement. + pub last_update: u32, + /// The difference in CLTV values that you must have when routing through this channel. + pub cltv_expiry_delta: u16, + /// Whether the channel can be currently used for payments (in this one direction). + pub enabled: bool, /// Most recent update for the channel received from the network /// Mostly redundant with the data we store in fields explicitly. /// Everything else is useful only for sending out for initial routing sync. @@ -878,22 +888,46 @@ impl Readable for ChannelUpdateInfo { } } +// Fetching values from this struct is very performance sensitive during routefinding. Thus, we +// want to ensure that all of the fields we care about (all of them except `last_update_message` +// and `announcement_received_time`) sit on the same cache line. +// +// Sadly, this is not possible, however we can still do okay - all of the fields before +// `one_to_two` and `two_to_one` are just under 128 bytes long, so we can ensure they sit on +// adjacent cache lines (which are generally fetched together in x86_64 processors). +// +// This leaves only the two directional channel info structs on separate cache lines. +// +// We accomplish this using `repr(C)`, which forces the struct to be laid out in memory the way we +// write it (ensuring the fields we care about are at the start of the struct) and `align(128)`, +// ensuring the struct starts at the beginning of two adjacent 64b x86-64 cache lines. +#[repr(align(128), C)] #[derive(Clone, Debug, Eq)] /// Details about a channel (both directions). /// Received within a channel announcement. pub struct ChannelInfo { /// Protocol features of a channel communicated during its announcement pub features: ChannelFeatures, + /// Source node of the first direction of a channel pub node_one: NodeId, - /// Details about the first direction of a channel - pub one_to_two: Option, + /// Source node of the second direction of a channel pub node_two: NodeId, - /// Details about the second direction of a channel - pub two_to_one: Option, + + /// The [`NodeInfo::node_counter`] of the node pointed to by [`Self::node_one`]. + pub(crate) node_one_counter: u32, + /// The [`NodeInfo::node_counter`] of the node pointed to by [`Self::node_two`]. + pub(crate) node_two_counter: u32, + /// The channel capacity as seen on-chain, if chain lookup is available. pub capacity_sats: Option, + + /// Details about the first direction of a channel + pub one_to_two: Option, + /// Details about the second direction of a channel + pub two_to_one: Option, + /// An initial announcement of the channel /// Mostly redundant with the data we store in fields explicitly. /// Everything else is useful only for sending out for initial routing sync. @@ -903,11 +937,6 @@ pub struct ChannelInfo { /// (which we can probably assume we are - no-std environments probably won't have a full /// network graph in memory!). announcement_received_time: u64, - - /// The [`NodeInfo::node_counter`] of the node pointed to by [`Self::node_one`]. - pub(crate) node_one_counter: u32, - /// The [`NodeInfo::node_counter`] of the node pointed to by [`Self::node_two`]. - pub(crate) node_two_counter: u32, } impl PartialEq for ChannelInfo {