From 5ac32ee2bdd9df47e4578c51810db4d2139757eb Mon Sep 17 00:00:00 2001 From: Alexandru Vasile <60601340+lexnv@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:49:10 +0200 Subject: [PATCH] authority-discovery: Set intervals to start when authority keys changes (#3764) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The authority-discovery mechanism has implemented a few exponential timers for: - publishing the authority records - goes from 2 seconds (when freshly booted) to 1 hour if the node is long-running - set to 1 hour after successfully publishing the authority record - discovering other authority records - goes from 2 seconds (when freshly booted) to 10 minutes if the node is long-running This PR resets the exponential publishing and discovery interval to defaults ensuring that long-running nodes: - will retry publishing the authority records as aggressively as freshly booted nodes - Currently, if a long-running node fails to publish the DHT record when the keys change (ie DhtEvent::ValuePutFailed), it will only retry after 1 hour - will rediscover other authorities faster (since there is a chance that other authority keys changed) The subp2p-explorer has difficulties discovering the authorities when the authority set changes in the first few hours. This might be entirely due to the recursive nature of the DHT and the needed time to propagate the records. However, there is a small chance that the authority publishing failed and is only retried in 1h. Let me know if this makes sense 🙠cc @paritytech/networking --------- Signed-off-by: Alexandru Vasile <alexandru.vasile@parity.io> Co-authored-by: Dmitry Markin <dmitry@markin.tech> --- .../authority-discovery/src/interval.rs | 20 +++++++++++-- .../client/authority-discovery/src/worker.rs | 28 +++++++++++++++++-- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/substrate/client/authority-discovery/src/interval.rs b/substrate/client/authority-discovery/src/interval.rs index 23c7ce266e3..0eee0d159cd 100644 --- a/substrate/client/authority-discovery/src/interval.rs +++ b/substrate/client/authority-discovery/src/interval.rs @@ -28,6 +28,7 @@ use std::{ /// /// Doubles interval duration on each tick until the configured maximum is reached. pub struct ExpIncInterval { + start: Duration, max: Duration, next: Duration, delay: Delay, @@ -37,14 +38,29 @@ impl ExpIncInterval { /// Create a new [`ExpIncInterval`]. pub fn new(start: Duration, max: Duration) -> Self { let delay = Delay::new(start); - Self { max, next: start * 2, delay } + Self { start, max, next: start * 2, delay } } - /// Fast forward the exponentially increasing interval to the configured maximum. + /// Fast forward the exponentially increasing interval to the configured maximum, if not already + /// set. pub fn set_to_max(&mut self) { + if self.next == self.max { + return; + } + self.next = self.max; self.delay = Delay::new(self.next); } + + /// Rewind the exponentially increasing interval to the configured start, if not already set. + pub fn set_to_start(&mut self) { + if self.next == self.start * 2 { + return; + } + + self.next = self.start * 2; + self.delay = Delay::new(self.start); + } } impl Stream for ExpIncInterval { diff --git a/substrate/client/authority-discovery/src/worker.rs b/substrate/client/authority-discovery/src/worker.rs index b77f0241ec2..4ad7db5f7da 100644 --- a/substrate/client/authority-discovery/src/worker.rs +++ b/substrate/client/authority-discovery/src/worker.rs @@ -129,6 +129,9 @@ pub struct Worker<Client, Network, Block, DhtEventStream> { /// List of keys onto which addresses have been published at the latest publication. /// Used to check whether they have changed. latest_published_keys: HashSet<AuthorityId>, + /// List of the kademlia keys that have been published at the latest publication. + /// Used to associate DHT events with our published records. + latest_published_kad_keys: HashSet<KademliaKey>, /// Same value as in the configuration. publish_non_global_ips: bool, @@ -265,6 +268,7 @@ where publish_interval, publish_if_changed_interval, latest_published_keys: HashSet::new(), + latest_published_kad_keys: HashSet::new(), publish_non_global_ips: config.publish_non_global_ips, public_addresses, strict_record_validation: config.strict_record_validation, @@ -397,8 +401,17 @@ where self.client.as_ref(), ).await?.into_iter().collect::<HashSet<_>>(); - if only_if_changed && keys == self.latest_published_keys { - return Ok(()) + if only_if_changed { + // If the authority keys did not change and the `publish_if_changed_interval` was + // triggered then do nothing. + if keys == self.latest_published_keys { + return Ok(()) + } + + // We have detected a change in the authority keys, reset the timers to + // publish and gather data faster. + self.publish_interval.set_to_start(); + self.query_interval.set_to_start(); } let addresses = serialize_addresses(self.addresses_to_publish()); @@ -422,6 +435,8 @@ where keys_vec, )?; + self.latest_published_kad_keys = kv_pairs.iter().map(|(k, _)| k.clone()).collect(); + for (key, value) in kv_pairs.into_iter() { self.network.put_value(key, value); } @@ -523,6 +538,10 @@ where } }, DhtEvent::ValuePut(hash) => { + if !self.latest_published_kad_keys.contains(&hash) { + return; + } + // Fast forward the exponentially increasing interval to the configured maximum. In // case this was the first successful address publishing there is no need for a // timely retry. @@ -535,6 +554,11 @@ where debug!(target: LOG_TARGET, "Successfully put hash '{:?}' on Dht.", hash) }, DhtEvent::ValuePutFailed(hash) => { + if !self.latest_published_kad_keys.contains(&hash) { + // Not a value we have published or received multiple times. + return; + } + if let Some(metrics) = &self.metrics { metrics.dht_event_received.with_label_values(&["value_put_failed"]).inc(); } -- GitLab