From 5ac32ee2bdd9df47e4578c51810db4d2139757eb Mon Sep 17 00:00:00 2001
From: Alexandru Vasile <60601340+lexnv@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:49:10 +0200
Subject: [PATCH] authority-discovery: Set intervals to start when authority
 keys changes (#3764)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The authority-discovery mechanism has implemented a few exponential
timers for:
- publishing the authority records
- goes from 2 seconds (when freshly booted) to 1 hour if the node is
long-running
  - set to 1 hour after successfully publishing the authority record
- discovering other authority records
- goes from 2 seconds (when freshly booted) to 10 minutes if the node is
long-running

This PR resets the exponential publishing and discovery interval to
defaults ensuring that long-running nodes:
- will retry publishing the authority records as aggressively as freshly
booted nodes
- Currently, if a long-running node fails to publish the DHT record when
the keys change (ie DhtEvent::ValuePutFailed), it will only retry after
1 hour
- will rediscover other authorities faster (since there is a chance that
other authority keys changed)

The subp2p-explorer has difficulties discovering the authorities when
the authority set changes in the first few hours. This might be entirely
due to the recursive nature of the DHT and the needed time to propagate
the records. However, there is a small chance that the authority
publishing failed and is only retried in 1h.

Let me know if this makes sense 🙏

cc @paritytech/networking

---------

Signed-off-by: Alexandru Vasile <alexandru.vasile@parity.io>
Co-authored-by: Dmitry Markin <dmitry@markin.tech>
---
 .../authority-discovery/src/interval.rs       | 20 +++++++++++--
 .../client/authority-discovery/src/worker.rs  | 28 +++++++++++++++++--
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/substrate/client/authority-discovery/src/interval.rs b/substrate/client/authority-discovery/src/interval.rs
index 23c7ce266e3..0eee0d159cd 100644
--- a/substrate/client/authority-discovery/src/interval.rs
+++ b/substrate/client/authority-discovery/src/interval.rs
@@ -28,6 +28,7 @@ use std::{
 ///
 /// Doubles interval duration on each tick until the configured maximum is reached.
 pub struct ExpIncInterval {
+	start: Duration,
 	max: Duration,
 	next: Duration,
 	delay: Delay,
@@ -37,14 +38,29 @@ impl ExpIncInterval {
 	/// Create a new [`ExpIncInterval`].
 	pub fn new(start: Duration, max: Duration) -> Self {
 		let delay = Delay::new(start);
-		Self { max, next: start * 2, delay }
+		Self { start, max, next: start * 2, delay }
 	}
 
-	/// Fast forward the exponentially increasing interval to the configured maximum.
+	/// Fast forward the exponentially increasing interval to the configured maximum, if not already
+	/// set.
 	pub fn set_to_max(&mut self) {
+		if self.next == self.max {
+			return;
+		}
+
 		self.next = self.max;
 		self.delay = Delay::new(self.next);
 	}
+
+	/// Rewind the exponentially increasing interval to the configured start, if not already set.
+	pub fn set_to_start(&mut self) {
+		if self.next == self.start * 2 {
+			return;
+		}
+
+		self.next = self.start * 2;
+		self.delay = Delay::new(self.start);
+	}
 }
 
 impl Stream for ExpIncInterval {
diff --git a/substrate/client/authority-discovery/src/worker.rs b/substrate/client/authority-discovery/src/worker.rs
index b77f0241ec2..4ad7db5f7da 100644
--- a/substrate/client/authority-discovery/src/worker.rs
+++ b/substrate/client/authority-discovery/src/worker.rs
@@ -129,6 +129,9 @@ pub struct Worker<Client, Network, Block, DhtEventStream> {
 	/// List of keys onto which addresses have been published at the latest publication.
 	/// Used to check whether they have changed.
 	latest_published_keys: HashSet<AuthorityId>,
+	/// List of the kademlia keys that have been published at the latest publication.
+	/// Used to associate DHT events with our published records.
+	latest_published_kad_keys: HashSet<KademliaKey>,
 
 	/// Same value as in the configuration.
 	publish_non_global_ips: bool,
@@ -265,6 +268,7 @@ where
 			publish_interval,
 			publish_if_changed_interval,
 			latest_published_keys: HashSet::new(),
+			latest_published_kad_keys: HashSet::new(),
 			publish_non_global_ips: config.publish_non_global_ips,
 			public_addresses,
 			strict_record_validation: config.strict_record_validation,
@@ -397,8 +401,17 @@ where
 			self.client.as_ref(),
 		).await?.into_iter().collect::<HashSet<_>>();
 
-		if only_if_changed && keys == self.latest_published_keys {
-			return Ok(())
+		if only_if_changed {
+			// If the authority keys did not change and the `publish_if_changed_interval` was
+			// triggered then do nothing.
+			if keys == self.latest_published_keys {
+				return Ok(())
+			}
+
+			// We have detected a change in the authority keys, reset the timers to
+			// publish and gather data faster.
+			self.publish_interval.set_to_start();
+			self.query_interval.set_to_start();
 		}
 
 		let addresses = serialize_addresses(self.addresses_to_publish());
@@ -422,6 +435,8 @@ where
 			keys_vec,
 		)?;
 
+		self.latest_published_kad_keys = kv_pairs.iter().map(|(k, _)| k.clone()).collect();
+
 		for (key, value) in kv_pairs.into_iter() {
 			self.network.put_value(key, value);
 		}
@@ -523,6 +538,10 @@ where
 				}
 			},
 			DhtEvent::ValuePut(hash) => {
+				if !self.latest_published_kad_keys.contains(&hash) {
+					return;
+				}
+
 				// Fast forward the exponentially increasing interval to the configured maximum. In
 				// case this was the first successful address publishing there is no need for a
 				// timely retry.
@@ -535,6 +554,11 @@ where
 				debug!(target: LOG_TARGET, "Successfully put hash '{:?}' on Dht.", hash)
 			},
 			DhtEvent::ValuePutFailed(hash) => {
+				if !self.latest_published_kad_keys.contains(&hash) {
+					// Not a value we have published or received multiple times.
+					return;
+				}
+
 				if let Some(metrics) = &self.metrics {
 					metrics.dht_event_received.with_label_values(&["value_put_failed"]).inc();
 				}
-- 
GitLab