From 6f3caac0517278f0fdd614545c9ef620200bc02f Mon Sep 17 00:00:00 2001
From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com>
Date: Fri, 8 Mar 2024 14:10:47 +0200
Subject: [PATCH] collator-protocol: Always stay connected to validators in
 backing group (#3544)

Looking at rococo-asset-hub
https://github.com/paritytech/polkadot-sdk/issues/3519 there seems to be
a lot of instances where collator did not advertise their collations,
while there are multiple problems there, one of it is that we are
connecting and disconnecting to our assigned validators every block,
because on reconnect_timeout every 4s we call connect_to_validators and
that will produce 0 validators when all went well, so set_reseverd_peers
called from validator discovery will disconnect all our peers.
More details here:
https://github.com/paritytech/polkadot-sdk/issues/3519#issuecomment-1972667343

Now, this shouldn't be a problem, but it stacks with an existing bug in
our network stack where if disconnect from a peer the peer might not
notice it, so it won't detect the reconnect either and it won't send us
the necessary view updates, so we won't advertise the collation to it
more details here:

https://github.com/paritytech/polkadot-sdk/issues/3519#issuecomment-1972958276

To avoid hitting this condition that often, let's keep the peers in the
reserved set for the entire duration we are allocated to a backing
group. Backing group sizes(1 rococo, 3 kusama, 5 polkadot) are really
small, so this shouldn't lead to that many connections. Additionally,
the validators would disconnect us any way if we don't advertise
anything for 4 blocks.

## TODO
- [x] More testing.
- [x] Confirm on rococo that this is improving the situation. (It
doesn't but just because other things are going wrong there).

---------

Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io>
---
 .../src/collator_side/validators_buffer.rs    | 38 +++++++++++++++----
 .../src/validator_side/mod.rs                 |  2 +-
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs b/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs
index 5b88efc99d8..1533f2eda5a 100644
--- a/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs
+++ b/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs
@@ -90,8 +90,7 @@ impl ValidatorGroupsBuffer {
 		}
 	}
 
-	/// Returns discovery ids of validators we have at least one advertised-but-not-fetched
-	/// collation for.
+	/// Returns discovery ids of validators we are assigned to in this backing group window.
 	pub fn validators_to_connect(&self) -> Vec<AuthorityDiscoveryId> {
 		let validators_num = self.validators.len();
 		let bits = self
@@ -99,11 +98,22 @@ impl ValidatorGroupsBuffer {
 			.values()
 			.fold(bitvec![0; validators_num], |acc, next| acc | next);
 
-		self.validators
+		let mut should_be_connected: Vec<AuthorityDiscoveryId> = self
+			.validators
 			.iter()
 			.enumerate()
 			.filter_map(|(idx, authority_id)| bits[idx].then_some(authority_id.clone()))
-			.collect()
+			.collect();
+
+		if let Some(last_group) = self.group_infos.iter().last() {
+			for validator in self.validators.iter().rev().take(last_group.len) {
+				if !should_be_connected.contains(validator) {
+					should_be_connected.push(validator.clone());
+				}
+			}
+		}
+
+		should_be_connected
 	}
 
 	/// Note a new advertisement, marking that we want to be connected to validators
@@ -279,7 +289,7 @@ mod tests {
 		assert_eq!(buf.validators_to_connect(), validators[..2].to_vec());
 
 		buf.reset_validator_interest(hash_a, &validators[1]);
-		assert_eq!(buf.validators_to_connect(), vec![validators[0].clone()]);
+		assert_eq!(buf.validators_to_connect(), validators[0..2].to_vec());
 
 		buf.note_collation_advertised(hash_b, 0, GroupIndex(1), &validators[2..]);
 		assert_eq!(buf.validators_to_connect(), validators[2..].to_vec());
@@ -287,7 +297,11 @@ mod tests {
 		for validator in &validators[2..] {
 			buf.reset_validator_interest(hash_b, validator);
 		}
-		assert!(buf.validators_to_connect().is_empty());
+		let mut expected = validators[2..].to_vec();
+		expected.sort();
+		let mut result = buf.validators_to_connect();
+		result.sort();
+		assert_eq!(result, expected);
 	}
 
 	#[test]
@@ -320,10 +334,18 @@ mod tests {
 		}
 
 		buf.reset_validator_interest(hashes[1], &validators[0]);
-		assert_eq!(buf.validators_to_connect(), validators[..2].to_vec());
+		let mut expected: Vec<_> = validators[..4].iter().cloned().collect();
+		let mut result = buf.validators_to_connect();
+		expected.sort();
+		result.sort();
+		assert_eq!(result, expected);
 
 		buf.reset_validator_interest(hashes[0], &validators[0]);
-		assert_eq!(buf.validators_to_connect(), vec![validators[1].clone()]);
+		let mut expected: Vec<_> = validators[1..4].iter().cloned().collect();
+		expected.sort();
+		let mut result = buf.validators_to_connect();
+		result.sort();
+		assert_eq!(result, expected);
 
 		buf.note_collation_advertised(hashes[3], 0, GroupIndex(1), &validators[2..4]);
 		buf.note_collation_advertised(
diff --git a/polkadot/node/network/collator-protocol/src/validator_side/mod.rs b/polkadot/node/network/collator-protocol/src/validator_side/mod.rs
index 48ad3c711a6..a1b93fff348 100644
--- a/polkadot/node/network/collator-protocol/src/validator_side/mod.rs
+++ b/polkadot/node/network/collator-protocol/src/validator_side/mod.rs
@@ -1883,7 +1883,7 @@ async fn disconnect_inactive_peers(
 ) {
 	for (peer, peer_data) in peers {
 		if peer_data.is_inactive(&eviction_policy) {
-			gum::trace!(target: LOG_TARGET, "Disconnecting inactive peer");
+			gum::trace!(target: LOG_TARGET, ?peer, "Disconnecting inactive peer");
 			disconnect_peer(sender, *peer).await;
 		}
 	}
-- 
GitLab