From da953454aa4b381c5b44ee6a32ff1c43e744390c Mon Sep 17 00:00:00 2001
From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:00:19 +0100
Subject: [PATCH] Fix `Possible bug: Vote import failed` after aggression is
 enabled (#6690)

After finality started lagging on kusama around 025-11-25 15:55:40
validators started seeing ocassionally this log, when importing votes
covering more than one assignment.
```
Possible bug: Vote import failed
```

That happens because the assumption that assignments from the same
validator would have the same required routing doesn't hold after you
enabled aggression, so you might end up receiving the first assignment
then you modify the routing for it in `enable_aggression` then your
receive the second assignment and the vote covering both assignments, so
the rouing for the first and second assingment wouldn't match and we
would fail to import the vote.

From the logs I've seen, I don't think this is the reason the network
didn't fully recover until the failsafe kicked it, because the votes had
been already imported in approval-voting before this error.

---------

Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io>
---
 .../network/approval-distribution/src/lib.rs  | 19 ++----
 .../network/protocol/src/grid_topology.rs     | 60 +++++++++++++++++++
 prdoc/pr_6690.prdoc                           | 17 ++++++
 3 files changed, 83 insertions(+), 13 deletions(-)
 create mode 100644 prdoc/pr_6690.prdoc

diff --git a/polkadot/node/network/approval-distribution/src/lib.rs b/polkadot/node/network/approval-distribution/src/lib.rs
index 876cc59b9c2..d6bbb0dca83 100644
--- a/polkadot/node/network/approval-distribution/src/lib.rs
+++ b/polkadot/node/network/approval-distribution/src/lib.rs
@@ -163,8 +163,6 @@ enum ApprovalEntryError {
 	InvalidCandidateIndex,
 	DuplicateApproval,
 	UnknownAssignment,
-	#[allow(dead_code)]
-	AssignmentsFollowedDifferentPaths(RequiredRouting, RequiredRouting),
 }
 
 impl ApprovalEntry {
@@ -568,7 +566,7 @@ impl BlockEntry {
 		&mut self,
 		approval: IndirectSignedApprovalVoteV2,
 	) -> Result<(RequiredRouting, HashSet<PeerId>), ApprovalEntryError> {
-		let mut required_routing = None;
+		let mut required_routing: Option<RequiredRouting> = None;
 		let mut peers_randomly_routed_to = HashSet::new();
 
 		if self.candidates.len() < approval.candidate_indices.len() as usize {
@@ -595,16 +593,11 @@ impl BlockEntry {
 				peers_randomly_routed_to
 					.extend(approval_entry.routing_info().peers_randomly_routed.iter());
 
-				if let Some(required_routing) = required_routing {
-					if required_routing != approval_entry.routing_info().required_routing {
-						// This shouldn't happen since the required routing is computed based on the
-						// validator_index, so two assignments from the same validators will have
-						// the same required routing.
-						return Err(ApprovalEntryError::AssignmentsFollowedDifferentPaths(
-							required_routing,
-							approval_entry.routing_info().required_routing,
-						))
-					}
+				if let Some(current_required_routing) = required_routing {
+					required_routing = Some(
+						current_required_routing
+							.combine(approval_entry.routing_info().required_routing),
+					);
 				} else {
 					required_routing = Some(approval_entry.routing_info().required_routing)
 				}
diff --git a/polkadot/node/network/protocol/src/grid_topology.rs b/polkadot/node/network/protocol/src/grid_topology.rs
index 4dd7d29fc25..f4c1a07ba3c 100644
--- a/polkadot/node/network/protocol/src/grid_topology.rs
+++ b/polkadot/node/network/protocol/src/grid_topology.rs
@@ -575,6 +575,22 @@ impl RequiredRouting {
 			_ => false,
 		}
 	}
+
+	/// Combine two required routing sets into one that would cover both routing modes.
+	pub fn combine(self, other: Self) -> Self {
+		match (self, other) {
+			(RequiredRouting::All, _) | (_, RequiredRouting::All) => RequiredRouting::All,
+			(RequiredRouting::GridXY, _) | (_, RequiredRouting::GridXY) => RequiredRouting::GridXY,
+			(RequiredRouting::GridX, RequiredRouting::GridY) |
+			(RequiredRouting::GridY, RequiredRouting::GridX) => RequiredRouting::GridXY,
+			(RequiredRouting::GridX, RequiredRouting::GridX) => RequiredRouting::GridX,
+			(RequiredRouting::GridY, RequiredRouting::GridY) => RequiredRouting::GridY,
+			(RequiredRouting::None, RequiredRouting::PendingTopology) |
+			(RequiredRouting::PendingTopology, RequiredRouting::None) => RequiredRouting::PendingTopology,
+			(RequiredRouting::None, _) | (RequiredRouting::PendingTopology, _) => other,
+			(_, RequiredRouting::None) | (_, RequiredRouting::PendingTopology) => self,
+		}
+	}
 }
 
 #[cfg(test)]
@@ -587,6 +603,50 @@ mod tests {
 		rand_chacha::ChaCha12Rng::seed_from_u64(12345)
 	}
 
+	#[test]
+	fn test_required_routing_combine() {
+		assert_eq!(RequiredRouting::All.combine(RequiredRouting::None), RequiredRouting::All);
+		assert_eq!(RequiredRouting::All.combine(RequiredRouting::GridXY), RequiredRouting::All);
+		assert_eq!(RequiredRouting::GridXY.combine(RequiredRouting::All), RequiredRouting::All);
+		assert_eq!(RequiredRouting::None.combine(RequiredRouting::All), RequiredRouting::All);
+		assert_eq!(RequiredRouting::None.combine(RequiredRouting::None), RequiredRouting::None);
+		assert_eq!(
+			RequiredRouting::PendingTopology.combine(RequiredRouting::GridX),
+			RequiredRouting::GridX
+		);
+
+		assert_eq!(
+			RequiredRouting::GridX.combine(RequiredRouting::PendingTopology),
+			RequiredRouting::GridX
+		);
+		assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::GridY), RequiredRouting::GridXY);
+		assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::GridX), RequiredRouting::GridXY);
+		assert_eq!(
+			RequiredRouting::GridXY.combine(RequiredRouting::GridXY),
+			RequiredRouting::GridXY
+		);
+		assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::GridX), RequiredRouting::GridX);
+		assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::GridY), RequiredRouting::GridY);
+
+		assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridY), RequiredRouting::GridY);
+		assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridX), RequiredRouting::GridX);
+		assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridXY), RequiredRouting::GridXY);
+
+		assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::None), RequiredRouting::GridY);
+		assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::None), RequiredRouting::GridX);
+		assert_eq!(RequiredRouting::GridXY.combine(RequiredRouting::None), RequiredRouting::GridXY);
+
+		assert_eq!(
+			RequiredRouting::PendingTopology.combine(RequiredRouting::None),
+			RequiredRouting::PendingTopology
+		);
+
+		assert_eq!(
+			RequiredRouting::None.combine(RequiredRouting::PendingTopology),
+			RequiredRouting::PendingTopology
+		);
+	}
+
 	#[test]
 	fn test_random_routing_sample() {
 		// This test is fragile as it relies on a specific ChaCha12Rng
diff --git a/prdoc/pr_6690.prdoc b/prdoc/pr_6690.prdoc
new file mode 100644
index 00000000000..0e4a2437ef9
--- /dev/null
+++ b/prdoc/pr_6690.prdoc
@@ -0,0 +1,17 @@
+# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0
+# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json
+
+title: Fix Possible bug, Vote import failed after aggression is enabled
+
+doc:
+  - audience: Node Dev
+    description: |
+      Fix the appearance of Possible bug: Vote import failed after aggression is enabled, the log itself is
+      harmless because approval gets imported anyway and aggression is able to distribute it, nevertheless
+      is something that can be easily be fixed by picking the highest required routing possible.
+
+crates:
+  - name: polkadot-node-network-protocol
+    bump: minor
+  - name: polkadot-approval-distribution
+    bump: minor
-- 
GitLab