From f469fbfb0a44c4e223488b07ec641ca02b2fb8f1 Mon Sep 17 00:00:00 2001
From: Andrei Sandu <54316454+sandreim@users.noreply.github.com>
Date: Fri, 24 May 2024 17:14:44 +0300
Subject: [PATCH] availability-recovery: bump chunk fetch threshold to 1MB for
 Polkadot and 4MB for Kusama + testnets (#4399)

Doing this change ensures that we minimize the CPU usage we spend in
reed-solomon by only doing the re-encoding into chunks if PoV size is
less than 4MB (which means all PoVs right now)

Based on susbystem benchmark results we concluded that it is safe to
bump this number higher. At worst case scenario the network pressure for
a backing group of 5 is around 25% of the network bandwidth in hw specs.

Assuming 6s block times (max_candidate_depth 3) and needed_approvals 30
the amount of bandwidth usage of a backing group used would hover above
`30 * 4 * 3 = 360MB` per relay chain block. Given a backing group of 5
that gives 72MB per block per validator -> 12 MB/s.

<details>
<summary>Reality check on Kusama PoV sizes (click for chart)</summary>
<br>
<img width="697" alt="Screenshot 2024-05-07 at 14 30 38"
src="https://github.com/paritytech/polkadot-sdk/assets/54316454/bfed32d4-8623-48b0-9ec0-8b95dd2a9d8c">
</details>

---------

Signed-off-by: Andrei Sandu <andrei-mihail@parity.io>
---
 .../network/availability-recovery/src/lib.rs  | 22 +++++++++++++------
 .../availability-recovery/src/tests.rs        |  6 +++--
 polkadot/node/service/src/lib.rs              |  7 ++++++
 polkadot/node/service/src/overseer.rs         |  7 ++++++
 4 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs
index 94b9d9546cd..b836870cd8a 100644
--- a/polkadot/node/network/availability-recovery/src/lib.rs
+++ b/polkadot/node/network/availability-recovery/src/lib.rs
@@ -77,8 +77,10 @@ const LRU_SIZE: u32 = 16;
 
 const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Peer sent unparsable request");
 
-/// PoV size limit in bytes for which prefer fetching from backers.
-const SMALL_POV_LIMIT: usize = 128 * 1024;
+/// PoV size limit in bytes for which prefer fetching from backers. (conservative, Polkadot for now)
+pub(crate) const CONSERVATIVE_FETCH_CHUNKS_THRESHOLD: usize = 1 * 1024 * 1024;
+/// PoV size limit in bytes for which prefer fetching from backers. (Kusama and all testnets)
+pub const FETCH_CHUNKS_THRESHOLD: usize = 4 * 1024 * 1024;
 
 #[derive(Clone, PartialEq)]
 /// The strategy we use to recover the PoV.
@@ -448,7 +450,7 @@ async fn handle_recover<Context>(
 				if let Some(backing_validators) = session_info.validator_groups.get(backing_group) {
 					let mut small_pov_size = true;
 
-					if let RecoveryStrategyKind::BackersFirstIfSizeLower(small_pov_limit) =
+					if let RecoveryStrategyKind::BackersFirstIfSizeLower(fetch_chunks_threshold) =
 						recovery_strategy_kind
 					{
 						// Get our own chunk size to get an estimate of the PoV size.
@@ -457,13 +459,13 @@ async fn handle_recover<Context>(
 						if let Ok(Some(chunk_size)) = chunk_size {
 							let pov_size_estimate =
 								chunk_size.saturating_mul(session_info.validators.len()) / 3;
-							small_pov_size = pov_size_estimate < small_pov_limit;
+							small_pov_size = pov_size_estimate < fetch_chunks_threshold;
 
 							gum::trace!(
 								target: LOG_TARGET,
 								?candidate_hash,
 								pov_size_estimate,
-								small_pov_limit,
+								fetch_chunks_threshold,
 								enabled = small_pov_size,
 								"Prefer fetch from backing group",
 							);
@@ -547,11 +549,14 @@ impl AvailabilityRecoverySubsystem {
 	/// which never requests the `AvailabilityStoreSubsystem` subsystem and only checks the POV hash
 	/// instead of reencoding the available data.
 	pub fn for_collator(
+		fetch_chunks_threshold: Option<usize>,
 		req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
 		metrics: Metrics,
 	) -> Self {
 		Self {
-			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(SMALL_POV_LIMIT),
+			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
+				fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
+			),
 			bypass_availability_store: true,
 			post_recovery_check: PostRecoveryCheck::PovHash,
 			req_receiver,
@@ -591,11 +596,14 @@ impl AvailabilityRecoverySubsystem {
 	/// Create a new instance of `AvailabilityRecoverySubsystem` which requests chunks if PoV is
 	/// above a threshold.
 	pub fn with_chunks_if_pov_large(
+		fetch_chunks_threshold: Option<usize>,
 		req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
 		metrics: Metrics,
 	) -> Self {
 		Self {
-			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(SMALL_POV_LIMIT),
+			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
+				fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
+			),
 			bypass_availability_store: false,
 			post_recovery_check: PostRecoveryCheck::Reencode,
 			req_receiver,
diff --git a/polkadot/node/network/availability-recovery/src/tests.rs b/polkadot/node/network/availability-recovery/src/tests.rs
index 909f6a25f46..6049a5a5c3a 100644
--- a/polkadot/node/network/availability-recovery/src/tests.rs
+++ b/polkadot/node/network/availability-recovery/src/tests.rs
@@ -906,6 +906,7 @@ fn recovers_from_only_chunks_if_pov_large() {
 	let test_state = TestState::default();
 	let req_protocol_names = ReqProtocolNames::new(&GENESIS_HASH, None);
 	let subsystem = AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
+		Some(FETCH_CHUNKS_THRESHOLD),
 		request_receiver(&req_protocol_names),
 		Metrics::new_dummy(),
 	);
@@ -942,7 +943,7 @@ fn recovers_from_only_chunks_if_pov_large() {
 			AllMessages::AvailabilityStore(
 				AvailabilityStoreMessage::QueryChunkSize(_, tx)
 			) => {
-				let _ = tx.send(Some(1000000));
+				let _ = tx.send(Some(crate::FETCH_CHUNKS_THRESHOLD + 1));
 			}
 		);
 
@@ -987,7 +988,7 @@ fn recovers_from_only_chunks_if_pov_large() {
 			AllMessages::AvailabilityStore(
 				AvailabilityStoreMessage::QueryChunkSize(_, tx)
 			) => {
-				let _ = tx.send(Some(1000000));
+				let _ = tx.send(Some(crate::FETCH_CHUNKS_THRESHOLD + 1));
 			}
 		);
 
@@ -1015,6 +1016,7 @@ fn fast_path_backing_group_recovers_if_pov_small() {
 	let test_state = TestState::default();
 	let req_protocol_names = ReqProtocolNames::new(&GENESIS_HASH, None);
 	let subsystem = AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
+		Some(FETCH_CHUNKS_THRESHOLD),
 		request_receiver(&req_protocol_names),
 		Metrics::new_dummy(),
 	);
diff --git a/polkadot/node/service/src/lib.rs b/polkadot/node/service/src/lib.rs
index 665533e9bc7..6d365b93ac7 100644
--- a/polkadot/node/service/src/lib.rs
+++ b/polkadot/node/service/src/lib.rs
@@ -750,6 +750,7 @@ pub fn new_full<
 		prepare_workers_hard_max_num,
 	}: NewFullParams<OverseerGenerator>,
 ) -> Result<NewFull, Error> {
+	use polkadot_availability_recovery::FETCH_CHUNKS_THRESHOLD;
 	use polkadot_node_network_protocol::request_response::IncomingRequest;
 	use sc_network_sync::WarpSyncParams;
 
@@ -988,6 +989,11 @@ pub fn new_full<
 			stagnant_check_interval: Default::default(),
 			stagnant_check_mode: chain_selection_subsystem::StagnantCheckMode::PruneOnly,
 		};
+
+		// Kusama + testnets get a higher threshold, we are conservative on Polkadot for now.
+		let fetch_chunks_threshold =
+			if config.chain_spec.is_polkadot() { None } else { Some(FETCH_CHUNKS_THRESHOLD) };
+
 		Some(ExtendedOverseerGenArgs {
 			keystore,
 			parachains_db,
@@ -1001,6 +1007,7 @@ pub fn new_full<
 			dispute_req_receiver,
 			dispute_coordinator_config,
 			chain_selection_config,
+			fetch_chunks_threshold,
 		})
 	};
 
diff --git a/polkadot/node/service/src/overseer.rs b/polkadot/node/service/src/overseer.rs
index 4b7777a0967..175a77e1c5f 100644
--- a/polkadot/node/service/src/overseer.rs
+++ b/polkadot/node/service/src/overseer.rs
@@ -133,6 +133,10 @@ pub struct ExtendedOverseerGenArgs {
 	pub dispute_coordinator_config: DisputeCoordinatorConfig,
 	/// Configuration for the chain selection subsystem.
 	pub chain_selection_config: ChainSelectionConfig,
+	/// Optional availability recovery fetch chunks threshold. If PoV size size is lower
+	/// than the value put in here we always try to recovery availability from backers.
+	/// The presence of this parameter here is needed to have different values per chain.
+	pub fetch_chunks_threshold: Option<usize>,
 }
 
 /// Obtain a prepared validator `Overseer`, that is initialized with all default values.
@@ -166,6 +170,7 @@ pub fn validator_overseer_builder<Spawner, RuntimeClient>(
 		dispute_req_receiver,
 		dispute_coordinator_config,
 		chain_selection_config,
+		fetch_chunks_threshold,
 	}: ExtendedOverseerGenArgs,
 ) -> Result<
 	InitializedOverseerBuilder<
@@ -240,6 +245,7 @@ where
 			Metrics::register(registry)?,
 		))
 		.availability_recovery(AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
+			fetch_chunks_threshold,
 			available_data_req_receiver,
 			Metrics::register(registry)?,
 		))
@@ -421,6 +427,7 @@ where
 		))
 		.availability_distribution(DummySubsystem)
 		.availability_recovery(AvailabilityRecoverySubsystem::for_collator(
+			None,
 			available_data_req_receiver,
 			Metrics::register(registry)?,
 		))
-- 
GitLab