From fc906d5d0fb7796ef54ba670101cf37b0aad6794 Mon Sep 17 00:00:00 2001
From: Alin Dima <alin@parity.io>
Date: Mon, 12 Aug 2024 16:56:00 +0300
Subject: [PATCH] fix av-distribution Jaeger spans mem leak (#5321)

Fixes https://github.com/paritytech/polkadot-sdk/issues/5258
---
 .../network/availability-distribution/src/lib.rs     | 12 ++++++------
 .../availability-distribution/src/requester/mod.rs   |  8 +++++---
 .../availability-distribution/src/requester/tests.rs |  4 ++--
 prdoc/pr_5321.prdoc                                  | 11 +++++++++++
 4 files changed, 24 insertions(+), 11 deletions(-)
 create mode 100644 prdoc/pr_5321.prdoc

diff --git a/polkadot/node/network/availability-distribution/src/lib.rs b/polkadot/node/network/availability-distribution/src/lib.rs
index ec2c01f99b0..d3185e0af80 100644
--- a/polkadot/node/network/availability-distribution/src/lib.rs
+++ b/polkadot/node/network/availability-distribution/src/lib.rs
@@ -25,7 +25,7 @@ use polkadot_node_subsystem::{
 	jaeger, messages::AvailabilityDistributionMessage, overseer, FromOrchestra, OverseerSignal,
 	SpawnedSubsystem, SubsystemError,
 };
-use polkadot_primitives::Hash;
+use polkadot_primitives::{BlockNumber, Hash};
 use std::collections::HashMap;
 
 /// Error and [`Result`] type for this subsystem.
@@ -104,7 +104,7 @@ impl AvailabilityDistributionSubsystem {
 	/// Start processing work as passed on from the Overseer.
 	async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), FatalError> {
 		let Self { mut runtime, recvs, metrics, req_protocol_names } = self;
-		let mut spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
+		let mut spans: HashMap<Hash, (BlockNumber, jaeger::PerLeafSpan)> = HashMap::new();
 
 		let IncomingRequestReceivers {
 			pov_req_receiver,
@@ -162,7 +162,7 @@ impl AvailabilityDistributionSubsystem {
 					};
 					let span =
 						jaeger::PerLeafSpan::new(cloned_leaf.span, "availability-distribution");
-					spans.insert(cloned_leaf.hash, span);
+					spans.insert(cloned_leaf.hash, (cloned_leaf.number, span));
 					log_error(
 						requester
 							.get_mut()
@@ -172,8 +172,8 @@ impl AvailabilityDistributionSubsystem {
 						&mut warn_freq,
 					)?;
 				},
-				FromOrchestra::Signal(OverseerSignal::BlockFinalized(hash, _)) => {
-					spans.remove(&hash);
+				FromOrchestra::Signal(OverseerSignal::BlockFinalized(_hash, finalized_number)) => {
+					spans.retain(|_hash, (block_number, _span)| *block_number > finalized_number);
 				},
 				FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(()),
 				FromOrchestra::Communication {
@@ -189,7 +189,7 @@ impl AvailabilityDistributionSubsystem {
 				} => {
 					let span = spans
 						.get(&relay_parent)
-						.map(|span| span.child("fetch-pov"))
+						.map(|(_, span)| span.child("fetch-pov"))
 						.unwrap_or_else(|| jaeger::Span::new(&relay_parent, "fetch-pov"))
 						.with_trace_id(candidate_hash)
 						.with_candidate(candidate_hash)
diff --git a/polkadot/node/network/availability-distribution/src/requester/mod.rs b/polkadot/node/network/availability-distribution/src/requester/mod.rs
index efbdceb43bd..0175161af70 100644
--- a/polkadot/node/network/availability-distribution/src/requester/mod.rs
+++ b/polkadot/node/network/availability-distribution/src/requester/mod.rs
@@ -39,7 +39,9 @@ use polkadot_node_subsystem_util::{
 	availability_chunks::availability_chunk_index,
 	runtime::{get_occupied_cores, RuntimeInfo},
 };
-use polkadot_primitives::{CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex};
+use polkadot_primitives::{
+	BlockNumber, CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex,
+};
 
 use super::{FatalError, Metrics, Result, LOG_TARGET};
 
@@ -112,14 +114,14 @@ impl Requester {
 		ctx: &mut Context,
 		runtime: &mut RuntimeInfo,
 		update: ActiveLeavesUpdate,
-		spans: &HashMap<Hash, jaeger::PerLeafSpan>,
+		spans: &HashMap<Hash, (BlockNumber, jaeger::PerLeafSpan)>,
 	) -> Result<()> {
 		gum::trace!(target: LOG_TARGET, ?update, "Update fetching heads");
 		let ActiveLeavesUpdate { activated, deactivated } = update;
 		if let Some(leaf) = activated {
 			let span = spans
 				.get(&leaf.hash)
-				.map(|span| span.child("update-fetching-heads"))
+				.map(|(_, span)| span.child("update-fetching-heads"))
 				.unwrap_or_else(|| jaeger::Span::new(&leaf.hash, "update-fetching-heads"))
 				.with_string_tag("leaf", format!("{:?}", leaf.hash))
 				.with_stage(jaeger::Stage::AvailabilityDistribution);
diff --git a/polkadot/node/network/availability-distribution/src/requester/tests.rs b/polkadot/node/network/availability-distribution/src/requester/tests.rs
index 09567a8f87d..decb3156004 100644
--- a/polkadot/node/network/availability-distribution/src/requester/tests.rs
+++ b/polkadot/node/network/availability-distribution/src/requester/tests.rs
@@ -208,7 +208,7 @@ fn check_ancestry_lookup_in_same_session() {
 
 	test_harness(test_state.clone(), |mut ctx| async move {
 		let chain = &test_state.relay_chain;
-		let spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
+		let spans: HashMap<Hash, (u32, jaeger::PerLeafSpan)> = HashMap::new();
 		let block_number = 1;
 		let update = ActiveLeavesUpdate {
 			activated: Some(new_leaf(chain[block_number], block_number as u32)),
@@ -281,7 +281,7 @@ fn check_ancestry_lookup_in_different_sessions() {
 
 	test_harness(test_state.clone(), |mut ctx| async move {
 		let chain = &test_state.relay_chain;
-		let spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
+		let spans: HashMap<Hash, (u32, jaeger::PerLeafSpan)> = HashMap::new();
 		let block_number = 3;
 		let update = ActiveLeavesUpdate {
 			activated: Some(new_leaf(chain[block_number], block_number as u32)),
diff --git a/prdoc/pr_5321.prdoc b/prdoc/pr_5321.prdoc
new file mode 100644
index 00000000000..97f75d28dd5
--- /dev/null
+++ b/prdoc/pr_5321.prdoc
@@ -0,0 +1,11 @@
+title: fix availability-distribution Jaeger spans memory leak
+
+doc:
+  - audience: Node Dev
+    description: |
+      Fixes a memory leak which caused the Jaeger span storage in availability-distribution to never be pruned and therefore increasing indefinitely.
+      This was caused by improper handling of finalized heads. More info in https://github.com/paritytech/polkadot-sdk/issues/5258
+
+crates:
+  - name: polkadot-availability-distribution
+    bump: patch
-- 
GitLab