From c0347f026a7acd66a09e3670f53d24dd32d33e70 Mon Sep 17 00:00:00 2001
From: Robert Klotzner <eskimor@users.noreply.github.com>
Date: Thu, 4 Mar 2021 18:03:24 +0100
Subject: [PATCH] Jaeger spans for availability distribution (#2559)

* Logging functionality for spans.

* Jaeger spans for availability distribution.

* Fix instrumentation to use log target properly.

* Add some tracing instrumentation macros.

* Use int_tags instead of logs.

* Add span per iteration.

* Remove span::log functionality.

* Fix instrumentation log target for real.

* Add jaeger span to responding side as well.

* Revert "Fix instrumentation log target for real."

This reverts commit e1c2a2e6ff6f257e702f07d8a77c2668af92b0ef.

* Revert "Fix instrumentation to use log target properly."

This reverts commit 7caa0bd1acc6fe9727bb3a91851560d756c40ab8.

* target -> subsystem in instrumentatio macro

target is not correct either, and the correct way of using a top level
target = LOG_TARGET does not work, as the macro expects a string literal
and gets confused by the constant `LOG_TARGET`.

* Use kebab-case for spa names.

Co-authored-by: Andronik Ordian <write@reusable.software>

Co-authored-by: Andronik Ordian <write@reusable.software>
---
 polkadot/node/jaeger/src/lib.rs                |  8 ++++++++
 .../src/requester/fetch_task/mod.rs            | 18 +++++++++++++++++-
 .../src/requester/fetch_task/tests.rs          |  1 +
 .../src/requester/mod.rs                       |  2 ++
 .../availability-distribution/src/responder.rs |  8 +++++++-
 .../src/session_cache.rs                       |  1 +
 6 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/polkadot/node/jaeger/src/lib.rs b/polkadot/node/jaeger/src/lib.rs
index 133fa7ce7d5..179734e84fe 100644
--- a/polkadot/node/jaeger/src/lib.rs
+++ b/polkadot/node/jaeger/src/lib.rs
@@ -313,6 +313,14 @@ impl Span {
 		}
 	}
 
+	/// Add an additional int tag to the span.
+	pub fn add_int_tag(&mut self, tag: &str, value: i64) {
+		match self {
+			Self::Enabled(ref mut inner) => inner.add_int_tag(tag, value),
+			Self::Disabled => {},
+		}
+	}
+
 	/// Adds the `FollowsFrom` relationship to this span with respect to the given one.
 	pub fn add_follows_from(&mut self, other: &Self) {
 		match (self, other) {
diff --git a/polkadot/node/network/availability-distribution/src/requester/fetch_task/mod.rs b/polkadot/node/network/availability-distribution/src/requester/fetch_task/mod.rs
index 3e187f9502e..b2a282f2609 100644
--- a/polkadot/node/network/availability-distribution/src/requester/fetch_task/mod.rs
+++ b/polkadot/node/network/availability-distribution/src/requester/fetch_task/mod.rs
@@ -33,7 +33,7 @@ use polkadot_primitives::v1::{
 use polkadot_subsystem::messages::{
 	AllMessages, AvailabilityStoreMessage, NetworkBridgeMessage,
 };
-use polkadot_subsystem::SubsystemContext;
+use polkadot_subsystem::{SubsystemContext, jaeger};
 
 use crate::{
 	error::{Error, Result},
@@ -119,6 +119,9 @@ struct RunningTask {
 	
 	/// Prometheues metrics for reporting results.
 	metrics: Metrics,
+
+	/// Span tracking the fetching of this chunk.
+	span: jaeger::Span,
 }
 
 impl FetchTaskConfig {
@@ -142,6 +145,9 @@ impl FetchTaskConfig {
 			};
 		}
 
+		let mut span = jaeger::candidate_hash_span(&core.candidate_hash, "availability-distribution");
+		span.add_stage(jaeger::Stage::AvailabilityDistribution);
+
 		let prepared_running = RunningTask {
 			session_index: session_info.session_index,
 			group_index: core.group_responsible,
@@ -156,6 +162,7 @@ impl FetchTaskConfig {
 			relay_parent: core.candidate_descriptor.relay_parent,
 			metrics,
 			sender,
+			span,
 		};
 		FetchTaskConfig {
 			live_in,
@@ -168,6 +175,7 @@ impl FetchTask {
 	/// Start fetching a chunk.
 	///
 	/// A task handling the fetching of the configured chunk will be spawned.
+	#[tracing::instrument(level = "trace", skip(config, ctx), fields(subsystem = LOG_TARGET))]
 	pub async fn start<Context>(config: FetchTaskConfig, ctx: &mut Context) -> Result<Self>
 	where
 		Context: SubsystemContext,
@@ -240,6 +248,7 @@ enum TaskError {
 }
 
 impl RunningTask {
+	#[tracing::instrument(level = "trace", skip(self, kill), fields(subsystem = LOG_TARGET))]
 	async fn run(self, kill: oneshot::Receiver<()>) {
 		// Wait for completion/or cancel.
 		let run_it = self.run_inner();
@@ -254,8 +263,13 @@ impl RunningTask {
 		let mut bad_validators = Vec::new();
 		let mut label = FAILED;
 		let mut count: u32 = 0;
+		let mut _span = self.span.child_builder("fetch-task")
+			.with_chunk_index(self.request.index.0)
+			.with_relay_parent(&self.relay_parent)
+			.build();
 		// Try validators in reverse order:
 		while let Some(validator) = self.group.pop() {
+			let _try_span = _span.child("try");
 			// Report retries:
 			if count > 0 {
 				self.metrics.on_retry();
@@ -302,8 +316,10 @@ impl RunningTask {
 			// Ok, let's store it and be happy:
 			self.store_chunk(chunk).await;
 			label = SUCCEEDED;
+			_span.add_string_tag("success", "true");
 			break;
 		}
+		_span.add_int_tag("tries", count as _);
 		self.metrics.on_fetch(label);
 		self.conclude(bad_validators).await;
 	}
diff --git a/polkadot/node/network/availability-distribution/src/requester/fetch_task/tests.rs b/polkadot/node/network/availability-distribution/src/requester/fetch_task/tests.rs
index 1f12000621c..b8c70a324d3 100644
--- a/polkadot/node/network/availability-distribution/src/requester/fetch_task/tests.rs
+++ b/polkadot/node/network/availability-distribution/src/requester/fetch_task/tests.rs
@@ -291,6 +291,7 @@ fn get_test_running_task() -> (RunningTask, mpsc::Receiver<FromFetchTask>) {
 			relay_parent: Hash::repeat_byte(71),
 			sender: tx,
 			metrics: Metrics::new_dummy(),
+			span: jaeger::Span::Disabled,
 		},
 		rx
 	)
diff --git a/polkadot/node/network/availability-distribution/src/requester/mod.rs b/polkadot/node/network/availability-distribution/src/requester/mod.rs
index 914a86ef7de..f613632ccbd 100644
--- a/polkadot/node/network/availability-distribution/src/requester/mod.rs
+++ b/polkadot/node/network/availability-distribution/src/requester/mod.rs
@@ -74,6 +74,7 @@ impl Requester {
 	///
 	/// You must feed it with `ActiveLeavesUpdate` via `update_fetching_heads` and make it progress
 	/// by advancing the stream.
+	#[tracing::instrument(level = "trace", skip(keystore, metrics), fields(subsystem = LOG_TARGET))]
 	pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self {
 		// All we do is forwarding messages, no need to make this big.
 		// Each sender will get one slot, see
@@ -90,6 +91,7 @@ impl Requester {
 	/// Update heads that need availability distribution.
 	///
 	/// For all active heads we will be fetching our chunks for availabilty distribution.
+	#[tracing::instrument(level = "trace", skip(self, ctx, update), fields(subsystem = LOG_TARGET))]
 	pub async fn update_fetching_heads<Context>(
 		&mut self,
 		ctx: &mut Context,
diff --git a/polkadot/node/network/availability-distribution/src/responder.rs b/polkadot/node/network/availability-distribution/src/responder.rs
index c094b17fd66..6704de72a8b 100644
--- a/polkadot/node/network/availability-distribution/src/responder.rs
+++ b/polkadot/node/network/availability-distribution/src/responder.rs
@@ -22,7 +22,7 @@ use polkadot_node_network_protocol::request_response::{request::IncomingRequest,
 use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex};
 use polkadot_subsystem::{
 	messages::{AllMessages, AvailabilityStoreMessage},
-	SubsystemContext,
+	SubsystemContext, jaeger,
 };
 
 use crate::error::{Error, Result};
@@ -64,6 +64,12 @@ pub async fn answer_request<Context>(
 where
 	Context: SubsystemContext,
 {
+	let mut span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request");
+	span.add_stage(jaeger::Stage::AvailabilityDistribution);
+	let _child_span = span.child_builder("answer-chunk-request")
+		.with_chunk_index(req.payload.index.0)
+		.build();
+
 	let chunk = query_chunk(ctx, req.payload.candidate_hash, req.payload.index).await?;
 
 	let result = chunk.is_some();
diff --git a/polkadot/node/network/availability-distribution/src/session_cache.rs b/polkadot/node/network/availability-distribution/src/session_cache.rs
index 15f5cad3c75..0212717767b 100644
--- a/polkadot/node/network/availability-distribution/src/session_cache.rs
+++ b/polkadot/node/network/availability-distribution/src/session_cache.rs
@@ -116,6 +116,7 @@ impl SessionCache {
 	///
 	/// Use this function over any `fetch_session_info` if all you need is a reference to
 	/// `SessionInfo`, as it avoids an expensive clone.
+	#[tracing::instrument(level = "trace", skip(self, ctx, with_info), fields(subsystem = LOG_TARGET))]
 	pub async fn with_session_info<Context, F, R>(
 		&mut self,
 		ctx: &mut Context,
-- 
GitLab