From c0347f026a7acd66a09e3670f53d24dd32d33e70 Mon Sep 17 00:00:00 2001 From: Robert Klotzner <eskimor@users.noreply.github.com> Date: Thu, 4 Mar 2021 18:03:24 +0100 Subject: [PATCH] Jaeger spans for availability distribution (#2559) * Logging functionality for spans. * Jaeger spans for availability distribution. * Fix instrumentation to use log target properly. * Add some tracing instrumentation macros. * Use int_tags instead of logs. * Add span per iteration. * Remove span::log functionality. * Fix instrumentation log target for real. * Add jaeger span to responding side as well. * Revert "Fix instrumentation log target for real." This reverts commit e1c2a2e6ff6f257e702f07d8a77c2668af92b0ef. * Revert "Fix instrumentation to use log target properly." This reverts commit 7caa0bd1acc6fe9727bb3a91851560d756c40ab8. * target -> subsystem in instrumentatio macro target is not correct either, and the correct way of using a top level target = LOG_TARGET does not work, as the macro expects a string literal and gets confused by the constant `LOG_TARGET`. * Use kebab-case for spa names. Co-authored-by: Andronik Ordian <write@reusable.software> Co-authored-by: Andronik Ordian <write@reusable.software> --- polkadot/node/jaeger/src/lib.rs | 8 ++++++++ .../src/requester/fetch_task/mod.rs | 18 +++++++++++++++++- .../src/requester/fetch_task/tests.rs | 1 + .../src/requester/mod.rs | 2 ++ .../availability-distribution/src/responder.rs | 8 +++++++- .../src/session_cache.rs | 1 + 6 files changed, 36 insertions(+), 2 deletions(-) diff --git a/polkadot/node/jaeger/src/lib.rs b/polkadot/node/jaeger/src/lib.rs index 133fa7ce7d5..179734e84fe 100644 --- a/polkadot/node/jaeger/src/lib.rs +++ b/polkadot/node/jaeger/src/lib.rs @@ -313,6 +313,14 @@ impl Span { } } + /// Add an additional int tag to the span. + pub fn add_int_tag(&mut self, tag: &str, value: i64) { + match self { + Self::Enabled(ref mut inner) => inner.add_int_tag(tag, value), + Self::Disabled => {}, + } + } + /// Adds the `FollowsFrom` relationship to this span with respect to the given one. pub fn add_follows_from(&mut self, other: &Self) { match (self, other) { diff --git a/polkadot/node/network/availability-distribution/src/requester/fetch_task/mod.rs b/polkadot/node/network/availability-distribution/src/requester/fetch_task/mod.rs index 3e187f9502e..b2a282f2609 100644 --- a/polkadot/node/network/availability-distribution/src/requester/fetch_task/mod.rs +++ b/polkadot/node/network/availability-distribution/src/requester/fetch_task/mod.rs @@ -33,7 +33,7 @@ use polkadot_primitives::v1::{ use polkadot_subsystem::messages::{ AllMessages, AvailabilityStoreMessage, NetworkBridgeMessage, }; -use polkadot_subsystem::SubsystemContext; +use polkadot_subsystem::{SubsystemContext, jaeger}; use crate::{ error::{Error, Result}, @@ -119,6 +119,9 @@ struct RunningTask { /// Prometheues metrics for reporting results. metrics: Metrics, + + /// Span tracking the fetching of this chunk. + span: jaeger::Span, } impl FetchTaskConfig { @@ -142,6 +145,9 @@ impl FetchTaskConfig { }; } + let mut span = jaeger::candidate_hash_span(&core.candidate_hash, "availability-distribution"); + span.add_stage(jaeger::Stage::AvailabilityDistribution); + let prepared_running = RunningTask { session_index: session_info.session_index, group_index: core.group_responsible, @@ -156,6 +162,7 @@ impl FetchTaskConfig { relay_parent: core.candidate_descriptor.relay_parent, metrics, sender, + span, }; FetchTaskConfig { live_in, @@ -168,6 +175,7 @@ impl FetchTask { /// Start fetching a chunk. /// /// A task handling the fetching of the configured chunk will be spawned. + #[tracing::instrument(level = "trace", skip(config, ctx), fields(subsystem = LOG_TARGET))] pub async fn start<Context>(config: FetchTaskConfig, ctx: &mut Context) -> Result<Self> where Context: SubsystemContext, @@ -240,6 +248,7 @@ enum TaskError { } impl RunningTask { + #[tracing::instrument(level = "trace", skip(self, kill), fields(subsystem = LOG_TARGET))] async fn run(self, kill: oneshot::Receiver<()>) { // Wait for completion/or cancel. let run_it = self.run_inner(); @@ -254,8 +263,13 @@ impl RunningTask { let mut bad_validators = Vec::new(); let mut label = FAILED; let mut count: u32 = 0; + let mut _span = self.span.child_builder("fetch-task") + .with_chunk_index(self.request.index.0) + .with_relay_parent(&self.relay_parent) + .build(); // Try validators in reverse order: while let Some(validator) = self.group.pop() { + let _try_span = _span.child("try"); // Report retries: if count > 0 { self.metrics.on_retry(); @@ -302,8 +316,10 @@ impl RunningTask { // Ok, let's store it and be happy: self.store_chunk(chunk).await; label = SUCCEEDED; + _span.add_string_tag("success", "true"); break; } + _span.add_int_tag("tries", count as _); self.metrics.on_fetch(label); self.conclude(bad_validators).await; } diff --git a/polkadot/node/network/availability-distribution/src/requester/fetch_task/tests.rs b/polkadot/node/network/availability-distribution/src/requester/fetch_task/tests.rs index 1f12000621c..b8c70a324d3 100644 --- a/polkadot/node/network/availability-distribution/src/requester/fetch_task/tests.rs +++ b/polkadot/node/network/availability-distribution/src/requester/fetch_task/tests.rs @@ -291,6 +291,7 @@ fn get_test_running_task() -> (RunningTask, mpsc::Receiver<FromFetchTask>) { relay_parent: Hash::repeat_byte(71), sender: tx, metrics: Metrics::new_dummy(), + span: jaeger::Span::Disabled, }, rx ) diff --git a/polkadot/node/network/availability-distribution/src/requester/mod.rs b/polkadot/node/network/availability-distribution/src/requester/mod.rs index 914a86ef7de..f613632ccbd 100644 --- a/polkadot/node/network/availability-distribution/src/requester/mod.rs +++ b/polkadot/node/network/availability-distribution/src/requester/mod.rs @@ -74,6 +74,7 @@ impl Requester { /// /// You must feed it with `ActiveLeavesUpdate` via `update_fetching_heads` and make it progress /// by advancing the stream. + #[tracing::instrument(level = "trace", skip(keystore, metrics), fields(subsystem = LOG_TARGET))] pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self { // All we do is forwarding messages, no need to make this big. // Each sender will get one slot, see @@ -90,6 +91,7 @@ impl Requester { /// Update heads that need availability distribution. /// /// For all active heads we will be fetching our chunks for availabilty distribution. + #[tracing::instrument(level = "trace", skip(self, ctx, update), fields(subsystem = LOG_TARGET))] pub async fn update_fetching_heads<Context>( &mut self, ctx: &mut Context, diff --git a/polkadot/node/network/availability-distribution/src/responder.rs b/polkadot/node/network/availability-distribution/src/responder.rs index c094b17fd66..6704de72a8b 100644 --- a/polkadot/node/network/availability-distribution/src/responder.rs +++ b/polkadot/node/network/availability-distribution/src/responder.rs @@ -22,7 +22,7 @@ use polkadot_node_network_protocol::request_response::{request::IncomingRequest, use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex}; use polkadot_subsystem::{ messages::{AllMessages, AvailabilityStoreMessage}, - SubsystemContext, + SubsystemContext, jaeger, }; use crate::error::{Error, Result}; @@ -64,6 +64,12 @@ pub async fn answer_request<Context>( where Context: SubsystemContext, { + let mut span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request"); + span.add_stage(jaeger::Stage::AvailabilityDistribution); + let _child_span = span.child_builder("answer-chunk-request") + .with_chunk_index(req.payload.index.0) + .build(); + let chunk = query_chunk(ctx, req.payload.candidate_hash, req.payload.index).await?; let result = chunk.is_some(); diff --git a/polkadot/node/network/availability-distribution/src/session_cache.rs b/polkadot/node/network/availability-distribution/src/session_cache.rs index 15f5cad3c75..0212717767b 100644 --- a/polkadot/node/network/availability-distribution/src/session_cache.rs +++ b/polkadot/node/network/availability-distribution/src/session_cache.rs @@ -116,6 +116,7 @@ impl SessionCache { /// /// Use this function over any `fetch_session_info` if all you need is a reference to /// `SessionInfo`, as it avoids an expensive clone. + #[tracing::instrument(level = "trace", skip(self, ctx, with_info), fields(subsystem = LOG_TARGET))] pub async fn with_session_info<Context, F, R>( &mut self, ctx: &mut Context, -- GitLab