Skip to content
Snippets Groups Projects
Commit c0347f02 authored by Robert Klotzner's avatar Robert Klotzner Committed by GitHub
Browse files

Jaeger spans for availability distribution (#2559)


* Logging functionality for spans.

* Jaeger spans for availability distribution.

* Fix instrumentation to use log target properly.

* Add some tracing instrumentation macros.

* Use int_tags instead of logs.

* Add span per iteration.

* Remove span::log functionality.

* Fix instrumentation log target for real.

* Add jaeger span to responding side as well.

* Revert "Fix instrumentation log target for real."

This reverts commit e1c2a2e6ff6f257e702f07d8a77c2668af92b0ef.

* Revert "Fix instrumentation to use log target properly."

This reverts commit 7caa0bd1acc6fe9727bb3a91851560d756c40ab8.

* target -> subsystem in instrumentatio macro

target is not correct either, and the correct way of using a top level
target = LOG_TARGET does not work, as the macro expects a string literal
and gets confused by the constant `LOG_TARGET`.

* Use kebab-case for spa names.

Co-authored-by: default avatarAndronik Ordian <write@reusable.software>

Co-authored-by: default avatarAndronik Ordian <write@reusable.software>
parent 95f1b09b
No related merge requests found
...@@ -313,6 +313,14 @@ impl Span { ...@@ -313,6 +313,14 @@ impl Span {
} }
} }
/// Add an additional int tag to the span.
pub fn add_int_tag(&mut self, tag: &str, value: i64) {
match self {
Self::Enabled(ref mut inner) => inner.add_int_tag(tag, value),
Self::Disabled => {},
}
}
/// Adds the `FollowsFrom` relationship to this span with respect to the given one. /// Adds the `FollowsFrom` relationship to this span with respect to the given one.
pub fn add_follows_from(&mut self, other: &Self) { pub fn add_follows_from(&mut self, other: &Self) {
match (self, other) { match (self, other) {
......
...@@ -33,7 +33,7 @@ use polkadot_primitives::v1::{ ...@@ -33,7 +33,7 @@ use polkadot_primitives::v1::{
use polkadot_subsystem::messages::{ use polkadot_subsystem::messages::{
AllMessages, AvailabilityStoreMessage, NetworkBridgeMessage, AllMessages, AvailabilityStoreMessage, NetworkBridgeMessage,
}; };
use polkadot_subsystem::SubsystemContext; use polkadot_subsystem::{SubsystemContext, jaeger};
use crate::{ use crate::{
error::{Error, Result}, error::{Error, Result},
...@@ -119,6 +119,9 @@ struct RunningTask { ...@@ -119,6 +119,9 @@ struct RunningTask {
/// Prometheues metrics for reporting results. /// Prometheues metrics for reporting results.
metrics: Metrics, metrics: Metrics,
/// Span tracking the fetching of this chunk.
span: jaeger::Span,
} }
impl FetchTaskConfig { impl FetchTaskConfig {
...@@ -142,6 +145,9 @@ impl FetchTaskConfig { ...@@ -142,6 +145,9 @@ impl FetchTaskConfig {
}; };
} }
let mut span = jaeger::candidate_hash_span(&core.candidate_hash, "availability-distribution");
span.add_stage(jaeger::Stage::AvailabilityDistribution);
let prepared_running = RunningTask { let prepared_running = RunningTask {
session_index: session_info.session_index, session_index: session_info.session_index,
group_index: core.group_responsible, group_index: core.group_responsible,
...@@ -156,6 +162,7 @@ impl FetchTaskConfig { ...@@ -156,6 +162,7 @@ impl FetchTaskConfig {
relay_parent: core.candidate_descriptor.relay_parent, relay_parent: core.candidate_descriptor.relay_parent,
metrics, metrics,
sender, sender,
span,
}; };
FetchTaskConfig { FetchTaskConfig {
live_in, live_in,
...@@ -168,6 +175,7 @@ impl FetchTask { ...@@ -168,6 +175,7 @@ impl FetchTask {
/// Start fetching a chunk. /// Start fetching a chunk.
/// ///
/// A task handling the fetching of the configured chunk will be spawned. /// A task handling the fetching of the configured chunk will be spawned.
#[tracing::instrument(level = "trace", skip(config, ctx), fields(subsystem = LOG_TARGET))]
pub async fn start<Context>(config: FetchTaskConfig, ctx: &mut Context) -> Result<Self> pub async fn start<Context>(config: FetchTaskConfig, ctx: &mut Context) -> Result<Self>
where where
Context: SubsystemContext, Context: SubsystemContext,
...@@ -240,6 +248,7 @@ enum TaskError { ...@@ -240,6 +248,7 @@ enum TaskError {
} }
impl RunningTask { impl RunningTask {
#[tracing::instrument(level = "trace", skip(self, kill), fields(subsystem = LOG_TARGET))]
async fn run(self, kill: oneshot::Receiver<()>) { async fn run(self, kill: oneshot::Receiver<()>) {
// Wait for completion/or cancel. // Wait for completion/or cancel.
let run_it = self.run_inner(); let run_it = self.run_inner();
...@@ -254,8 +263,13 @@ impl RunningTask { ...@@ -254,8 +263,13 @@ impl RunningTask {
let mut bad_validators = Vec::new(); let mut bad_validators = Vec::new();
let mut label = FAILED; let mut label = FAILED;
let mut count: u32 = 0; let mut count: u32 = 0;
let mut _span = self.span.child_builder("fetch-task")
.with_chunk_index(self.request.index.0)
.with_relay_parent(&self.relay_parent)
.build();
// Try validators in reverse order: // Try validators in reverse order:
while let Some(validator) = self.group.pop() { while let Some(validator) = self.group.pop() {
let _try_span = _span.child("try");
// Report retries: // Report retries:
if count > 0 { if count > 0 {
self.metrics.on_retry(); self.metrics.on_retry();
...@@ -302,8 +316,10 @@ impl RunningTask { ...@@ -302,8 +316,10 @@ impl RunningTask {
// Ok, let's store it and be happy: // Ok, let's store it and be happy:
self.store_chunk(chunk).await; self.store_chunk(chunk).await;
label = SUCCEEDED; label = SUCCEEDED;
_span.add_string_tag("success", "true");
break; break;
} }
_span.add_int_tag("tries", count as _);
self.metrics.on_fetch(label); self.metrics.on_fetch(label);
self.conclude(bad_validators).await; self.conclude(bad_validators).await;
} }
......
...@@ -291,6 +291,7 @@ fn get_test_running_task() -> (RunningTask, mpsc::Receiver<FromFetchTask>) { ...@@ -291,6 +291,7 @@ fn get_test_running_task() -> (RunningTask, mpsc::Receiver<FromFetchTask>) {
relay_parent: Hash::repeat_byte(71), relay_parent: Hash::repeat_byte(71),
sender: tx, sender: tx,
metrics: Metrics::new_dummy(), metrics: Metrics::new_dummy(),
span: jaeger::Span::Disabled,
}, },
rx rx
) )
......
...@@ -74,6 +74,7 @@ impl Requester { ...@@ -74,6 +74,7 @@ impl Requester {
/// ///
/// You must feed it with `ActiveLeavesUpdate` via `update_fetching_heads` and make it progress /// You must feed it with `ActiveLeavesUpdate` via `update_fetching_heads` and make it progress
/// by advancing the stream. /// by advancing the stream.
#[tracing::instrument(level = "trace", skip(keystore, metrics), fields(subsystem = LOG_TARGET))]
pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self { pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self {
// All we do is forwarding messages, no need to make this big. // All we do is forwarding messages, no need to make this big.
// Each sender will get one slot, see // Each sender will get one slot, see
...@@ -90,6 +91,7 @@ impl Requester { ...@@ -90,6 +91,7 @@ impl Requester {
/// Update heads that need availability distribution. /// Update heads that need availability distribution.
/// ///
/// For all active heads we will be fetching our chunks for availabilty distribution. /// For all active heads we will be fetching our chunks for availabilty distribution.
#[tracing::instrument(level = "trace", skip(self, ctx, update), fields(subsystem = LOG_TARGET))]
pub async fn update_fetching_heads<Context>( pub async fn update_fetching_heads<Context>(
&mut self, &mut self,
ctx: &mut Context, ctx: &mut Context,
......
...@@ -22,7 +22,7 @@ use polkadot_node_network_protocol::request_response::{request::IncomingRequest, ...@@ -22,7 +22,7 @@ use polkadot_node_network_protocol::request_response::{request::IncomingRequest,
use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex}; use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex};
use polkadot_subsystem::{ use polkadot_subsystem::{
messages::{AllMessages, AvailabilityStoreMessage}, messages::{AllMessages, AvailabilityStoreMessage},
SubsystemContext, SubsystemContext, jaeger,
}; };
use crate::error::{Error, Result}; use crate::error::{Error, Result};
...@@ -64,6 +64,12 @@ pub async fn answer_request<Context>( ...@@ -64,6 +64,12 @@ pub async fn answer_request<Context>(
where where
Context: SubsystemContext, Context: SubsystemContext,
{ {
let mut span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request");
span.add_stage(jaeger::Stage::AvailabilityDistribution);
let _child_span = span.child_builder("answer-chunk-request")
.with_chunk_index(req.payload.index.0)
.build();
let chunk = query_chunk(ctx, req.payload.candidate_hash, req.payload.index).await?; let chunk = query_chunk(ctx, req.payload.candidate_hash, req.payload.index).await?;
let result = chunk.is_some(); let result = chunk.is_some();
......
...@@ -116,6 +116,7 @@ impl SessionCache { ...@@ -116,6 +116,7 @@ impl SessionCache {
/// ///
/// Use this function over any `fetch_session_info` if all you need is a reference to /// Use this function over any `fetch_session_info` if all you need is a reference to
/// `SessionInfo`, as it avoids an expensive clone. /// `SessionInfo`, as it avoids an expensive clone.
#[tracing::instrument(level = "trace", skip(self, ctx, with_info), fields(subsystem = LOG_TARGET))]
pub async fn with_session_info<Context, F, R>( pub async fn with_session_info<Context, F, R>(
&mut self, &mut self,
ctx: &mut Context, ctx: &mut Context,
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment