Unverified Commit 58ff7f59 authored by Robert Klotzner's avatar Robert Klotzner Committed by GitHub
Browse files

Jaeger spans for availability distribution (#2559)

* Logging functionality for spans.

* Jaeger spans for availability distribution.

* Fix instrumentation to use log target properly.

* Add some tracing instrumentation macros.

* Use int_tags instead of logs.

* Add span per iteration.

* Remove span::log functionality.

* Fix instrumentation log target for real.

* Add jaeger span to responding side as well.

* Revert "Fix instrumentation log target for real."

This reverts commit e1c2a2e6.

* Revert "Fix instrumentation to use log target properly."

This reverts commit 7caa0bd1

.

* target -> subsystem in instrumentatio macro

target is not correct either, and the correct way of using a top level
target = LOG_TARGET does not work, as the macro expects a string literal
and gets confused by the constant `LOG_TARGET`.

* Use kebab-case for spa names.

Co-authored-by: Andronik Ordian's avatarAndronik Ordian <write@reusable.software>

Co-authored-by: Andronik Ordian's avatarAndronik Ordian <write@reusable.software>
parent 387040e5
Pipeline #126917 passed with stages
in 33 minutes and 50 seconds
......@@ -313,6 +313,14 @@ impl Span {
}
}
/// Add an additional int tag to the span.
pub fn add_int_tag(&mut self, tag: &str, value: i64) {
match self {
Self::Enabled(ref mut inner) => inner.add_int_tag(tag, value),
Self::Disabled => {},
}
}
/// Adds the `FollowsFrom` relationship to this span with respect to the given one.
pub fn add_follows_from(&mut self, other: &Self) {
match (self, other) {
......
......@@ -33,7 +33,7 @@ use polkadot_primitives::v1::{
use polkadot_subsystem::messages::{
AllMessages, AvailabilityStoreMessage, NetworkBridgeMessage,
};
use polkadot_subsystem::SubsystemContext;
use polkadot_subsystem::{SubsystemContext, jaeger};
use crate::{
error::{Error, Result},
......@@ -119,6 +119,9 @@ struct RunningTask {
/// Prometheues metrics for reporting results.
metrics: Metrics,
/// Span tracking the fetching of this chunk.
span: jaeger::Span,
}
impl FetchTaskConfig {
......@@ -142,6 +145,9 @@ impl FetchTaskConfig {
};
}
let mut span = jaeger::candidate_hash_span(&core.candidate_hash, "availability-distribution");
span.add_stage(jaeger::Stage::AvailabilityDistribution);
let prepared_running = RunningTask {
session_index: session_info.session_index,
group_index: core.group_responsible,
......@@ -156,6 +162,7 @@ impl FetchTaskConfig {
relay_parent: core.candidate_descriptor.relay_parent,
metrics,
sender,
span,
};
FetchTaskConfig {
live_in,
......@@ -168,6 +175,7 @@ impl FetchTask {
/// Start fetching a chunk.
///
/// A task handling the fetching of the configured chunk will be spawned.
#[tracing::instrument(level = "trace", skip(config, ctx), fields(subsystem = LOG_TARGET))]
pub async fn start<Context>(config: FetchTaskConfig, ctx: &mut Context) -> Result<Self>
where
Context: SubsystemContext,
......@@ -240,6 +248,7 @@ enum TaskError {
}
impl RunningTask {
#[tracing::instrument(level = "trace", skip(self, kill), fields(subsystem = LOG_TARGET))]
async fn run(self, kill: oneshot::Receiver<()>) {
// Wait for completion/or cancel.
let run_it = self.run_inner();
......@@ -254,8 +263,13 @@ impl RunningTask {
let mut bad_validators = Vec::new();
let mut label = FAILED;
let mut count: u32 = 0;
let mut _span = self.span.child_builder("fetch-task")
.with_chunk_index(self.request.index.0)
.with_relay_parent(&self.relay_parent)
.build();
// Try validators in reverse order:
while let Some(validator) = self.group.pop() {
let _try_span = _span.child("try");
// Report retries:
if count > 0 {
self.metrics.on_retry();
......@@ -302,8 +316,10 @@ impl RunningTask {
// Ok, let's store it and be happy:
self.store_chunk(chunk).await;
label = SUCCEEDED;
_span.add_string_tag("success", "true");
break;
}
_span.add_int_tag("tries", count as _);
self.metrics.on_fetch(label);
self.conclude(bad_validators).await;
}
......
......@@ -291,6 +291,7 @@ fn get_test_running_task() -> (RunningTask, mpsc::Receiver<FromFetchTask>) {
relay_parent: Hash::repeat_byte(71),
sender: tx,
metrics: Metrics::new_dummy(),
span: jaeger::Span::Disabled,
},
rx
)
......
......@@ -74,6 +74,7 @@ impl Requester {
///
/// You must feed it with `ActiveLeavesUpdate` via `update_fetching_heads` and make it progress
/// by advancing the stream.
#[tracing::instrument(level = "trace", skip(keystore, metrics), fields(subsystem = LOG_TARGET))]
pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self {
// All we do is forwarding messages, no need to make this big.
// Each sender will get one slot, see
......@@ -90,6 +91,7 @@ impl Requester {
/// Update heads that need availability distribution.
///
/// For all active heads we will be fetching our chunks for availabilty distribution.
#[tracing::instrument(level = "trace", skip(self, ctx, update), fields(subsystem = LOG_TARGET))]
pub async fn update_fetching_heads<Context>(
&mut self,
ctx: &mut Context,
......
......@@ -22,7 +22,7 @@ use polkadot_node_network_protocol::request_response::{request::IncomingRequest,
use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex};
use polkadot_subsystem::{
messages::{AllMessages, AvailabilityStoreMessage},
SubsystemContext,
SubsystemContext, jaeger,
};
use crate::error::{Error, Result};
......@@ -64,6 +64,12 @@ pub async fn answer_request<Context>(
where
Context: SubsystemContext,
{
let mut span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request");
span.add_stage(jaeger::Stage::AvailabilityDistribution);
let _child_span = span.child_builder("answer-chunk-request")
.with_chunk_index(req.payload.index.0)
.build();
let chunk = query_chunk(ctx, req.payload.candidate_hash, req.payload.index).await?;
let result = chunk.is_some();
......
......@@ -116,6 +116,7 @@ impl SessionCache {
///
/// Use this function over any `fetch_session_info` if all you need is a reference to
/// `SessionInfo`, as it avoids an expensive clone.
#[tracing::instrument(level = "trace", skip(self, ctx, with_info), fields(subsystem = LOG_TARGET))]
pub async fn with_session_info<Context, F, R>(
&mut self,
ctx: &mut Context,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment