Unverified Commit 9758a180 authored by Andronik Ordian's avatar Andronik Ordian Committed by GitHub
Browse files

more resilient subsystems (#1908)

* backing: extract log target

* bitfield-signing: extract log target

* utils: fix a typo

* provisioner: extract log target

* candidate selection: remove unused error variant

* bitfield-distribution: change the return type of run

* pov-distribution: extract log target

* collator-protocol: simplify runtime request

* collation-generation: do not exit early on error

* collation-generation: do not exit on double init

* collator-protocol: do not exit on errors and rename LOG_TARGET

* collator-protocol: a workaround for ununused imports warning

* Update node/network/bitfield-distribution/src/lib.rs

* collation-generation: elevate warn! to error!

* collator-protocol: fix imports

* post merge fix

* fix compilation
parent e488c6cd
Pipeline #113283 passed with stages
in 23 minutes and 27 seconds
......@@ -129,8 +129,7 @@ impl CollationGenerationSubsystem {
if let Err(err) =
handle_new_activations(config.clone(), &activated, ctx, metrics, sender).await
{
log::warn!(target: LOG_TARGET, "failed to handle new activations: {:?}", err);
return true;
log::warn!(target: LOG_TARGET, "failed to handle new activations: {}", err);
};
}
false
......@@ -140,12 +139,11 @@ impl CollationGenerationSubsystem {
msg: CollationGenerationMessage::Initialize(config),
}) => {
if self.config.is_some() {
log::warn!(target: LOG_TARGET, "double initialization");
true
log::error!(target: LOG_TARGET, "double initialization");
} else {
self.config = Some(Arc::new(config));
false
}
false
}
Ok(Signal(BlockFinalized(_))) => false,
Err(err) => {
......
......@@ -68,6 +68,8 @@ use statement_table::{
};
use thiserror::Error;
const LOG_TARGET: &str = "candidate_backing";
#[derive(Debug, Error)]
enum Error {
#[error("Candidate is not found")]
......@@ -763,7 +765,7 @@ impl util::JobTrait for CandidateBackingJob {
Ok(x) => x,
Err(e) => {
log::warn!(
target: "candidate_backing",
target: LOG_TARGET,
"Failed to fetch runtime API data for job: {:?}",
e,
);
......@@ -803,7 +805,7 @@ impl util::JobTrait for CandidateBackingJob {
Err(util::Error::NotAValidator) => { return Ok(()) },
Err(e) => {
log::warn!(
target: "candidate_backing",
target: LOG_TARGET,
"Cannot participate in candidate backing: {:?}",
e
);
......
......@@ -40,6 +40,7 @@ use thiserror::Error;
/// Delay between starting a bitfield signing job and its attempting to create a bitfield.
const JOB_DELAY: Duration = Duration::from_millis(1500);
const LOG_TARGET: &str = "bitfield_signing";
/// Each `BitfieldSigningJob` prepares a signed bitfield for a single relay parent.
pub struct BitfieldSigningJob;
......@@ -163,7 +164,7 @@ async fn get_core_availability(
Ok(None) => return Ok(false),
Err(e) => {
// Don't take down the node on runtime API errors.
log::warn!(target: "bitfield_signing", "Encountered a runtime API error: {:?}", e);
log::warn!(target: LOG_TARGET, "Encountered a runtime API error: {:?}", e);
return Ok(false);
}
};
......@@ -288,7 +289,7 @@ impl JobTrait for BitfieldSigningJob {
{
Err(Error::Runtime(runtime_err)) => {
// Don't take down the node on runtime API errors.
log::warn!(target: "bitfield_signing", "Encountered a runtime API error: {:?}", runtime_err);
log::warn!(target: LOG_TARGET, "Encountered a runtime API error: {:?}", runtime_err);
return Ok(());
}
Err(err) => return Err(err),
......
......@@ -24,8 +24,10 @@ use futures::{
prelude::*,
};
use polkadot_node_subsystem::{
errors::{ChainApiError, RuntimeApiError},
messages::{AllMessages, CandidateBackingMessage, CandidateSelectionMessage, CollatorProtocolMessage},
errors::ChainApiError,
messages::{
AllMessages, CandidateBackingMessage, CandidateSelectionMessage, CollatorProtocolMessage,
},
};
use polkadot_node_subsystem_util::{
self as util, delegated_subsystem, JobTrait, ToJobTrait,
......@@ -118,8 +120,6 @@ enum Error {
OneshotRecv(#[from] oneshot::Canceled),
#[error(transparent)]
ChainApi(#[from] ChainApiError),
#[error(transparent)]
Runtime(#[from] RuntimeApiError),
}
impl JobTrait for CandidateSelectionJob {
......
......@@ -45,6 +45,8 @@ use std::{convert::TryFrom, pin::Pin};
use std::collections::BTreeMap;
use thiserror::Error;
const LOG_TARGET: &str = "provisioner";
struct ProvisioningJob {
relay_parent: Hash,
sender: mpsc::Sender<FromJob>,
......@@ -203,7 +205,7 @@ impl ProvisioningJob {
)
.await
{
log::warn!(target: "provisioner", "failed to assemble or send inherent data: {:?}", err);
log::warn!(target: LOG_TARGET, "failed to assemble or send inherent data: {:?}", err);
self.metrics.on_inherent_data_request(Err(()));
} else {
self.metrics.on_inherent_data_request(Ok(()));
......@@ -459,7 +461,8 @@ fn bitfields_indicate_availability(
// however, in practice, that would just push off an error-handling routine which would look a whole lot like this one.
// simpler to just handle the error internally here.
log::warn!(
target: "provisioner", "attempted to set a transverse bit at idx {} which is greater than bitfield size {}",
target: LOG_TARGET,
"attempted to set a transverse bit at idx {} which is greater than bitfield size {}",
validator_idx,
availability_len,
);
......
......@@ -23,9 +23,9 @@
#![deny(unused_crate_dependencies)]
use codec::{Decode, Encode};
use futures::{channel::oneshot, FutureExt, TryFutureExt};
use futures::{channel::oneshot, FutureExt};
use log::{trace, warn};
use log::{debug, trace, warn};
use polkadot_subsystem::messages::*;
use polkadot_subsystem::{
ActiveLeavesUpdate, FromOverseer, OverseerSignal, SpawnedSubsystem, Subsystem, SubsystemContext, SubsystemResult,
......@@ -33,7 +33,6 @@ use polkadot_subsystem::{
use polkadot_node_subsystem_util::metrics::{self, prometheus};
use polkadot_primitives::v1::{Hash, SignedAvailabilityBitfield, SigningContext, ValidatorId};
use polkadot_node_network_protocol::{v1 as protocol_v1, PeerId, NetworkBridgeEvent, View, ReputationChange};
use polkadot_subsystem::SubsystemError;
use std::collections::{HashMap, HashSet};
const COST_SIGNATURE_INVALID: ReputationChange =
......@@ -131,7 +130,7 @@ impl PerRelayParentData {
}
}
const TARGET: &'static str = "bitd";
const LOG_TARGET: &str = "bitfield_distribution";
/// The bitfield distribution subsystem.
pub struct BitfieldDistribution {
......@@ -145,65 +144,82 @@ impl BitfieldDistribution {
}
/// Start processing work as passed on from the Overseer.
async fn run<Context>(self, mut ctx: Context) -> SubsystemResult<()>
async fn run<Context>(self, mut ctx: Context)
where
Context: SubsystemContext<Message = BitfieldDistributionMessage>,
{
// work: process incoming messages from the overseer and process accordingly.
let mut state = ProtocolState::default();
loop {
let message = ctx.recv().await?;
let message = match ctx.recv().await {
Ok(message) => message,
Err(e) => {
debug!(target: LOG_TARGET, "Failed to receive a message from Overseer: {}, exiting", e);
return;
},
};
match message {
FromOverseer::Communication {
msg: BitfieldDistributionMessage::DistributeBitfield(hash, signed_availability),
} => {
trace!(target: TARGET, "Processing DistributeBitfield");
handle_bitfield_distribution(&mut ctx, &mut state, &self.metrics, hash, signed_availability)
.await?;
trace!(target: LOG_TARGET, "Processing DistributeBitfield");
if let Err(err) = handle_bitfield_distribution(
&mut ctx,
&mut state,
&self.metrics,
hash,
signed_availability,
).await {
warn!(target: LOG_TARGET, "Failed to reply to `DistributeBitfield` message: {}", err);
}
}
FromOverseer::Communication {
msg: BitfieldDistributionMessage::NetworkBridgeUpdateV1(event),
} => {
trace!(target: TARGET, "Processing NetworkMessage");
trace!(target: LOG_TARGET, "Processing NetworkMessage");
// a network message was received
if let Err(e) = handle_network_msg(&mut ctx, &mut state, &self.metrics, event).await {
warn!(target: TARGET, "Failed to handle incoming network messages: {:?}", e);
warn!(target: LOG_TARGET, "Failed to handle incoming network messages: {:?}", e);
}
}
FromOverseer::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, deactivated })) => {
for relay_parent in activated {
trace!(target: TARGET, "Start {:?}", relay_parent);
trace!(target: LOG_TARGET, "Start {:?}", relay_parent);
// query basic system parameters once
if let Some((validator_set, signing_context)) =
query_basics(&mut ctx, relay_parent).await?
{
// If our runtime API fails, we don't take down the node,
// but we might alter peers' reputations erroneously as a result
// of not having the correct bookkeeping. If we have lost a race
// with state pruning, it is unlikely that peers will be sending
// us anything to do with this relay-parent anyway.
let _ = state.per_relay_parent.insert(
relay_parent,
PerRelayParentData {
signing_context,
validator_set,
..Default::default()
},
);
match query_basics(&mut ctx, relay_parent).await {
Ok(Some((validator_set, signing_context))) => {
// If our runtime API fails, we don't take down the node,
// but we might alter peers' reputations erroneously as a result
// of not having the correct bookkeeping. If we have lost a race
// with state pruning, it is unlikely that peers will be sending
// us anything to do with this relay-parent anyway.
let _ = state.per_relay_parent.insert(
relay_parent,
PerRelayParentData {
signing_context,
validator_set,
..Default::default()
},
);
}
Err(e) => {
warn!(target: LOG_TARGET, "query_basics has failed: {}", e);
}
_ => {},
}
}
for relay_parent in deactivated {
trace!(target: TARGET, "Stop {:?}", relay_parent);
trace!(target: LOG_TARGET, "Stop {:?}", relay_parent);
// defer the cleanup to the view change
}
}
FromOverseer::Signal(OverseerSignal::BlockFinalized(hash)) => {
trace!(target: TARGET, "Block finalized {:?}", hash);
trace!(target: LOG_TARGET, "Block finalized {:?}", hash);
}
FromOverseer::Signal(OverseerSignal::Conclude) => {
trace!(target: TARGET, "Conclude");
return Ok(());
trace!(target: LOG_TARGET, "Conclude");
return;
}
}
}
......@@ -219,7 +235,7 @@ async fn modify_reputation<Context>(
where
Context: SubsystemContext<Message = BitfieldDistributionMessage>,
{
trace!(target: TARGET, "Reputation change of {:?} for peer {:?}", rep, peer);
trace!(target: LOG_TARGET, "Reputation change of {:?} for peer {:?}", rep, peer);
ctx.send_message(AllMessages::NetworkBridge(
NetworkBridgeMessage::ReportPeer(peer, rep),
))
......@@ -245,7 +261,7 @@ where
job_data
} else {
trace!(
target: TARGET,
target: LOG_TARGET,
"Not supposed to work on relay parent {} related data",
relay_parent
);
......@@ -254,7 +270,7 @@ where
};
let validator_set = &job_data.validator_set;
if validator_set.is_empty() {
trace!(target: TARGET, "Validator set for {:?} is empty", relay_parent);
trace!(target: LOG_TARGET, "Validator set for {:?} is empty", relay_parent);
return Ok(());
}
......@@ -262,7 +278,7 @@ where
let validator = if let Some(validator) = validator_set.get(validator_index) {
validator.clone()
} else {
trace!(target: TARGET, "Could not find a validator for index {}", validator_index);
trace!(target: LOG_TARGET, "Could not find a validator for index {}", validator_index);
return Ok(());
};
......@@ -327,7 +343,7 @@ where
if interested_peers.is_empty() {
trace!(
target: TARGET,
target: LOG_TARGET,
"No peers are interested in gossip for relay parent {:?}",
message.relay_parent
);
......@@ -370,7 +386,7 @@ where
let validator_set = &job_data.validator_set;
if validator_set.is_empty() {
trace!(
target: TARGET,
target: LOG_TARGET,
"Validator set for relay parent {:?} is empty",
&message.relay_parent
);
......@@ -412,7 +428,7 @@ where
// only relay_message a message of a validator once
if one_per_validator.get(&validator).is_some() {
trace!(
target: TARGET,
target: LOG_TARGET,
"Already received a message for validator at index {}",
validator_index
);
......@@ -458,7 +474,7 @@ where
NetworkBridgeEvent::PeerMessage(remote, message) => {
match message {
protocol_v1::BitfieldDistributionMessage::Bitfield(relay_parent, bitfield) => {
trace!(target: TARGET, "Received bitfield gossip from peer {:?}", &remote);
trace!(target: LOG_TARGET, "Received bitfield gossip from peer {:?}", &remote);
let gossiped_bitfield = BitfieldGossipMessage {
relay_parent,
signed_availability: bitfield,
......@@ -478,7 +494,7 @@ fn handle_our_view_change(state: &mut ProtocolState, view: View) -> SubsystemRes
for added in state.view.difference(&old_view) {
if !state.per_relay_parent.contains_key(&added) {
warn!(
target: TARGET,
target: LOG_TARGET,
"Our view contains {} but the overseer never told use we should work on this",
&added
);
......@@ -583,9 +599,7 @@ where
{
fn start(self, ctx: C) -> SpawnedSubsystem {
let future = self.run(ctx)
.map_err(|e| {
SubsystemError::with_origin("bitfield-distribution", e)
})
.map(|_| Ok(()))
.boxed();
SpawnedSubsystem {
......@@ -625,7 +639,7 @@ where
SigningContext { parent_hash: relay_parent, session_index: s },
))),
(Err(e), _) | (_, Err(e)) => {
warn!(target: TARGET, "Failed to fetch basics from runtime API: {:?}", e);
warn!(target: LOG_TARGET, "Failed to fetch basics from runtime API: {:?}", e);
Ok(None)
}
}
......
......@@ -16,9 +16,9 @@
use std::collections::HashMap;
use super::{TARGET, Result};
use super::{LOG_TARGET, Result};
use futures::{StreamExt, channel::oneshot, task::Poll};
use futures::{StreamExt, task::Poll};
use log::warn;
use polkadot_primitives::v1::{
......@@ -28,7 +28,7 @@ use polkadot_primitives::v1::{
use polkadot_subsystem::{
FromOverseer, OverseerSignal, SubsystemContext,
messages::{
AllMessages, CollatorProtocolMessage, RuntimeApiMessage, RuntimeApiRequest,
AllMessages, CollatorProtocolMessage,
NetworkBridgeMessage,
},
};
......@@ -39,6 +39,7 @@ use polkadot_node_subsystem_util::{
validator_discovery,
request_validators_ctx,
request_validator_groups_ctx,
request_availability_cores_ctx,
metrics::{self, prometheus},
};
......@@ -152,7 +153,7 @@ where
// This collation is not in the active-leaves set.
if !state.view.contains(&relay_parent) {
warn!(
target: TARGET,
target: LOG_TARGET,
"Distribute collation message parent {:?} is outside of our view",
relay_parent,
);
......@@ -171,7 +172,7 @@ where
Some(core) => core,
None => {
warn!(
target: TARGET,
target: LOG_TARGET,
"Looks like no core is assigned to {:?} at {:?}", id, relay_parent,
);
return Ok(());
......@@ -183,7 +184,7 @@ where
Some(validators) => validators,
None => {
warn!(
target: TARGET,
target: LOG_TARGET,
"There are no validators assigned to {:?} core", our_core,
);
......@@ -224,16 +225,7 @@ async fn determine_core<Context>(
where
Context: SubsystemContext<Message = CollatorProtocolMessage>
{
let (tx, rx) = oneshot::channel();
ctx.send_message(AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::AvailabilityCores(tx),
)
)).await?;
let cores = rx.await??;
let cores = request_availability_cores_ctx(relay_parent, ctx).await?.await??;
for (idx, core) in cores.iter().enumerate() {
if let CoreState::Scheduled(occupied) = core {
......@@ -386,7 +378,7 @@ where
// If the ParaId of a collation requested to be distributed does not match
// the one we expect, we ignore the message.
warn!(
target: TARGET,
target: LOG_TARGET,
"DistributeCollation message for para {:?} while collating on {:?}",
receipt.descriptor.para_id,
id,
......@@ -397,7 +389,7 @@ where
}
None => {
warn!(
target: TARGET,
target: LOG_TARGET,
"DistributeCollation message for para {:?} while not collating on any",
receipt.descriptor.para_id,
);
......@@ -406,19 +398,19 @@ where
}
FetchCollation(_, _, _, _) => {
warn!(
target: TARGET,
target: LOG_TARGET,
"FetchCollation message is not expected on the collator side of the protocol",
);
}
ReportCollator(_) => {
warn!(
target: TARGET,
target: LOG_TARGET,
"ReportCollator message is not expected on the collator side of the protocol",
);
}
NoteGoodCollation(_) => {
warn!(
target: TARGET,
target: LOG_TARGET,
"NoteGoodCollation message is not expected on the collator side of the protocol",
);
}
......@@ -429,7 +421,7 @@ where
event,
).await {
warn!(
target: TARGET,
target: LOG_TARGET,
"Failed to handle incoming network message: {:?}", e,
);
}
......@@ -484,13 +476,13 @@ where
match msg {
Declare(_) => {
warn!(
target: TARGET,
target: LOG_TARGET,
"Declare message is not expected on the collator side of the protocol",
);
}
AdvertiseCollation(_, _) => {
warn!(
target: TARGET,
target: LOG_TARGET,
"AdvertiseCollation message is not expected on the collator side of the protocol",
);
}
......@@ -503,7 +495,7 @@ where
}
} else {
warn!(
target: TARGET,
target: LOG_TARGET,
"Received a RequestCollation for {:?} while collating on {:?}",
para_id, our_para_id,
);
......@@ -511,7 +503,7 @@ where
}
None => {
warn!(
target: TARGET,
target: LOG_TARGET,
"Received a RequestCollation for {:?} while not collating on any para",
para_id,
);
......@@ -520,7 +512,7 @@ where
}
Collation(_, _, _) => {
warn!(
target: TARGET,
target: LOG_TARGET,
"Collation message is not expected on the collator side of the protocol",
);
}
......@@ -655,7 +647,7 @@ where
while let Poll::Ready(Some((validator_id, peer_id))) = futures::poll!(request.next()) {
if let Err(err) = handle_validator_connected(&mut ctx, &mut state, peer_id, validator_id).await {
warn!(
target: TARGET,
target: LOG_TARGET,
"Failed to declare our collator id: {:?}",
err,
);
......@@ -667,7 +659,11 @@ where
while let Poll::Ready(msg) = futures::poll!(ctx.recv()) {
match msg? {
Communication { msg } => process_msg(&mut ctx, &mut state, msg).await?,
Communication { msg } => {
if let Err(e) = process_msg(&mut ctx, &mut state, msg).await {
warn!(target: LOG_TARGET, "Failed to process message: {}", e);
}
},
Signal(ActiveLeaves(_update)) => {}
Signal(BlockFinalized(_)) => {}
Signal(Conclude) => return Ok(()),
......@@ -696,7 +692,7 @@ mod tests {
BlockData, CandidateDescriptor, CollatorPair, ScheduledCore,
ValidatorIndex, GroupRotationInfo, AuthorityDiscoveryId,
};
use polkadot_subsystem::ActiveLeavesUpdate;
use polkadot_subsystem::{ActiveLeavesUpdate, messages::{RuntimeApiMessage, RuntimeApiRequest}};
use polkadot_node_subsystem_util::TimeoutExt;
use polkadot_subsystem_testhelpers as test_helpers;
use polkadot_node_network_protocol::ObservedRole;
......@@ -818,7 +814,7 @@ mod tests {
log::LevelFilter::Trace,
)
.filter(
Some(TARGET),
Some(LOG_TARGET),
log::LevelFilter::Trace,
)
.try_init();
......
......@@ -43,7 +43,7 @@ use polkadot_node_subsystem_util::{
mod collator_side;
mod validator_side;
const TARGET: &'static str = "colp";
const LOG_TARGET: &'static str = "collator_protocol";
const REQUEST_TIMEOUT: Duration = Duration::from_secs(1);
#[derive(Debug, Error)]
......@@ -140,7 +140,7 @@ where
Context: SubsystemContext<Message = CollatorProtocolMessage>,
{
trace!(
target: TARGET,
target: LOG_TARGET,
"Reputation change of {:?} for peer {:?}", rep, peer,
);
......
......@@ -45,7 +45,7 @@ use polkadot_node_subsystem_util::{
metrics::{self, prometheus},
};
use super::{modify_reputation, TARGET, Result};
use super::{modify_reputation, LOG_TARGET, Result};
const COST_UNEXPECTED_MESSAGE: Rep = Rep::new(-10, "An unexpected message");
const COST_REQUEST_TIMED_OUT: Rep = Rep::new(-20, "A collation request has timed out");
......@@ -207,7 +207,7 @@ where
// We do not want this to be fatal because the receving subsystem
// may have closed the results channel for some reason.
trace!(
target: TARGET,
target: LOG_TARGET,
"Failed to send collation: {:?}", e,
);
}
......@@ -381,7 +381,7 @@ where
{
if !state.view.contains(&relay_parent) {
trace!(
target: TARGET,
target: LOG_TARGET,
"Collation by {} on {} on relay parent {} is no longer in view",
peer_id, para_id, relay_parent,
);
......@@ -390,7 +390,7 @@ where
if state.requested_collations.contains_key(&(relay_parent, para_id.clone(), peer_id.clone())) {
trace!(
target: TARGET,