Unverified Commit d8d8e926 authored by asynchronous rob's avatar asynchronous rob Committed by GitHub
Browse files

Reversion Safety tools for overseer and subsystems (#3104)

* guide: reversion safety

* guide: manage reversion safety in subsystems

* add leaf status to ActivatedLeaf

* add an LRU-cache to overseer for staleness detection

* update ActivatedLeaf usages in tests to contain status field

* add variant where missed accidentally

* add some helpers to LeafStatus

* address grumbles
parent b2bbffb8
Pipeline #140094 passed with stages
in 32 minutes and 44 seconds
......@@ -6260,6 +6260,7 @@ dependencies = [
"futures 0.3.14",
"futures-timer 3.0.2",
"kv-log-macro",
"lru",
"polkadot-node-network-protocol",
"polkadot-node-primitives",
"polkadot-node-subsystem",
......
......@@ -33,6 +33,7 @@ use polkadot_node_primitives::{AvailableData, BlockData, PoV};
use polkadot_node_subsystem_util::TimeoutExt;
use polkadot_subsystem::{
ActiveLeavesUpdate, errors::RuntimeApiError, jaeger, messages::AllMessages, ActivatedLeaf,
LeafStatus,
};
use polkadot_node_subsystem_test_helpers as test_helpers;
use sp_keyring::Sr25519Keyring;
......@@ -258,6 +259,7 @@ fn runtime_api_error_does_not_stop_the_subsystem() {
activated: vec![ActivatedLeaf {
hash: new_leaf,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].into(),
deactivated: vec![].into(),
......@@ -994,6 +996,7 @@ async fn import_leaf(
activated: vec![ActivatedLeaf {
hash: new_leaf,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].into(),
deactivated: vec![].into(),
......
......@@ -1333,7 +1333,7 @@ mod tests {
use polkadot_primitives::v1::{GroupRotationInfo, HeadData, PersistedValidationData, ScheduledCore};
use polkadot_subsystem::{
messages::{RuntimeApiRequest, RuntimeApiMessage},
ActiveLeavesUpdate, FromOverseer, OverseerSignal, ActivatedLeaf,
ActiveLeavesUpdate, FromOverseer, OverseerSignal, ActivatedLeaf, LeafStatus,
};
use polkadot_node_primitives::{InvalidCandidate, BlockData};
use polkadot_node_subsystem_test_helpers as test_helpers;
......@@ -1526,6 +1526,7 @@ mod tests {
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: test_state.relay_parent,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})))
).await;
......
......@@ -29,7 +29,8 @@ use sc_network as network;
use sc_network::IfDisconnected;
use sc_network::config as netconfig;
use polkadot_subsystem::{ActiveLeavesUpdate, FromOverseer, OverseerSignal, ActivatedLeaf,
use polkadot_subsystem::{
ActiveLeavesUpdate, FromOverseer, OverseerSignal, ActivatedLeaf, LeafStatus,
messages::{
AllMessages, AvailabilityDistributionMessage, AvailabilityStoreMessage, NetworkBridgeMessage,
RuntimeApiMessage, RuntimeApiRequest,
......@@ -173,6 +174,7 @@ impl TestState {
activated: smallvec![ActivatedLeaf {
hash: new.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![old.clone()],
......
......@@ -33,7 +33,9 @@ use polkadot_node_primitives::{PoV, BlockData};
use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks};
use polkadot_node_subsystem_util::TimeoutExt;
use polkadot_subsystem_testhelpers as test_helpers;
use polkadot_subsystem::{messages::{RuntimeApiMessage, RuntimeApiRequest}, jaeger, ActivatedLeaf};
use polkadot_subsystem::{
messages::{RuntimeApiMessage, RuntimeApiRequest}, jaeger, ActivatedLeaf, LeafStatus,
};
type VirtualOverseer = test_helpers::TestSubsystemContextHandle<AvailabilityRecoveryMessage>;
......@@ -448,6 +450,7 @@ fn availability_is_recovered_from_chunks_if_no_group_provided() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -529,6 +532,7 @@ fn availability_is_recovered_from_chunks_even_if_backing_group_supplied_if_chunk
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -610,6 +614,7 @@ fn bad_merkle_path_leads_to_recovery_error() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -666,6 +671,7 @@ fn wrong_chunk_index_leads_to_recovery_error() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -739,6 +745,7 @@ fn invalid_erasure_coding_leads_to_invalid_error() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -786,6 +793,7 @@ fn fast_path_backing_group_recovers() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -838,6 +846,7 @@ fn no_answers_in_fast_path_causes_chunk_requests() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -900,6 +909,7 @@ fn task_canceled_when_receivers_dropped() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -942,6 +952,7 @@ fn chunks_retry_until_all_nodes_respond() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -1000,6 +1011,7 @@ fn returns_early_if_we_have_the_data() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......@@ -1037,6 +1049,7 @@ fn does_not_query_local_validator() {
activated: smallvec![ActivatedLeaf {
hash: test_state.current.clone(),
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}],
deactivated: smallvec![],
......
......@@ -1157,7 +1157,7 @@ mod tests {
use sc_network::{Event as NetworkEvent, IfDisconnected};
use polkadot_subsystem::{jaeger, ActiveLeavesUpdate, FromOverseer, OverseerSignal};
use polkadot_subsystem::{jaeger, ActiveLeavesUpdate, FromOverseer, OverseerSignal, LeafStatus};
use polkadot_subsystem::messages::{
ApprovalDistributionMessage,
BitfieldDistributionMessage,
......@@ -1471,6 +1471,7 @@ mod tests {
ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: head,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})
))
......@@ -1568,6 +1569,7 @@ mod tests {
ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: hash_a,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})
))
......@@ -1652,6 +1654,7 @@ mod tests {
ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: hash_a,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})
))
......@@ -1667,6 +1670,7 @@ mod tests {
ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: hash_b,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})
))
......@@ -1739,6 +1743,7 @@ mod tests {
ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: hash_a,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})
))
......@@ -1956,6 +1961,7 @@ mod tests {
ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: hash_a,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})
))
......@@ -2171,6 +2177,7 @@ mod tests {
ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: hash_b,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})
))
......@@ -2400,6 +2407,7 @@ mod tests {
activated: hashes.enumerate().map(|(i, h)| ActivatedLeaf {
hash: h,
number: i as _,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}).rev().collect(),
deactivated: Default::default(),
......
......@@ -921,7 +921,7 @@ mod tests {
use polkadot_subsystem::{
jaeger,
messages::{RuntimeApiMessage, RuntimeApiRequest},
ActiveLeavesUpdate, ActivatedLeaf,
ActiveLeavesUpdate, ActivatedLeaf, LeafStatus,
};
use polkadot_subsystem_testhelpers as test_helpers;
......@@ -1159,6 +1159,7 @@ mod tests {
activated: vec![ActivatedLeaf {
hash: test_state.relay_parent,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].into(),
deactivated: [][..].into(),
......
......@@ -18,7 +18,7 @@
use super::*;
use polkadot_node_subsystem::{
jaeger, ActivatedLeaf,
jaeger, ActivatedLeaf, LeafStatus,
messages::{RuntimeApiMessage, RuntimeApiRequest},
};
use polkadot_node_subsystem_test_helpers as test_helpers;
......@@ -73,6 +73,7 @@ async fn overseer_signal_active_leaves(
let leaf = ActivatedLeaf {
hash: leaf,
number: 0xdeadcafe,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
};
overseer
......
......@@ -903,7 +903,7 @@ async fn circulate_statement_and_dependents(
fn statement_message(relay_parent: Hash, statement: SignedFullStatement)
-> protocol_v1::ValidationProtocol
{
{
let msg = if is_statement_large(&statement) {
protocol_v1::StatementDistributionMessage::LargeStatement(
StatementMetadata {
......@@ -1198,7 +1198,7 @@ async fn retrieve_statement_from_message<'a>(
).await {
vacant.insert(new_status);
}
}
}
protocol_v1::StatementDistributionMessage::Statement(_, s) => {
// No fetch in progress, safe to return any statement immediately (we don't bother
// about normal network jitter which might cause `Valid` statements to arrive early
......@@ -1594,7 +1594,7 @@ impl StatementDistribution {
match result {
Ok(true) => break,
Ok(false) => {}
Err(Error(Fault::Fatal(f))) => return Err(f),
Err(Error(Fault::Fatal(f))) => return Err(f),
Err(Error(Fault::Err(error))) =>
tracing::debug!(target: LOG_TARGET, ?error)
}
......@@ -2072,7 +2072,9 @@ mod tests {
use sp_keystore::{CryptoStore, SyncCryptoStorePtr, SyncCryptoStore};
use sc_keystore::LocalKeystore;
use polkadot_node_network_protocol::{view, ObservedRole, request_response::Recipient};
use polkadot_subsystem::{jaeger, ActivatedLeaf, messages::{RuntimeApiMessage, RuntimeApiRequest}};
use polkadot_subsystem::{
jaeger, ActivatedLeaf, messages::{RuntimeApiMessage, RuntimeApiRequest}, LeafStatus,
};
use polkadot_node_network_protocol::request_response::{
Requests,
v1::{
......@@ -2690,6 +2692,7 @@ mod tests {
activated: vec![ActivatedLeaf {
hash: hash_a,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].into(),
deactivated: vec![].into(),
......@@ -2865,6 +2868,7 @@ mod tests {
activated: vec![ActivatedLeaf {
hash: hash_a,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].into(),
deactivated: vec![].into(),
......@@ -3336,6 +3340,7 @@ mod tests {
activated: vec![ActivatedLeaf {
hash: hash_a,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].into(),
deactivated: vec![].into(),
......@@ -3591,6 +3596,7 @@ mod tests {
activated: vec![ActivatedLeaf {
hash: hash_a,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].into(),
deactivated: vec![].into(),
......@@ -3634,7 +3640,7 @@ mod tests {
NetworkBridgeEvent::PeerViewChange(peer_a.clone(), view![hash_a])
)
}).await;
// receive a seconded statement from peer A.
let statement = {
let signing_context = SigningContext {
......
......@@ -16,6 +16,7 @@ polkadot-procmacro-overseer-subsystems-gen = { path = "./subsystems-gen" }
polkadot-primitives = { path = "../../primitives" }
polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../subsystem" }
tracing = "0.1.26"
lru = "0.6"
[dev-dependencies]
sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" }
......
......@@ -74,6 +74,7 @@ use futures::{
Future, FutureExt, StreamExt,
};
use futures_timer::Delay;
use lru::LruCache;
use polkadot_primitives::v1::{Block, BlockId,BlockNumber, Hash, ParachainHost};
use client::{BlockImportNotification, BlockchainEvents, FinalityNotification};
......@@ -91,6 +92,7 @@ use polkadot_subsystem::messages::{
pub use polkadot_subsystem::{
Subsystem, SubsystemContext, SubsystemSender, OverseerSignal, FromOverseer, SubsystemError,
SubsystemResult, SpawnedSubsystem, ActiveLeavesUpdate, ActivatedLeaf, DummySubsystem, jaeger,
LeafStatus,
};
use polkadot_node_subsystem_util::{TimeoutExt, metrics::{self, prometheus}, metered, Metronome};
use polkadot_node_primitives::SpawnNamed;
......@@ -1045,6 +1047,11 @@ struct SubsystemMeterReadouts {
signals: metered::Readout,
}
/// Store 2 days worth of blocks, not accounting for forks,
/// in the LRU cache. Assumes a 6-second block time.
const KNOWN_LEAVES_CACHE_SIZE: usize = 2 * 24 * 3600 / 6;
/// The `Overseer` itself.
pub struct Overseer<S, SupportsParachains> {
/// Handles to all subsystems.
......@@ -1098,6 +1105,9 @@ pub struct Overseer<S, SupportsParachains> {
/// An implementation for checking whether a header supports parachain consensus.
supports_parachains: SupportsParachains,
/// An LRU cache for keeping track of relay-chain heads that have already been seen.
known_leaves: LruCache<Hash, ()>,
/// Various Prometheus metrics.
metrics: Metrics,
}
......@@ -1832,6 +1842,7 @@ where
active_leaves,
metrics,
span_per_active_leaf: Default::default(),
known_leaves: LruCache::new(KNOWN_LEAVES_CACHE_SIZE),
supports_parachains,
};
......@@ -1881,10 +1892,11 @@ where
for (hash, number) in std::mem::take(&mut self.leaves) {
let _ = self.active_leaves.insert(hash, number);
if let Some(span) = self.on_head_activated(&hash, None) {
if let Some((span, status)) = self.on_head_activated(&hash, None) {
update.activated.push(ActivatedLeaf {
hash,
number,
status,
span,
});
}
......@@ -1967,9 +1979,10 @@ where
};
let mut update = match self.on_head_activated(&block.hash, Some(block.parent_hash)) {
Some(span) => ActiveLeavesUpdate::start_work(ActivatedLeaf {
Some((span, status)) => ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: block.hash,
number: block.number,
status,
span
}),
None => ActiveLeavesUpdate::default(),
......@@ -2110,7 +2123,7 @@ where
/// this returns `None`.
#[tracing::instrument(level = "trace", skip(self), fields(subsystem = LOG_TARGET))]
fn on_head_activated(&mut self, hash: &Hash, parent_hash: Option<Hash>)
-> Option<Arc<jaeger::Span>>
-> Option<(Arc<jaeger::Span>, LeafStatus)>
{
if !self.supports_parachains.head_supports_parachains(hash) {
return None;
......@@ -2132,7 +2145,14 @@ where
let span = Arc::new(span);
self.span_per_active_leaf.insert(*hash, span.clone());
Some(span)
let status = if let Some(_) = self.known_leaves.put(*hash, ()) {
LeafStatus::Stale
} else {
LeafStatus::Fresh
};
Some((span, status))
}
#[tracing::instrument(level = "trace", skip(self), fields(subsystem = LOG_TARGET))]
......@@ -2620,12 +2640,14 @@ mod tests {
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: first_block_hash,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})),
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated: [ActivatedLeaf {
hash: second_block_hash,
number: 2,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].as_ref().into(),
deactivated: [first_block_hash].as_ref().into(),
......@@ -2634,6 +2656,7 @@ mod tests {
activated: [ActivatedLeaf {
hash: third_block_hash,
number: 3,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].as_ref().into(),
deactivated: [second_block_hash].as_ref().into(),
......@@ -2728,11 +2751,13 @@ mod tests {
ActivatedLeaf {
hash: first_block_hash,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
},
ActivatedLeaf {
hash: second_block_hash,
number: 2,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
},
].as_ref().into(),
......@@ -2825,6 +2850,7 @@ mod tests {
ActivatedLeaf {
hash: imported_block.hash,
number: imported_block.number,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled)
}
].as_ref().into(),
......@@ -2859,6 +2885,146 @@ mod tests {
});
}
// Tests that duplicate leaves have an attached 'Stale' status.
#[test]
fn overseer_stale_detection() {
let spawner = sp_core::testing::TaskExecutor::new();
executor::block_on(async move {
let a1_hash = [1; 32].into();
let b1_hash = [2; 32].into();
let a2_hash = [3; 32].into();
let b2_hash = [4; 32].into();
let first_block = BlockInfo {
hash: a1_hash,
parent_hash: [0; 32].into(),
number: 1,
};
let second_block = BlockInfo {
hash: b1_hash,
parent_hash: [0; 32].into(),
number: 1,
};
let third_block = BlockInfo {
hash: a2_hash,
parent_hash: a1_hash,
number: 2,
};
let fourth_block = BlockInfo {
hash: b2_hash,
parent_hash: b1_hash,
number: 2,
};
let (tx_5, mut rx_5) = metered::channel(64);
let (tx_6, mut rx_6) = metered::channel(64);
let all_subsystems = AllSubsystems::<()>::dummy()
.replace_candidate_validation(TestSubsystem5(tx_5))
.replace_candidate_backing(TestSubsystem6(tx_6));
let (overseer, mut handler) = Overseer::new(
vec![first_block.clone()],
all_subsystems,
None,
MockSupportsParachains,
spawner,
).unwrap();
let overseer_fut = overseer.run().fuse();
pin_mut!(overseer_fut);
let mut ss5_results = Vec::new();
let mut ss6_results = Vec::new();
handler.block_imported(second_block.clone()).await;
// import the second block of each chain to deactivate the heads.
handler.block_imported(third_block).await;
handler.block_imported(fourth_block).await;
// import the first blocks again (emulating a revert)
handler.block_imported(first_block).await;
handler.block_imported(second_block).await;
let expected_heartbeats = vec![
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: a1_hash,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})),
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: b1_hash,
number: 1,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
})),
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated: [ActivatedLeaf {
hash: a2_hash,
number: 2,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].as_ref().into(),
deactivated: [a1_hash].as_ref().into(),
}),
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated: [ActivatedLeaf {
hash: b2_hash,
number: 2,
status: LeafStatus::Fresh,
span: Arc::new(jaeger::Span::Disabled),
}].as_ref().into(),
deactivated: [b1_hash].as_ref().into(),
}),
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: a1_hash,
number: 1,
status: LeafStatus::Stale,
span: Arc::new(jaeger::Span::Disabled),
})),
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(ActivatedLeaf {
hash: b1_hash,
number: 1,
status: LeafStatus::Stale,
span: Arc::new(jaeger::Span::Disabled),
})),
];
loop {
select! {
res = overseer_fut => {
assert!(res.is_ok());
break;
},
res = rx_5.next() => {
if let Some(res) = res {
ss5_results.push(res);