Unverified Commit 0363cc63 authored by Andronik Ordian's avatar Andronik Ordian Committed by GitHub
Browse files

more resilient subsystems: av-store (#1888)

* utils: remove unused error

* av-store: do not exit early on errors

* av-store: revert logging change on master

* av-store: add a test
parent 1364ee69
Pipeline #112981 passed with stages
in 21 minutes and 39 seconds
...@@ -27,7 +27,7 @@ use std::sync::Arc; ...@@ -27,7 +27,7 @@ use std::sync::Arc;
use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH};
use codec::{Encode, Decode}; use codec::{Encode, Decode};
use futures::{select, channel::oneshot, future::{self, Either}, Future, FutureExt, TryFutureExt}; use futures::{select, channel::oneshot, future::{self, Either}, Future, FutureExt};
use futures_timer::Delay; use futures_timer::Delay;
use kvdb_rocksdb::{Database, DatabaseConfig}; use kvdb_rocksdb::{Database, DatabaseConfig};
use kvdb::{KeyValueDB, DBTransaction}; use kvdb::{KeyValueDB, DBTransaction};
...@@ -57,34 +57,31 @@ mod columns { ...@@ -57,34 +57,31 @@ mod columns {
#[derive(Debug, Error)] #[derive(Debug, Error)]
enum Error { enum Error {
#[error(transparent)] #[error(transparent)]
ChainAPI(#[from] ChainApiError), RuntimeApi(#[from] RuntimeApiError),
#[error(transparent)]
ChainApi(#[from] ChainApiError),
#[error(transparent)] #[error(transparent)]
Erasure(#[from] erasure::Error), Erasure(#[from] erasure::Error),
#[error(transparent)] #[error(transparent)]
Io(#[from] io::Error), Io(#[from] io::Error),
#[error(transparent)] #[error(transparent)]
ChainApiChannelIsClosed(#[from] oneshot::Canceled), Oneshot(#[from] oneshot::Canceled),
#[error(transparent)] #[error(transparent)]
Subsystem(#[from] SubsystemError), Subsystem(#[from] SubsystemError),
#[error(transparent)] #[error(transparent)]
Time(#[from] SystemTimeError), Time(#[from] SystemTimeError),
} }
/// Class of errors which we should handle more gracefully. impl Error {
/// An occurrence of this error should not bring down the subsystem. fn severity(&self) -> log::Level {
#[derive(Debug, Error)] match self {
enum NonFatalError { // don't spam the log with spurious errors
/// A Runtime API error occurred. Self::RuntimeApi(_) |
#[error(transparent)] Self::Oneshot(_) => log::Level::Debug,
RuntimeApi(#[from] RuntimeApiError), // it's worth reporting otherwise
_ => log::Level::Warn,
/// The receiver's end of the channel is closed. }
#[error(transparent)] }
Oneshot(#[from] oneshot::Canceled),
/// Overseer channel's buffer is full.
#[error(transparent)]
OverseerOutOfCapacity(#[from] SubsystemError),
} }
/// A wrapper type for delays. /// A wrapper type for delays.
...@@ -108,48 +105,48 @@ impl PruningDelay { ...@@ -108,48 +105,48 @@ impl PruningDelay {
fn as_duration(&self) -> Option<Duration> { fn as_duration(&self) -> Option<Duration> {
match self { match self {
PruningDelay::In(d) => Some(*d), PruningDelay::In(d) => Some(*d),
PruningDelay::Indefinite => None, PruningDelay::Indefinite => None,
} }
} }
} }
impl From<Duration> for PruningDelay { impl From<Duration> for PruningDelay {
fn from(d: Duration) -> Self { fn from(d: Duration) -> Self {
Self::In(d) Self::In(d)
} }
} }
impl PartialEq for PruningDelay { impl PartialEq for PruningDelay {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
match (self, other) { match (self, other) {
(PruningDelay::In(this), PruningDelay::In(that)) => {this == that}, (PruningDelay::In(this), PruningDelay::In(that)) => {this == that},
(PruningDelay::Indefinite, PruningDelay::Indefinite) => true, (PruningDelay::Indefinite, PruningDelay::Indefinite) => true,
_ => false, _ => false,
} }
} }
} }
impl PartialOrd for PruningDelay { impl PartialOrd for PruningDelay {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
match (self, other) { match (self, other) {
(PruningDelay::In(this), PruningDelay::In(that)) => this.partial_cmp(that), (PruningDelay::In(this), PruningDelay::In(that)) => this.partial_cmp(that),
(PruningDelay::In(_), PruningDelay::Indefinite) => Some(Ordering::Less), (PruningDelay::In(_), PruningDelay::Indefinite) => Some(Ordering::Less),
(PruningDelay::Indefinite, PruningDelay::In(_)) => Some(Ordering::Greater), (PruningDelay::Indefinite, PruningDelay::In(_)) => Some(Ordering::Greater),
(PruningDelay::Indefinite, PruningDelay::Indefinite) => Some(Ordering::Equal), (PruningDelay::Indefinite, PruningDelay::Indefinite) => Some(Ordering::Equal),
} }
} }
} }
impl Ord for PruningDelay { impl Ord for PruningDelay {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
match (self, other) { match (self, other) {
(PruningDelay::In(this), PruningDelay::In(that)) => this.cmp(that), (PruningDelay::In(this), PruningDelay::In(that)) => this.cmp(that),
(PruningDelay::In(_), PruningDelay::Indefinite) => Ordering::Less, (PruningDelay::In(_), PruningDelay::Indefinite) => Ordering::Less,
(PruningDelay::Indefinite, PruningDelay::In(_)) => Ordering::Greater, (PruningDelay::Indefinite, PruningDelay::In(_)) => Ordering::Greater,
(PruningDelay::Indefinite, PruningDelay::Indefinite) => Ordering::Equal, (PruningDelay::Indefinite, PruningDelay::Indefinite) => Ordering::Equal,
} }
} }
} }
/// A key for chunk pruning records. /// A key for chunk pruning records.
...@@ -227,13 +224,13 @@ struct PruningConfig { ...@@ -227,13 +224,13 @@ struct PruningConfig {
} }
impl Default for PruningConfig { impl Default for PruningConfig {
fn default() -> Self { fn default() -> Self {
Self { Self {
keep_stored_block_for: KEEP_STORED_BLOCK_FOR, keep_stored_block_for: KEEP_STORED_BLOCK_FOR,
keep_finalized_block_for: KEEP_FINALIZED_BLOCK_FOR, keep_finalized_block_for: KEEP_FINALIZED_BLOCK_FOR,
keep_finalized_chunk_for: KEEP_FINALIZED_CHUNK_FOR, keep_finalized_chunk_for: KEEP_FINALIZED_CHUNK_FOR,
} }
} }
} }
#[derive(Debug, Decode, Encode, Eq, PartialEq)] #[derive(Debug, Decode, Encode, Eq, PartialEq)]
...@@ -268,9 +265,9 @@ impl Ord for PoVPruningRecord { ...@@ -268,9 +265,9 @@ impl Ord for PoVPruningRecord {
} }
impl PartialOrd for PoVPruningRecord { impl PartialOrd for PoVPruningRecord {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
#[derive(Debug, Decode, Encode, Eq)] #[derive(Debug, Decode, Encode, Eq)]
...@@ -477,56 +474,69 @@ fn get_next_chunk_pruning_time(db: &Arc<dyn KeyValueDB>) -> Option<NextChunkPrun ...@@ -477,56 +474,69 @@ fn get_next_chunk_pruning_time(db: &Arc<dyn KeyValueDB>) -> Option<NextChunkPrun
} }
async fn run<Context>(mut subsystem: AvailabilityStoreSubsystem, mut ctx: Context) async fn run<Context>(mut subsystem: AvailabilityStoreSubsystem, mut ctx: Context)
-> Result<(), Error>
where where
Context: SubsystemContext<Message=AvailabilityStoreMessage>, Context: SubsystemContext<Message=AvailabilityStoreMessage>,
{ {
loop { loop {
// Every time the following two methods are called a read from DB is performed. let res = run_iteration(&mut subsystem, &mut ctx).await;
// But given that these are very small values which are essentially a newtype match res {
// wrappers around `Duration` (`NextChunkPruning` and `NextPoVPruning`) and also the Err(e) => {
// fact of the frequent reads itself we assume these to end up cached in the memory log::log!(target: LOG_TARGET, e.severity(), "{}", e);
// anyway and thus these db reads to be reasonably fast. }
let pov_pruning_time = subsystem.maybe_prune_povs()?; Ok(true) => {
let chunk_pruning_time = subsystem.maybe_prune_chunks()?; log::info!(target: LOG_TARGET, "received `Conclude` signal, exiting");
break;
let mut pov_pruning_time = pov_pruning_time.fuse(); },
let mut chunk_pruning_time = chunk_pruning_time.fuse(); Ok(false) => continue,
}
select! { }
incoming = ctx.recv().fuse() => { }
match incoming {
Ok(FromOverseer::Signal(OverseerSignal::Conclude)) => break, async fn run_iteration<Context>(subsystem: &mut AvailabilityStoreSubsystem, ctx: &mut Context)
Ok(FromOverseer::Signal(OverseerSignal::ActiveLeaves( -> Result<bool, Error>
ActiveLeavesUpdate { activated, .. }) where
)) => { Context: SubsystemContext<Message=AvailabilityStoreMessage>,
for activated in activated.into_iter() { {
process_block_activated(&mut ctx, &subsystem.inner, activated).await?; // Every time the following two methods are called a read from DB is performed.
} // But given that these are very small values which are essentially a newtype
} // wrappers around `Duration` (`NextChunkPruning` and `NextPoVPruning`) and also the
Ok(FromOverseer::Signal(OverseerSignal::BlockFinalized(hash))) => { // fact of the frequent reads itself we assume these to end up cached in the memory
process_block_finalized(&subsystem, &mut ctx, &subsystem.inner, hash).await?; // anyway and thus these db reads to be reasonably fast.
} let pov_pruning_time = subsystem.maybe_prune_povs()?;
Ok(FromOverseer::Communication { msg }) => { let chunk_pruning_time = subsystem.maybe_prune_chunks()?;
process_message(&mut subsystem, &mut ctx, msg).await?;
let mut pov_pruning_time = pov_pruning_time.fuse();
let mut chunk_pruning_time = chunk_pruning_time.fuse();
select! {
incoming = ctx.recv().fuse() => {
match incoming? {
FromOverseer::Signal(OverseerSignal::Conclude) => return Ok(true),
FromOverseer::Signal(OverseerSignal::ActiveLeaves(
ActiveLeavesUpdate { activated, .. })
) => {
for activated in activated.into_iter() {
process_block_activated(ctx, &subsystem.inner, activated).await?;
} }
Err(e) => { }
log::error!("AvailabilityStoreSubsystem err: {:#?}", e); FromOverseer::Signal(OverseerSignal::BlockFinalized(hash)) => {
break process_block_finalized(subsystem, ctx, &subsystem.inner, hash).await?;
}, }
FromOverseer::Communication { msg } => {
process_message(subsystem, ctx, msg).await?;
} }
} }
pov_pruning_time = pov_pruning_time => {
subsystem.prune_povs()?;
}
chunk_pruning_time = chunk_pruning_time => {
subsystem.prune_chunks()?;
}
complete => break,
} }
pov_pruning_time = pov_pruning_time => {
subsystem.prune_povs()?;
}
chunk_pruning_time = chunk_pruning_time => {
subsystem.prune_chunks()?;
}
complete => return Ok(true),
} }
Ok(()) Ok(false)
} }
/// As soon as certain block is finalized its pruning records and records of all /// As soon as certain block is finalized its pruning records and records of all
...@@ -647,7 +657,7 @@ where ...@@ -647,7 +657,7 @@ where
async fn request_candidate_events<Context>( async fn request_candidate_events<Context>(
ctx: &mut Context, ctx: &mut Context,
hash: Hash, hash: Hash,
) -> Result<Vec<CandidateEvent>, NonFatalError> ) -> Result<Vec<CandidateEvent>, Error>
where where
Context: SubsystemContext<Message=AvailabilityStoreMessage> Context: SubsystemContext<Message=AvailabilityStoreMessage>
{ {
...@@ -673,48 +683,44 @@ where ...@@ -673,48 +683,44 @@ where
{ {
use AvailabilityStoreMessage::*; use AvailabilityStoreMessage::*;
fn log_send_error(request: &'static str) {
log::debug!(target: LOG_TARGET, "error sending a response to {}", request);
}
match msg { match msg {
QueryAvailableData(hash, tx) => { QueryAvailableData(hash, tx) => {
tx.send(available_data(&subsystem.inner, &hash).map(|d| d.data)) tx.send(available_data(&subsystem.inner, &hash).map(|d| d.data))
.unwrap_or_else(|_| log_send_error("QueryAvailableData")); .map_err(|_| oneshot::Canceled)?;
} }
QueryDataAvailability(hash, tx) => { QueryDataAvailability(hash, tx) => {
tx.send(available_data(&subsystem.inner, &hash).is_some()) tx.send(available_data(&subsystem.inner, &hash).is_some())
.unwrap_or_else(|_| log_send_error("QueryDataAvailability")); .map_err(|_| oneshot::Canceled)?;
} }
QueryChunk(hash, id, tx) => { QueryChunk(hash, id, tx) => {
tx.send(get_chunk(subsystem, &hash, id)?) tx.send(get_chunk(subsystem, &hash, id)?)
.unwrap_or_else(|_| log_send_error("QueryChunk")); .map_err(|_| oneshot::Canceled)?;
} }
QueryChunkAvailability(hash, id, tx) => { QueryChunkAvailability(hash, id, tx) => {
tx.send(get_chunk(subsystem, &hash, id)?.is_some()) tx.send(get_chunk(subsystem, &hash, id)?.is_some())
.unwrap_or_else(|_| log_send_error("QueryChunkAvailability")); .map_err(|_| oneshot::Canceled)?;
} }
StoreChunk { candidate_hash, relay_parent, validator_index, chunk, tx } => { StoreChunk { candidate_hash, relay_parent, validator_index, chunk, tx } => {
// Current block number is relay_parent block number + 1. // Current block number is relay_parent block number + 1.
let block_number = get_block_number(ctx, relay_parent).await? + 1; let block_number = get_block_number(ctx, relay_parent).await? + 1;
match store_chunk(subsystem, &candidate_hash, validator_index, chunk, block_number) { match store_chunk(subsystem, &candidate_hash, validator_index, chunk, block_number) {
Err(e) => { Err(e) => {
tx.send(Err(())).unwrap_or_else(|_| log_send_error("StoreChunk (Err)")); tx.send(Err(())).map_err(|_| oneshot::Canceled)?;
return Err(e); return Err(e);
} }
Ok(()) => { Ok(()) => {
tx.send(Ok(())).unwrap_or_else(|_| log_send_error("StoreChunk (Ok)")); tx.send(Ok(())).map_err(|_| oneshot::Canceled)?;
} }
} }
} }
StoreAvailableData(hash, id, n_validators, av_data, tx) => { StoreAvailableData(hash, id, n_validators, av_data, tx) => {
match store_available_data(subsystem, &hash, id, n_validators, av_data) { match store_available_data(subsystem, &hash, id, n_validators, av_data) {
Err(e) => { Err(e) => {
tx.send(Err(())).unwrap_or_else(|_| log_send_error("StoreAvailableData (Err)")); tx.send(Err(())).map_err(|_| oneshot::Canceled)?;
return Err(e); return Err(e);
} }
Ok(()) => { Ok(()) => {
tx.send(Ok(())).unwrap_or_else(|_| log_send_error("StoreAvailableData (Ok)")); tx.send(Ok(())).map_err(|_| oneshot::Canceled)?;
} }
} }
} }
...@@ -995,7 +1001,7 @@ where ...@@ -995,7 +1001,7 @@ where
{ {
fn start(self, ctx: Context) -> SpawnedSubsystem { fn start(self, ctx: Context) -> SpawnedSubsystem {
let future = run(self, ctx) let future = run(self, ctx)
.map_err(|e| SubsystemError::with_origin("availability-store", e)) .map(|_| Ok(()))
.boxed(); .boxed();
SpawnedSubsystem { SpawnedSubsystem {
......
...@@ -30,7 +30,9 @@ use polkadot_primitives::v1::{ ...@@ -30,7 +30,9 @@ use polkadot_primitives::v1::{
PersistedValidationData, PoV, Id as ParaId, PersistedValidationData, PoV, Id as ParaId,
}; };
use polkadot_node_subsystem_util::TimeoutExt; use polkadot_node_subsystem_util::TimeoutExt;
use polkadot_subsystem::ActiveLeavesUpdate; use polkadot_subsystem::{
ActiveLeavesUpdate, errors::RuntimeApiError,
};
use polkadot_node_subsystem_test_helpers as test_helpers; use polkadot_node_subsystem_test_helpers as test_helpers;
struct TestHarness { struct TestHarness {
...@@ -167,6 +169,51 @@ async fn overseer_signal( ...@@ -167,6 +169,51 @@ async fn overseer_signal(
.expect(&format!("{:?} is more than enough for sending signals.", TIMEOUT)); .expect(&format!("{:?} is more than enough for sending signals.", TIMEOUT));
} }
#[test]
fn runtime_api_error_does_not_stop_the_subsystem() {
let store = Arc::new(kvdb_memorydb::create(columns::NUM_COLUMNS));
test_harness(PruningConfig::default(), store, |test_harness| async move {
let TestHarness { mut virtual_overseer } = test_harness;
let new_leaf = Hash::repeat_byte(0x01);
overseer_signal(
&mut virtual_overseer,
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated: smallvec![new_leaf.clone()],
deactivated: smallvec![],
}),
).await;
// runtime api call fails
assert_matches!(
overseer_recv(&mut virtual_overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::CandidateEvents(tx),
)) => {
assert_eq!(relay_parent, new_leaf);
tx.send(Err(RuntimeApiError::from("oh no".to_string()))).unwrap();
}
);
// but that's fine, we're still alive
let (tx, rx) = oneshot::channel();
let candidate_hash = Hash::repeat_byte(33);
let validator_index = 5;
let query_chunk = AvailabilityStoreMessage::QueryChunk(
candidate_hash,
validator_index,
tx,
);
overseer_send(&mut virtual_overseer, query_chunk.into()).await;
assert!(rx.await.unwrap().is_none());
});
}
#[test] #[test]
fn store_chunk_works() { fn store_chunk_works() {
let store = Arc::new(kvdb_memorydb::create(columns::NUM_COLUMNS)); let store = Arc::new(kvdb_memorydb::create(columns::NUM_COLUMNS));
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment