diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 59c4aa562e74241ace7fc610450d291a848dc031..2a3bd3895afc893560e07a0d72ea3ea0f8c0b5d1 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -24,8 +24,8 @@ #![warn(missing_docs)] use polkadot_node_core_pvf::{ - InvalidCandidate as WasmInvalidCandidate, PrepareError, PrepareStats, PvfPrepData, - ValidationError, ValidationHost, + InternalValidationError, InvalidCandidate as WasmInvalidCandidate, PrepareError, PrepareStats, + PvfPrepData, ValidationError, ValidationHost, }; use polkadot_node_primitives::{ BlockData, InvalidCandidate, PoV, ValidationResult, POV_BOMB_LIMIT, VALIDATION_CODE_BOMB_LIMIT, @@ -51,7 +51,11 @@ use parity_scale_codec::Encode; use futures::{channel::oneshot, prelude::*}; -use std::{path::PathBuf, sync::Arc, time::Duration}; +use std::{ + path::PathBuf, + sync::Arc, + time::{Duration, Instant}, +}; use async_trait::async_trait; @@ -63,11 +67,19 @@ mod tests; const LOG_TARGET: &'static str = "parachain::candidate-validation"; -/// The amount of time to wait before retrying after an AmbiguousWorkerDeath validation error. +/// The amount of time to wait before retrying after a retry-able backing validation error. We use a lower value for the +/// backing case, to fit within the lower backing timeout. +#[cfg(not(test))] +const PVF_BACKING_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(500); +#[cfg(test)] +const PVF_BACKING_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(200); +/// The amount of time to wait before retrying after a retry-able approval validation error. We use a higher value for +/// the approval case since we have more time, and if we wait longer it is more likely that transient conditions will +/// resolve. #[cfg(not(test))] -const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_secs(3); +const PVF_APPROVAL_EXECUTION_RETRY_DELAY: Duration = Duration::from_secs(3); #[cfg(test)] -const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(200); +const PVF_APPROVAL_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(200); // Default PVF timeouts. Must never be changed! Use executor environment parameters in // `session_info` pallet to adjust them. See also `PvfTimeoutKind` docs. @@ -617,6 +629,7 @@ where .validate_candidate_with_retry( raw_validation_code.to_vec(), pvf_exec_timeout(&executor_params, exec_timeout_kind), + exec_timeout_kind, params, executor_params, ) @@ -627,7 +640,15 @@ where } match result { - Err(ValidationError::InternalError(e)) => Err(ValidationFailed(e)), + Err(ValidationError::InternalError(e)) => { + gum::warn!( + target: LOG_TARGET, + ?para_id, + ?e, + "An internal error occurred during validation, will abstain from voting", + ); + Err(ValidationFailed(e.to_string())) + }, Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout)) => Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)), Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::WorkerReportedError(e))) => @@ -636,6 +657,8 @@ where Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError( "ambiguous worker death".to_string(), ))), + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(err))) => + Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::PrepareError(e))) => { // In principle if preparation of the `WASM` fails, the current candidate can not be the // reason for that. So we can't say whether it is invalid or not. In addition, with @@ -698,24 +721,44 @@ trait ValidationBackend { &mut self, raw_validation_code: Vec<u8>, exec_timeout: Duration, + exec_timeout_kind: PvfExecTimeoutKind, params: ValidationParams, executor_params: ExecutorParams, ) -> Result<WasmValidationResult, ValidationError> { let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Lenient); // Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap. let pvf = PvfPrepData::from_code(raw_validation_code, executor_params, prep_timeout); + // We keep track of the total time that has passed and stop retrying if we are taking too long. + let total_time_start = Instant::now(); let mut validation_result = self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await; + if validation_result.is_ok() { + return validation_result + } + + let retry_delay = match exec_timeout_kind { + PvfExecTimeoutKind::Backing => PVF_BACKING_EXECUTION_RETRY_DELAY, + PvfExecTimeoutKind::Approval => PVF_APPROVAL_EXECUTION_RETRY_DELAY, + }; // Allow limited retries for each kind of error. let mut num_internal_retries_left = 1; let mut num_awd_retries_left = 1; + let mut num_panic_retries_left = 1; loop { + // Stop retrying if we exceeded the timeout. + if total_time_start.elapsed() + retry_delay > exec_timeout { + break + } + match validation_result { Err(ValidationError::InvalidCandidate( WasmInvalidCandidate::AmbiguousWorkerDeath, )) if num_awd_retries_left > 0 => num_awd_retries_left -= 1, + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(_))) + if num_panic_retries_left > 0 => + num_panic_retries_left -= 1, Err(ValidationError::InternalError(_)) if num_internal_retries_left > 0 => num_internal_retries_left -= 1, _ => break, @@ -725,11 +768,14 @@ trait ValidationBackend { // that the conditions that caused this error may have resolved on their own. { // Wait a brief delay before retrying. - futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await; + futures_timer::Delay::new(retry_delay).await; + + let new_timeout = exec_timeout.saturating_sub(total_time_start.elapsed()); gum::warn!( target: LOG_TARGET, ?pvf, + ?new_timeout, "Re-trying failed candidate validation due to possible transient error: {:?}", validation_result ); @@ -737,7 +783,7 @@ trait ValidationBackend { // Encode the params again when re-trying. We expect the retry case to be relatively // rare, and we want to avoid unconditionally cloning data. validation_result = - self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await; + self.validate_candidate(pvf.clone(), new_timeout, params.encode()).await; } } @@ -760,14 +806,18 @@ impl ValidationBackend for ValidationHost { let (tx, rx) = oneshot::channel(); if let Err(err) = self.execute_pvf(pvf, exec_timeout, encoded_params, priority, tx).await { - return Err(ValidationError::InternalError(format!( - "cannot send pvf to the validation host: {:?}", + return Err(InternalValidationError::HostCommunication(format!( + "cannot send pvf to the validation host, it might have shut down: {:?}", err - ))) + )) + .into()) } - rx.await - .map_err(|_| ValidationError::InternalError("validation was cancelled".into()))? + rx.await.map_err(|_| { + ValidationError::from(InternalValidationError::HostCommunication( + "validation was cancelled".into(), + )) + })? } async fn precheck_pvf(&mut self, pvf: PvfPrepData) -> Result<PrepareStats, PrepareError> { diff --git a/polkadot/node/core/candidate-validation/src/tests.rs b/polkadot/node/core/candidate-validation/src/tests.rs index 5d1cc75b74375865014d998a665ff6749c4d4898..2dcead4db466510001f732e61ab542c78b8829e5 100644 --- a/polkadot/node/core/candidate-validation/src/tests.rs +++ b/polkadot/node/core/candidate-validation/src/tests.rs @@ -540,6 +540,7 @@ fn candidate_validation_bad_return_is_invalid() { assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::Timeout))); } +// Test that we vote valid if we get `AmbiguousWorkerDeath`, retry, and then succeed. #[test] fn candidate_validation_one_ambiguous_error_is_valid() { let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() }; @@ -710,11 +711,11 @@ fn candidate_validation_retry_internal_errors() { validate_candidate_exhaustive( ctx.sender(), MockValidateCandidateBackend::with_hardcoded_result_list(vec![ - Err(ValidationError::InternalError("foo".into())), + Err(InternalValidationError::HostCommunication("foo".into()).into()), // Throw an AWD error, we should still retry again. Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)), // Throw another internal error. - Err(ValidationError::InternalError("bar".into())), + Err(InternalValidationError::HostCommunication("bar".into()).into()), ]), validation_data, validation_code, @@ -725,7 +726,63 @@ fn candidate_validation_retry_internal_errors() { ) }); - assert_matches!(v, Err(ValidationFailed(s)) if s == "bar".to_string()); + assert_matches!(v, Err(ValidationFailed(s)) if s.contains("bar")); +} + +// Test that we retry on panic errors. +#[test] +fn candidate_validation_retry_panic_errors() { + let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() }; + + let pov = PoV { block_data: BlockData(vec![1; 32]) }; + let validation_code = ValidationCode(vec![2; 16]); + + let descriptor = make_valid_candidate_descriptor( + ParaId::from(1_u32), + dummy_hash(), + validation_data.hash(), + pov.hash(), + validation_code.hash(), + dummy_hash(), + dummy_hash(), + Sr25519Keyring::Alice, + ); + + let check = perform_basic_checks( + &descriptor, + validation_data.max_pov_size, + &pov, + &validation_code.hash(), + ); + assert!(check.is_ok()); + + let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() }; + + let pool = TaskExecutor::new(); + let (mut ctx, ctx_handle) = + test_helpers::make_subsystem_context::<AllMessages, _>(pool.clone()); + let metrics = Metrics::default(); + + let v = test_with_executor_params(ctx_handle, || { + validate_candidate_exhaustive( + ctx.sender(), + MockValidateCandidateBackend::with_hardcoded_result_list(vec![ + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic("foo".into()))), + // Throw an AWD error, we should still retry again. + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)), + // Throw another panic error. + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic("bar".into()))), + ]), + validation_data, + validation_code, + candidate_receipt, + Arc::new(pov), + PvfExecTimeoutKind::Backing, + &metrics, + ) + }); + + assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(s))) if s == "bar".to_string()); } #[test] diff --git a/polkadot/node/core/pvf/src/error.rs b/polkadot/node/core/pvf/src/error.rs index 21f23d515fdd7d12ec4bd077e86a9afd4f0ffb3f..33f3f00810f20ab3c8e9984490b5c50d3f3405bf 100644 --- a/polkadot/node/core/pvf/src/error.rs +++ b/polkadot/node/core/pvf/src/error.rs @@ -78,9 +78,11 @@ impl fmt::Display for PrepareError { #[derive(Debug, Clone)] pub enum ValidationError { /// The error was raised because the candidate is invalid. + /// + /// Whenever we are unsure if the error was due to the candidate or not, we must vote invalid. InvalidCandidate(InvalidCandidate), - /// This error is raised due to inability to serve the request. - InternalError(String), + /// Some internal error occurred. + InternalError(InternalValidationError), } /// A description of an error raised during executing a PVF and can be attributed to the combination @@ -103,7 +105,7 @@ pub enum InvalidCandidate { /// an `rlimit` (if set) or, again, invited OOM killer. Another possibility is a bug in /// wasmtime allowed the PVF to gain control over the execution worker. /// - /// We attribute such an event to an invalid candidate in either case. + /// We attribute such an event to an *invalid candidate* in either case. /// /// The rationale for this is that a glitch may lead to unfair rejecting candidate by a single /// validator. If the glitch is somewhat more persistent the validator will reject all candidate @@ -113,6 +115,48 @@ pub enum InvalidCandidate { AmbiguousWorkerDeath, /// PVF execution (compilation is not included) took more time than was allotted. HardTimeout, + /// A panic occurred and we can't be sure whether the candidate is really invalid or some internal glitch occurred. + /// Whenever we are unsure, we can never treat an error as internal as we would abstain from voting. This is bad + /// because if the issue was due to the candidate, then all validators would abstain, stalling finality on the + /// chain. So we will first retry the candidate, and if the issue persists we are forced to vote invalid. + Panic(String), +} + +/// Some internal error occurred. +/// +/// Should only ever be used for validation errors independent of the candidate and PVF, or for errors we ruled out +/// during pre-checking (so preparation errors are fine). +#[derive(Debug, Clone, Encode, Decode)] +pub enum InternalValidationError { + /// Some communication error occurred with the host. + HostCommunication(String), + /// Could not find or open compiled artifact file. + CouldNotOpenFile(String), + /// An error occurred in the CPU time monitor thread. Should be totally unrelated to validation. + CpuTimeMonitorThread(String), + /// Some non-deterministic preparation error occurred. + NonDeterministicPrepareError(PrepareError), +} + +impl fmt::Display for InternalValidationError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use InternalValidationError::*; + match self { + HostCommunication(err) => + write!(f, "validation: some communication error occurred with the host: {}", err), + CouldNotOpenFile(err) => + write!(f, "validation: could not find or open compiled artifact file: {}", err), + CpuTimeMonitorThread(err) => + write!(f, "validation: an error occurred in the CPU time monitor thread: {}", err), + NonDeterministicPrepareError(err) => write!(f, "validation: prepare: {}", err), + } + } +} + +impl From<InternalValidationError> for ValidationError { + fn from(error: InternalValidationError) -> Self { + Self::InternalError(error) + } } impl From<PrepareError> for ValidationError { @@ -120,9 +164,9 @@ impl From<PrepareError> for ValidationError { // Here we need to classify the errors into two errors: deterministic and non-deterministic. // See [`PrepareError::is_deterministic`]. if error.is_deterministic() { - ValidationError::InvalidCandidate(InvalidCandidate::PrepareError(error.to_string())) + Self::InvalidCandidate(InvalidCandidate::PrepareError(error.to_string())) } else { - ValidationError::InternalError(error.to_string()) + Self::InternalError(InternalValidationError::NonDeterministicPrepareError(error)) } } } diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index 5b3e21cee079745a640c217024992ed4044dd669..61cebc5e2c4618af7ac33a33af32d39176caa591 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -334,15 +334,17 @@ fn handle_job_finish( Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))), None, ), - Outcome::InternalError { err, idle_worker } => - (Some(idle_worker), Err(ValidationError::InternalError(err)), None), + Outcome::InternalError { err } => (None, Err(ValidationError::InternalError(err)), None), Outcome::HardTimeout => (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None), + // "Maybe invalid" errors (will retry). Outcome::IoErr => ( None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)), None, ), + Outcome::Panic { err } => + (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::Panic(err))), None), }; queue.metrics.execute_finished(); @@ -356,7 +358,7 @@ fn handle_job_finish( err ); } else { - gum::debug!( + gum::trace!( target: LOG_TARGET, ?artifact_id, ?worker, diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index bc467cf90de6fcfe469a1fced9e8b9b22723fab9..4c26aeb0260a555b169365a05fec7127acdcfade 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -18,6 +18,7 @@ use crate::{ artifacts::ArtifactPathId, + error::InternalValidationError, worker_common::{ framed_recv, framed_send, path_to_bytes, spawn_with_program_path, IdleWorker, SpawnErr, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, @@ -64,6 +65,8 @@ pub async fn spawn( } /// Outcome of PVF execution. +/// +/// If the idle worker token is not returned, it means the worker must be terminated. pub enum Outcome { /// PVF execution completed successfully and the result is returned. The worker is ready for /// another job. @@ -73,18 +76,23 @@ pub enum Outcome { InvalidCandidate { err: String, idle_worker: IdleWorker }, /// An internal error happened during the validation. Such an error is most likely related to /// some transient glitch. - InternalError { err: String, idle_worker: IdleWorker }, + /// + /// Should only ever be used for errors independent of the candidate and PVF. Therefore it may + /// be a problem with the worker, so we terminate it. + InternalError { err: InternalValidationError }, /// The execution time exceeded the hard limit. The worker is terminated. HardTimeout, /// An I/O error happened during communication with the worker. This may mean that the worker /// process already died. The token is not returned in any case. IoErr, + /// An unexpected panic has occurred in the execution worker. + Panic { err: String }, } /// Given the idle token of a worker and parameters of work, communicates with the worker and /// returns the outcome. /// -/// NOTE: Returning the `HardTimeout` or `IoErr` errors will trigger the child process being killed. +/// NOTE: Not returning the idle worker token in `Outcome` will trigger the child process being killed. pub async fn start_work( worker: IdleWorker, artifact: ArtifactPathId, @@ -171,8 +179,8 @@ pub async fn start_work( Response::InvalidCandidate(err) => Outcome::InvalidCandidate { err, idle_worker: IdleWorker { stream, pid } }, Response::TimedOut => Outcome::HardTimeout, - Response::InternalError(err) => - Outcome::InternalError { err, idle_worker: IdleWorker { stream, pid } }, + Response::Panic(err) => Outcome::Panic { err }, + Response::InternalError(err) => Outcome::InternalError { err }, } } @@ -223,8 +231,10 @@ pub enum Response { InvalidCandidate(String), /// The job timed out. TimedOut, - /// Some internal error occurred. Should only be used for errors independent of the candidate. - InternalError(String), + /// An unexpected panic has occurred in the execution worker. + Panic(String), + /// Some internal error occurred. + InternalError(InternalValidationError), } impl Response { @@ -236,12 +246,4 @@ impl Response { Self::InvalidCandidate(format!("{}: {}", ctx, msg)) } } - /// Creates an internal response from a context `ctx` and a message `msg` (which can be empty). - pub fn format_internal(ctx: &'static str, msg: &str) -> Self { - if msg.is_empty() { - Self::InternalError(ctx.to_string()) - } else { - Self::InternalError(format!("{}: {}", ctx, msg)) - } - } } diff --git a/polkadot/node/core/pvf/src/lib.rs b/polkadot/node/core/pvf/src/lib.rs index cdaee334140216e578dbb796b70543049bf28aa8..9b302150fd361d93b1c9b892c9835482496b5a7a 100644 --- a/polkadot/node/core/pvf/src/lib.rs +++ b/polkadot/node/core/pvf/src/lib.rs @@ -99,7 +99,9 @@ mod pvf; mod worker_common; pub use artifacts::CompiledArtifact; -pub use error::{InvalidCandidate, PrepareError, PrepareResult, ValidationError}; +pub use error::{ + InternalValidationError, InvalidCandidate, PrepareError, PrepareResult, ValidationError, +}; pub use execute::{ExecuteHandshake, ExecuteResponse}; #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] pub use prepare::MemoryAllocationStats; diff --git a/polkadot/node/core/pvf/worker/src/execute.rs b/polkadot/node/core/pvf/worker/src/execute.rs index 78f1f700ad0311c8c8b563dc85c71afafcb36f1e..997613fe7bc9417b3669634a4a3e8974fcb0c645 100644 --- a/polkadot/node/core/pvf/worker/src/execute.rs +++ b/polkadot/node/core/pvf/worker/src/execute.rs @@ -27,6 +27,7 @@ use cpu_time::ProcessTime; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf::{ framed_recv, framed_send, ExecuteHandshake as Handshake, ExecuteResponse as Response, + InternalValidationError, }; use polkadot_parachain::primitives::ValidationResult; use std::{ @@ -127,13 +128,9 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) { let response = match outcome { WaitOutcome::Finished => { let _ = cpu_time_monitor_tx.send(()); - execute_thread.join().unwrap_or_else(|e| { - // TODO: Use `Panic` error once that is implemented. - Response::format_internal( - "execute thread error", - &stringify_panic_payload(e), - ) - }) + execute_thread + .join() + .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))) }, // If the CPU thread is not selected, we signal it to end, the join handle is // dropped and the thread will finish in the background. @@ -150,16 +147,14 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) { ); Response::TimedOut }, - Ok(None) => Response::format_internal( - "cpu time monitor thread error", - "error communicating over closed channel".into(), - ), - // We can use an internal error here because errors in this thread are - // independent of the candidate. - Err(e) => Response::format_internal( - "cpu time monitor thread error", - &stringify_panic_payload(e), - ), + Ok(None) => + Response::InternalError(InternalValidationError::CpuTimeMonitorThread( + "error communicating over finished channel".into(), + )), + Err(e) => + Response::InternalError(InternalValidationError::CpuTimeMonitorThread( + stringify_panic_payload(e), + )), } }, WaitOutcome::Pending => @@ -181,7 +176,7 @@ fn validate_using_artifact( // TODO: Re-evaluate after <https://github.com/paritytech/substrate/issues/13860>. let file_metadata = std::fs::metadata(artifact_path); if let Err(err) = file_metadata { - return Response::format_internal("execute: could not find or open file", &err.to_string()) + return Response::InternalError(InternalValidationError::CouldNotOpenFile(err.to_string())) } let descriptor_bytes = match unsafe { diff --git a/polkadot/node/core/pvf/worker/src/prepare.rs b/polkadot/node/core/pvf/worker/src/prepare.rs index 36b05318c889f0554b3fb3aa77482f5659e2db68..fe9c1a85545a78f046c1e0d1387aba9c8e80e7c1 100644 --- a/polkadot/node/core/pvf/worker/src/prepare.rs +++ b/polkadot/node/core/pvf/worker/src/prepare.rs @@ -205,7 +205,7 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) { Ok(None) => Err(PrepareError::IoErr( "error communicating over closed channel".into(), )), - // Errors in this thread are independent of the candidate. + // Errors in this thread are independent of the PVF. Err(err) => Err(PrepareError::IoErr(stringify_panic_payload(err))), } }, diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/candidate-validation.md b/polkadot/roadmap/implementers-guide/src/node/utility/candidate-validation.md index bc1d11d8f6644d62653eb4c3d3913be0876f4026..a238ff511bc554f44a1198cfdd116336fea5420a 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/candidate-validation.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/candidate-validation.md @@ -72,7 +72,7 @@ hopefully resolve. We use a more brief delay here (1 second as opposed to 15 minutes for preparation (see above)), because a successful execution must happen in a short amount of time. -We currently know of at least two specific cases that will lead to a retried +We currently know of the following specific cases that will lead to a retried execution request: 1. **OOM:** The host might have been temporarily low on memory due to other @@ -80,7 +80,9 @@ execution request: voting against the candidate (and possibly a dispute) if the retry is still not successful. 2. **Artifact missing:** The prepared artifact might have been deleted due to - operator error or some bug in the system. We will re-create it on retry. + operator error or some bug in the system. +3. **Panic:** The worker thread panicked for some indeterminate reason, which + may or may not be independent of the candidate or PVF. #### Preparation timeouts @@ -103,4 +105,25 @@ of a process is less variable under different system conditions. When the overall system is under heavy load, the wall clock time of a job is affected more than the CPU time. +#### Internal errors + +In general, for errors not raising a dispute we have to be very careful. This is +only sound, if we either: + +1. Ruled out that error in pre-checking. If something is not checked in + pre-checking, even if independent of the candidate and PVF, we must raise a + dispute. +2. We are 100% confident that it is a hardware/local issue: Like corrupted file, + etc. + +Reasoning: Otherwise it would be possible to register a PVF where candidates can +not be checked, but we don't get a dispute - so nobody gets punished. Second, we +end up with a finality stall that is not going to resolve! + +There are some error conditions where we can't be sure whether the candidate is +really invalid or some internal glitch occurred, e.g. panics. Whenever we are +unsure, we can never treat an error as internal as we would abstain from voting. +So we will first retry the candidate, and if the issue persists we are forced to +vote invalid. + [CVM]: ../../types/overseer-protocol.md#validationrequesttype