diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 7db7f9a5945179e16733c6e50b157d36369b61e1..f8faefc24e65aadb2c50d2375e30fc7d579d6d37 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -16,7 +16,6 @@ use crate::prepare::{PrepareSuccess, PrepareWorkerSuccess}; use parity_scale_codec::{Decode, Encode}; -use std::fmt; /// Result of PVF preparation from a worker, with checksum of the compiled PVF and stats of the /// preparation if successful. @@ -32,35 +31,43 @@ pub type PrecheckResult = Result<(), PrepareError>; /// An error that occurred during the prepare part of the PVF pipeline. // Codec indexes are intended to stabilize pre-encoded payloads (see `OOM_PAYLOAD`) -#[derive(Debug, Clone, Encode, Decode)] +#[derive(thiserror::Error, Debug, Clone, Encode, Decode)] pub enum PrepareError { /// During the prevalidation stage of preparation an issue was found with the PVF. #[codec(index = 0)] + #[error("prepare: prevalidation error: {0}")] Prevalidation(String), /// Compilation failed for the given PVF. #[codec(index = 1)] + #[error("prepare: preparation error: {0}")] Preparation(String), /// Instantiation of the WASM module instance failed. #[codec(index = 2)] + #[error("prepare: runtime construction: {0}")] RuntimeConstruction(String), /// An unexpected error has occurred in the preparation job. #[codec(index = 3)] + #[error("prepare: job error: {0}")] JobError(String), /// Failed to prepare the PVF due to the time limit. #[codec(index = 4)] + #[error("prepare: timeout")] TimedOut, /// An IO error occurred. This state is reported by either the validation host or by the /// worker. #[codec(index = 5)] + #[error("prepare: io error while receiving response: {0}")] IoErr(String), /// The temporary file for the artifact could not be created at the given cache path. This /// state is reported by the validation host (not by the worker). #[codec(index = 6)] + #[error("prepare: error creating tmp file: {0}")] CreateTmpFile(String), /// The response from the worker is received, but the file cannot be renamed (moved) to the /// final destination location. This state is reported by the validation host (not by the /// worker). #[codec(index = 7)] + #[error("prepare: error renaming tmp file ({src:?} -> {dest:?}): {err}")] RenameTmpFile { err: String, // Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible @@ -70,17 +77,21 @@ pub enum PrepareError { }, /// Memory limit reached #[codec(index = 8)] + #[error("prepare: out of memory")] OutOfMemory, /// The response from the worker is received, but the worker cache could not be cleared. The /// worker has to be killed to avoid jobs having access to data from other jobs. This state is /// reported by the validation host (not by the worker). #[codec(index = 9)] + #[error("prepare: error clearing worker cache: {0}")] ClearWorkerDir(String), /// The preparation job process died, due to OOM, a seccomp violation, or some other factor. - JobDied { err: String, job_pid: i32 }, #[codec(index = 10)] + #[error("prepare: prepare job with pid {job_pid} died: {err}")] + JobDied { err: String, job_pid: i32 }, /// Some error occurred when interfacing with the kernel. #[codec(index = 11)] + #[error("prepare: error interfacing with the kernel: {0}")] Kernel(String), } @@ -109,41 +120,23 @@ impl PrepareError { } } -impl fmt::Display for PrepareError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use PrepareError::*; - match self { - Prevalidation(err) => write!(f, "prevalidation: {}", err), - Preparation(err) => write!(f, "preparation: {}", err), - RuntimeConstruction(err) => write!(f, "runtime construction: {}", err), - JobError(err) => write!(f, "panic: {}", err), - TimedOut => write!(f, "prepare: timeout"), - IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err), - JobDied { err, job_pid } => - write!(f, "prepare: prepare job with pid {job_pid} died: {err}"), - CreateTmpFile(err) => write!(f, "prepare: error creating tmp file: {}", err), - RenameTmpFile { err, src, dest } => - write!(f, "prepare: error renaming tmp file ({:?} -> {:?}): {}", src, dest, err), - OutOfMemory => write!(f, "prepare: out of memory"), - ClearWorkerDir(err) => write!(f, "prepare: error clearing worker cache: {}", err), - Kernel(err) => write!(f, "prepare: error interfacing with the kernel: {}", err), - } - } -} - /// Some internal error occurred. /// /// Should only ever be used for validation errors independent of the candidate and PVF, or for /// errors we ruled out during pre-checking (so preparation errors are fine). -#[derive(Debug, Clone, Encode, Decode)] +#[derive(thiserror::Error, Debug, Clone, Encode, Decode)] pub enum InternalValidationError { /// Some communication error occurred with the host. + #[error("validation: some communication error occurred with the host: {0}")] HostCommunication(String), /// Host could not create a hard link to the artifact path. + #[error("validation: host could not create a hard link to the artifact path: {0}")] CouldNotCreateLink(String), /// Could not find or open compiled artifact file. + #[error("validation: could not find or open compiled artifact file: {0}")] CouldNotOpenFile(String), /// Host could not clear the worker cache after a job. + #[error("validation: host could not clear the worker cache ({path:?}) after a job: {err}")] CouldNotClearWorkerDir { err: String, // Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible @@ -151,32 +144,9 @@ pub enum InternalValidationError { path: Option<String>, }, /// Some error occurred when interfacing with the kernel. + #[error("validation: error interfacing with the kernel: {0}")] Kernel(String), - /// Some non-deterministic preparation error occurred. + #[error("validation: prepare: {0}")] NonDeterministicPrepareError(PrepareError), } - -impl fmt::Display for InternalValidationError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use InternalValidationError::*; - match self { - HostCommunication(err) => - write!(f, "validation: some communication error occurred with the host: {}", err), - CouldNotCreateLink(err) => write!( - f, - "validation: host could not create a hard link to the artifact path: {}", - err - ), - CouldNotOpenFile(err) => - write!(f, "validation: could not find or open compiled artifact file: {}", err), - CouldNotClearWorkerDir { err, path } => write!( - f, - "validation: host could not clear the worker cache ({:?}) after a job: {}", - path, err - ), - Kernel(err) => write!(f, "validation: error interfacing with the kernel: {}", err), - NonDeterministicPrepareError(err) => write!(f, "validation: prepare: {}", err), - } - } -} diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index cff6e0ac13ab5e380bc62a287b6e59288ef6ae76..6980f913d0d74976abe33592b14b67fc518f70ce 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -485,7 +485,7 @@ fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result<J JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| { io::Error::new( io::ErrorKind::Other, - format!("execute pvf recv_child_response: decode error: {:?}", e), + format!("execute pvf recv_child_response: decode error: {}", e), ) }) } diff --git a/polkadot/node/core/pvf/src/error.rs b/polkadot/node/core/pvf/src/error.rs index 442443f326e9815fe3b9e56813356989748a8d79..80d41d5c64be1a5cec31aee084a0f536ad0380bf 100644 --- a/polkadot/node/core/pvf/src/error.rs +++ b/polkadot/node/core/pvf/src/error.rs @@ -17,7 +17,7 @@ use polkadot_node_core_pvf_common::error::{InternalValidationError, PrepareError}; /// A error raised during validation of the candidate. -#[derive(Debug, Clone)] +#[derive(thiserror::Error, Debug, Clone)] pub enum ValidationError { /// Deterministic preparation issue. In practice, most of the problems should be caught by /// prechecking, so this may be a sign of internal conditions. @@ -27,35 +27,42 @@ pub enum ValidationError { /// pre-checking enabled only valid runtimes should ever get enacted, so we can be /// reasonably sure that this is some local problem on the current node. However, as this /// particular error *seems* to indicate a deterministic error, we raise a warning. + #[error("candidate validation: {0}")] Preparation(PrepareError), /// The error was raised because the candidate is invalid. Should vote against. - Invalid(InvalidCandidate), + #[error("candidate validation: {0}")] + Invalid(#[from] InvalidCandidate), /// Possibly transient issue that may resolve after retries. Should vote against when retries /// fail. - PossiblyInvalid(PossiblyInvalidError), + #[error("candidate validation: {0}")] + PossiblyInvalid(#[from] PossiblyInvalidError), /// Preparation or execution issue caused by an internal condition. Should not vote against. - Internal(InternalValidationError), + #[error("candidate validation: internal: {0}")] + Internal(#[from] InternalValidationError), } /// A description of an error raised during executing a PVF and can be attributed to the combination /// of the candidate [`polkadot_parachain_primitives::primitives::ValidationParams`] and the PVF. -#[derive(Debug, Clone)] +#[derive(thiserror::Error, Debug, Clone)] pub enum InvalidCandidate { /// The candidate is reported to be invalid by the execution worker. The string contains the /// error message. + #[error("invalid: worker reported: {0}")] WorkerReportedInvalid(String), /// PVF execution (compilation is not included) took more time than was allotted. + #[error("invalid: hard timeout")] HardTimeout, } /// Possibly transient issue that may resolve after retries. -#[derive(Debug, Clone)] +#[derive(thiserror::Error, Debug, Clone)] pub enum PossiblyInvalidError { /// The worker process (not the job) has died during validation of a candidate. /// /// It's unlikely that this is caused by malicious code since workers spawn separate job /// processes, and those job processes are sandboxed. But, it is possible. We retry in this /// case, and if the error persists, we assume it's caused by the candidate and vote against. + #[error("possibly invalid: ambiguous worker death")] AmbiguousWorkerDeath, /// The job process (not the worker) has died for one of the following reasons: /// @@ -69,6 +76,7 @@ pub enum PossiblyInvalidError { /// (c) Some other reason, perhaps transient or perhaps caused by malicious code. /// /// We cannot treat this as an internal error because malicious code may have caused this. + #[error("possibly invalid: ambiguous job death: {0}")] AmbiguousJobDeath(String), /// An unexpected error occurred in the job process and we can't be sure whether the candidate /// is really invalid or some internal glitch occurred. Whenever we are unsure, we can never @@ -76,15 +84,10 @@ pub enum PossiblyInvalidError { /// issue was due to the candidate, then all validators would abstain, stalling finality on the /// chain. So we will first retry the candidate, and if the issue persists we are forced to /// vote invalid. + #[error("possibly invalid: job error: {0}")] JobError(String), } -impl From<InternalValidationError> for ValidationError { - fn from(error: InternalValidationError) -> Self { - Self::Internal(error) - } -} - impl From<PrepareError> for ValidationError { fn from(error: PrepareError) -> Self { // Here we need to classify the errors into two errors: deterministic and non-deterministic. diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index be607fe1c20b06659765776e90b29bf939f4aed7..aa91d11781fc625993484fa64ff8ebca4d65aeb2 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -372,7 +372,7 @@ fn handle_job_finish( ?artifact_id, ?worker, worker_rip = idle_worker.is_none(), - "execution worker concluded, error occurred: {:?}", + "execution worker concluded, error occurred: {}", err ); } else { diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 21e13453edf3df34f35818498b7e2bdcff47759f..2668ce41583d780d08f2aa96e52a3f0a70bc9314 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -486,7 +486,8 @@ async fn handle_precheck_pvf( /// /// If the prepare job failed previously, we may retry it under certain conditions. /// -/// When preparing for execution, we use a more lenient timeout ([`LENIENT_PREPARATION_TIMEOUT`]) +/// When preparing for execution, we use a more lenient timeout +/// ([`DEFAULT_LENIENT_PREPARATION_TIMEOUT`](polkadot_primitives::executor_params::DEFAULT_LENIENT_PREPARATION_TIMEOUT)) /// than when prechecking. async fn handle_execute_pvf( artifacts: &mut Artifacts, diff --git a/polkadot/node/core/pvf/src/prepare/queue.rs b/polkadot/node/core/pvf/src/prepare/queue.rs index c140a6cafda08ed26aa9556935a591dfa0bd2f0e..c7bfa2f3b21ba94084edd3cb397abd5e67213d08 100644 --- a/polkadot/node/core/pvf/src/prepare/queue.rs +++ b/polkadot/node/core/pvf/src/prepare/queue.rs @@ -45,8 +45,8 @@ pub struct FromQueue { /// Identifier of an artifact. pub(crate) artifact_id: ArtifactId, /// Outcome of the PVF processing. [`Ok`] indicates that compiled artifact - /// is successfully stored on disk. Otherwise, an [error](crate::error::PrepareError) - /// is supplied. + /// is successfully stored on disk. Otherwise, an + /// [error](polkadot_node_core_pvf_common::error::PrepareError) is supplied. pub(crate) result: PrepareResult, } diff --git a/polkadot/node/core/pvf/src/prepare/worker_interface.rs b/polkadot/node/core/pvf/src/prepare/worker_interface.rs index 45e31a5f453ff9974e90d3326408bec41d688d13..d64ee1510cad79a4ed1df808668d203b5817121b 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_interface.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_interface.rs @@ -174,7 +174,7 @@ pub async fn start_work( gum::warn!( target: LOG_TARGET, worker_pid = %pid, - "failed to recv a prepare response: {:?}", + "failed to recv a prepare response: {}", err, ); Outcome::IoErr(err.to_string())