From 172f355ca48fac32e20601b1a43f852212c7c4d7 Mon Sep 17 00:00:00 2001
From: Sergei Shulepov <sergei@parity.io>
Date: Mon, 29 Nov 2021 15:42:43 +0100
Subject: [PATCH] Treat non-deterministic prep errors as internal errors
 (#4364)

Closes https://github.com/paritytech/polkadot/issues/4293

This PR changes the way how we treat a certain subset of PVF preparation
errors. Specifically, now only the deterministic errors are treated as
invalid candidates. That is, the errors that are easily
attributable to either the the PVF contents or the wasmtime code, but
not e.g. I/O errors that could be triggered by the OS (insufficient
memory, disk failure, too much load, etc). The latter are treated as
internal errors and thus do not trigger the disputes.
---
 .../node/core/candidate-validation/src/lib.rs |  2 +
 polkadot/node/core/pvf/src/error.rs           | 42 ++++++++++++++-----
 polkadot/node/core/pvf/src/host.rs            |  2 +-
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs
index 0b71c77df6b..406a37cb088 100644
--- a/polkadot/node/core/candidate-validation/src/lib.rs
+++ b/polkadot/node/core/candidate-validation/src/lib.rs
@@ -459,6 +459,8 @@ async fn validate_candidate_exhaustive(
 			Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(
 				"ambiguous worker death".to_string(),
 			))),
+		Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::PrepareError(e))) =>
+			Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))),
 
 		Ok(res) =>
 			if res.head_data.hash() != descriptor.para_head {
diff --git a/polkadot/node/core/pvf/src/error.rs b/polkadot/node/core/pvf/src/error.rs
index 977f24b1482..6b52a3f1369 100644
--- a/polkadot/node/core/pvf/src/error.rs
+++ b/polkadot/node/core/pvf/src/error.rs
@@ -48,9 +48,9 @@ pub enum ValidationError {
 /// of the candidate [`polkadot_parachain::primitives::ValidationParams`] and the PVF.
 #[derive(Debug, Clone)]
 pub enum InvalidCandidate {
-	/// The failure is reported by the worker. The string contains the error message.
-	///
-	/// This also includes the errors reported by the preparation pipeline.
+	/// PVF preparation ended up with a deterministic error.
+	PrepareError(String),
+	/// The failure is reported by the execution worker. The string contains the error message.
 	WorkerReportedError(String),
 	/// The worker has died during validation of a candidate. That may fall in one of the following
 	/// categories, which we cannot distinguish programmatically:
@@ -78,13 +78,33 @@ pub enum InvalidCandidate {
 
 impl From<PrepareError> for ValidationError {
 	fn from(error: PrepareError) -> Self {
-		let error_str = match error {
-			PrepareError::Prevalidation(err) => format!("prevalidation: {}", err),
-			PrepareError::Preparation(err) => format!("preparation: {}", err),
-			PrepareError::Panic(err) => format!("panic: {}", err),
-			PrepareError::TimedOut => "preparation timeout".to_owned(),
-			PrepareError::DidNotMakeIt => "communication error".to_owned(),
-		};
-		ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(error_str))
+		// Here we need to classify the errors into two errors: deterministic and non-deterministic.
+		//
+		// Non-deterministic errors can happen spuriously. Typically, they occur due to resource
+		// starvation, e.g. under heavy load or memory pressure. Those errors are typically transient
+		// but may persist e.g. if the node is run by overwhelmingly underpowered machine.
+		//
+		// Deterministic errors should trigger reliably. Those errors depend on the PVF itself and
+		// the sc-executor/wasmtime logic.
+		//
+		// For now, at least until the PVF pre-checking lands, the deterministic errors will be
+		// treated as `InvalidCandidate`. Should those occur they could potentially trigger disputes.
+		//
+		// All non-deterministic errors are qualified as `InternalError`s and will not trigger
+		// disputes.
+		match error {
+			PrepareError::Prevalidation(err) => ValidationError::InvalidCandidate(
+				InvalidCandidate::PrepareError(format!("prevalidation: {}", err)),
+			),
+			PrepareError::Preparation(err) => ValidationError::InvalidCandidate(
+				InvalidCandidate::PrepareError(format!("preparation: {}", err)),
+			),
+			PrepareError::Panic(err) => ValidationError::InvalidCandidate(
+				InvalidCandidate::PrepareError(format!("panic: {}", err)),
+			),
+			PrepareError::TimedOut => ValidationError::InternalError("prepare: timeout".to_owned()),
+			PrepareError::DidNotMakeIt =>
+				ValidationError::InternalError("prepare: did not make it".to_owned()),
+		}
 	}
 }
diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs
index dccb52781a0..809d07164ba 100644
--- a/polkadot/node/core/pvf/src/host.rs
+++ b/polkadot/node/core/pvf/src/host.rs
@@ -1156,7 +1156,7 @@ mod tests {
 		assert_matches!(result_rx.now_or_never().unwrap().unwrap(), Err(PrepareError::TimedOut));
 		assert_matches!(
 			result_rx_execute.now_or_never().unwrap().unwrap(),
-			Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(_)))
+			Err(ValidationError::InternalError(_))
 		);
 
 		// Reversed case: first send multiple precheck requests, then ask for an execution.
-- 
GitLab