From 408af9b32d95acbbac5e18bee66fd1b74230a699 Mon Sep 17 00:00:00 2001
From: Marcin S <marcin@realemail.net>
Date: Wed, 22 Nov 2023 15:45:52 +0100
Subject: [PATCH] PVF: Fix unshare "no such file or directory" error (#2426)

---
 .gitlab/pipeline/test.yml                     |   2 +-
 Cargo.lock                                    |   1 +
 polkadot/node/core/pvf/Cargo.toml             |   1 +
 polkadot/node/core/pvf/common/src/lib.rs      |   2 +-
 polkadot/node/core/pvf/src/artifacts.rs       |   5 +-
 .../node/core/pvf/src/execute/worker_intf.rs  |   4 +-
 polkadot/node/core/pvf/src/host.rs            |   9 +-
 .../node/core/pvf/src/prepare/worker_intf.rs  |   4 +-
 polkadot/node/core/pvf/src/security.rs        |  19 ++--
 polkadot/node/core/pvf/src/worker_intf.rs     | 105 ++++++++----------
 polkadot/node/core/pvf/tests/it/main.rs       |  32 ++++++
 .../functional/0001-parachains-pvf.zndsl      |   4 +-
 12 files changed, 105 insertions(+), 83 deletions(-)

diff --git a/.gitlab/pipeline/test.yml b/.gitlab/pipeline/test.yml
index 4ed3ec19c48..79ef9070dba 100644
--- a/.gitlab/pipeline/test.yml
+++ b/.gitlab/pipeline/test.yml
@@ -29,7 +29,7 @@ test-linux-stable:
         --locked \
         --release \
         --no-fail-fast \
-        --features try-runtime,experimental \
+        --features try-runtime,experimental,ci-only-tests \
         --partition count:${CI_NODE_INDEX}/${CI_NODE_TOTAL}
     # Upload tests results to Elasticsearch
     - echo "Upload test results to Elasticsearch"
diff --git a/Cargo.lock b/Cargo.lock
index 903e9463cf6..bfe24e528eb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -12591,6 +12591,7 @@ dependencies = [
  "rand 0.8.5",
  "rococo-runtime",
  "rusty-fork",
+ "sc-sysinfo",
  "slotmap",
  "sp-core",
  "sp-maybe-compressed-blob",
diff --git a/polkadot/node/core/pvf/Cargo.toml b/polkadot/node/core/pvf/Cargo.toml
index 27da484fe4f..05509a5f853 100644
--- a/polkadot/node/core/pvf/Cargo.toml
+++ b/polkadot/node/core/pvf/Cargo.toml
@@ -54,6 +54,7 @@ halt = { package = "test-parachain-halt", path = "../../../parachain/test-parach
 [target.'cfg(target_os = "linux")'.dev-dependencies]
 procfs = "0.16.0"
 rusty-fork = "0.3.0"
+sc-sysinfo = { path = "../../../../substrate/client/sysinfo" }
 
 [[bench]]
 name = "host_prepare_rococo_runtime"
diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs
index 278fa3fe821..c041c60aaf9 100644
--- a/polkadot/node/core/pvf/common/src/lib.rs
+++ b/polkadot/node/core/pvf/common/src/lib.rs
@@ -47,7 +47,7 @@ pub mod tests {
 }
 
 /// Status of security features on the current system.
-#[derive(Debug, Clone, Default)]
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
 pub struct SecurityStatus {
 	/// Whether the landlock features we use are fully available on this system.
 	pub can_enable_landlock: bool,
diff --git a/polkadot/node/core/pvf/src/artifacts.rs b/polkadot/node/core/pvf/src/artifacts.rs
index 53085eade3c..79b53467b4e 100644
--- a/polkadot/node/core/pvf/src/artifacts.rs
+++ b/polkadot/node/core/pvf/src/artifacts.rs
@@ -499,8 +499,7 @@ mod tests {
 
 	#[tokio::test]
 	async fn remove_stale_cache_on_startup() {
-		let cache_dir = crate::worker_intf::tmppath("test-cache").await.unwrap();
-		fs::create_dir_all(&cache_dir).unwrap();
+		let cache_dir = tempfile::Builder::new().prefix("test-cache-").tempdir().unwrap();
 
 		// invalid prefix
 		create_rand_artifact(&cache_dir, "");
@@ -529,7 +528,7 @@ mod tests {
 
 		assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 7);
 
-		let artifacts = Artifacts::new_and_prune(&cache_dir).await;
+		let artifacts = Artifacts::new_and_prune(cache_dir.path()).await;
 
 		assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 1);
 		assert_eq!(artifacts.len(), 1);
diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs
index 4faaf13e62f..fc6cd8fc725 100644
--- a/polkadot/node/core/pvf/src/execute/worker_intf.rs
+++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs
@@ -278,7 +278,7 @@ where
 	// Cheaply create a hard link to the artifact. The artifact is always at a known location in the
 	// worker cache, and the child can't access any other artifacts or gain any information from the
 	// original filename.
-	let link_path = worker_dir::execute_artifact(&worker_dir.path);
+	let link_path = worker_dir::execute_artifact(worker_dir.path());
 	if let Err(err) = tokio::fs::hard_link(artifact_path, link_path).await {
 		gum::warn!(
 			target: LOG_TARGET,
@@ -292,7 +292,7 @@ where
 		}
 	}
 
-	let worker_dir_path = worker_dir.path.clone();
+	let worker_dir_path = worker_dir.path().to_owned();
 	let outcome = f(worker_dir).await;
 
 	// Try to clear the worker dir.
diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs
index 6049f51834e..be8f7aee778 100644
--- a/polkadot/node/core/pvf/src/host.rs
+++ b/polkadot/node/core/pvf/src/host.rs
@@ -24,7 +24,7 @@ use crate::{
 	artifacts::{ArtifactId, ArtifactPathId, ArtifactState, Artifacts},
 	execute::{self, PendingExecutionRequest},
 	metrics::Metrics,
-	prepare, security, Priority, ValidationError, LOG_TARGET,
+	prepare, security, Priority, SecurityStatus, ValidationError, LOG_TARGET,
 };
 use always_assert::never;
 use futures::{
@@ -70,6 +70,8 @@ pub(crate) type PrecheckResultSender = oneshot::Sender<PrecheckResult>;
 #[derive(Clone)]
 pub struct ValidationHost {
 	to_host_tx: mpsc::Sender<ToHost>,
+	/// Available security features, detected by the host during startup.
+	pub security_status: SecurityStatus,
 }
 
 impl ValidationHost {
@@ -216,7 +218,7 @@ pub async fn start(
 
 	let (to_host_tx, to_host_rx) = mpsc::channel(10);
 
-	let validation_host = ValidationHost { to_host_tx };
+	let validation_host = ValidationHost { to_host_tx, security_status: security_status.clone() };
 
 	let (to_prepare_pool, from_prepare_pool, run_prepare_pool) = prepare::start_pool(
 		metrics.clone(),
@@ -978,7 +980,8 @@ pub(crate) mod tests {
 
 		fn host_handle(&mut self) -> ValidationHost {
 			let to_host_tx = self.to_host_tx.take().unwrap();
-			ValidationHost { to_host_tx }
+			let security_status = Default::default();
+			ValidationHost { to_host_tx, security_status }
 		}
 
 		async fn poll_and_recv_result<T>(&mut self, result_rx: oneshot::Receiver<T>) -> T
diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs
index 318aea7295d..f978005068c 100644
--- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs
+++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs
@@ -318,7 +318,7 @@ where
 {
 	// Create the tmp file here so that the child doesn't need any file creation rights. This will
 	// be cleared at the end of this function.
-	let tmp_file = worker_dir::prepare_tmp_artifact(&worker_dir.path);
+	let tmp_file = worker_dir::prepare_tmp_artifact(worker_dir.path());
 	if let Err(err) = tokio::fs::File::create(&tmp_file).await {
 		gum::warn!(
 			target: LOG_TARGET,
@@ -333,7 +333,7 @@ where
 		}
 	};
 
-	let worker_dir_path = worker_dir.path.clone();
+	let worker_dir_path = worker_dir.path().to_owned();
 	let outcome = f(tmp_file, stream, worker_dir).await;
 
 	// Try to clear the worker dir.
diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs
index 8c06c68392f..ef8714f58c8 100644
--- a/polkadot/node/core/pvf/src/security.rs
+++ b/polkadot/node/core/pvf/src/security.rs
@@ -28,7 +28,7 @@ const SECURE_MODE_ANNOUNCEMENT: &'static str =
 
 /// Run checks for supported security features.
 ///
-/// # Return
+/// # Returns
 ///
 /// Returns the set of security features that we were able to enable. If an error occurs while
 /// enabling a security feature we set the corresponding status to `false`.
@@ -158,18 +158,15 @@ async fn check_can_unshare_user_namespace_and_change_root(
 ) -> SecureModeResult {
 	cfg_if::cfg_if! {
 		if #[cfg(target_os = "linux")] {
-			let cache_dir_tempdir =
-				crate::worker_intf::tmppath_in("check-can-unshare", cache_path)
-				.await
-				.map_err(
-					|err|
-					SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(
-						format!("could not create a temporary directory in {:?}: {}", cache_path, err)
-					)
-				)?;
+			let cache_dir_tempdir = tempfile::Builder::new()
+				.prefix("check-can-unshare-")
+				.tempdir_in(cache_path)
+				.map_err(|err| SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(
+					format!("could not create a temporary directory in {:?}: {}", cache_path, err)
+				))?;
 			match tokio::process::Command::new(prepare_worker_program_path)
 				.arg("--check-can-unshare-user-namespace-and-change-root")
-				.arg(cache_dir_tempdir)
+				.arg(cache_dir_tempdir.path())
 				.output()
 				.await
 			{
diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs
index 5e589b9abce..9d6907c1092 100644
--- a/polkadot/node/core/pvf/src/worker_intf.rs
+++ b/polkadot/node/core/pvf/src/worker_intf.rs
@@ -71,6 +71,7 @@ pub async fn spawn_with_program_path(
 
 	with_transient_socket_path(debug_id, |socket_path| {
 		let socket_path = socket_path.to_owned();
+		let worker_dir_path = worker_dir.path().to_owned();
 
 		async move {
 			let listener = UnixListener::bind(&socket_path).map_err(|err| {
@@ -91,7 +92,7 @@ pub async fn spawn_with_program_path(
 				&program_path,
 				&extra_args,
 				&socket_path,
-				&worker_dir.path,
+				&worker_dir_path,
 				security_status,
 			)
 			.map_err(|err| {
@@ -100,7 +101,7 @@ pub async fn spawn_with_program_path(
 					%debug_id,
 					?program_path,
 					?extra_args,
-					?worker_dir.path,
+					?worker_dir_path,
 					?socket_path,
 					"cannot spawn a worker: {:?}",
 					err,
@@ -108,7 +109,6 @@ pub async fn spawn_with_program_path(
 				SpawnErr::ProcessSpawn
 			})?;
 
-			let worker_dir_path = worker_dir.path.clone();
 			futures::select! {
 				accept_result = listener.accept().fuse() => {
 					let (stream, _) = accept_result.map_err(|err| {
@@ -150,7 +150,42 @@ where
 	F: FnOnce(&Path) -> Fut,
 	Fut: futures::Future<Output = Result<T, SpawnErr>> + 'static,
 {
-	let socket_path = tmppath(&format!("pvf-host-{}", debug_id))
+	/// Returns a path under [`std::env::temp_dir`]. The path name will start with the given prefix.
+	///
+	/// There is only a certain number of retries. If exceeded this function will give up and return
+	/// an error.
+	pub async fn tmppath(prefix: &str) -> io::Result<PathBuf> {
+		fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf {
+			use rand::distributions::Alphanumeric;
+
+			const DESCRIMINATOR_LEN: usize = 10;
+
+			let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN);
+			buf.extend(prefix.as_bytes());
+			buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(DESCRIMINATOR_LEN));
+
+			let s = std::str::from_utf8(&buf)
+				.expect("the string is collected from a valid utf-8 sequence; qed");
+
+			let mut path = dir.to_owned();
+			path.push(s);
+			path
+		}
+
+		const NUM_RETRIES: usize = 50;
+
+		let dir = std::env::temp_dir();
+		for _ in 0..NUM_RETRIES {
+			let tmp_path = make_tmppath(prefix, &dir);
+			if !tmp_path.exists() {
+				return Ok(tmp_path)
+			}
+		}
+
+		Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path"))
+	}
+
+	let socket_path = tmppath(&format!("pvf-host-{}-", debug_id))
 		.await
 		.map_err(|_| SpawnErr::TmpPath)?;
 	let result = f(&socket_path).await;
@@ -162,46 +197,6 @@ where
 	result
 }
 
-/// Returns a path under the given `dir`. The path name will start with the given prefix.
-///
-/// There is only a certain number of retries. If exceeded this function will give up and return an
-/// error.
-pub async fn tmppath_in(prefix: &str, dir: &Path) -> io::Result<PathBuf> {
-	fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf {
-		use rand::distributions::Alphanumeric;
-
-		const DESCRIMINATOR_LEN: usize = 10;
-
-		let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN);
-		buf.extend(prefix.as_bytes());
-		buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(DESCRIMINATOR_LEN));
-
-		let s = std::str::from_utf8(&buf)
-			.expect("the string is collected from a valid utf-8 sequence; qed");
-
-		let mut path = dir.to_owned();
-		path.push(s);
-		path
-	}
-
-	const NUM_RETRIES: usize = 50;
-
-	for _ in 0..NUM_RETRIES {
-		let tmp_path = make_tmppath(prefix, dir);
-		if !tmp_path.exists() {
-			return Ok(tmp_path)
-		}
-	}
-
-	Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path"))
-}
-
-/// The same as [`tmppath_in`], but uses [`std::env::temp_dir`] as the directory.
-pub async fn tmppath(prefix: &str) -> io::Result<PathBuf> {
-	let temp_dir = std::env::temp_dir();
-	tmppath_in(prefix, &temp_dir).await
-}
-
 /// A struct that represents an idle worker.
 ///
 /// This struct is supposed to be used as a token that is passed by move into a subroutine that
@@ -224,8 +219,6 @@ pub struct IdleWorker {
 pub enum SpawnErr {
 	/// Cannot obtain a temporary path location.
 	TmpPath,
-	/// An FS error occurred.
-	Fs(String),
 	/// Cannot bind the socket to the given path.
 	Bind,
 	/// An error happened during accepting a connection to the socket.
@@ -419,26 +412,22 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result<Vec<u8>
 /// ```
 #[derive(Debug)]
 pub struct WorkerDir {
-	pub path: PathBuf,
+	tempdir: tempfile::TempDir,
 }
 
 impl WorkerDir {
 	/// Creates a new, empty worker dir with a random name in the given cache dir.
 	pub async fn new(debug_id: &'static str, cache_dir: &Path) -> Result<Self, SpawnErr> {
 		let prefix = format!("worker-dir-{}-", debug_id);
-		let path = tmppath_in(&prefix, cache_dir).await.map_err(|_| SpawnErr::TmpPath)?;
-		tokio::fs::create_dir(&path)
-			.await
-			.map_err(|err| SpawnErr::Fs(err.to_string()))?;
-		Ok(Self { path })
+		let tempdir = tempfile::Builder::new()
+			.prefix(&prefix)
+			.tempdir_in(cache_dir)
+			.map_err(|_| SpawnErr::TmpPath)?;
+		Ok(Self { tempdir })
 	}
-}
 
-// Try to clean up the temporary worker dir at the end of the worker's lifetime. It should be wiped
-// on startup, but we make a best effort not to leave it around.
-impl Drop for WorkerDir {
-	fn drop(&mut self) {
-		let _ = std::fs::remove_dir_all(&self.path);
+	pub fn path(&self) -> &Path {
+		self.tempdir.path()
 	}
 }
 
diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs
index 4c81ac502dd..b53d598cd0f 100644
--- a/polkadot/node/core/pvf/tests/it/main.rs
+++ b/polkadot/node/core/pvf/tests/it/main.rs
@@ -18,6 +18,8 @@
 
 use assert_matches::assert_matches;
 use parity_scale_codec::Encode as _;
+#[cfg(all(feature = "ci-only-tests", target_os = "linux"))]
+use polkadot_node_core_pvf::SecurityStatus;
 use polkadot_node_core_pvf::{
 	start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError,
 	PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
@@ -122,6 +124,11 @@ impl TestHost {
 			.unwrap();
 		result_rx.await.unwrap()
 	}
+
+	#[cfg(all(feature = "ci-only-tests", target_os = "linux"))]
+	async fn security_status(&self) -> SecurityStatus {
+		self.host.lock().await.security_status.clone()
+	}
 }
 
 #[tokio::test]
@@ -402,3 +409,28 @@ async fn prepare_can_run_serially() {
 	// Prepare a different wasm blob to prevent skipping work.
 	let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap();
 }
+
+// CI machines should be able to enable all the security features.
+#[cfg(all(feature = "ci-only-tests", target_os = "linux"))]
+#[tokio::test]
+async fn all_security_features_work() {
+	// Landlock is only available starting Linux 5.13, and we may be testing on an old kernel.
+	let sysinfo = sc_sysinfo::gather_sysinfo();
+	// The version will look something like "5.15.0-87-generic".
+	let version = sysinfo.linux_kernel.unwrap();
+	let version_split: Vec<&str> = version.split(".").collect();
+	let major: u32 = version_split[0].parse().unwrap();
+	let minor: u32 = version_split[1].parse().unwrap();
+	let can_enable_landlock = if major >= 6 { true } else { minor >= 13 };
+
+	let host = TestHost::new().await;
+
+	assert_eq!(
+		host.security_status().await,
+		SecurityStatus {
+			can_enable_landlock,
+			can_enable_seccomp: true,
+			can_unshare_user_namespace_and_change_root: true,
+		}
+	);
+}
diff --git a/polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl b/polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl
index 46bb8bcdf72..135999a092a 100644
--- a/polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl
+++ b/polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl
@@ -54,8 +54,8 @@ one: reports histogram polkadot_pvf_preparation_time has 0 samples in buckets ["
 two: reports histogram polkadot_pvf_preparation_time has 0 samples in buckets ["20", "30", "60", "120", "+Inf"] within 10 seconds
 
 # Check execution time.
-# There are two different timeout conditions: BACKING_EXECUTION_TIMEOUT(2s) and
-# APPROVAL_EXECUTION_TIMEOUT(6s). Currently these are not differentiated by metrics
+# There are two different timeout conditions: DEFAULT_BACKING_EXECUTION_TIMEOUT(2s) and
+# DEFAULT_APPROVAL_EXECUTION_TIMEOUT(12s). Currently these are not differentiated by metrics
 # because the metrics are defined in `polkadot-node-core-pvf` which is a level below
 # the relevant subsystems.
 # That being said, we will take the simplifying assumption of testing only the
-- 
GitLab