Unverified Commit 0eb7905a authored by Sergey Pepyakin's avatar Sergey Pepyakin Committed by GitHub
Browse files

New PVF validation host (#2710)



* Implement PVF validation host

* WIP: Diener

* Increase the alloted compilation time

* Add more comments

* Minor clean up

* Apply suggestions from code review
Co-authored-by: default avatarBastian Köcher <bkchr@users.noreply.github.com>

* Fix pruning artifact removal

* Fix formatting and newlines

* Fix the thread pool

* Update node/core/pvf/src/executor_intf.rs
Co-authored-by: default avatarBastian Köcher <bkchr@users.noreply.github.com>

* Remove redundant test declaration

* Don't convert the path into an intermediate string

* Try to workaround the test failure

* Use the puppet_worker trick again

* Fix a blip

* Move `ensure_wasmtime_version` under the tests mod

* Add a macro for puppet_workers

* fix build for not real-overseer

* Rename the puppet worker for adder collator

* play it safe with the name of adder puppet worker

* Typo: triggered

* Add more comments

* Do not kill exec worker on every error

* Plumb Duration for timeouts

* typo: critical

* Add proofs

* Clean unused imports

* Revert "WIP: Diener"

This reverts commit ff2d3ff2

.

* Sync version of wasmtime

* Update cargo.lock

* Update Substrate

* Merge fixes still

* Update wasmtime version in test

* bastifmt
Co-authored-by: default avatarBastian Köcher <bkchr@users.noreply.github.com>

* Squash spaces

* Trailing new line for testing.rs

* Remove controversial code

* comment about biasing

* Fix suggestion

* Add comments

* make it more clear why unwrap_err

* tmpfile retry

* proper proofs for claim_idle

* Remove mutex from ValidationHost

* Add some more logging

* Extract exec timeout into a constant

* Add some clarifying logging

* Use blake2_256

* Clean up the merge

Specifically the leftovers after removing real-overseer

* Update parachain/test-parachains/adder/collator/Cargo.toml
Co-authored-by: Andronik Ordian's avatarAndronik Ordian <write@reusable.software>
Co-authored-by: default avatarBastian Köcher <bkchr@users.noreply.github.com>
Co-authored-by: Andronik Ordian's avatarAndronik Ordian <write@reusable.software>
parent e5bab572
Pipeline #133477 failed with stages
in 13 minutes and 43 seconds
......@@ -120,9 +120,9 @@ dependencies = [
[[package]]
name = "anyhow"
version = "1.0.34"
version = "1.0.39"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf8dcb5b4bbaa28653b647d8c77bd4ed40183b48882e130c1f1ffb73de069fd7"
checksum = "81cddc5f91628367664cc7c69714ff08deee8a3efc54623011c772544d7b2767"
[[package]]
name = "approx"
......@@ -204,6 +204,16 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9"
[[package]]
name = "async-attributes"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "async-channel"
version = "1.5.1"
......@@ -293,6 +303,7 @@ version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f9f84f1280a2b436a2c77c2582602732b6c2f4321d5494d6e799e6c367859a8"
dependencies = [
"async-attributes",
"async-channel",
"async-global-executor",
"async-io",
......@@ -5579,7 +5590,7 @@ dependencies = [
"frame-benchmarking-cli",
"futures 0.3.13",
"log",
"polkadot-parachain",
"polkadot-node-core-pvf",
"polkadot-service",
"sc-cli",
"sc-service",
......@@ -5823,8 +5834,10 @@ name = "polkadot-node-core-candidate-validation"
version = "0.1.0"
dependencies = [
"assert_matches",
"async-trait",
"futures 0.3.13",
"parity-scale-codec",
"polkadot-node-core-pvf",
"polkadot-node-primitives",
"polkadot-node-subsystem",
"polkadot-node-subsystem-test-helpers",
......@@ -5893,6 +5906,38 @@ dependencies = [
"tracing",
]
[[package]]
name = "polkadot-node-core-pvf"
version = "0.1.0"
dependencies = [
"always-assert",
"assert_matches",
"async-process",
"async-std",
"futures 0.3.13",
"futures-timer 3.0.2",
"hex-literal",
"libc",
"parity-scale-codec",
"pin-project 1.0.4",
"polkadot-core-primitives",
"polkadot-parachain",
"rand 0.8.3",
"sc-executor",
"sc-executor-common",
"sc-executor-wasmtime",
"slotmap",
"sp-core",
"sp-externalities",
"sp-io",
"sp-wasm-interface",
"tempfile",
"test-parachain-adder",
"test-parachain-halt",
"tracing",
"wasmtime-jit",
]
[[package]]
name = "polkadot-node-core-runtime-api"
version = "0.1.0"
......@@ -6074,25 +6119,13 @@ name = "polkadot-parachain"
version = "0.8.30"
dependencies = [
"derive_more",
"futures 0.3.13",
"libc",
"log",
"parity-scale-codec",
"parity-util-mem",
"parking_lot 0.11.1",
"polkadot-core-primitives",
"raw_sync",
"sc-executor",
"serde",
"shared_memory",
"sp-core",
"sp-externalities",
"sp-io",
"sp-runtime",
"sp-std",
"sp-wasm-interface",
"static_assertions",
"thiserror",
]
[[package]]
......@@ -7031,19 +7064,6 @@ dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "raw_sync"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a34bde3561f980a51c70495164200569a11662644fe5af017f0b5d7015688cc"
dependencies = [
"cfg-if 0.1.10",
"libc",
"nix",
"rand 0.8.3",
"winapi 0.3.9",
]
[[package]]
name = "rawpointer"
version = "0.2.1"
......@@ -8704,20 +8724,6 @@ dependencies = [
"loom",
]
[[package]]
name = "shared_memory"
version = "0.11.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b854a362375dfe8ab12ea8a98228040d37293c988f85fbac9fa0f83336387966"
dependencies = [
"cfg-if 0.1.10",
"libc",
"nix",
"quick-error 2.0.0",
"rand 0.8.3",
"winapi 0.3.9",
]
[[package]]
name = "shlex"
version = "0.1.1"
......@@ -8778,6 +8784,15 @@ dependencies = [
"sp-std",
]
[[package]]
name = "slotmap"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab3003725ae562cf995f3dc82bb99e70926e09000396816765bb6d7adbe740b1"
dependencies = [
"version_check",
]
[[package]]
name = "smallvec"
version = "0.6.13"
......@@ -9868,6 +9883,7 @@ dependencies = [
"log",
"parity-scale-codec",
"polkadot-cli",
"polkadot-node-core-pvf",
"polkadot-node-primitives",
"polkadot-node-subsystem",
"polkadot-parachain",
......
......@@ -55,6 +55,7 @@ members = [
"node/core/chain-api",
"node/core/proposer",
"node/core/provisioner",
"node/core/pvf",
"node/core/runtime-api",
"node/network/approval-distribution",
"node/network/bridge",
......
......@@ -22,7 +22,7 @@ wasm-bindgen-futures = { version = "0.4.19", optional = true }
futures = "0.3.12"
service = { package = "polkadot-service", path = "../node/service", default-features = false, optional = true }
polkadot-parachain = { path = "../parachain", optional = true }
polkadot-node-core-pvf = { path = "../node/core/pvf", optional = true }
sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" }
frame-benchmarking-cli = { git = "https://github.com/paritytech/substrate", branch = "master", optional = true }
......@@ -39,8 +39,8 @@ sp-trie = { git = "https://github.com/paritytech/substrate", branch = "master",
substrate-build-script-utils = { git = "https://github.com/paritytech/substrate", branch = "master" }
[features]
default = [ "wasmtime", "db", "cli", "full-node", "trie-memory-tracker", "polkadot-parachain" ]
wasmtime = [ "sc-cli/wasmtime", "polkadot-parachain/wasmtime" ]
default = [ "wasmtime", "db", "cli", "full-node", "trie-memory-tracker" ]
wasmtime = [ "sc-cli/wasmtime" ]
db = [ "service/db" ]
cli = [
"structopt",
......@@ -48,6 +48,7 @@ cli = [
"sc-service",
"frame-benchmarking-cli",
"try-runtime-cli",
"polkadot-node-core-pvf",
]
browser = [
"wasm-bindgen",
......
......@@ -43,8 +43,12 @@ pub enum Subcommand {
Revert(sc_cli::RevertCmd),
#[allow(missing_docs)]
#[structopt(name = "validation-worker", setting = structopt::clap::AppSettings::Hidden)]
ValidationWorker(ValidationWorkerCommand),
#[structopt(name = "prepare-worker", setting = structopt::clap::AppSettings::Hidden)]
PvfPrepareWorker(ValidationWorkerCommand),
#[allow(missing_docs)]
#[structopt(name = "execute-worker", setting = structopt::clap::AppSettings::Hidden)]
PvfExecuteWorker(ValidationWorkerCommand),
/// The custom benchmark subcommand benchmarking runtime pallets.
#[structopt(
......@@ -64,11 +68,8 @@ pub enum Subcommand {
#[allow(missing_docs)]
#[derive(Debug, StructOpt)]
pub struct ValidationWorkerCommand {
/// The path that the executor can use for its caching purposes.
pub cache_base_path: std::path::PathBuf,
#[allow(missing_docs)]
pub mem_id: String,
/// The path to the validation host's socket.
pub socket_path: String,
}
#[allow(missing_docs)]
......
......@@ -256,19 +256,39 @@ pub fn run() -> Result<()> {
Ok((cmd.run(client, backend).map_err(Error::SubstrateCli), task_manager))
})?)
},
Some(Subcommand::ValidationWorker(cmd)) => {
Some(Subcommand::PvfPrepareWorker(cmd)) => {
let mut builder = sc_cli::LoggerBuilder::new("");
builder.with_colors(false);
let _ = builder.init();
if cfg!(feature = "browser") || cfg!(target_os = "android") {
Err(sc_cli::Error::Input("Cannot run validation worker in browser".into()).into())
} else {
#[cfg(not(any(target_os = "android", feature = "browser")))]
polkadot_parachain::wasm_executor::run_worker(
&cmd.mem_id,
Some(cmd.cache_base_path.clone()),
)?;
#[cfg(any(target_os = "android", feature = "browser"))]
{
return Err(
sc_cli::Error::Input("PVF preparation workers are not supported under this platform".into()).into()
);
}
#[cfg(not(any(target_os = "android", feature = "browser")))]
{
polkadot_node_core_pvf::prepare_worker_entrypoint(&cmd.socket_path);
Ok(())
}
},
Some(Subcommand::PvfExecuteWorker(cmd)) => {
let mut builder = sc_cli::LoggerBuilder::new("");
builder.with_colors(false);
let _ = builder.init();
#[cfg(any(target_os = "android", feature = "browser"))]
{
return Err(
sc_cli::Error::Input("PVF execution workers are not supported under this platform".into()).into()
);
}
#[cfg(not(any(target_os = "android", feature = "browser")))]
{
polkadot_node_core_pvf::execute_worker_entrypoint(&cmd.socket_path);
Ok(())
}
},
......
......@@ -5,10 +5,10 @@ authors = ["Parity Technologies <admin@parity.io>"]
edition = "2018"
[dependencies]
async-trait = "0.1.42"
futures = "0.3.12"
tracing = "0.1.25"
sp-core = { package = "sp-core", git = "https://github.com/paritytech/substrate", branch = "master" }
sp-maybe-compressed-blob = { package = "sp-maybe-compressed-blob", git = "https://github.com/paritytech/substrate", branch = "master" }
parity-scale-codec = { version = "2.0.0", default-features = false, features = ["bit-vec", "derive"] }
......@@ -18,8 +18,12 @@ polkadot-node-primitives = { path = "../../primitives" }
polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../../subsystem" }
polkadot-node-subsystem-util = { path = "../../subsystem-util" }
[target.'cfg(not(any(target_os = "android", target_os = "unknown")))'.dependencies]
polkadot-node-core-pvf = { path = "../pvf" }
[dev-dependencies]
sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" }
futures = { version = "0.3.12", features = ["thread-pool"] }
assert_matches = "1.4.0"
polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" }
sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" }
This diff is collapsed.
[package]
name = "polkadot-node-core-pvf"
version = "0.1.0"
authors = ["Parity Technologies <admin@parity.io>"]
edition = "2018"
[[bin]]
name = "puppet_worker"
path = "bin/puppet_worker.rs"
[dependencies]
always-assert = "0.1"
async-std = { version = "1.8.0", features = ["attributes"] }
async-process = "1.0.1"
assert_matches = "1.4.0"
futures = "0.3.12"
futures-timer = "3.0.2"
libc = "0.2.81"
slotmap = "1.0"
tracing = "0.1.22"
pin-project = "1.0.4"
rand = "0.8.3"
parity-scale-codec = { version = "2.0.0", default-features = false, features = ["derive"] }
polkadot-parachain = { path = "../../../parachain" }
polkadot-core-primitives = { path = "../../../core-primitives" }
sc-executor = { git = "https://github.com/paritytech/substrate", branch = "master" }
sc-executor-wasmtime = { git = "https://github.com/paritytech/substrate", branch = "master" }
sc-executor-common = { git = "https://github.com/paritytech/substrate", branch = "master" }
sp-externalities = { git = "https://github.com/paritytech/substrate", branch = "master" }
sp-io = { git = "https://github.com/paritytech/substrate", branch = "master" }
sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" }
sp-wasm-interface = { git = "https://github.com/paritytech/substrate", branch = "master" }
[dev-dependencies]
adder = { package = "test-parachain-adder", path = "../../../parachain/test-parachains/adder" }
halt = { package = "test-parachain-halt", path = "../../../parachain/test-parachains/halt" }
hex-literal = "0.3.1"
tempfile = "3.2.0"
# PVF execution leverages compiled artifacts provided by wasmtime. The contents of the artifacts
# depends on the version of wasmtime. In this crate we persist the artifacts on disk so we should
# be careful about the updates. In order to handle this, we depend on the wasmtime version here
# that we think is used by the sc-executor. If wasmtime is updated in Substrate and wasn't updated
# here then there will be linking errors like
#
# `multiple definitions of `set_vmctx_memory`.
#
# or similar, because wasmtime exports these symbols and does not support multiple versions compiled
# in at the same time.
#
# Another safeguard is a test `ensure_wasmtime_version` that will fail on each bump and prompt the
# developer to correspondingly act upon the change.
wasmtime-jit = "0.24"
// Copyright 2021 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
polkadot_node_core_pvf::decl_puppet_worker_main!();
// Copyright 2021 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
use crate::LOG_TARGET;
use always_assert::always;
use async_std::{
io,
path::{Path, PathBuf},
};
use polkadot_core_primitives::Hash;
use std::{
collections::HashMap,
time::{Duration, SystemTime},
};
use parity_scale_codec::{Encode, Decode};
use futures::StreamExt;
/// A final product of preparation process. Contains either a ready to run compiled artifact or
/// a description what went wrong.
#[derive(Encode, Decode)]
pub enum Artifact {
/// During the prevalidation stage of preparation an issue was found with the PVF.
PrevalidationErr(String),
/// Compilation failed for the given PVF.
PreparationErr(String),
/// This state indicates that the process assigned to prepare the artifact wasn't responsible
/// or were killed. This state is reported by the validation host (not by the worker).
DidntMakeIt,
/// The PVF passed all the checks and is ready for execution.
Compiled { compiled_artifact: Vec<u8> },
}
impl Artifact {
/// Serializes this struct into a byte buffer.
pub fn serialize(&self) -> Vec<u8> {
self.encode()
}
/// Deserialize the given byte buffer to an artifact.
pub fn deserialize(mut bytes: &[u8]) -> Result<Self, String> {
Artifact::decode(&mut bytes).map_err(|e| format!("{:?}", e))
}
}
/// Identifier of an artifact. Right now it only encodes a code hash of the PVF. But if we get to
/// multiple engine implementations the artifact ID should include the engine type as well.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ArtifactId {
code_hash: Hash,
}
impl ArtifactId {
const PREFIX: &'static str = "wasmtime_1_";
/// Creates a new artifact ID with the given hash.
pub fn new(code_hash: Hash) -> Self {
Self { code_hash }
}
/// Tries to recover the artifact id from the given file name.
pub fn from_file_name(file_name: &str) -> Option<Self> {
use std::str::FromStr as _;
let file_name = file_name.strip_prefix(Self::PREFIX)?;
let code_hash = Hash::from_str(file_name).ok()?;
Some(Self { code_hash })
}
/// Returns the expected path to this artifact given the root of the cache.
pub fn path(&self, cache_path: &Path) -> PathBuf {
let file_name = format!("{}{}", Self::PREFIX, self.code_hash.to_string());
cache_path.join(file_name)
}
}
pub enum ArtifactState {
/// The artifact is ready to be used by the executor.
///
/// That means that the artifact should be accessible through the path obtained by the artifact
/// id (unless, it was removed externally).
Prepared {
/// The time when the artifact was the last time needed.
///
/// This is updated when we get the heads up for this artifact or when we just discover
/// this file.
last_time_needed: SystemTime,
},
/// A task to prepare this artifact is scheduled.
Preparing,
}
/// A container of all known artifact ids and their states.
pub struct Artifacts {
artifacts: HashMap<ArtifactId, ArtifactState>,
}
impl Artifacts {
/// Scan the given cache root for the artifacts.
///
/// The recognized artifacts will be filled in the table and unrecognized will be removed.
pub async fn new(cache_path: &Path) -> Self {
// Make sure that the cache path directory and all it's parents are created.
let _ = async_std::fs::create_dir_all(cache_path).await;
let artifacts = match scan_for_known_artifacts(cache_path).await {
Ok(a) => a,
Err(err) => {
tracing::warn!(
target: LOG_TARGET,
"unable to seed the artifacts in memory cache: {:?}. Starting with a clean one",
err,
);
HashMap::new()
}
};
Self { artifacts }
}
#[cfg(test)]
pub(crate) fn empty() -> Self {
Self {
artifacts: HashMap::new(),
}
}
/// Returns the state of the given artifact by its ID.
pub fn artifact_state_mut(&mut self, artifact_id: &ArtifactId) -> Option<&mut ArtifactState> {
self.artifacts.get_mut(artifact_id)
}
/// Inform the table about the artifact with the given ID. The state will be set to "preparing".
///
/// This function must be used only for brand new artifacts and should never be used for
/// replacing existing ones.
pub fn insert_preparing(&mut self, artifact_id: ArtifactId) {
// See the precondition.
always!(self
.artifacts
.insert(artifact_id, ArtifactState::Preparing)
.is_none());
}
/// Insert an artifact with the given ID as "prepared".
///
/// This function must be used only for brand new artifacts and should never be used for
/// replacing existing ones.
#[cfg(test)]
pub fn insert_prepared(&mut self, artifact_id: ArtifactId, last_time_needed: SystemTime) {
// See the precondition.
always!(self
.artifacts
.insert(artifact_id, ArtifactState::Prepared { last_time_needed })
.is_none());
}
/// Remove and retrive the artifacts from the table that are older than the supplied Time-To-Live.
pub fn prune(&mut self, artifact_ttl: Duration) -> Vec<ArtifactId> {
let now = SystemTime::now();
let mut to_remove = vec![];
for (k, v) in self.artifacts.iter() {
if let ArtifactState::Prepared {
last_time_needed, ..
} = *v {
if now
.duration_since(last_time_needed)
.map(|age| age > artifact_ttl)
.unwrap_or(false)
{
to_remove.push(k.clone());
}
}
}
for artifact in &to_remove {
self.artifacts.remove(artifact);
}
to_remove
}
}
/// Goes over all files in the given directory, collecting all recognizable artifacts. All files
/// that do not look like artifacts are removed.
///
/// All recognized artifacts will be created with the current datetime.
async fn scan_for_known_artifacts(
cache_path: &Path,
) -> io::Result<HashMap<ArtifactId, ArtifactState>> {
let mut result = HashMap::new();
let now = SystemTime::now();
let mut dir = async_std::fs::read_dir(cache_path).await?;
while let Some(res) = dir.next().await {
let entry = res?;
if entry.file_type().await?.is_dir() {
tracing::debug!(
target: LOG_TARGET,
"{} is a dir, and dirs do not belong to us. Removing",
entry.path().display(),
);
let _ = async_std::fs::remove_dir_all(entry.path()).await;
}
let path = entry.path();
let file_name = match path.file_name() {
None => {
// A file without a file name? Weird, just skip it.
continue;
}
Some(file_name) => file_name,