Unverified Commit 69b1058d authored by Sergey Pepyakin's avatar Sergey Pepyakin Committed by GitHub
Browse files

Mitigation of SIGBUS (#2440)

* Update shared-memory to new version & refactor

This two are combined in a single commit because the new version of
shared-memory doesn't provide the used functionality anymore.

Therefore in order to update the version of this crate we implement the
functionality that we need by ourselves, providing a cleaner API along
the way.

* Significantly decrease the required memory for a workspace

For some reason it was allocating an entire GiB of memory. I suspect
this has something to do with the current memory size limit of a PVF
execution environment (the prior name suggests that). However, we don't
need so much memory anywhere near that amount.

In fact, we could reduce the allocated size even more, but that maybe
for the next time.

* Unlink shmem just after opening

That will make sure that we don't leak the shmem accidentally.

* Do not compile workspace mod for androind and wasm

* Address some review comments

* Fix the test runner

* Fix missed +1 for the attached flag

* Use .expect rather than .unwrap

* Add a rustdoc for the workspace module

* fixup! Use .expect rather than .unwrap

* Add some doc comments to pub members

* Warn on error removing shm_unlink

* Change the alignment implementation

* Fix the comment nit
parent 5898cafc
Pipeline #124246 passed with stages
in 33 minutes and 30 seconds
This diff is collapsed.
......@@ -445,9 +445,9 @@ fn validate_candidate_exhaustive<B: ValidationBackend, S: SpawnNamed + 'static>(
match B::validate(backend_arg, &validation_code, params, spawn) {
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Timeout)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::ParamsTooLarge(l))) =>
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::ParamsTooLarge(l, _))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ParamsTooLarge(l as u64))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::CodeTooLarge(l))) =>
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::CodeTooLarge(l, _))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::CodeTooLarge(l as u64))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::BadReturn)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::BadReturn)),
......
......@@ -27,9 +27,12 @@ sp-io = { git = "https://github.com/paritytech/substrate", branch = "master", op
parking_lot = { version = "0.11.1", optional = true }
log = { version = "0.4.11", optional = true }
futures = { version = "0.3.8", optional = true }
static_assertions = { version = "1.1", optional = true }
libc = { version = "0.2.81", optional = true }
[target.'cfg(not(any(target_os = "android", target_os = "unknown")))'.dependencies]
shared_memory = { version = "0.10.0", optional = true }
shared_memory = { version = "0.11.0", optional = true }
raw_sync = { version = "0.1", optional = true }
[features]
default = ["std"]
......@@ -42,9 +45,12 @@ std = [
"sp-std/std",
"sp-runtime/std",
"shared_memory",
"raw_sync",
"sp-core/std",
"parking_lot",
"static_assertions",
"log",
"libc",
"parity-util-mem",
"sp-externalities",
"sc-executor",
......
......@@ -32,11 +32,6 @@ pub use validation_host::{run_worker, ValidationPool, EXECUTION_TIMEOUT_SEC, WOR
mod validation_host;
// maximum memory in bytes
const MAX_RUNTIME_MEM: usize = 1024 * 1024 * 1024; // 1 GiB
const MAX_CODE_MEM: usize = 16 * 1024 * 1024; // 16 MiB
const MAX_VALIDATION_RESULT_HEADER_MEM: usize = MAX_CODE_MEM + 1024; // 16.001 MiB
/// The strategy we employ for isolating execution of wasm parachain validation function (PVF).
///
/// For a typical validator an external process is the default way to run PVF. The rationale is based
......@@ -126,11 +121,11 @@ pub enum InvalidCandidate {
#[error("WASM executor error")]
WasmExecutor(#[from] sc_executor::error::Error),
/// Call data is too large.
#[error("Validation parameters are {0} bytes, max allowed is {}", MAX_RUNTIME_MEM)]
ParamsTooLarge(usize),
#[error("Validation parameters are {0} bytes, max allowed is {1}")]
ParamsTooLarge(usize, usize),
/// Code size it too large.
#[error("WASM code is {0} bytes, max allowed is {}", MAX_CODE_MEM)]
CodeTooLarge(usize),
#[error("WASM code is {0} bytes, max allowed is {1}")]
CodeTooLarge(usize, usize),
/// Error decoding returned data.
#[error("Validation function returned invalid data.")]
BadReturn,
......@@ -156,8 +151,20 @@ pub enum InternalError {
System(#[from] Box<dyn std::error::Error + Send + Sync + 'static>),
#[cfg(not(any(target_os = "android", target_os = "unknown")))]
#[error("Shared memory error: {0}")]
SharedMem(#[from] shared_memory::SharedMemError),
#[error("Failed to create shared memory: {0}")]
WorkerStartTimeout(String),
#[cfg(not(any(target_os = "android", target_os = "unknown")))]
#[error("Failed to create shared memory: {0}")]
FailedToCreateSharedMemory(String),
#[cfg(not(any(target_os = "android", target_os = "unknown")))]
#[error("Failed to send a singal to worker: {0}")]
FailedToSignal(String),
#[cfg(not(any(target_os = "android", target_os = "unknown")))]
#[error("Failed to send data to worker: {0}")]
FailedToWriteData(&'static str),
#[error("WASM worker error: {0}")]
WasmWorker(String),
......
......@@ -16,14 +16,9 @@
#![cfg(not(any(target_os = "android", target_os = "unknown")))]
use std::{process, env, sync::Arc, sync::atomic, path::PathBuf};
use parity_scale_codec::{Decode, Encode};
use std::{env, path::PathBuf, process, sync::Arc, sync::atomic};
use crate::primitives::{ValidationParams, ValidationResult};
use super::{
validate_candidate_internal, ValidationError, InvalidCandidate, InternalError,
MAX_CODE_MEM, MAX_RUNTIME_MEM, MAX_VALIDATION_RESULT_HEADER_MEM,
};
use shared_memory::{SharedMem, SharedMemConf, EventState, WriteLockable, EventWait, EventSet};
use super::{validate_candidate_internal, ValidationError, InvalidCandidate, InternalError};
use parking_lot::Mutex;
use log::{debug, trace};
use futures::executor::ThreadPool;
......@@ -35,6 +30,8 @@ pub const WORKER_ARGS: &[&'static str] = &[WORKER_ARG];
const LOG_TARGET: &'static str = "validation-worker";
mod workspace;
/// Execution timeout in seconds;
#[cfg(debug_assertions)]
pub const EXECUTION_TIMEOUT_SEC: u64 = 30;
......@@ -42,12 +39,6 @@ pub const EXECUTION_TIMEOUT_SEC: u64 = 30;
#[cfg(not(debug_assertions))]
pub const EXECUTION_TIMEOUT_SEC: u64 = 5;
enum Event {
CandidateReady = 0,
ResultReady = 1,
WorkerReady = 2,
}
#[derive(Clone)]
struct TaskExecutor(ThreadPool);
......@@ -99,16 +90,14 @@ impl ValidationPool {
let worker_cli_args = match cache_base_path {
Some(cache_base_path) => {
let worker_cli_args: Vec<&str> =
WORKER_ARGS.into_iter()
let worker_cli_args: Vec<&str> = WORKER_ARGS
.into_iter()
.cloned()
.chain(iter::once(cache_base_path))
.collect();
Cow::from(worker_cli_args)
}
None => {
Cow::from(WORKER_ARGS)
},
None => Cow::from(WORKER_ARGS),
};
self.validate_candidate_custom(
......@@ -133,24 +122,31 @@ impl ValidationPool {
) -> Result<ValidationResult, ValidationError> {
for host in self.hosts.iter() {
if let Some(mut host) = host.try_lock() {
return host.validate_candidate(validation_code, params, command, args)
return host.validate_candidate(validation_code, params, command, args);
}
}
// all workers are busy, just wait for the first one
self.hosts[0].lock().validate_candidate(validation_code, params, command, args)
self.hosts[0]
.lock()
.validate_candidate(validation_code, params, command, args)
}
}
/// Validation worker process entry point. Runs a loop waiting for candidates to validate
/// and sends back results via shared memory.
pub fn run_worker(mem_id: &str, cache_base_path: Option<PathBuf>) -> Result<(), String> {
let mut memory = match SharedMem::open(mem_id) {
Ok(memory) => memory,
let mut worker_handle = match workspace::open(mem_id) {
Err(e) => {
debug!(target: LOG_TARGET, "{} Error opening shared memory: {:?}", process::id(), e);
return Err(format!("Error opening shared memory: {:?}", e));
debug!(
target: LOG_TARGET,
"{} Error opening shared memory: {:?}",
process::id(),
e
);
return Err(e);
}
Ok(h) => h,
};
let exit = Arc::new(atomic::AtomicBool::new(false));
......@@ -162,12 +158,15 @@ pub fn run_worker(mem_id: &str, cache_base_path: Option<PathBuf>) -> Result<(),
let mut in_data = Vec::new();
// pipe terminates when parent process exits
std::io::stdin().read_to_end(&mut in_data).ok();
debug!(target: LOG_TARGET, "{} Parent process is dead. Exiting", process::id());
debug!(
target: LOG_TARGET,
"{} Parent process is dead. Exiting",
process::id()
);
exit.store(true, atomic::Ordering::Relaxed);
});
memory.set(Event::WorkerReady as usize, EventState::Signaled)
.map_err(|e| format!("{} Error setting shared event: {:?}", process::id(), e))?;
worker_handle.signal_ready()?;
let executor = super::ExecutorCache::new(cache_base_path);
......@@ -176,102 +175,64 @@ pub fn run_worker(mem_id: &str, cache_base_path: Option<PathBuf>) -> Result<(),
break;
}
debug!(target: LOG_TARGET, "{} Waiting for candidate", process::id());
match memory.wait(Event::CandidateReady as usize, shared_memory::Timeout::Sec(3)) {
Err(e) => {
// Timeout
trace!(target: LOG_TARGET, "{} Timeout waiting for candidate: {:?}", process::id(), e);
debug!(
target: LOG_TARGET,
"{} Waiting for candidate",
process::id()
);
let work_item = match worker_handle.wait_for_work(3) {
Err(workspace::WaitForWorkErr::Wait(e)) => {
trace!(
target: LOG_TARGET,
"{} Timeout waiting for candidate: {:?}",
process::id(),
e
);
continue;
}
Ok(()) => {}
Err(workspace::WaitForWorkErr::FailedToDecode(e)) => {
return Err(e);
}
Ok(work_item) => work_item,
};
{
debug!(target: LOG_TARGET, "{} Processing candidate", process::id());
// we have candidate data
let mut slice = memory.wlock_as_slice(0)
.map_err(|e| format!("Error locking shared memory: {:?}", e))?;
let result = {
let data: &mut[u8] = &mut **slice;
let (header_buf, rest) = data.split_at_mut(1024);
let mut header_buf: &[u8] = header_buf;
let header = ValidationHeader::decode(&mut header_buf)
.map_err(|_| format!("Error decoding validation request."))?;
debug!(target: LOG_TARGET, "{} Candidate header: {:?}", process::id(), header);
let (code, rest) = rest.split_at_mut(MAX_CODE_MEM);
let (code, _) = code.split_at_mut(header.code_size as usize);
let (call_data, _) = rest.split_at_mut(MAX_RUNTIME_MEM);
let (call_data, _) = call_data.split_at_mut(header.params_size as usize);
let result = validate_candidate_internal(&executor, code, call_data, task_executor.clone());
debug!(target: LOG_TARGET, "{} Candidate validated: {:?}", process::id(), result);
match result {
Ok(r) => ValidationResultHeader::Ok(r),
Err(ValidationError::Internal(e)) =>
ValidationResultHeader::Error(WorkerValidationError::InternalError(e.to_string())),
Err(ValidationError::InvalidCandidate(e)) =>
ValidationResultHeader::Error(WorkerValidationError::ValidationError(e.to_string())),
}
let result = validate_candidate_internal(
&executor,
work_item.code,
work_item.params,
task_executor.clone(),
);
debug!(
target: LOG_TARGET,
"{} Candidate validated: {:?}",
process::id(),
result
);
let result_header = match result {
Ok(r) => workspace::ValidationResultHeader::Ok(r),
Err(ValidationError::Internal(e)) => workspace::ValidationResultHeader::Error(
workspace::WorkerValidationError::InternalError(e.to_string()),
),
Err(ValidationError::InvalidCandidate(e)) => workspace::ValidationResultHeader::Error(
workspace::WorkerValidationError::ValidationError(e.to_string()),
),
};
let mut data: &mut[u8] = &mut **slice;
result.encode_to(&mut data);
}
debug!(target: LOG_TARGET, "{} Signaling result", process::id());
memory.set(Event::ResultReady as usize, EventState::Signaled)
.map_err(|e| format!("Error setting shared event: {:?}", e))?;
worker_handle
.report_result(result_header)
.map_err(|e| format!("error reporting result: {:?}", e))?;
}
Ok(())
}
/// Params header in shared memory. All offsets should be aligned to WASM page size.
#[derive(Encode, Decode, Debug)]
struct ValidationHeader {
code_size: u64,
params_size: u64,
}
#[derive(Encode, Decode, Debug)]
enum WorkerValidationError {
InternalError(String),
ValidationError(String),
}
#[derive(Encode, Decode, Debug)]
enum ValidationResultHeader {
Ok(ValidationResult),
Error(WorkerValidationError),
}
unsafe impl Send for ValidationHost {}
struct ValidationHostMemory(SharedMem);
impl std::fmt::Debug for ValidationHostMemory {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "ValidationHostMemory")
}
}
impl std::ops::Deref for ValidationHostMemory {
type Target = SharedMem;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl std::ops::DerefMut for ValidationHostMemory {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
#[derive(Default, Debug)]
struct ValidationHost {
worker: Option<process::Child>,
memory: Option<ValidationHostMemory>,
host_handle: Option<workspace::HostHandle>,
id: u32,
}
......@@ -284,18 +245,6 @@ impl Drop for ValidationHost {
}
impl ValidationHost {
fn create_memory() -> Result<SharedMem, InternalError> {
let mem_size = MAX_RUNTIME_MEM + MAX_CODE_MEM + MAX_VALIDATION_RESULT_HEADER_MEM;
let mem_config = SharedMemConf::default()
.set_size(mem_size)
.add_lock(shared_memory::LockType::Mutex, 0, mem_size)?
.add_event(shared_memory::EventType::Auto)? // Event::CandidateReady
.add_event(shared_memory::EventType::Auto)? // Event::ResultReady
.add_event(shared_memory::EventType::Auto)?; // Event::WorkerReady
Ok(mem_config.create()?)
}
fn start_worker(&mut self, cmd: &PathBuf, args: &[&str]) -> Result<(), InternalError> {
if let Some(ref mut worker) = self.worker {
// Check if still alive
......@@ -305,28 +254,28 @@ impl ValidationHost {
}
}
let memory = Self::create_memory()?;
let host_handle =
workspace::create().map_err(|msg| InternalError::FailedToCreateSharedMemory(msg))?;
debug!(
target: LOG_TARGET,
"Starting worker at {:?} with arguments: {:?} and {:?}",
cmd,
args,
memory.get_os_path(),
host_handle.id(),
);
let worker = process::Command::new(cmd)
.args(args)
.arg(memory.get_os_path())
.arg(host_handle.id())
.stdin(process::Stdio::piped())
.spawn()?;
self.id = worker.id();
self.worker = Some(worker);
memory.wait(
Event::WorkerReady as usize,
shared_memory::Timeout::Sec(EXECUTION_TIMEOUT_SEC as usize),
)?;
self.memory = Some(ValidationHostMemory(memory));
host_handle
.wait_until_ready(EXECUTION_TIMEOUT_SEC)
.map_err(|e| InternalError::WorkerStartTimeout(format!("{:?}", e)))?;
self.host_handle = Some(host_handle);
Ok(())
}
......@@ -340,76 +289,68 @@ impl ValidationHost {
binary: &PathBuf,
args: &[&str],
) -> Result<ValidationResult, ValidationError> {
if validation_code.len() > MAX_CODE_MEM {
return Err(ValidationError::InvalidCandidate(InvalidCandidate::CodeTooLarge(validation_code.len())));
}
// First, check if need to spawn the child process
self.start_worker(binary, args)?;
let memory = self.memory.as_mut()
.expect("memory is always `Some` after `start_worker` completes successfully");
{
// Put data in shared mem
let data: &mut[u8] = &mut **memory.wlock_as_slice(0)
.map_err(|e|ValidationError::Internal(e.into()))?;
let (mut header_buf, rest) = data.split_at_mut(1024);
let (code, rest) = rest.split_at_mut(MAX_CODE_MEM);
let (code, _) = code.split_at_mut(validation_code.len());
let (call_data, _) = rest.split_at_mut(MAX_RUNTIME_MEM);
code[..validation_code.len()].copy_from_slice(validation_code);
let encoded_params = params.encode();
if encoded_params.len() >= MAX_RUNTIME_MEM {
return Err(ValidationError::InvalidCandidate(InvalidCandidate::ParamsTooLarge(MAX_RUNTIME_MEM)));
}
call_data[..encoded_params.len()].copy_from_slice(&encoded_params);
let header = ValidationHeader {
code_size: validation_code.len() as u64,
params_size: encoded_params.len() as u64,
};
header.encode_to(&mut header_buf);
}
let host_handle = self
.host_handle
.as_mut()
.expect("host_handle is always `Some` after `start_worker` completes successfully");
debug!(target: LOG_TARGET, "{} Signaling candidate", self.id);
memory.set(Event::CandidateReady as usize, EventState::Signaled)
.map_err(|e| ValidationError::Internal(e.into()))?;
match host_handle.request_validation(validation_code, params) {
Ok(()) => {}
Err(workspace::RequestValidationErr::CodeTooLarge { actual, max }) => {
return Err(ValidationError::InvalidCandidate(
InvalidCandidate::CodeTooLarge(actual, max),
));
}
Err(workspace::RequestValidationErr::ParamsTooLarge { actual, max }) => {
return Err(ValidationError::InvalidCandidate(
InvalidCandidate::ParamsTooLarge(actual, max),
));
}
Err(workspace::RequestValidationErr::Signal(msg)) => {
return Err(ValidationError::Internal(InternalError::FailedToSignal(msg)));
}
Err(workspace::RequestValidationErr::WriteData(msg)) => {
return Err(ValidationError::Internal(InternalError::FailedToWriteData(msg)));
}
}
debug!(target: LOG_TARGET, "{} Waiting for results", self.id);
match memory.wait(Event::ResultReady as usize, shared_memory::Timeout::Sec(EXECUTION_TIMEOUT_SEC as usize)) {
Err(e) => {
debug!(target: LOG_TARGET, "Worker timeout: {:?}", e);
let result_header = match host_handle.wait_for_result(EXECUTION_TIMEOUT_SEC) {
Ok(inner_result) => inner_result,
Err(assumed_timeout) => {
debug!(target: LOG_TARGET, "Worker timeout: {:?}", assumed_timeout);
if let Some(mut worker) = self.worker.take() {
worker.kill().ok();
}
return Err(ValidationError::InvalidCandidate(InvalidCandidate::Timeout));
}
Ok(()) => {}
}
};
{
debug!(target: LOG_TARGET, "{} Reading results", self.id);
let data: &[u8] = &**memory.wlock_as_slice(0)
.map_err(|e| ValidationError::Internal(e.into()))?;
let (header_buf, _) = data.split_at(MAX_VALIDATION_RESULT_HEADER_MEM);
let mut header_buf: &[u8] = header_buf;
let header = ValidationResultHeader::decode(&mut header_buf)
.map_err(|e|
InternalError::System(
Box::<dyn std::error::Error + Send + Sync>::from(
format!("Failed to decode `ValidationResultHeader`: {:?}", e)
) as Box<_>
)
)?;
match header {
ValidationResultHeader::Ok(result) => Ok(result),
ValidationResultHeader::Error(WorkerValidationError::InternalError(e)) => {
debug!(target: LOG_TARGET, "{} Internal validation error: {}", self.id, e);
match result_header {
workspace::ValidationResultHeader::Ok(result) => Ok(result),
workspace::ValidationResultHeader::Error(
workspace::WorkerValidationError::InternalError(e),
) => {
debug!(
target: LOG_TARGET,
"{} Internal validation error: {}", self.id, e
);
Err(ValidationError::Internal(InternalError::WasmWorker(e)))
},
ValidationResultHeader::Error(WorkerValidationError::ValidationError(e)) => {
debug!(target: LOG_TARGET, "{} External validation error: {}", self.id, e);
Err(ValidationError::InvalidCandidate(InvalidCandidate::ExternalWasmExecutor(e)))
}
workspace::ValidationResultHeader::Error(
workspace::WorkerValidationError::ValidationError(e),
) => {
debug!(
target: LOG_TARGET,
"{} External validation error: {}", self.id, e
);
Err(ValidationError::InvalidCandidate(
InvalidCandidate::ExternalWasmExecutor(e),
))
}
}
}
......
// Copyright 2021 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! This module implements a "workspace" - basically a wrapper around a shared memory that
//! is used as an IPC channel for communication between the validation host and it's validation
//! worker.
use crate::primitives::{ValidationParams, ValidationResult};
use super::LOG_TARGET;
use parity_scale_codec::{Decode, Encode};
use raw_sync::{
events::{Event, EventImpl, EventInit, EventState},
Timeout,
};
use shared_memory::{Shmem, ShmemConf};
use std::{
error::Error,
fmt,
io::{Cursor, Write},
slice,
sync::atomic::AtomicBool,
time::Duration,
};
// maximum memory in bytes
const MAX_PARAMS_MEM: usize = 1024 * 1024; // 1 MiB
const MAX_CODE_MEM: usize = 16 * 1024 * 1024; // 16 MiB
const MAX_VALIDATION_RESULT_HEADER_MEM: usize = MAX_CODE_MEM + 1024; // 16.001 MiB
/// Params header in shared memory. All offsets should be aligned to WASM page size.
#[derive(Encode, Decode, Debug)]
struct ValidationHeader {
code_size: u64,
params_size: u64,
}
/// An error that could happen during validation of a candidate.
#[derive(Encode, Decode, Debug)]
pub enum WorkerValidationError {
InternalError(String),
ValidationError(String),
}
/// An enum that is used to marshal a validation result in order to pass it through the shared memory.
#[derive(Encode, Decode, Debug)]
pub enum ValidationResultHeader {