Unverified Commit b651e4af authored by asynchronous rob's avatar asynchronous rob Committed by GitHub
Browse files

Implement PoV Distribution Subsystem (#1344)

* introduce candidatedescriptor type

* add PoVDistribution message type

* loosen bound on PoV Distribution to account for equivocations

* re-export some types from the messages module

* begin PoV Distribution subsystem

* remove redundant index from PoV distribution

* define state machine for pov distribution

* handle overseer signals

* set up control flow

* remove `ValidatorStatement` section

* implement PoV fetching

* implement distribution logic

* add missing `

* implement some network bridge event handlers

* stub for message processing, handle our view change

* control flow for handling messages

* handle `awaiting` message

* handle any incoming PoVs and redistribute

* actually provide a subsystem implementation

* remove set-builder notation

* begin testing PoV distribution

* test that we send awaiting messages only to peers with same view

* ensure we distribute awaited PoVs to peers on view changes

* test that peers can complete fetch and are rewarded

* test some reporting logic

* ensure peer is reported for flooding

* test punishing peers diverging from awaited protocol

* test that we eagerly complete peers' awaited PoVs based on what we receive

* test that we prune the awaited set after receiving

* expand pov-distribution in guide to match a change I made

* remove unneeded import
parent edd8f3b2
Pipeline #99861 passed with stages
in 24 minutes and 49 seconds
...@@ -4540,6 +4540,25 @@ dependencies = [ ...@@ -4540,6 +4540,25 @@ dependencies = [
"sp-wasm-interface", "sp-wasm-interface",
] ]
[[package]]
name = "polkadot-pov-distribution"
version = "0.1.0"
dependencies = [
"assert_matches",
"futures 0.3.5",
"futures-timer 3.0.2",
"log 0.4.8",
"parity-scale-codec",
"parking_lot 0.10.2",
"polkadot-node-primitives",
"polkadot-node-subsystem",
"polkadot-primitives",
"polkadot-subsystem-test-helpers",
"sc-network",
"sp-runtime",
"streamunordered",
]
[[package]] [[package]]
name = "polkadot-primitives" name = "polkadot-primitives"
version = "0.8.14" version = "0.8.14"
......
...@@ -45,6 +45,7 @@ members = [ ...@@ -45,6 +45,7 @@ members = [
"node/core/proposer", "node/core/proposer",
"node/network/bridge", "node/network/bridge",
"node/network/pov-distribution",
"node/network/statement-distribution", "node/network/statement-distribution",
"node/overseer", "node/overseer",
"node/primitives", "node/primitives",
......
Stub - This folder will hold networking subsystem implementations, each with their own crate. This folder holds all networking subsystem implementations, each with their own crate.
[package]
name = "polkadot-pov-distribution"
version = "0.1.0"
authors = ["Parity Technologies <admin@parity.io>"]
edition = "2018"
[dependencies]
futures = "0.3.5"
log = "0.4.8"
futures-timer = "3.0.2"
streamunordered = "0.5.1"
polkadot-primitives = { path = "../../../primitives" }
node-primitives = { package = "polkadot-node-primitives", path = "../../primitives" }
parity-scale-codec = "1.3.0"
sc-network = { git = "https://github.com/paritytech/substrate", branch = "master" }
sp-runtime = { git = "https://github.com/paritytech/substrate", branch = "master" }
polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../../subsystem" }
[dev-dependencies]
parking_lot = "0.10.0"
subsystem-test = { package = "polkadot-subsystem-test-helpers", path = "../../test-helpers/subsystem" }
assert_matches = "1.3.0"
This diff is collapsed.
...@@ -28,11 +28,14 @@ use polkadot_primitives::{BlockNumber, Hash, Signature}; ...@@ -28,11 +28,14 @@ use polkadot_primitives::{BlockNumber, Hash, Signature};
use polkadot_primitives::parachain::{ use polkadot_primitives::parachain::{
AbridgedCandidateReceipt, PoVBlock, ErasureChunk, BackedCandidate, Id as ParaId, AbridgedCandidateReceipt, PoVBlock, ErasureChunk, BackedCandidate, Id as ParaId,
SignedAvailabilityBitfield, SigningContext, ValidatorId, ValidationCode, ValidatorIndex, SignedAvailabilityBitfield, SigningContext, ValidatorId, ValidationCode, ValidatorIndex,
CandidateDescriptor,
}; };
use polkadot_node_primitives::{ use polkadot_node_primitives::{
MisbehaviorReport, SignedFullStatement, View, ProtocolId, MisbehaviorReport, SignedFullStatement, View, ProtocolId,
}; };
use std::sync::Arc;
pub use sc_network::{ObservedRole, ReputationChange, PeerId}; pub use sc_network::{ObservedRole, ReputationChange, PeerId};
/// A notification of a new backed candidate. /// A notification of a new backed candidate.
...@@ -214,6 +217,21 @@ pub enum ProvisionerMessage { ...@@ -214,6 +217,21 @@ pub enum ProvisionerMessage {
ProvisionableData(ProvisionableData), ProvisionableData(ProvisionableData),
} }
/// Message to the PoV Distribution Subsystem.
#[derive(Debug)]
pub enum PoVDistributionMessage {
/// Fetch a PoV from the network.
///
/// This `CandidateDescriptor` should correspond to a candidate seconded under the provided
/// relay-parent hash.
FetchPoV(Hash, CandidateDescriptor, oneshot::Sender<Arc<PoVBlock>>),
/// Distribute a PoV for the given relay-parent and CandidateDescriptor.
/// The PoV should correctly hash to the PoV hash mentioned in the CandidateDescriptor
DistributePoV(Hash, CandidateDescriptor, Arc<PoVBlock>),
/// An update from the network bridge.
NetworkBridgeUpdate(NetworkBridgeEvent),
}
/// A message type tying together all message types that are used across Subsystems. /// A message type tying together all message types that are used across Subsystems.
#[derive(Debug)] #[derive(Debug)]
pub enum AllMessages { pub enum AllMessages {
...@@ -231,6 +249,8 @@ pub enum AllMessages { ...@@ -231,6 +249,8 @@ pub enum AllMessages {
BitfieldDistribution(BitfieldDistributionMessage), BitfieldDistribution(BitfieldDistributionMessage),
/// Message for the Provisioner subsystem. /// Message for the Provisioner subsystem.
Provisioner(ProvisionerMessage), Provisioner(ProvisionerMessage),
/// Message for the PoV Distribution subsystem.
PoVDistribution(PoVDistributionMessage),
/// Message for the Runtime API subsystem. /// Message for the Runtime API subsystem.
RuntimeApi(RuntimeApiMessage), RuntimeApi(RuntimeApiMessage),
/// Message for the availability store subsystem. /// Message for the availability store subsystem.
......
...@@ -460,6 +460,17 @@ impl AbridgedCandidateReceipt { ...@@ -460,6 +460,17 @@ impl AbridgedCandidateReceipt {
pov_block_hash: *pov_block_hash, pov_block_hash: *pov_block_hash,
} }
} }
/// Clone the relevant portions of the `AbridgedCandidateReceipt` to form a `CandidateDescriptor`.
pub fn to_descriptor(&self) -> CandidateDescriptor {
CandidateDescriptor {
para_id: self.parachain_index,
relay_parent: self.relay_parent,
collator: self.collator.clone(),
signature: self.signature.clone(),
pov_hash: self.pov_block_hash.clone(),
}
}
} }
...@@ -478,6 +489,26 @@ impl Ord for AbridgedCandidateReceipt { ...@@ -478,6 +489,26 @@ impl Ord for AbridgedCandidateReceipt {
} }
} }
/// A unique descriptor of the candidate receipt, in a lightweight format.
#[derive(PartialEq, Eq, Clone, Encode, Decode)]
#[cfg_attr(feature = "std", derive(Debug, Default))]
pub struct CandidateDescriptor<H = Hash> {
/// The ID of the para this is a candidate for.
pub para_id: Id,
/// The hash of the relay-chain block this should be executed in
/// the context of.
// NOTE: the fact that the hash includes this value means that code depends
// on this for deduplication. Removing this field is likely to break things.
pub relay_parent: H,
/// The collator's relay-chain account ID
pub collator: CollatorId,
/// Signature on blake2-256 of components of this receipt:
/// The para ID, the relay parent, and the pov_hash.
pub signature: CollatorSignature,
/// The hash of the pov-block.
pub pov_hash: H,
}
/// A collation sent by a collator. /// A collation sent by a collator.
#[derive(PartialEq, Eq, Clone, Encode, Decode)] #[derive(PartialEq, Eq, Clone, Encode, Decode)]
#[cfg_attr(feature = "std", derive(Debug, Default))] #[cfg_attr(feature = "std", derive(Debug, Default))]
......
...@@ -26,11 +26,13 @@ This protocol is described in terms of "us" and our peers, with the understandin ...@@ -26,11 +26,13 @@ This protocol is described in terms of "us" and our peers, with the understandin
As we are gossiping, we need to track which PoVs our peers are waiting for to avoid sending them data that they are not expecting. It is not reasonable to expect our peers to buffer unexpected PoVs, just as we will not buffer unexpected PoVs. So notifying our peers about what is being awaited is key. However it is important that the notifications system is also bounded. As we are gossiping, we need to track which PoVs our peers are waiting for to avoid sending them data that they are not expecting. It is not reasonable to expect our peers to buffer unexpected PoVs, just as we will not buffer unexpected PoVs. So notifying our peers about what is being awaited is key. However it is important that the notifications system is also bounded.
For this, in order to avoid reaching into the internals of the [Statement Distribution](statement-distribution.md) Subsystem, we can rely on an expected propery of candidate backing: that each validator can only second one candidate at each chain head. So we can set a cap on the number of PoVs each peer is allowed to notify us that they are waiting for at a given relay-parent. This cap will be the number of validators at that relay-parent. And the view update mechanism of the [Network Bridge](../utility/network-bridge.md) ensures that peers are only allowed to consider a certain set of relay-parents as live. So this bounding mechanism caps the amount of data we need to store per peer at any time at `sum({ n_validators_at_head(head) | head in view_heads })`. Additionally, peers should only be allowed to notify us of PoV hashes they are waiting for in the context of relay-parents in our own local view, which means that `n_validators_at_head` is implied to be `0` for relay-parents not in our own local view. For this, in order to avoid reaching into the internals of the [Statement Distribution](statement-distribution.md) Subsystem, we can rely on an expected propery of candidate backing: that each validator can second up to 2 candidates per chain head. This will typically be only one, because they are only supposed to issue one, but they can equivocate if they are willing to be slashed. So we can set a cap on the number of PoVs each peer is allowed to notify us that they are waiting for at a given relay-parent. This cap will be twice the number of validators at that relay-parent. In practice, this is a very lax upper bound that can be reduced much further if desired.
The view update mechanism of the [Network Bridge](../utility/network-bridge.md) ensures that peers are only allowed to consider a certain set of relay-parents as live. So this bounding mechanism caps the amount of data we need to store per peer at any time at `sum({ 2 * n_validators_at_head(head) * sizeof(hash) for head in view_heads })`. Additionally, peers should only be allowed to notify us of PoV hashes they are waiting for in the context of relay-parents in our own local view, which means that `n_validators_at_head` is implied to be `0` for relay-parents not in our own local view.
View updates from peers and our own view updates are received from the network bridge. These will lag somewhat behind the `StartWork` and `StopWork` messages received from the overseer, which will influence the actual data we store. The `OurViewUpdate`s from the [`NetworkBridgeEvent`](../../types/overseer-protocol.md#network-bridge-update) must be considered canonical in terms of our peers' perception of us. View updates from peers and our own view updates are received from the network bridge. These will lag somewhat behind the `StartWork` and `StopWork` messages received from the overseer, which will influence the actual data we store. The `OurViewUpdate`s from the [`NetworkBridgeEvent`](../../types/overseer-protocol.md#network-bridge-update) must be considered canonical in terms of our peers' perception of us.
Lastly, the system needs to be bootstrapped with our own perception of which PoVs we are cognizant of but awaiting data for. This is done by receipt of the [`PoVDistributionMessage`](../../types/overseer-protocol.md#pov-distribution-message)::ValidatorStatement variant. We can ignore anything except for `Seconded` statements. Lastly, the system needs to be bootstrapped with our own perception of which PoVs we are cognizant of but awaiting data for. This is done by receipt of the [`PoVDistributionMessage`](../../types/overseer-protocol.md#pov-distribution-message)::FetchPoV variant. Proper operation of this subsystem depends on the descriptors passed faithfully representing candidates which have been seconded by other validators.
## Formal Description ## Formal Description
...@@ -45,7 +47,6 @@ struct State { ...@@ -45,7 +47,6 @@ struct State {
struct BlockBasedState { struct BlockBasedState {
known: Map<Hash, PoV>, // should be a shared PoV in practice. these things are heavy. known: Map<Hash, PoV>, // should be a shared PoV in practice. these things are heavy.
awaited: Set<Hash>, // awaited PoVs by blake2-256 hash.
fetching: Map<Hash, [ResponseChannel<PoV>]>, fetching: Map<Hash, [ResponseChannel<PoV>]>,
n_validators: usize, n_validators: usize,
} }
...@@ -79,14 +80,11 @@ Here is the logic of the state machine: ...@@ -79,14 +80,11 @@ Here is the logic of the state machine:
- On `Concluded`: conclude. - On `Concluded`: conclude.
*PoV Distribution Messages* *PoV Distribution Messages*
- On `ValidatorStatement(relay_parent, statement)`
- If this is not `Statement::Seconded`, ignore.
- If there is an entry under `relay_parent` in `relay_parent_state`, add the `pov_hash` of the seconded Candidate's [`CandidateDescriptor`](../../types/candidate.md#candidate-descriptor) to the `awaited` set of the entry.
- If the `pov_hash` was not previously awaited and there are `n_validators` or fewer entries in the `awaited` set, send `NetworkMessage::Awaiting(relay_parent, vec![pov_hash])` to all peers.
- On `FetchPoV(relay_parent, descriptor, response_channel)` - On `FetchPoV(relay_parent, descriptor, response_channel)`
- If there is no entry in `relay_parent_state` under `relay_parent`, ignore. - If there is no entry in `relay_parent_state` under `relay_parent`, ignore.
- If there is a PoV under `descriptor.pov_hash` in the `known` map, send that PoV on the channel and return. - If there is a PoV under `descriptor.pov_hash` in the `known` map, send that PoV on the channel and return.
- Otherwise, place the `response_channel` in the `fetching` map under `descriptor.pov_hash`. - Otherwise, place the `response_channel` in the `fetching` map under `descriptor.pov_hash`.
- If the `pov_hash` had no previous entry in `fetching` and there are `2 * n_validators` or fewer entries in the `fetching` set, send `NetworkMessage::Awaiting(relay_parent, vec![pov_hash])` to all peers.
- On `DistributePoV(relay_parent, descriptor, PoV)` - On `DistributePoV(relay_parent, descriptor, PoV)`
- If there is no entry in `relay_parent_state` under `relay_parent`, ignore. - If there is no entry in `relay_parent_state` under `relay_parent`, ignore.
- Complete and remove any channels under `descriptor.pov_hash` in the `fetching` map. - Complete and remove any channels under `descriptor.pov_hash` in the `fetching` map.
...@@ -96,26 +94,28 @@ Here is the logic of the state machine: ...@@ -96,26 +94,28 @@ Here is the logic of the state machine:
*Network Bridge Updates* *Network Bridge Updates*
- On `PeerConnected(peer_id, observed_role)` - On `PeerConnected(peer_id, observed_role)`
- Make a fresh entry in the `peer_state` map for the `peer_id`. - Make a fresh entry in the `peer_state` map for the `peer_id`.
- On `PeerDisconnected(peer_id) - On `PeerDisconnected(peer_id)`
- Remove the entry for `peer_id` from the `peer_state` map. - Remove the entry for `peer_id` from the `peer_state` map.
- On `PeerMessage(peer_id, bytes)` - On `PeerMessage(peer_id, bytes)`
- If the bytes do not decode to a `NetworkMessage` or the `peer_id` has no entry in the `peer_state` map, report and ignore. - If the bytes do not decode to a `NetworkMessage` or the `peer_id` has no entry in the `peer_state` map, report and ignore.
- If this is `NetworkMessage::Awaiting(relay_parent, pov_hashes)`: - If this is `NetworkMessage::Awaiting(relay_parent, pov_hashes)`:
- If there is no entry under `peer_state.awaited` for the `relay_parent`, report and ignore. - If there is no entry under `peer_state.awaited` for the `relay_parent`, report and ignore.
- If `relay_parent` is not contained within `our_view`, report and ignore. - If `relay_parent` is not contained within `our_view`, report and ignore.
- Otherwise, if the `awaited` map combined with the `pov_hashes` would have more than `relay_parent_state[relay_parent].n_validators` entries, report and ignore. Note that we are leaning on the property of the network bridge that it sets our view based on `StartWork` messages. - Otherwise, if the peer's `awaited` map combined with the `pov_hashes` would have more than ` 2 * relay_parent_state[relay_parent].n_validators` entries, report and ignore. Note that we are leaning on the property of the network bridge that it sets our view based on `StartWork` messages.
- For each new `pov_hash` in `pov_hashes`, if there is a `pov` under `pov_hash` in the `known` map, send the peer a `NetworkMessage::SendPoV(relay_parent, pov_hash, pov)`. - For each new `pov_hash` in `pov_hashes`, if there is a `pov` under `pov_hash` in the `known` map, send the peer a `NetworkMessage::SendPoV(relay_parent, pov_hash, pov)`.
- Otherwise, add the `pov_hash` to the `awaited` map - Otherwise, add the `pov_hash` to the `awaited` map
- If this is `NetworkMessage::SendPoV(relay_parent, pov_hash, pov)`: - If this is `NetworkMessage::SendPoV(relay_parent, pov_hash, pov)`:
- If there is no entry under `relay_parent` in `relay_parent_state` or no entry under `pov_hash` in our `awaited` map for that `relay_parent`, report and ignore. - If there is no entry under `relay_parent` in `relay_parent_state` or no entry under `pov_hash` in our `fetching` map for that `relay_parent`, report and ignore.
- If the blake2-256 hash of the pov doesn't equal `pov_hash`, report and ignore. - If the blake2-256 hash of the pov doesn't equal `pov_hash`, report and ignore.
- Complete and remove any listeners in the `fetching` map under `pov_hash`. - Complete and remove any listeners in the `fetching` map under `pov_hash`. However, leave an empty set of listeners in the `fetching` map to denote that this was something we once awaited. This will allow us to recognize peers who have sent us something we were expecting, but just a little late.
- Add to `known` map. - Add to `known` map.
- Remove the `pov_hash` from the `peer.awaited` map, if any.
- Send `NetworkMessage::SendPoV(relay_parent, descriptor.pov_hash, PoV)` to all peers who have the `descriptor.pov_hash` in the set under `relay_parent` in the `peer.awaited` map and remove the entry from `peer.awaited`. - Send `NetworkMessage::SendPoV(relay_parent, descriptor.pov_hash, PoV)` to all peers who have the `descriptor.pov_hash` in the set under `relay_parent` in the `peer.awaited` map and remove the entry from `peer.awaited`.
- On `PeerViewChange(peer_id, view)` - On `PeerViewChange(peer_id, view)`
- If Peer is unknown, ignore. - If Peer is unknown, ignore.
- Ensure there is an entry under `relay_parent` for each `relay_parent` in `view` within the `peer.awaited` map, creating blank `awaited` lists as necessary. - Ensure there is an entry under `relay_parent` for each `relay_parent` in `view` within the `peer.awaited` map, creating blank `awaited` lists as necessary.
- Remove all entries under `peer.awaited` that are not within `view`. - Remove all entries under `peer.awaited` that are not within `view`.
- For all hashes in `view` but were not within the old, send the peer all the keys in our `fetching` map under the block-based state for that hash - i.e. notify the peer of everything we are awaiting at that hash.
- On `OurViewChange(view)` - On `OurViewChange(view)`
- Update `our_view` to `view` - Update `our_view` to `view`
...@@ -172,11 +172,10 @@ If this subsystem chooses to second a parachain block, it dispatches a `Candidat ...@@ -172,11 +172,10 @@ If this subsystem chooses to second a parachain block, it dispatches a `Candidat
```rust ```rust
enum PoVDistributionMessage { enum PoVDistributionMessage {
/// Note a statement by a validator on a relay-parent. `Seconded` statements must always
/// have been passed in before `Valid` or `Invalid` statements.
ValidatorStatement(Hash, SignedFullStatement),
/// Fetch a PoV from the network. /// Fetch a PoV from the network.
/// (relay_parent, PoV-hash, Response channel). ///
/// This `CandidateDescriptor` should correspond to a candidate seconded under the provided
/// relay-parent hash.
FetchPoV(Hash, CandidateDescriptor, ResponseChannel<PoV>), FetchPoV(Hash, CandidateDescriptor, ResponseChannel<PoV>),
/// Distribute a PoV for the given relay-parent and CandidateDescriptor. /// Distribute a PoV for the given relay-parent and CandidateDescriptor.
/// The PoV should correctly hash to the PoV hash mentioned in the CandidateDescriptor /// The PoV should correctly hash to the PoV hash mentioned in the CandidateDescriptor
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment