diff --git a/polkadot/node/network/collator-protocol/src/validator_side/mod.rs b/polkadot/node/network/collator-protocol/src/validator_side/mod.rs index c8841213d9a089c88e72bf73cd21c890a0d49bf7..866062b052cde1ec6d19109e79b3315829a81384 100644 --- a/polkadot/node/network/collator-protocol/src/validator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/validator_side/mod.rs @@ -66,7 +66,6 @@ const COST_UNEXPECTED_MESSAGE: Rep = Rep::CostMinor("An unexpected message"); const COST_CORRUPTED_MESSAGE: Rep = Rep::CostMinor("Message was corrupt"); /// Network errors that originated at the remote host should have same cost as timeout. const COST_NETWORK_ERROR: Rep = Rep::CostMinor("Some network error"); -const COST_REQUEST_TIMED_OUT: Rep = Rep::CostMinor("A collation request has timed out"); const COST_INVALID_SIGNATURE: Rep = Rep::Malicious("Invalid network message signature"); const COST_REPORT_BAD: Rep = Rep::Malicious("A collator was reported by another subsystem"); const COST_WRONG_PARA: Rep = Rep::Malicious("A collator provided a collation for the wrong para"); @@ -1212,7 +1211,7 @@ async fn poll_requests( if !result.is_ready() { retained_requested.insert(pending_collation.clone()); } - if let CollationFetchResult::Error(rep) = result { + if let CollationFetchResult::Error(Some(rep)) = result { reputation_changes.push((pending_collation.peer_id.clone(), rep)); } } @@ -1333,8 +1332,8 @@ enum CollationFetchResult { /// The collation was fetched successfully. Success, /// An error occurred when fetching a collation or it was invalid. - /// A reputation change should be applied to the peer. - Error(Rep), + /// A given reputation change should be applied to the peer. + Error(Option<Rep>), } impl CollationFetchResult { @@ -1380,7 +1379,19 @@ async fn poll_collation_response( err = ?err, "Collator provided response that could not be decoded" ); - CollationFetchResult::Error(COST_CORRUPTED_MESSAGE) + CollationFetchResult::Error(Some(COST_CORRUPTED_MESSAGE)) + }, + Err(err) if err.is_timed_out() => { + tracing::debug!( + target: LOG_TARGET, + hash = ?pending_collation.relay_parent, + para_id = ?pending_collation.para_id, + peer_id = ?pending_collation.peer_id, + "Request timed out" + ); + // For now we don't want to change reputation on timeout, to mitigate issues like + // this: https://github.com/paritytech/polkadot/issues/4617 + CollationFetchResult::Error(None) }, Err(RequestError::NetworkError(err)) => { tracing::debug!( @@ -1392,24 +1403,21 @@ async fn poll_collation_response( "Fetching collation failed due to network error" ); // A minor decrease in reputation for any network failure seems - // sensible. In theory this could be exploited, by Dosing this node, + // sensible. In theory this could be exploited, by DoSing this node, // which would result in reduced reputation for proper nodes, but the // same can happen for penalties on timeouts, which we also have. - CollationFetchResult::Error(COST_NETWORK_ERROR) + CollationFetchResult::Error(Some(COST_NETWORK_ERROR)) }, - Err(RequestError::Canceled(_)) => { + Err(RequestError::Canceled(err)) => { tracing::debug!( target: LOG_TARGET, hash = ?pending_collation.relay_parent, para_id = ?pending_collation.para_id, peer_id = ?pending_collation.peer_id, - "Request timed out" + err = ?err, + "Canceled should be handled by `is_timed_out` above - this is a bug!" ); - // A minor decrease in reputation for any network failure seems - // sensible. In theory this could be exploited, by Dosing this node, - // which would result in reduced reputation for proper nodes, but the - // same can happen for penalties on timeouts, which we also have. - CollationFetchResult::Error(COST_REQUEST_TIMED_OUT) + CollationFetchResult::Error(None) }, Ok(CollationFetchingResponse::Collation(receipt, _)) if receipt.descriptor().para_id != pending_collation.para_id => @@ -1422,7 +1430,7 @@ async fn poll_collation_response( "Got wrong para ID for requested collation." ); - CollationFetchResult::Error(COST_WRONG_PARA) + CollationFetchResult::Error(Some(COST_WRONG_PARA)) }, Ok(CollationFetchingResponse::Collation(receipt, pov)) => { tracing::debug!( diff --git a/polkadot/node/network/collator-protocol/src/validator_side/tests.rs b/polkadot/node/network/collator-protocol/src/validator_side/tests.rs index 5cf2b35280d700a312ded1b543d91bb192e89de2..e15a7d2eeadd2fe7e46341b5f2f52d026adf81c9 100644 --- a/polkadot/node/network/collator-protocol/src/validator_side/tests.rs +++ b/polkadot/node/network/collator-protocol/src/validator_side/tests.rs @@ -624,17 +624,6 @@ fn fetch_collations_works() { assert_fetch_collation_request(&mut virtual_overseer, second, test_state.chain_ids[0]) .await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer( - peer, - rep, - )) => { - assert_eq!(peer, peer_b); - assert_eq!(rep, COST_REQUEST_TIMED_OUT); - } - ); - let response_channel_non_exclusive = assert_fetch_collation_request(&mut virtual_overseer, second, test_state.chain_ids[0]) .await; @@ -861,17 +850,6 @@ fn inactive_disconnected() { Delay::new(ACTIVITY_TIMEOUT * 3).await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer( - peer, - rep, - )) => { - assert_eq!(peer, peer_b); - assert_eq!(rep, COST_REQUEST_TIMED_OUT); - } - ); - assert_collator_disconnect(&mut virtual_overseer, peer_b.clone()).await; virtual_overseer }); @@ -924,17 +902,6 @@ fn activity_extends_life() { advertise_collation(&mut virtual_overseer, peer_b.clone(), hash_b).await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer( - peer, - rep, - )) => { - assert_eq!(peer, peer_b); - assert_eq!(rep, COST_REQUEST_TIMED_OUT); - } - ); - assert_fetch_collation_request(&mut virtual_overseer, hash_b, test_state.chain_ids[0]) .await; @@ -942,33 +909,11 @@ fn activity_extends_life() { advertise_collation(&mut virtual_overseer, peer_b.clone(), hash_c).await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer( - peer, - rep, - )) => { - assert_eq!(peer, peer_b); - assert_eq!(rep, COST_REQUEST_TIMED_OUT); - } - ); - assert_fetch_collation_request(&mut virtual_overseer, hash_c, test_state.chain_ids[0]) .await; Delay::new(ACTIVITY_TIMEOUT * 3 / 2).await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer( - peer, - rep, - )) => { - assert_eq!(peer, peer_b); - assert_eq!(rep, COST_REQUEST_TIMED_OUT); - } - ); - assert_collator_disconnect(&mut virtual_overseer, peer_b.clone()).await; virtual_overseer diff --git a/polkadot/node/network/protocol/src/request_response/outgoing.rs b/polkadot/node/network/protocol/src/request_response/outgoing.rs index e76b7b0eaac2b86844b34ce4603cc5a1870f864f..acb78d06d7b27096120b826de148645afd899c49 100644 --- a/polkadot/node/network/protocol/src/request_response/outgoing.rs +++ b/polkadot/node/network/protocol/src/request_response/outgoing.rs @@ -94,6 +94,20 @@ pub enum RequestError { Canceled(#[source] oneshot::Canceled), } +impl RequestError { + /// Whether the error represents some kind of timeout condition. + pub fn is_timed_out(&self) -> bool { + match self { + Self::Canceled(_) | + Self::NetworkError(network::RequestFailure::Obsolete) | + Self::NetworkError(network::RequestFailure::Network( + network::OutboundFailure::Timeout, + )) => true, + _ => false, + } + } +} + /// A request to be sent to the network bridge, including a sender for sending responses/failures. /// /// The network implementation will make use of that sender for informing the requesting subsystem