Newer
Older
groups:
- name: polkadot.rules
rules:
##############################################################################
# Block production
##############################################################################
- alert: BlockProductionSlow
annotations:
message: 'Best block on instance {{ $labels.instance }} increases by
less than 1 per minute for more than 3 minutes.'
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
for: 3m
labels:
severity: warning
- alert: BlockProductionSlow
annotations:
message: 'Best block on instance {{ $labels.instance }} increases by
less than 1 per minute for more than 10 minutes.'
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
for: 10m
labels:
severity: critical
##############################################################################
# Block finalization
##############################################################################
- alert: BlockFinalizationSlow
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
for: 3m
labels:
severity: warning
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by
less than 1 per minute for more than 3 minutes.'
- alert: BlockFinalizationSlow
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
for: 10m
labels:
severity: critical
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by
less than 1 per minute for more than 10 minutes.'
- alert: BlockFinalizationLaggingBehind
# Under the assumption of an average block production of 6 seconds,
# "best" and "finalized" being more than 10 blocks apart would imply
# more than a 1 minute delay between block production and finalization.
expr: '(polkadot_block_height_number{status="best"} - ignoring(status)
polkadot_block_height_number{status="finalized"}) > 10'
for: 8m
labels:
severity: critical
annotations:
message: "Block finalization on instance {{ $labels.instance }} is behind
block production by {{ $value }} for more than 8 minutes."
##############################################################################
# Transaction queue
##############################################################################
Max Inden
committed
- alert: TransactionQueueSizeIncreasing
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
for: 10m
labels:
severity: warning
annotations:
Max Inden
committed
message: 'The transaction pool size on node {{ $labels.instance }} has
been monotonically increasing for more than 10 minutes.'
Max Inden
committed
- alert: TransactionQueueSizeIncreasing
expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
for: 30m
labels:
severity: critical
annotations:
Max Inden
committed
message: 'The transaction pool size on node {{ $labels.instance }} has
been monotonically increasing for more than 30 minutes.'
Max Inden
committed
- alert: TransactionQueueSizeHigh
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10000'
for: 5m
labels:
severity: critical
annotations:
message: 'The transaction pool size on node {{ $labels.instance }} has
been above 10_000 for more than 5 minutes.'
##############################################################################
# Networking
##############################################################################
- alert: NumberOfPeersLow
expr: polkadot_sub_libp2p_peers_count < 3
for: 3m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more
than 3 minutes'
- alert: NumberOfPeersLow
expr: polkadot_sub_libp2p_peers_count < 3
for: 15m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more
than 15 minutes'
##############################################################################
# System
##############################################################################
- alert: NumberOfFileDescriptorsHigh
expr: 'node_filefd_allocated{domain=~"kusama|polkadot"} > 10000'
for: 3m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has more than 10_000 file
descriptors allocated for more than 3 minutes'
##############################################################################
# Others
##############################################################################
- alert: ContinuousTaskEnded
expr: '(polkadot_tasks_spawned_total{task_name != "basic-authorship-proposer"} == 1)
- on(instance, task_name) (polkadot_tasks_ended_total == 1)'
for: 5m
labels:
severity: warning
annotations:
message: 'Continuous task {{ $labels.task_name }} on node
{{ $labels.instance }} ended unexpectedly.'
- alert: AuthorityDiscoveryDiscoveryFailureHigh
expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
ignoring(name)
polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
for: 2h
labels:
severity: warning
annotations:
message: 'Authority discovery on node {{ $labels.instance }} fails to
process more than 50 % of the values found on the DHT for more than 2
hours.'
- alert: UnboundedChannelPersistentlyLarge
expr: '(
(polkadot_unbounded_channel_len{action = "send"} -
ignoring(action) polkadot_unbounded_channel_len{action = "received"})
or on(instance) polkadot_unbounded_channel_len{action = "send"}
) >= 200'
for: 5m
labels:
severity: warning
annotations:
message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains
more than 200 items for more than 5 minutes. Node might be frozen.'
- alert: UnboundedChannelVeryLarge
expr: '(
(polkadot_unbounded_channel_len{action = "send"} -
ignoring(action) polkadot_unbounded_channel_len{action = "received"})
or on(instance) polkadot_unbounded_channel_len{action = "send"}
) > 5000'
labels:
severity: warning
annotations:
message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains more than
5000 items.'