Skip to content
Snippets Groups Projects
alerting-rules.yaml 6.44 KiB
Newer Older
groups:
- name: polkadot.rules
  rules:

  ##############################################################################
  # Block production
  ##############################################################################

  - alert: BlockProductionSlow
      message: 'Best block on instance {{ $labels.instance }} increases by
      less than 1 per minute for more than 3 minutes.'
    expr: increase(polkadot_block_height{status="best"}[1m]) < 1
    for: 3m
    labels:
      severity: warning
  - alert: BlockProductionSlow
      message: 'Best block on instance {{ $labels.instance }} increases by
      less than 1 per minute for more than 10 minutes.'
    expr: increase(polkadot_block_height{status="best"}[1m]) < 1
    for: 10m
    labels:
      severity: critical

  ##############################################################################
  # Block finalization
  ##############################################################################

  - alert: BlockFinalizationSlow
    expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
    for: 3m
    labels:
      severity: warning
    annotations:
      message: 'Finalized block on instance {{ $labels.instance }} increases by
      less than 1 per minute for more than 3 minutes.'
  - alert: BlockFinalizationSlow
    expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
    for: 10m
    labels:
      severity: critical
    annotations:
      message: 'Finalized block on instance {{ $labels.instance }} increases by
      less than 1 per minute for more than 10 minutes.'
  - alert: BlockFinalizationLaggingBehind
    # Under the assumption of an average block production of 6 seconds,
    # "best" and "finalized" being more than 10 blocks apart would imply
    # more than a 1 minute delay between block production and finalization.
    expr: '(polkadot_block_height_number{status="best"} - ignoring(status)
    polkadot_block_height_number{status="finalized"}) > 10'
    for: 8m
    labels:
      severity: critical
    annotations:
      message: "Block finalization on instance {{ $labels.instance }} is behind
      block production by {{ $value }} for more than 8 minutes."

  ##############################################################################
  # Transaction queue
  ##############################################################################

  - alert: TransactionQueueSizeIncreasing
    expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
    increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
    for: 10m
    labels:
      severity: warning
    annotations:
      message: 'The transaction pool size on node {{ $labels.instance }} has
      been monotonically increasing for more than 10 minutes.'
  - alert: TransactionQueueSizeIncreasing
    expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
    increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
    for: 30m
    labels:
      severity: critical
    annotations:
      message: 'The transaction pool size on node {{ $labels.instance }} has
      been monotonically increasing for more than 30 minutes.'
  - alert: TransactionQueueSizeHigh
    expr: 'polkadot_sub_txpool_validations_scheduled -
    polkadot_sub_txpool_validations_finished > 10000'
    for: 5m
    labels:
      severity: critical
    annotations:
      message: 'The transaction pool size on node {{ $labels.instance }} has
      been above 10_000 for more than 5 minutes.'

  ##############################################################################
  # Networking
  ##############################################################################

  - alert: NumberOfPeersLow
    expr: polkadot_sub_libp2p_peers_count < 3
    for: 3m
    labels:
      severity: warning
    annotations:
      message: 'The node {{ $labels.instance }} has less than 3 peers for more
      than 3 minutes'
  - alert: NumberOfPeersLow
    expr: polkadot_sub_libp2p_peers_count < 3
    for: 15m
    labels:
      severity: critical
    annotations:
      message: 'The node {{ $labels.instance }} has less than 3 peers for more
      than 15 minutes'
  ##############################################################################
  # System
  ##############################################################################

  - alert: NumberOfFileDescriptorsHigh
    expr: 'node_filefd_allocated{domain=~"kusama|polkadot"} > 10000'
    for: 3m
    labels:
      severity: warning
    annotations:
      message: 'The node {{ $labels.instance }} has more than 10_000 file
      descriptors allocated for more than 3 minutes'

  ##############################################################################
  # Others
  ##############################################################################

  - alert: ContinuousTaskEnded
    expr: '(polkadot_tasks_spawned_total{task_name != "basic-authorship-proposer"} == 1)
        - on(instance, task_name) (polkadot_tasks_ended_total == 1)'
    for: 5m
    labels:
      severity: warning
    annotations:
      message: 'Continuous task {{ $labels.task_name }} on node
      {{ $labels.instance }} ended unexpectedly.'

  - alert: AuthorityDiscoveryDiscoveryFailureHigh
    expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
    ignoring(name)
    polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
      message: 'Authority discovery on node {{ $labels.instance }} fails to
      process more than 50 % of the values found on the DHT for more than 2
      hours.'

  - alert: UnboundedChannelPersistentlyLarge
    expr: '(
        (polkadot_unbounded_channel_len{action = "send"} -
            ignoring(action) polkadot_unbounded_channel_len{action = "received"})
        or on(instance) polkadot_unbounded_channel_len{action = "send"}
    ) >= 200'
    for: 5m
    labels:
      severity: warning
    annotations:
      message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains
      more than 200 items for more than 5 minutes. Node might be frozen.'

  - alert: UnboundedChannelVeryLarge
    expr: '(
        (polkadot_unbounded_channel_len{action = "send"} -
            ignoring(action) polkadot_unbounded_channel_len{action = "received"})
        or on(instance) polkadot_unbounded_channel_len{action = "send"}
    ) > 5000'
    labels:
      severity: warning
    annotations:
      message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains more than
      5000 items.'