Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
groups:
- name: polkadot.rules
rules:
##############################################################################
# Resource usage
##############################################################################
- alert: HighCPUUsage
expr: polkadot_cpu_usage_percentage >= 100
for: 5m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has a CPU usage higher than 100% for more than 5 minutes'
##############################################################################
# Block production
##############################################################################
- alert: LowNumberOfNewBlocks
annotations:
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
for: 3m
labels:
severity: warning
- alert: LowNumberOfNewBlocks
annotations:
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
for: 10m
labels:
severity: critical
##############################################################################
# Block finalization
##############################################################################
- alert: BlockFinalizationSlow
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
for: 3m
labels:
severity: warning
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
- alert: BlockFinalizationSlow
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
for: 10m
labels:
severity: critical
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
- alert: BlockFinalizationLaggingBehind
# Under the assumption of an average block production of 6 seconds,
# "best" and "finalized" being more than 10 blocks apart would imply
# more than a 1 minute delay between block production and finalization.
expr: (polkadot_block_height_number{status="best"} - ignoring(status) polkadot_block_height_number{status="finalized"}) > 10
for: 8m
labels:
severity: critical
annotations:
message: "Block finalization on instance {{ $labels.instance }} is behind block production by {{ $value }} for more than 8m"
##############################################################################
# Transaction queue
##############################################################################
- alert: TransactionQueueSize
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
for: 10m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 10 minutes'
- alert: TransactionQueueSize
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
for: 30m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 30 minutes'
##############################################################################
# Networking
##############################################################################
- alert: LowNumberOfPeers
expr: polkadot_sub_libp2p_peers_count < 3
for: 3m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes'
- alert: LowNumberOfPeers
expr: polkadot_sub_libp2p_peers_count < 3
for: 15m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes'
##############################################################################
# Others
##############################################################################
- alert: AuthorityDiscoveryHighDiscoveryFailure
expr: polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5
for: 2h
labels:
severity: warning
annotations:
message: "Authority discovery on node {{ $labels.instance }} fails to process more than 50 % of the values found on the DHT."