-template: 5min_cpu_pcent
+template: 10min_cpu_usage
on: system.cpu
- lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice
+ lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
every: 1m
- warn: $this > 90
+ warn: $this > 80
+ crit: $this > 90
units: %
- info: average cpu utilization for the last 5 minutes
+ info: average cpu utilization for the last 10 minutes
to: sysadmin
-template: 5min_iowait_cpu_pcent
+template: 10min_cpu_iowait
on: system.cpu
- lookup: average -5m unaligned of iowait
+ lookup: average -10m unaligned of iowait
every: 1m
warn: $this > 10
+ crit: $this > 30
units: %
- info: average wait I/O for the last 5 minutes
+ info: average CPU wait I/O for the last 10 minutes
to: sysadmin
-template: 20min_steal_cpu_pcent
+template: 20min_steal_cpu
on: system.cpu
lookup: average -20m unaligned of steal
every: 5m
warn: $this > 10
+ crit: $this > 30
units: %
- info: average stolen CPU time for the last 20 minutes
+ info: average CPU steal time for the last 20 minutes
to: sysadmin
# raise an alarm if the disk is low on
# available disk space
-template: disk_full_percent
+template: disk_space_usage
on: disk.space
calc: $used * 100 / ($avail + $used)
every: 1m
template: disk_fill_rate
on: disk.space
- lookup: max -2m at -30m unaligned of avail
+ lookup: max -5m at -30m unaligned of avail
calc: ($this - $avail) / (($now - $after) / 60)
- every: 15s
+ every: 1m
units: GB/min
info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes
# if the disk continues to fill
# in this rate
-template: disk_full_after_hours
+template: out_of_disk_space_in
on: disk.space
calc: $avail / ($disk_fill_rate * 60)
every: 10s
# the alarm is checked every 1 minute
# and examines the last 30 minutes of data
- alarm: min_30min_entropy
+ alarm: 30min_lowest_entropy
on: system.entropy
lookup: min -30m unaligned
every: 1m
- warn: $this < 200
- crit: $this < 100
+ warn: $this < 100
units: entries
- info: minimum entries in the random numbers pool (entropy), for the last 30 minutes
+ info: minimum entries in the random numbers pool in the last 30 minutes
to: sysadmin
# detect if memcached cache is full
-template: cache_full_pcent
+template: memcached_cache_memory_usage
on: memcached.cache
calc: $used * 100 / ($used + $available)
every: 10s
# find the rate memcached cache is filling
-template: cache_fill_rate
+template: memcached_cache_fill_rate
on: memcached.cache
- lookup: max -1s at -30m unaligned of available
- calc: ($this - $available) / ($now - $after)
- every: 15s
- units: KB/s
+ lookup: max -5m at -30m unaligned of available
+ calc: ($this - $available) / (($now - $after) / 60)
+ every: 1m
+ units: KB/min
info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes
# find the hours remaining until memcached cache is full
-template: cache_full_after_hours
+template: memcached_out_of_cache_in
on: memcached.cache
- calc: $available / $cache_fill_rate / 3600
+ calc: $available / ($memcached_cache_fill_rate * 60)
every: 10s
warn: $this > 0 and $this < 48
crit: $this > 0 and $this < 24
# check if an interface is dropping packets
# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# and examines the last hour of data
-template: 30min_packet_drops
+template: 1hour_packet_drops
on: net.drops
- lookup: sum -30m unaligned absolute
+ lookup: sum -1h unaligned absolute
every: 1m
warn: $this > 0
units: packets
- info: dropped packets in the last 30 minutes
+ info: interface dropped packets in the last hour
to: sysadmin
# check if an interface is having FIFO
# buffer errors
# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# and examines the last hour of data
-template: 30min_fifo_errors
+template: 1hour_fifo_errors
on: net.fifo
- lookup: sum -30m unaligned absolute
+ lookup: sum -1h unaligned absolute
every: 1m
warn: $this > 0
units: errors
- info: network interface fifo errors in the last 30 minutes
+ info: interface fifo errors in the last hour
to: sysadmin
on: system.swap
calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
every: 10s
- warn: $this > 10
+ warn: $this > 20
crit: $this > 50
units: % of RAM
info: the swap memory used, as a percentage of the system RAM
char *path = health_config_dir();
+ // free all running alarms
rrdhost_rwlock(&localhost);
health_free_all_nolock(&localhost);
rrdhost_unlock(&localhost);
+ // invalidate all previous entries in the alarm log
+ ALARM_ENTRY *t;
+ for(t = localhost.health_log.alarms ; t ; t = t->next) {
+ t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
+ }
+
+ // reset all thresholds to all charts
RRDSET *st;
for(st = localhost.rrdset_root; st ; st = st->next) {
st->green = NAN;
st->red = NAN;
}
+ // load the new alarms
rrdhost_rwlock(&localhost);
health_readdir(path);
rrdhost_unlock(&localhost);
+ // link the loaded alarms to their charts
for(st = localhost.rrdset_root; st ; st = st->next) {
rrdhost_rwlock(&localhost);