more elastic alarms; better names; properly invalidate all alarm log on alarms reload...

author Costa Tsaousis (ktsaou) <costa@tsaousis.gr>

Wed, 7 Sep 2016 00:28:05 +0000 (03:28 +0300)

committer Costa Tsaousis (ktsaou) <costa@tsaousis.gr>

Wed, 7 Sep 2016 00:28:05 +0000 (03:28 +0300)
author Costa Tsaousis (ktsaou) <costa@tsaousis.gr>
Wed, 7 Sep 2016 00:28:05 +0000 (03:28 +0300)
committer Costa Tsaousis (ktsaou) <costa@tsaousis.gr>
Wed, 7 Sep 2016 00:28:05 +0000 (03:28 +0300)
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf

index 8caee3259cc61e4f38f97ad55362e9860da44894..3d98be407b33adffbe40655feceb2b578abe96f2 100644 (file)
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -1,27 +1,30 @@
  
-template: 5min_cpu_pcent
+template: 10min_cpu_usage
        on: system.cpu
-  lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice
+  lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
     every: 1m
-    warn: $this > 90
+    warn: $this > 80
+    crit: $this > 90
     units: %
-    info: average cpu utilization for the last 5 minutes
+    info: average cpu utilization for the last 10 minutes
        to: sysadmin
  
-template: 5min_iowait_cpu_pcent
+template: 10min_cpu_iowait
        on: system.cpu
-  lookup: average -5m unaligned of iowait
+  lookup: average -10m unaligned of iowait
     every: 1m
      warn: $this > 10
+    crit: $this > 30
     units: %
-    info: average wait I/O for the last 5 minutes
+    info: average CPU wait I/O for the last 10 minutes
        to: sysadmin
  
-template: 20min_steal_cpu_pcent
+template: 20min_steal_cpu
        on: system.cpu
    lookup: average -20m unaligned of steal
     every: 5m
      warn: $this > 10
+    crit: $this > 30
     units: %
-    info: average stolen CPU time for the last 20 minutes
+    info: average CPU steal time for the last 20 minutes
        to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf

index 7e98511feb4a7a5e14f9e3da62a15376873b0ee3..9e31e295c9545d8bb4131de9e6228e988c5eb28e 100644 (file)
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -5,7 +5,7 @@
  # raise an alarm if the disk is low on
  # available disk space
  
-template: disk_full_percent
+template: disk_space_usage
        on: disk.space
      calc: $used * 100 / ($avail + $used)
     every: 1m
@@ -29,9 +29,9 @@ template: disk_full_percent
  
  template: disk_fill_rate
        on: disk.space
-  lookup: max -2m at -30m unaligned of avail
+  lookup: max -5m at -30m unaligned of avail
      calc: ($this - $avail) / (($now - $after) / 60)
-   every: 15s
+   every: 1m
     units: GB/min
      info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes
  
@@ -40,7 +40,7 @@ template: disk_fill_rate
  # if the disk continues to fill
  # in this rate
  
-template: disk_full_after_hours
+template: out_of_disk_space_in
        on: disk.space
      calc: $avail / ($disk_fill_rate * 60)
     every: 10s
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf

index fcbfb4cb54a93325dea342e5c0e760556e94cb31..b807d554f57dceee7694fcb67a1d20fc18c01933 100644 (file)
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -3,12 +3,11 @@
  # the alarm is checked every 1 minute
  # and examines the last 30 minutes of data
  
-   alarm: min_30min_entropy
+   alarm: 30min_lowest_entropy
        on: system.entropy
    lookup: min -30m unaligned
     every: 1m
-    warn: $this < 200
-    crit: $this < 100
+    warn: $this < 100
     units: entries
-    info: minimum entries in the random numbers pool (entropy), for the last 30 minutes
+    info: minimum entries in the random numbers pool in the last 30 minutes
        to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf

index 6e1acac2f5120e3b99e54cfeefb571589556f1d5..efbdc9a51ac41eb4ba47f1a9e8ca1842a044dcf7 100644 (file)
--- a/conf.d/health.d/memcached.conf
+++ b/conf.d/health.d/memcached.conf
@@ -14,7 +14,7 @@ template: memcached_last_collected_secs
  
  # detect if memcached cache is full
  
-template: cache_full_pcent
+template: memcached_cache_memory_usage
        on: memcached.cache
      calc: $used * 100 / ($used + $available)
     every: 10s
@@ -27,20 +27,20 @@ template: cache_full_pcent
  
  # find the rate memcached cache is filling
  
-template: cache_fill_rate
+template: memcached_cache_fill_rate
        on: memcached.cache
-  lookup: max -1s at -30m unaligned of available
-    calc: ($this - $available) / ($now - $after)
-   every: 15s
-   units: KB/s
+  lookup: max -5m at -30m unaligned of available
+    calc: ($this - $available) / (($now - $after) / 60)
+   every: 1m
+   units: KB/min
      info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes
  
  
  # find the hours remaining until memcached cache is full
  
-template: cache_full_after_hours
+template: memcached_out_of_cache_in
        on: memcached.cache
-    calc: $available / $cache_fill_rate / 3600
+    calc: $available / ($memcached_cache_fill_rate * 60)
     every: 10s
      warn: $this > 0 and $this < 48
      crit: $this > 0 and $this < 24
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf

index 8ed713758d156e2d53a033aeefef349b79038192..5047d25978ac9271415670649265cad045998149 100644 (file)
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -1,29 +1,29 @@
  
  # check if an interface is dropping packets
  # the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# and examines the last hour of data
  
-template: 30min_packet_drops
+template: 1hour_packet_drops
        on: net.drops
-  lookup: sum -30m unaligned absolute
+  lookup: sum -1h unaligned absolute
     every: 1m
      warn: $this > 0
     units: packets
-    info: dropped packets in the last 30 minutes
+    info: interface dropped packets in the last hour
        to: sysadmin
  
  
  # check if an interface is having FIFO
  # buffer errors
  # the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# and examines the last hour of data
  
-template: 30min_fifo_errors
+template: 1hour_fifo_errors
        on: net.fifo
-  lookup: sum -30m unaligned absolute
+  lookup: sum -1h unaligned absolute
     every: 1m
      warn: $this > 0
     units: errors
-    info: network interface fifo errors in the last 30 minutes
+    info: interface fifo errors in the last hour
        to: sysadmin
  
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf

index f762b03079371a135dfc4db953d4a10cfc8f8119..d5965a9b09c57305ca8ce6d6f6ad33f18c7627e8 100644 (file)
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -15,7 +15,7 @@
        on: system.swap
      calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
     every: 10s
-    warn: $this > 10
+    warn: $this > 20
      crit: $this > 50
     units: % of RAM
      info: the swap memory used, as a percentage of the system RAM
diff --git a/src/health.c b/src/health.c

index 1381e81c5dbaf8a793ed395edccf97ba70cc7fb5..e62852af0f205448a8a913f0c843b963b661a462 100644 (file)
--- a/src/health.c
+++ b/src/health.c
@@ -1742,20 +1742,30 @@ void health_reload(void) {
  
      char *path = health_config_dir();
  
+    // free all running alarms
      rrdhost_rwlock(&localhost);
      health_free_all_nolock(&localhost);
      rrdhost_unlock(&localhost);
  
+    // invalidate all previous entries in the alarm log
+    ALARM_ENTRY *t;
+    for(t = localhost.health_log.alarms ; t ; t = t->next) {
+        t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
+    }
+
+    // reset all thresholds to all charts
      RRDSET *st;
      for(st = localhost.rrdset_root; st ; st = st->next) {
          st->green = NAN;
          st->red = NAN;
      }
  
+    // load the new alarms
      rrdhost_rwlock(&localhost);
      health_readdir(path);
      rrdhost_unlock(&localhost);
  
+    // link the loaded alarms to their charts
      for(st = localhost.rrdset_root; st ; st = st->next) {
          rrdhost_rwlock(&localhost);
author	Costa Tsaousis (ktsaou) <costa@tsaousis.gr>
	Wed, 7 Sep 2016 00:28:05 +0000 (03:28 +0300)
committer	Costa Tsaousis (ktsaou) <costa@tsaousis.gr>
	Wed, 7 Sep 2016 00:28:05 +0000 (03:28 +0300)
conf.d/health.d/cpu.conf		patch \| blob \| history
conf.d/health.d/disks.conf		patch \| blob \| history
conf.d/health.d/entropy.conf		patch \| blob \| history
conf.d/health.d/memcached.conf		patch \| blob \| history
conf.d/health.d/net.conf		patch \| blob \| history
conf.d/health.d/swap.conf		patch \| blob \| history
src/health.c		patch \| blob \| history