From: Costa Tsaousis (ktsaou) <costa@tsaousis.gr>
Date: Wed, 14 Sep 2016 22:31:08 +0000 (+0300)
Subject: updated alarms to take into account the alarm status examination
X-Git-Tag: v1.4.0~55^2~1
X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b7261c269139a576e98c3c03a5dc4d2337044f8f;p=netdata.git

updated alarms to take into account the alarm status examination
---

diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
index 58bb863d..0aaf0e00 100644
--- a/conf.d/health.d/apache.conf
+++ b/conf.d/health.d/apache.conf
@@ -6,9 +6,9 @@ template: apache_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: webmaster
 
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index c5efc5a1..4d79fc79 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -4,9 +4,9 @@ template: 10min_cpu_usage
   lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
    units: %
    every: 1m
-    warn: $this > 80
-    crit: $this > 90
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 15m multiplier 1.5 max 1h
     info: average cpu utilization for the last 10 minutes
       to: sysadmin
 
@@ -15,9 +15,9 @@ template: 10min_cpu_iowait
   lookup: average -10m unaligned of iowait
    units: %
    every: 1m
-    warn: $this > 10
-    crit: $this > 30
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+   delay: down 15m multiplier 1.5 max 1h
     info: average CPU wait I/O for the last 10 minutes
       to: sysadmin
 
@@ -26,8 +26,8 @@ template: 20min_steal_cpu
   lookup: average -20m unaligned of steal
    units: %
    every: 5m
-    warn: $this > 10
-    crit: $this > 30
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+   delay: down 15m multiplier 1.5 max 1h
     info: average CPU steal time for the last 20 minutes
       to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
index 428a4331..cc7a4766 100644
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -7,9 +7,9 @@ template: disk_space_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection of the mount point
       to: sysadmin
 
@@ -19,9 +19,9 @@ template: disk_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection of the block device
       to: sysadmin
 
@@ -38,8 +38,8 @@ template: disk_space_usage
     calc: $used * 100 / ($avail + $used)
    units: %
    every: 1m
-    warn: $this > 80
-    crit: $this > 95
+    warn: $this > (($status >= $WARNING ) ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: up 1m down 15m multiplier 1.5 max 1h
     info: current disk space usage
       to: sysadmin
@@ -49,8 +49,8 @@ template: disk_inode_usage
     calc: $used * 100 / ($avail + $used)
    units: %
    every: 1m
-    warn: $this > 80
-    crit: $this > 95
+    warn: $this > (($status >= $WARNING)  ? (75) : (80))
+    crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: up 1m down 15m multiplier 1.5 max 1h
     info: current disk inode usage
       to: sysadmin
@@ -85,9 +85,9 @@ template: out_of_disk_space_time
     calc: $avail / $disk_fill_rate
    units: hours
    every: 10s
-    warn: $this > 0 and $this < 8
-    crit: $this > 0 and $this < 2
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.2 max 1h
     info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
       to: sysadmin
 
@@ -106,9 +106,9 @@ template: 10min_disk_utilization
    every: 1m
    green: 90
      red: 98
-    warn: $this > $green
-    crit: $this > $red
-   delay: up 0 down 30m multiplier 1.5 max 1h
+    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+   delay: down 15m multiplier 1.2 max 1h
     info: the percentage of time the disk was busy, during the last 10 minutes
       to: sysadmin
 
@@ -125,8 +125,8 @@ template: 10min_disk_backlog
    every: 1m
    green: 2000
      red: 5000
-    warn: $this > $green
-    crit: $this > $red
-   delay: up 1m down 30m multiplier 1.5 max 1h
+    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+   delay: down 15m multiplier 1.2 max 1h
     info: average of the kernel estimated disk backlog, for the last 10 minutes
       to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index bee77c36..bb77ad48 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -8,7 +8,7 @@
   lookup: min -1h unaligned
    units: entries
    every: 5m
-    warn: $this < 100
-   delay: up 0 down 1h multiplier 1.5 max 1h
+    warn: $this < (($status >= $WARNING) ? (200) : (100))
+   delay: down 1h multiplier 1.5 max 1h
     info: minimum entries in the random numbers pool in the last 30 minutes
       to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
index 77524331..46a8ca0e 100644
--- a/conf.d/health.d/memcached.conf
+++ b/conf.d/health.d/memcached.conf
@@ -6,9 +6,9 @@ template: memcached_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: dba
 
@@ -20,8 +20,8 @@ template: memcached_cache_memory_usage
     calc: $used * 100 / ($used + $available)
    units: %
    every: 10s
-    warn: $this > 80
-    crit: $this > 90
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
    delay: up 0 down 15m multiplier 1.5 max 1h
     info: current cache memory usage
       to: dba
@@ -45,8 +45,8 @@ template: out_of_cache_space_time
     calc: $available / $cache_fill_rate
    units: hours
    every: 10s
-    warn: $this > 0 and $this < 8
-    crit: $this > 0 and $this < 2
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.5 max 1h
     info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
       to: dba
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
index 6d84bfa4..a2cfa3ec 100644
--- a/conf.d/health.d/mysql.conf
+++ b/conf.d/health.d/mysql.conf
@@ -6,8 +6,8 @@ template: mysql_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: dba
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
index 09739675..f2eaa83c 100644
--- a/conf.d/health.d/named.conf
+++ b/conf.d/health.d/named.conf
@@ -6,9 +6,9 @@ template: named_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: domainadmin
 
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index dc54d937..7753aa18 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -6,9 +6,9 @@ template: interface_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: sysadmin
 
@@ -25,7 +25,7 @@ template: 1hour_packet_drops
    units: packets
    every: 1m
     warn: $this > 0
-   delay: up 0 down 15m multiplier 1.5 max 1h
+   delay: down 30m multiplier 1.5 max 1h
     info: interface dropped packets in the last hour
       to: sysadmin
 
@@ -43,6 +43,6 @@ template: 1hour_fifo_errors
    units: errors
    every: 1m
     warn: $this > 0
-   delay: up 0 down 15m multiplier 1.5 max 1h
+   delay: down 30m multiplier 1.5 max 1h
     info: interface fifo errors in the last hour
       to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
index 47e288f3..d70d6a59 100644
--- a/conf.d/health.d/nginx.conf
+++ b/conf.d/health.d/nginx.conf
@@ -6,9 +6,9 @@ template: nginx_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: webmaster
 
diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf
index af03d831..9e5939fd 100644
--- a/conf.d/health.d/qos.conf
+++ b/conf.d/health.d/qos.conf
@@ -8,7 +8,7 @@
 #  lookup: sum -10m unaligned absolute
 #   every: 30s
 #    warn: $this > 0
-#   delay: up 0 down 15m multiplier 1.5 max 1h
+#   delay: up 0 down 30m multiplier 1.5 max 1h
 #   units: packets
 #    info: dropped packets in the last 30 minutes
 #      to: sysadmin
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index c461480a..216b82fe 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -4,8 +4,8 @@
     calc: $used * 100 / ($used + $cached + $free)
    units: %
    every: 10s
-    warn: $this > 80
-    crit: $this > 90
-   delay: up 1m down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 15m multiplier 1.5 max 1h
     info: system RAM usage
       to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index d03dfc4e..3e648d85 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -6,9 +6,9 @@ template: redis_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: dba
 
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
index b699dc96..1af7b468 100644
--- a/conf.d/health.d/retroshare.conf
+++ b/conf.d/health.d/retroshare.conf
@@ -5,9 +5,9 @@ template: retroshare_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: sysadmin
 
@@ -18,8 +18,8 @@ template: retroshare_dht_working
     calc: $dht_size_all
    units: peers
    every: 1m
-    warn: $this < 100
-    crit: $this == 0
+    warn: $this < (($status >= $WARNING)  ? (120) : (100))
+    crit: $this < (($status == $CRITICAL) ? (10)  : (1))
    delay: up 0 down 15m multiplier 1.5 max 1h
     info: Checks if the DHT has enough peers to operate
       to: sysadmin
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
index 7d2b434f..76143c5d 100644
--- a/conf.d/health.d/squid.conf
+++ b/conf.d/health.d/squid.conf
@@ -6,9 +6,9 @@ template: squid_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (60 * $update_every)
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: proxyadmin
 
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
index a5819624..0cfa888c 100644
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -6,8 +6,8 @@
     calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
    units: % of RAM
    every: 1m
-    warn: $this > 10
-    crit: $this > 20
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (15) : (20))
    delay: up 0 down 15m multiplier 1.5 max 1h
     info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
       to: sysadmin
@@ -17,8 +17,8 @@
     calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
    units: % of RAM
    every: 10s
-    warn: $this > 20
-    crit: $this > 50
+    warn: $this > (($status >= $WARNING)  ? (15) : (20))
+    crit: $this > (($status == $CRITICAL) ? (40) : (50))
    delay: up 0 down 15m multiplier 1.5 max 1h
     info: the swap memory used, as a percentage of the system RAM
       to: sysadmin