From f1f4652c5c8cd7717b62d27e6bd8f0d412733299 Mon Sep 17 00:00:00 2001 From: "Costa Tsaousis (ktsaou)" Date: Wed, 14 Sep 2016 00:13:36 +0300 Subject: [PATCH] added support for notifications delays; fixes #945 --- conf.d/health.d/apache.conf | 3 +- conf.d/health.d/cpu.conf | 9 +- conf.d/health.d/disks.conf | 21 ++- conf.d/health.d/entropy.conf | 11 +- conf.d/health.d/memcached.conf | 11 +- conf.d/health.d/mysql.conf | 3 +- conf.d/health.d/named.conf | 3 +- conf.d/health.d/net.conf | 10 +- conf.d/health.d/nginx.conf | 3 +- conf.d/health.d/qos.conf | 1 + conf.d/health.d/ram.conf | 3 +- conf.d/health.d/redis.conf | 3 +- conf.d/health.d/retroshare.conf | 6 +- conf.d/health.d/squid.conf | 3 +- conf.d/health.d/swap.conf | 6 +- src/health.c | 309 ++++++++++++++++++++++---------- src/health.h | 136 +++++++++----- 17 files changed, 373 insertions(+), 168 deletions(-) diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf index 72d6be4c..58bb863d 100644 --- a/conf.d/health.d/apache.conf +++ b/conf.d/health.d/apache.conf @@ -4,10 +4,11 @@ template: apache_last_collected_secs on: apache.requests calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: webmaster diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf index 3d98be40..c5efc5a1 100644 --- a/conf.d/health.d/cpu.conf +++ b/conf.d/health.d/cpu.conf @@ -2,29 +2,32 @@ template: 10min_cpu_usage on: system.cpu lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice + units: % every: 1m warn: $this > 80 crit: $this > 90 - units: % + delay: up 0 down 15m multiplier 1.5 max 1h info: average cpu utilization for the last 10 minutes to: sysadmin template: 10min_cpu_iowait on: system.cpu lookup: average -10m unaligned of iowait + units: % every: 1m warn: $this > 10 crit: $this > 30 - units: % + delay: up 0 down 15m multiplier 1.5 max 1h info: average CPU wait I/O for the last 10 minutes to: sysadmin template: 20min_steal_cpu on: system.cpu lookup: average -20m unaligned of steal + units: % every: 5m warn: $this > 10 crit: $this > 30 - units: % + delay: up 0 down 15m multiplier 1.5 max 1h info: average CPU steal time for the last 20 minutes to: sysadmin diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf index 3f202fc1..428a4331 100644 --- a/conf.d/health.d/disks.conf +++ b/conf.d/health.d/disks.conf @@ -5,10 +5,11 @@ template: disk_space_last_collected_secs on: disk.space calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection of the mount point to: sysadmin @@ -16,10 +17,11 @@ template: disk_space_last_collected_secs template: disk_last_collected_secs on: disk.io calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection of the block device to: sysadmin @@ -34,20 +36,22 @@ template: disk_last_collected_secs template: disk_space_usage on: disk.space calc: $used * 100 / ($avail + $used) + units: % every: 1m warn: $this > 80 crit: $this > 95 - units: % + delay: up 1m down 15m multiplier 1.5 max 1h info: current disk space usage to: sysadmin template: disk_inode_usage on: disk.inodes calc: $used * 100 / ($avail + $used) + units: % every: 1m warn: $this > 80 crit: $this > 95 - units: % + delay: up 1m down 15m multiplier 1.5 max 1h info: current disk inode usage to: sysadmin @@ -79,10 +83,11 @@ template: disk_fill_rate template: out_of_disk_space_time on: disk.space calc: $avail / $disk_fill_rate + units: hours every: 10s warn: $this > 0 and $this < 8 crit: $this > 0 and $this < 2 - units: hours + delay: up 0 down 15m multiplier 1.5 max 1h info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour to: sysadmin @@ -97,12 +102,13 @@ template: out_of_disk_space_time template: 10min_disk_utilization on: disk.util lookup: average -10m unaligned + units: % every: 1m green: 90 red: 98 warn: $this > $green crit: $this > $red - units: % + delay: up 0 down 30m multiplier 1.5 max 1h info: the percentage of time the disk was busy, during the last 10 minutes to: sysadmin @@ -115,11 +121,12 @@ template: 10min_disk_utilization template: 10min_disk_backlog on: disk.backlog lookup: average -10m unaligned + units: ms every: 1m green: 2000 red: 5000 warn: $this > $green crit: $this > $red - units: ms + delay: up 1m down 30m multiplier 1.5 max 1h info: average of the kernel estimated disk backlog, for the last 10 minutes to: sysadmin diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf index b807d554..bee77c36 100644 --- a/conf.d/health.d/entropy.conf +++ b/conf.d/health.d/entropy.conf @@ -1,13 +1,14 @@ # check if entropy is too low # the alarm is checked every 1 minute -# and examines the last 30 minutes of data +# and examines the last hour of data - alarm: 30min_lowest_entropy + alarm: 1hour_lowest_entropy on: system.entropy - lookup: min -30m unaligned - every: 1m - warn: $this < 100 + lookup: min -1h unaligned units: entries + every: 5m + warn: $this < 100 + delay: up 0 down 1h multiplier 1.5 max 1h info: minimum entries in the random numbers pool in the last 30 minutes to: sysadmin diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf index 573e0339..77524331 100644 --- a/conf.d/health.d/memcached.conf +++ b/conf.d/health.d/memcached.conf @@ -4,10 +4,11 @@ template: memcached_last_collected_secs on: memcached.cache calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: dba @@ -17,10 +18,11 @@ template: memcached_last_collected_secs template: memcached_cache_memory_usage on: memcached.cache calc: $used * 100 / ($used + $available) + units: % every: 10s warn: $this > 80 crit: $this > 90 - units: % + delay: up 0 down 15m multiplier 1.5 max 1h info: current cache memory usage to: dba @@ -31,8 +33,8 @@ template: cache_fill_rate on: memcached.cache lookup: min -10m at -50m unaligned of available calc: ($this - $available) / (($now - $after) / 3600) - every: 1m units: KB/hour + every: 1m info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour @@ -41,9 +43,10 @@ template: cache_fill_rate template: out_of_cache_space_time on: memcached.cache calc: $available / $cache_fill_rate + units: hours every: 10s warn: $this > 0 and $this < 8 crit: $this > 0 and $this < 2 - units: hours + delay: up 0 down 15m multiplier 1.5 max 1h info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour to: dba diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf index 77c21aa0..6d84bfa4 100644 --- a/conf.d/health.d/mysql.conf +++ b/conf.d/health.d/mysql.conf @@ -4,9 +4,10 @@ template: mysql_last_collected_secs on: mysql.queries calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: dba diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf index 997c6b94..09739675 100644 --- a/conf.d/health.d/named.conf +++ b/conf.d/health.d/named.conf @@ -4,10 +4,11 @@ template: named_last_collected_secs on: named.global_queries calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: domainadmin diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf index b96677b2..dc54d937 100644 --- a/conf.d/health.d/net.conf +++ b/conf.d/health.d/net.conf @@ -4,10 +4,11 @@ template: interface_last_collected_secs on: net.net calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: sysadmin @@ -21,9 +22,10 @@ template: interface_last_collected_secs template: 1hour_packet_drops on: net.drops lookup: sum -1h unaligned absolute + units: packets every: 1m warn: $this > 0 - units: packets + delay: up 0 down 15m multiplier 1.5 max 1h info: interface dropped packets in the last hour to: sysadmin @@ -38,9 +40,9 @@ template: 1hour_packet_drops template: 1hour_fifo_errors on: net.fifo lookup: sum -1h unaligned absolute + units: errors every: 1m warn: $this > 0 - units: errors + delay: up 0 down 15m multiplier 1.5 max 1h info: interface fifo errors in the last hour to: sysadmin - diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf index 880777d7..47e288f3 100644 --- a/conf.d/health.d/nginx.conf +++ b/conf.d/health.d/nginx.conf @@ -4,10 +4,11 @@ template: nginx_last_collected_secs on: nginx.requests calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: webmaster diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf index bd438aa0..af03d831 100644 --- a/conf.d/health.d/qos.conf +++ b/conf.d/health.d/qos.conf @@ -8,6 +8,7 @@ # lookup: sum -10m unaligned absolute # every: 30s # warn: $this > 0 +# delay: up 0 down 15m multiplier 1.5 max 1h # units: packets # info: dropped packets in the last 30 minutes # to: sysadmin diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf index 85e52540..c461480a 100644 --- a/conf.d/health.d/ram.conf +++ b/conf.d/health.d/ram.conf @@ -2,9 +2,10 @@ alarm: ram_in_use on: system.ram calc: $used * 100 / ($used + $cached + $free) + units: % every: 10s warn: $this > 80 crit: $this > 90 - units: % + delay: up 1m down 15m multiplier 1.5 max 1h info: system RAM usage to: sysadmin diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf index cdeae4a5..d03dfc4e 100644 --- a/conf.d/health.d/redis.conf +++ b/conf.d/health.d/redis.conf @@ -4,10 +4,11 @@ template: redis_last_collected_secs on: redis.operations calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: dba diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf index a2f2ac21..b699dc96 100644 --- a/conf.d/health.d/retroshare.conf +++ b/conf.d/health.d/retroshare.conf @@ -3,10 +3,11 @@ template: retroshare_last_collected_secs on: retroshare.peers calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: sysadmin @@ -15,9 +16,10 @@ template: retroshare_last_collected_secs template: retroshare_dht_working on: retroshare.dht calc: $dht_size_all + units: peers every: 1m warn: $this < 100 crit: $this == 0 - units: peers + delay: up 0 down 15m multiplier 1.5 max 1h info: Checks if the DHT has enough peers to operate to: sysadmin diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf index 8fecce23..7d2b434f 100644 --- a/conf.d/health.d/squid.conf +++ b/conf.d/health.d/squid.conf @@ -4,10 +4,11 @@ template: squid_last_collected_secs on: squid.clients_requests calc: $now - $last_collected_t + units: seconds ago every: 10s warn: $this > ( 5 * $update_every) crit: $this > (60 * $update_every) - units: seconds ago + delay: up 0 down 15m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: proxyadmin diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf index 2ccde48e..a5819624 100644 --- a/conf.d/health.d/swap.conf +++ b/conf.d/health.d/swap.conf @@ -4,19 +4,21 @@ lookup: sum -30m unaligned absolute of out # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + units: % of RAM every: 1m warn: $this > 10 crit: $this > 20 - units: % of RAM + delay: up 0 down 15m multiplier 1.5 max 1h info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM to: sysadmin alarm: used_swap_space on: system.swap calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + units: % of RAM every: 10s warn: $this > 20 crit: $this > 50 - units: % of RAM + delay: up 0 down 15m multiplier 1.5 max 1h info: the swap memory used, as a percentage of the system RAM to: sysadmin diff --git a/src/health.c b/src/health.c index 71c8d97a..b8dae0a7 100644 --- a/src/health.c +++ b/src/health.c @@ -18,7 +18,8 @@ static inline void health_alarm_log(RRDHOST *host, int old_status, int new_status, const char *source, const char *units, - const char *info + const char *info, + int delay ) { debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id); @@ -49,6 +50,8 @@ static inline void health_alarm_log(RRDHOST *host, ae->old_status = old_status; ae->new_status = new_status; ae->duration = duration; + ae->delay = delay; + ae->delay_up_to_timestamp = when + delay; if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) ae->non_clear_duration += ae->duration; @@ -557,7 +560,7 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) { { time_t now = time(NULL); - health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info); + health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0); } } @@ -595,7 +598,7 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) { { time_t now = time(NULL); - health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info); + health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0); } RRDHOST *host = st->rrdhost; @@ -668,7 +671,7 @@ static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *n return 0; } -static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name) { +static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) { if(chart && name) { uint32_t hash_chart = simple_hash(chart); uint32_t hash_name = simple_hash(name); @@ -676,8 +679,10 @@ static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, c // re-use old IDs, by looking them up in the alarm log ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae ;ae = ae->next) { - if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) + if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) { + if(next_event_id) *next_event_id = ae->alarm_event_id + 1; return ae->alarm_id; + } } } @@ -729,62 +734,62 @@ static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) { } } -static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const char *chart, const char *dimensions, - const char *units, const char *info, - int group_method, int after, int before, int update_every, uint32_t options, - calculated_number green, calculated_number red, - const char *exec, const char *recipient, const char *source, - const char *calc, const char *warn, const char *crit) { +static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) { - debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, name); + debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name); - if(rrdcalc_exists(host, chart, name, 0, 0)) + if(rrdcalc_exists(host, chart, rt->name, 0, 0)) return NULL; RRDCALC *rc = callocz(1, sizeof(RRDCALC)); - rc->id = rrdcalc_get_unique_id(host, chart, name); rc->next_event_id = 1; - rc->name = strdupz(name); + rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id); + rc->name = strdupz(rt->name); rc->hash = simple_hash(rc->name); rc->chart = strdupz(chart); rc->hash_chart = simple_hash(rc->chart); - if(dimensions) rc->dimensions = strdupz(dimensions); + if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions); - rc->green = green; - rc->red = red; + rc->green = rt->green; + rc->red = rt->red; rc->value = NAN; rc->old_value = NAN; - rc->group = group_method; - rc->after = after; - rc->before = before; - rc->update_every = update_every; - rc->options = options; - - if(exec) rc->exec = strdupz(exec); - if(recipient) rc->recipient = strdupz(recipient); - if(source) rc->source = strdupz(source); - if(units) rc->units = strdupz(units); - if(info) rc->info = strdupz(info); - - if(calc) { - rc->calculation = expression_parse(calc, NULL, NULL); + rc->delay_up_duration = rt->delay_up_duration; + rc->delay_down_duration = rt->delay_down_duration; + rc->delay_max_duration = rt->delay_max_duration; + rc->delay_multiplier = rt->delay_multiplier; + + rc->group = rt->group; + rc->after = rt->after; + rc->before = rt->before; + rc->update_every = rt->update_every; + rc->options = rt->options; + + if(rt->exec) rc->exec = strdupz(rt->exec); + if(rt->recipient) rc->recipient = strdupz(rt->recipient); + if(rt->source) rc->source = strdupz(rt->source); + if(rt->units) rc->units = strdupz(rt->units); + if(rt->info) rc->info = strdupz(rt->info); + + if(rt->calculation) { + rc->calculation = expression_parse(rt->calculation->source, NULL, NULL); if(!rc->calculation) - error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, name, calc); + error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source); } - if(warn) { - rc->warning = expression_parse(warn, NULL, NULL); + if(rt->warning) { + rc->warning = expression_parse(rt->warning->source, NULL, NULL); if(!rc->warning) - error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, name, warn); + error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source); } - if(crit) { - rc->critical = expression_parse(crit, NULL, NULL); + if(rt->critical) { + rc->critical = expression_parse(rt->critical->source, NULL, NULL); if(!rc->critical) - error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, name, crit); + error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source); } - debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s", + debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", (rc->chart)?rc->chart:"NOCHART", rc->name, (rc->exec)?rc->exec:"DEFAULT", @@ -800,7 +805,11 @@ static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const cha (rc->calculation)?rc->calculation->parsed_as:"NONE", (rc->warning)?rc->warning->parsed_as:"NONE", (rc->critical)?rc->critical->parsed_as:"NONE", - rc->source + rc->source, + rc->delay_up_duration, + rc->delay_down_duration, + rc->delay_max_duration, + rc->delay_multiplier ); rrdcalc_create_part2(host, rc); @@ -854,14 +863,7 @@ void rrdcalctemplate_link_matching(RRDSET *st) { for(rt = st->rrdhost->templates; rt ; rt = rt->next) { if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) { - RRDCALC *rc = rrdcalc_create(st->rrdhost, rt->name, st->id, - rt->dimensions, rt->units, rt->info, rt->group, rt->after, rt->before, rt->update_every, rt->options, - rt->green, rt->red, - rt->exec, rt->recipient, rt->source, - (rt->calculation)?rt->calculation->source:NULL, - (rt->warning)?rt->warning->source:NULL, - (rt->critical)?rt->critical->source:NULL); - + RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id); if(unlikely(!rc)) error("Health tried to create alarm from template '%s', but it failed", rt->name); @@ -926,6 +928,7 @@ static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) { #define HEALTH_RECIPIENT_KEY "to" #define HEALTH_UNITS_KEY "units" #define HEALTH_INFO_KEY "info" +#define HEALTH_DELAY_KEY "delay" static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { if(!rc->chart) { @@ -946,9 +949,9 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash)) return 0; - rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name); + rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id); - debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s", + debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", rc->chart?rc->chart:"NOCHART", rc->name, rc->id, @@ -965,7 +968,11 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { (rc->calculation)?rc->calculation->parsed_as:"NONE", (rc->warning)?rc->warning->parsed_as:"NONE", (rc->critical)?rc->critical->parsed_as:"NONE", - rc->source + rc->source, + rc->delay_up_duration, + rc->delay_down_duration, + rc->delay_max_duration, + rc->delay_multiplier ); rrdcalc_create_part2(host, rc); @@ -996,7 +1003,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL } } - debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'", + debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", rt->name, (rt->context)?rt->context:"NONE", (rt->exec)?rt->exec:"DEFAULT", @@ -1012,7 +1019,11 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL (rt->calculation)?rt->calculation->parsed_as:"NONE", (rt->warning)?rt->warning->parsed_as:"NONE", (rt->critical)?rt->critical->parsed_as:"NONE", - rt->source + rt->source, + rt->delay_up_duration, + rt->delay_down_duration, + rt->delay_max_duration, + rt->delay_multiplier ); if(likely(last)) { @@ -1068,6 +1079,60 @@ static inline int health_parse_duration(char *string, int *result) { return 1; } +static inline int health_parse_delay( + size_t line, const char *path, const char *file, char *string, + int *delay_up_duration, + int *delay_down_duration, + int *delay_max_duration, + float *delay_multiplier) { + + char *s = string; + while(*s) { + char *key = s; + + while(*s && !isspace(*s)) s++; + while(*s && isspace(*s)) *s++ = '\0'; + + if(!*key) break; + + char *value = s; + while(*s && !isspace(*s)) s++; + while(*s && isspace(*s)) *s++ = '\0'; + + if(!strcasecmp(key, "up")) { + if (!health_parse_duration(value, delay_up_duration)) { + error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword", + line, path, file, value, key); + } + } + else if(!strcasecmp(key, "down")) { + if (!health_parse_duration(value, delay_down_duration)) { + error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword", + line, path, file, value, key); + } + } + else if(!strcasecmp(key, "multiplier")) { + *delay_multiplier = strtof(value, NULL); + if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) { + error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword", + line, path, file, value, key); + } + } + else if(!strcasecmp(key, "max")) { + if (!health_parse_duration(value, delay_max_duration)) { + error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword", + line, path, file, value, key); + } + } + else { + error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'", + line, path, file, key); + } + } + + return 1; +} + static inline int health_parse_db_lookup( size_t line, const char *path, const char *file, char *string, int *group_method, int *after, int *before, int *every, @@ -1186,7 +1251,7 @@ static inline void strip_quotes(char *s) { int health_readfile(const char *path, const char *filename) { debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename); - static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0; + static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0; char buffer[HEALTH_CONF_MAX_LINE + 1]; if(unlikely(!hash_alarm)) { @@ -1204,6 +1269,7 @@ int health_readfile(const char *path, const char *filename) { hash_units = simple_hash(HEALTH_UNITS_KEY); hash_info = simple_hash(HEALTH_INFO_KEY); hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY); + hash_delay = simple_uhash(HEALTH_DELAY_KEY); } snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename); @@ -1283,6 +1349,7 @@ int health_readfile(const char *path, const char *filename) { rc->red = NAN; rc->value = NAN; rc->old_value = NAN; + rc->delay_multiplier = 1.0; if(rrdvar_fix_name(rc->name)) error("Health configuration renamed alarm '%s' to '%s'", value, rc->name); @@ -1303,6 +1370,7 @@ int health_readfile(const char *path, const char *filename) { rt->source = health_source_file(line, path, filename); rt->green = NAN; rt->red = NAN; + rt->delay_multiplier = 1.0; if(rrdvar_fix_name(rt->name)) error("Health configuration renamed template '%s' to '%s'", value, rt->name); @@ -1414,6 +1482,9 @@ int health_readfile(const char *path, const char *filename) { rc->info = strdupz(value); strip_quotes(rc->info); } + else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { + health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier); + } else { error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.", line, path, filename, rc->name, key); @@ -1526,6 +1597,9 @@ int health_readfile(const char *path, const char *filename) { rt->info = strdupz(value); strip_quotes(rt->info); } + else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { + health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier); + } else { error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.", line, path, filename, rt->name, key); @@ -1615,7 +1689,7 @@ void health_init(void) { } long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max); - if(n < 2) { + if(n < 10) { error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max); config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max); } @@ -1638,28 +1712,30 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { buffer_sprintf(wb, "\n\t{\n" - "\t\t\"hostname\":\"%s\",\n" - "\t\t\"unique_id\":%u,\n" - "\t\t\"alarm_id\":%u,\n" - "\t\t\"alarm_event_id\":%u,\n" - "\t\t\"name\":\"%s\",\n" - "\t\t\"chart\":\"%s\",\n" - "\t\t\"family\":\"%s\",\n" - "\t\t\"processed\":%s,\n" - "\t\t\"updated\":%s,\n" - "\t\t\"exec_run\":%s,\n" - "\t\t\"exec_failed\":%s,\n" - "\t\t\"exec\":\"%s\",\n" - "\t\t\"recipient\":\"%s\",\n" - "\t\t\"exec_code\":%d,\n" - "\t\t\"source\":\"%s\",\n" - "\t\t\"units\":\"%s\",\n" - "\t\t\"info\":\"%s\",\n" - "\t\t\"when\":%lu,\n" - "\t\t\"duration\":%lu,\n" - "\t\t\"non_clear_duration\":%lu,\n" - "\t\t\"status\":\"%s\",\n" - "\t\t\"old_status\":\"%s\",\n", + "\t\t\"hostname\": \"%s\",\n" + "\t\t\"unique_id\": %u,\n" + "\t\t\"alarm_id\": %u,\n" + "\t\t\"alarm_event_id\": %u,\n" + "\t\t\"name\": \"%s\",\n" + "\t\t\"chart\": \"%s\",\n" + "\t\t\"family\": \"%s\",\n" + "\t\t\"processed\": %s,\n" + "\t\t\"updated\": %s,\n" + "\t\t\"exec_run\": %lu,\n" + "\t\t\"exec_failed\": %s,\n" + "\t\t\"exec\": \"%s\",\n" + "\t\t\"recipient\": \"%s\",\n" + "\t\t\"exec_code\": %d,\n" + "\t\t\"source\": \"%s\",\n" + "\t\t\"units\": \"%s\",\n" + "\t\t\"info\": \"%s\",\n" + "\t\t\"when\": %lu,\n" + "\t\t\"duration\": %lu,\n" + "\t\t\"non_clear_duration\": %lu,\n" + "\t\t\"status\": \"%s\",\n" + "\t\t\"old_status\": \"%s\",\n" + "\t\t\"delay\": %d,\n" + "\t\t\"delay_up_to_timestamp\": %lu,\n", host->hostname, ae->unique_id, ae->alarm_id, @@ -1669,7 +1745,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R ae->family, (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)?"true":"false", (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)?"true":"false", - (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN)?"true":"false", + (unsigned long)ae->exec_run_timestamp, (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false", ae->exec?ae->exec:health_default_exec, ae->recipient?ae->recipient:health_default_recipient, @@ -1681,7 +1757,9 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R (unsigned long)ae->duration, (unsigned long)ae->non_clear_duration, rrdcalc_status2string(ae->new_status), - rrdcalc_status2string(ae->old_status) + rrdcalc_status2string(ae->old_status), + ae->delay, + (unsigned long)ae->delay_up_to_timestamp ); buffer_strcat(wb, "\t\t\"value\":"); @@ -1732,6 +1810,12 @@ static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) { "\t\t\t\"last_updated\": %lu,\n" "\t\t\t\"next_update\": %lu,\n" "\t\t\t\"update_every\": %d,\n" + "\t\t\t\"delay_up_duration\": %d,\n" + "\t\t\t\"delay_down_duration\": %d,\n" + "\t\t\t\"delay_max_duration\": %d,\n" + "\t\t\t\"delay_multiplier\": %f,\n" + "\t\t\t\"delay\": %d,\n" + "\t\t\t\"delay_up_to_timestamp\": %lu,\n" , rc->chart, rc->name , rc->name , rc->chart @@ -1747,6 +1831,12 @@ static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) { , (unsigned long)rc->last_updated , (unsigned long)rc->next_update , rc->update_every + , rc->delay_up_duration + , rc->delay_down_duration + , rc->delay_max_duration + , rc->delay_multiplier + , rc->delay_last + , rc->delay_up_to_timestamp ); if(RRDCALC_HAS_DB_LOOKUP(rc)) { @@ -1906,6 +1996,8 @@ static inline int rrdcalc_value2status(calculated_number n) { } static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { + ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED; + if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR) return; @@ -1941,6 +2033,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ); ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN; + ae->exec_run_timestamp = time(NULL); debug(D_HEALTH, "executing command '%s'", buffer); FILE *fp = mypopen(buffer, &command_pid); @@ -1971,23 +2064,29 @@ static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) } static inline void health_alarm_log_process(RRDHOST *host) { - static uint32_t last_processed = 0; - ALARM_ENTRY *ae; + static uint32_t stop_at_id = 0; + uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0; + time_t now = time(NULL); pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - for(ae = host->health_log.alarms; ae ;ae = ae->next) { - if(last_processed >= ae->unique_id) break; + ALARM_ENTRY *ae; + for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) { + if(unlikely( + !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) && + !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED) + )) { - if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) && - !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) { - ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED; - health_process_notifications(host, ae); + if(unlikely(ae->unique_id < first_waiting)) + first_waiting = ae->unique_id; + + if(likely(now > ae->delay_up_to_timestamp)) + health_process_notifications(host, ae); } } - if(host->health_log.alarms) - last_processed = host->health_log.alarms->unique_id; + // remember this for the next iteration + stop_at_id = first_waiting; pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); @@ -1998,7 +2097,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *last = NULL; - unsigned int count = host->health_log.max; + unsigned int count = host->health_log.max * 2 / 3; for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ; if(ae && last && last->next == ae) @@ -2293,7 +2392,33 @@ void *health_main(void *ptr) { } if(status != rc->status) { - health_alarm_log(&localhost, rc->id, rc->next_event_id++, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info); + int delay = 0; + + if(now > rc->delay_up_to_timestamp) { + rc->delay_up_current = rc->delay_up_duration; + rc->delay_down_current = rc->delay_down_duration; + rc->delay_last = 0; + rc->delay_up_to_timestamp = 0; + } + else { + rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier); + if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration; + + rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier); + if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration; + } + + if(status > rc->status) + delay = rc->delay_up_current; + else + delay = rc->delay_down_current; + + if(now + delay < rc->delay_up_to_timestamp) + delay = rc->delay_up_to_timestamp - now; + + rc->delay_last = delay; + rc->delay_up_to_timestamp = now + delay; + health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last); rc->last_status_change = now; rc->status = status; } diff --git a/src/health.h b/src/health.h index e493078a..17a0f75c 100644 --- a/src/health.h +++ b/src/health.h @@ -121,56 +121,89 @@ typedef struct rrddimvar { #define RRDCALC_FLAG_CRIT_ERROR 0x00000020 typedef struct rrdcalc { - uint32_t id; - uint32_t next_event_id; + uint32_t id; // the unique id of this alarm + uint32_t next_event_id; // the next event id that will be used for this alarm - char *name; - uint32_t hash; + char *name; // the name of this alarm + uint32_t hash; - char *exec; - char *recipient; + char *exec; // the command to execute when this alarm switches state + char *recipient; // the recipient of the alarm (the first parameter to exec) - char *chart; // the chart id this should be linked to + char *chart; // the chart id this should be linked to uint32_t hash_chart; - char *source; // the source of this calculation - char *units; - char *info; + char *source; // the source of this alarm + char *units; // the units of the alarm + char *info; // a short description of the alarm - char *dimensions; // the chart dimensions + int update_every; // update frequency for the alarm - int group; // grouping method: average, max, etc. - int before; // ending point in time-series - int after; // starting point in time-series - uint32_t options; // calculation options - int update_every; // update frequency for the calculation + // the red and green threshold of this alarm (to be set to the chart) + calculated_number green; + calculated_number red; - time_t last_updated; - time_t next_update; + // ------------------------------------------------------------------------ + // database lookup settings - EVAL_EXPRESSION *calculation; - EVAL_EXPRESSION *warning; - EVAL_EXPRESSION *critical; + char *dimensions; // the chart dimensions + int group; // grouping method: average, max, etc. + int before; // ending point in time-series + int after; // starting point in time-series + uint32_t options; // calculation options - uint32_t rrdcalc_flags; - int status; + // ------------------------------------------------------------------------ + // expressions related to the alarm - time_t db_after; - time_t db_before; - time_t last_status_change; + EVAL_EXPRESSION *calculation; // expression to calculate the value of the alarm + EVAL_EXPRESSION *warning; // expression to check the warning condition + EVAL_EXPRESSION *critical; // expression to check the critical condition - calculated_number value; - calculated_number old_value; + // ------------------------------------------------------------------------ + // notification delay settings - calculated_number green; - calculated_number red; + int delay_up_duration; // duration to delay notifications when alarm raises + int delay_down_duration; // duration to delay notifications when alarm lowers + int delay_max_duration; // the absolute max delay to apply to this alarm + float delay_multiplier; // multiplier for all delays when alarms switch status + // while now < delay_up_to + + // ------------------------------------------------------------------------ + // runtime information + + int status; // the current status of the alarm + + calculated_number value; // the current value of the alarm + calculated_number old_value; // the previous value of the alarm + + uint32_t rrdcalc_flags; // check RRDCALC_FLAG_* + + time_t last_updated; // the last update timestamp of the alarm + time_t next_update; // the next update timestamp of the alarm + time_t last_status_change; // the timestamp of the last time this alarm changed status + + time_t db_after; // the first timestamp evaluated by the db lookup + time_t db_before; // the last timestamp evaluated by the db lookup + + time_t delay_up_to_timestamp; // the timestamp up to which we should delay notifications + int delay_up_current; // the current up notification delay duration + int delay_down_current; // the current down notification delay duration + int delay_last; // the last delay we used + + // ------------------------------------------------------------------------ + // variables this alarm exposes to the rest of the alarms RRDVAR *local; RRDVAR *family; RRDVAR *hostid; RRDVAR *hostname; + // ------------------------------------------------------------------------ + // the chart this alarm it is linked to + struct rrdset *rrdset; + + // linking of this alarm on its chart struct rrdcalc *rrdset_next; struct rrdcalc *rrdset_prev; @@ -192,25 +225,40 @@ typedef struct rrdcalctemplate { char *context; uint32_t hash_context; - char *source; // the source of this template - char *units; - char *info; + char *source; // the source of this alarm + char *units; // the units of the alarm + char *info; // a short description of the alarm - char *dimensions; + int update_every; // update frequency for the alarm - int group; // grouping method: average, max, etc. - int before; // ending point in time-series - int after; // starting point in time-series - uint32_t options; // calculation options - int update_every; // update frequency for the calculation + // the red and green threshold of this alarm (to be set to the chart) + calculated_number green; + calculated_number red; + + // ------------------------------------------------------------------------ + // database lookup settings + + char *dimensions; // the chart dimensions + int group; // grouping method: average, max, etc. + int before; // ending point in time-series + int after; // starting point in time-series + uint32_t options; // calculation options + + // ------------------------------------------------------------------------ + // notification delay settings + + int delay_up_duration; // duration to delay notifications when alarm raises + int delay_down_duration; // duration to delay notifications when alarm lowers + int delay_max_duration; // the absolute max delay to apply to this alarm + float delay_multiplier; // multiplier for all delays when alarms switch status + + // ------------------------------------------------------------------------ + // expressions related to the alarm EVAL_EXPRESSION *calculation; EVAL_EXPRESSION *warning; EVAL_EXPRESSION *critical; - calculated_number green; - calculated_number red; - struct rrdcalctemplate *next; } RRDCALCTEMPLATE; @@ -240,6 +288,7 @@ typedef struct alarm_entry { char *exec; char *recipient; + time_t exec_run_timestamp; int exec_code; char *source; @@ -253,6 +302,9 @@ typedef struct alarm_entry { uint32_t notifications; + int delay; + time_t delay_up_to_timestamp; + struct alarm_entry *updated_by; struct alarm_entry *next; } ALARM_ENTRY; -- 2.39.2