From 28a441fb4034d81ce9e0cebad354199ac5b8250a Mon Sep 17 00:00:00 2001 From: "Costa Tsaousis (ktsaou)" Date: Thu, 23 Feb 2017 21:33:03 +0200 Subject: [PATCH] do not log failed expressions - fix for hibernation delay --- src/health.c | 250 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 169 insertions(+), 81 deletions(-) diff --git a/src/health.c b/src/health.c index d8bc9db5..9e02b544 100644 --- a/src/health.c +++ b/src/health.c @@ -333,20 +333,28 @@ void *health_main(void *ptr) { BUFFER *wb = buffer_create(100); + time_t now = now_realtime_sec(); + time_t now_boottime = now_boottime_sec(); + time_t last_now = now; + time_t last_now_boottime = now_boottime; + time_t hibernation_delay = config_get_number("health", "postpone alarms during hibernation for seconds", 60); + unsigned int loop = 0; while(!netdata_exit) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); - int oldstate, runnable = 0; - time_t now = now_realtime_sec(); - time_t now_boot = now_boottime_sec(); + int oldstate, runnable = 0, apply_hibernation_delay = 0; time_t next_run = now + min_run_every; RRDCALC *rc; - time_t last_now = now; - time_t last_now_boot = now_boot; - time_t hibernation_delay = config_get_number("health", "postpone alarms during hibernation for seconds", 60); + // detect if boottime and realtime have twice the difference + // in which case we assume the system was just waken from hibernation + if(unlikely(now - last_now > 2 * (now_boottime - last_now_boottime))) + apply_hibernation_delay = 1; + + last_now = now; + last_now_boottime = now_boottime; if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0)) error("Cannot set pthread cancel state to DISABLE."); @@ -355,15 +363,20 @@ void *health_main(void *ptr) { RRDHOST *host; rrdhost_foreach_read(host) { - if(now - last_now > 2 * (now_boot - last_now_boot)) { - info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld).", - hibernation_delay, host->hostname, (long)(now - last_now), (long)(now_boot - last_now_boot)); + if(unlikely(apply_hibernation_delay)) { + + info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld)." + , hibernation_delay + , host->hostname + , (long)(now - last_now) + , (long)(now_boottime - last_now_boottime) + ); + host->health_delay_up_to = now + hibernation_delay; } - last_now = now; - last_now_boot = now_boot; - if(unlikely(!host->health_enabled || now < host->health_delay_up_to)) continue; + if(unlikely(!host->health_enabled || now < host->health_delay_up_to)) + continue; rrdhost_rdlock(host); @@ -379,27 +392,41 @@ void *health_main(void *ptr) { rc->old_value = rc->value; rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; - // 1. if there is database lookup, do it - // 2. if there is calculation expression, run it + // ------------------------------------------------------------ + // if there is database lookup, do it if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { /* time_t old_db_timestamp = rc->db_before; */ int value_is_null = 0; - int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null); + int ret = rrd2value(rc->rrdset + , wb + , &rc->value + , rc->dimensions + , 1 + , rc->after + , rc->before + , rc->group + , rc->options + , &rc->db_after + , &rc->db_before + , &value_is_null + ); if(unlikely(ret != 200)) { // database lookup failed rc->value = NAN; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; - error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret); - } + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; + + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': database lookup returned error %d" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , ret + ); } - else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR)) + else rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; /* - RRDCALC_FLAG_DB_STALE not currently used @@ -419,44 +446,57 @@ void *health_main(void *ptr) { if(unlikely(value_is_null)) { // collected value is null - rc->value = NAN; + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; - error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name); - } + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + ); } - else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN)) + else rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value); + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->value + ); } + // ------------------------------------------------------------ + // if there is calculation expression, run it + if(unlikely(rc->calculation)) { if(unlikely(!expression_evaluate(rc->calculation))) { // calculation failed - rc->value = NAN; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; - error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname, rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)); - } + rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->calculation->parsed_as + , buffer_tostring(rc->calculation->error_msg) + ); } else { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " - CALCULATED_NUMBER_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name - , rc->calculation->parsed_as, rc->calculation->result, - buffer_tostring(rc->calculation->error_msg), rc->source + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->calculation->parsed_as + , rc->calculation->result + , buffer_tostring(rc->calculation->error_msg) + , rc->source ); rc->value = rc->calculation->result; @@ -472,51 +512,74 @@ void *health_main(void *ptr) { if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) continue; - int warning_status = RRDCALC_STATUS_UNDEFINED; + int warning_status = RRDCALC_STATUS_UNDEFINED; int critical_status = RRDCALC_STATUS_UNDEFINED; + // -------------------------------------------------------- + // check the warning expression + if(likely(rc->warning)) { if(unlikely(!expression_evaluate(rc->warning))) { // calculation failed - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg)); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; - error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg)); - } + rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; + + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , buffer_tostring(rc->warning->error_msg) + ); } else { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source); - + rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->warning->result + , buffer_tostring(rc->warning->error_msg) + , rc->source + ); warning_status = rrdcalc_value2status(rc->warning->result); } } + // -------------------------------------------------------- + // check the critical expression + if(likely(rc->critical)) { if(unlikely(!expression_evaluate(rc->critical))) { // calculation failed - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg)); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; - error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg)); - } + rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; + + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , buffer_tostring(rc->critical->error_msg) + ); } else { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source); - + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->critical->result + , buffer_tostring(rc->critical->error_msg) + , rc->source + ); critical_status = rrdcalc_value2status(rc->critical->result); } } + // -------------------------------------------------------- + // decide the final alarm status + int status = RRDCALC_STATUS_UNDEFINED; switch(warning_status) { @@ -546,9 +609,14 @@ void *health_main(void *ptr) { break; } + // -------------------------------------------------------- + // check if the new status and the old differ + if(status != rc->status) { int delay = 0; + // apply trigger hysteresis + if(now > rc->delay_up_to_timestamp) { rc->delay_up_current = rc->delay_up_duration; rc->delay_down_current = rc->delay_down_duration; @@ -576,13 +644,31 @@ void *health_main(void *ptr) { rc->delay_last = delay; rc->delay_up_to_timestamp = now + delay; + + // add the alarm into the log + health_alarm_log( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id - , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change - , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info - , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) - ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0 + host + , rc->id + , rc->next_event_id++ + , now + , rc->name + , rc->rrdset->id + , rc->rrdset->family + , rc->exec + , rc->recipient + , now - rc->last_status_change + , rc->old_value + , rc->value + , rc->status + , status + , rc->source + , rc->units + , rc->info + , rc->delay_last + , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0 ); + rc->last_status_change = now; rc->status = status; } @@ -607,7 +693,7 @@ void *health_main(void *ptr) { if(unlikely(netdata_exit)) break; - } /* host loop */ + } /* rrdhost_foreach */ rrd_unlock(); @@ -621,12 +707,14 @@ void *health_main(void *ptr) { if(now < next_run) { debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now)); + now = now_realtime_sec(); } else debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); - now_boot = now_boottime_sec(); - } + now_boottime = now_boottime_sec(); + + } // forever buffer_free(wb); -- 2.39.2