X-Git-Url: https://arthur.barton.de/gitweb/?a=blobdiff_plain;f=src%2Fhealth.c;h=46b27db6fbfdacc871ceb13ab63cbc9f4b75e4e6;hb=8679670bdbe3c5928ec2e266d9c72e1a758fdf37;hp=494904c097434dbd821b55641e4c797b75402222;hpb=16cb55cb76cb6ef4d8540b62fe51ae6d11f80795;p=netdata.git diff --git a/src/health.c b/src/health.c index 494904c0..46b27db6 100644 --- a/src/health.c +++ b/src/health.c @@ -9,22 +9,15 @@ int default_health_enabled = 1; inline char *health_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir); - return config_get("health", "health configuration directory", buffer); + return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer); } void health_init(void) { debug(D_HEALTH, "Health configuration initializing"); - if(!central_netdata_to_push_data) { - if(!(default_health_enabled = config_get_boolean("health", "enabled", 1))) { - debug(D_HEALTH, "Health is disabled."); - return; - } - } - else { - info("Health is disabled - setup alarms at the central netdata."); - config_set_boolean("health", "enabled", 0); - default_health_enabled = 0; + if(!(default_health_enabled = config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", 1))) { + debug(D_HEALTH, "Health is disabled."); + return; } } @@ -32,6 +25,9 @@ void health_init(void) { // re-load health configuration void health_reload_host(RRDHOST *host) { + if(unlikely(!host->health_enabled)) + return; + char *path = health_config_dir(); // free all running alarms @@ -212,7 +208,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0; time_t now = now_realtime_sec(); - pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) { @@ -232,13 +228,13 @@ static inline void health_alarm_log_process(RRDHOST *host) { // remember this for the next iteration stop_at_id = first_waiting; - pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); if(host->health_log.count <= host->health_log.max) return; // cleanup excess entries in the log - pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock); + netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *last = NULL; unsigned int count = host->health_log.max * 2 / 3; @@ -260,7 +256,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { host->health_log.count--; } - pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); } static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) { @@ -285,6 +281,16 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) return 0; } + if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) { + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name); + return 0; + } + + if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) { + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name); + return 0; + } + if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name); return 0; @@ -328,21 +334,34 @@ void *health_main(void *ptr) { if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0) error("Cannot set pthread cancel state to ENABLE."); - int min_run_every = (int)config_get_number("health", "run at least every seconds", 10); + int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); if(min_run_every < 1) min_run_every = 1; BUFFER *wb = buffer_create(100); + time_t now = now_realtime_sec(); + time_t now_boottime = now_boottime_sec(); + time_t last_now = now; + time_t last_now_boottime = now_boottime; + time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); + unsigned int loop = 0; while(!netdata_exit) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); - int oldstate, runnable = 0; - time_t now = now_realtime_sec(); + int oldstate, runnable = 0, apply_hibernation_delay = 0; time_t next_run = now + min_run_every; RRDCALC *rc; + // detect if boottime and realtime have twice the difference + // in which case we assume the system was just waken from hibernation + if(unlikely(now - last_now > 2 * (now_boottime - last_now_boottime))) + apply_hibernation_delay = 1; + + last_now = now; + last_now_boottime = now_boottime; + if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0)) error("Cannot set pthread cancel state to DISABLE."); @@ -350,7 +369,23 @@ void *health_main(void *ptr) { RRDHOST *host; rrdhost_foreach_read(host) { - if(unlikely(!host->health_enabled)) continue; + if(unlikely(!host->health_enabled)) + continue; + + if(unlikely(apply_hibernation_delay)) { + + info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld)." + , hibernation_delay + , host->hostname + , (long)(now - last_now) + , (long)(now_boottime - last_now_boottime) + ); + + host->health_delay_up_to = now + hibernation_delay; + } + + if(unlikely(!host->health_enabled || now < host->health_delay_up_to)) + continue; rrdhost_rdlock(host); @@ -366,27 +401,41 @@ void *health_main(void *ptr) { rc->old_value = rc->value; rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; - // 1. if there is database lookup, do it - // 2. if there is calculation expression, run it + // ------------------------------------------------------------ + // if there is database lookup, do it if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { /* time_t old_db_timestamp = rc->db_before; */ int value_is_null = 0; - int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null); + int ret = rrdset2value_api_v1(rc->rrdset + , wb + , &rc->value + , rc->dimensions + , 1 + , rc->after + , rc->before + , rc->group + , rc->options + , &rc->db_after + , &rc->db_before + , &value_is_null + ); if(unlikely(ret != 200)) { // database lookup failed rc->value = NAN; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; - error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret); - } + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; + + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': database lookup returned error %d" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , ret + ); } - else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR)) + else rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; /* - RRDCALC_FLAG_DB_STALE not currently used @@ -406,44 +455,57 @@ void *health_main(void *ptr) { if(unlikely(value_is_null)) { // collected value is null - rc->value = NAN; + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; - error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name); - } + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + ); } - else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN)) + else rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value); + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->value + ); } + // ------------------------------------------------------------ + // if there is calculation expression, run it + if(unlikely(rc->calculation)) { if(unlikely(!expression_evaluate(rc->calculation))) { // calculation failed - rc->value = NAN; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; - error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname, rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)); - } + rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->calculation->parsed_as + , buffer_tostring(rc->calculation->error_msg) + ); } else { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " - CALCULATED_NUMBER_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name - , rc->calculation->parsed_as, rc->calculation->result, - buffer_tostring(rc->calculation->error_msg), rc->source + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->calculation->parsed_as + , rc->calculation->result + , buffer_tostring(rc->calculation->error_msg) + , rc->source ); rc->value = rc->calculation->result; @@ -459,51 +521,74 @@ void *health_main(void *ptr) { if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) continue; - int warning_status = RRDCALC_STATUS_UNDEFINED; + int warning_status = RRDCALC_STATUS_UNDEFINED; int critical_status = RRDCALC_STATUS_UNDEFINED; + // -------------------------------------------------------- + // check the warning expression + if(likely(rc->warning)) { if(unlikely(!expression_evaluate(rc->warning))) { // calculation failed - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg)); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; - error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg)); - } + rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; + + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , buffer_tostring(rc->warning->error_msg) + ); } else { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source); - + rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->warning->result + , buffer_tostring(rc->warning->error_msg) + , rc->source + ); warning_status = rrdcalc_value2status(rc->warning->result); } } + // -------------------------------------------------------- + // check the critical expression + if(likely(rc->critical)) { if(unlikely(!expression_evaluate(rc->critical))) { // calculation failed - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg)); - - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; - error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg)); - } + rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; + + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , buffer_tostring(rc->critical->error_msg) + ); } else { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source); - + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; + debug(D_HEALTH + , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" + , host->hostname + , rc->chart ? rc->chart : "NOCHART" + , rc->name + , rc->critical->result + , buffer_tostring(rc->critical->error_msg) + , rc->source + ); critical_status = rrdcalc_value2status(rc->critical->result); } } + // -------------------------------------------------------- + // decide the final alarm status + int status = RRDCALC_STATUS_UNDEFINED; switch(warning_status) { @@ -533,9 +618,14 @@ void *health_main(void *ptr) { break; } + // -------------------------------------------------------- + // check if the new status and the old differ + if(status != rc->status) { int delay = 0; + // apply trigger hysteresis + if(now > rc->delay_up_to_timestamp) { rc->delay_up_current = rc->delay_up_duration; rc->delay_down_current = rc->delay_down_duration; @@ -563,13 +653,31 @@ void *health_main(void *ptr) { rc->delay_last = delay; rc->delay_up_to_timestamp = now + delay; + + // add the alarm into the log + health_alarm_log( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id - , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change - , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info - , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) - ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0 + host + , rc->id + , rc->next_event_id++ + , now + , rc->name + , rc->rrdset->id + , rc->rrdset->family + , rc->exec + , rc->recipient + , now - rc->last_status_change + , rc->old_value + , rc->value + , rc->status + , status + , rc->source + , rc->units + , rc->info + , rc->delay_last + , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0 ); + rc->last_status_change = now; rc->status = status; } @@ -594,7 +702,7 @@ void *health_main(void *ptr) { if(unlikely(netdata_exit)) break; - } /* host loop */ + } /* rrdhost_foreach */ rrd_unlock(); @@ -608,10 +716,14 @@ void *health_main(void *ptr) { if(now < next_run) { debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now)); + now = now_realtime_sec(); } else debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); - } + + now_boottime = now_boottime_sec(); + + } // forever buffer_free(wb);