inline char *health_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
- return config_get("health", "health configuration directory", buffer);
+ return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
}
void health_init(void) {
debug(D_HEALTH, "Health configuration initializing");
- if(!central_netdata_to_push_data) {
- if(!(default_health_enabled = config_get_boolean("health", "enabled", 1))) {
- debug(D_HEALTH, "Health is disabled.");
- return;
- }
- }
- else {
- info("Health is disabled - setup alarms at the central netdata.");
- config_set_boolean("health", "enabled", 0);
- default_health_enabled = 0;
+ if(!(default_health_enabled = config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", 1))) {
+ debug(D_HEALTH, "Health is disabled.");
+ return;
}
-
- char pathname[FILENAME_MAX + 1];
- snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir);
- if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
- fatal("Cannot create directory '%s'.", pathname);
}
// ----------------------------------------------------------------------------
// re-load health configuration
void health_reload_host(RRDHOST *host) {
+ if(unlikely(!host->health_enabled))
+ return;
+
char *path = health_config_dir();
// free all running alarms
return 0;
}
+ if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
+ return 0;
+ }
+
+ if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) {
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name);
+ return 0;
+ }
+
if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
return 0;
if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
error("Cannot set pthread cancel state to ENABLE.");
- int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
+ int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
if(min_run_every < 1) min_run_every = 1;
BUFFER *wb = buffer_create(100);
+ time_t now = now_realtime_sec();
+ time_t now_boottime = now_boottime_sec();
+ time_t last_now = now;
+ time_t last_now_boottime = now_boottime;
+ time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
+
unsigned int loop = 0;
while(!netdata_exit) {
loop++;
debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
- int oldstate, runnable = 0;
- time_t now = now_realtime_sec();
+ int oldstate, runnable = 0, apply_hibernation_delay = 0;
time_t next_run = now + min_run_every;
RRDCALC *rc;
+ // detect if boottime and realtime have twice the difference
+ // in which case we assume the system was just waken from hibernation
+ if(unlikely(now - last_now > 2 * (now_boottime - last_now_boottime)))
+ apply_hibernation_delay = 1;
+
+ last_now = now;
+ last_now_boottime = now_boottime;
+
if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
error("Cannot set pthread cancel state to DISABLE.");
RRDHOST *host;
rrdhost_foreach_read(host) {
- if(unlikely(!host->health_enabled)) continue;
+ if(unlikely(!host->health_enabled))
+ continue;
+
+ if(unlikely(apply_hibernation_delay)) {
+
+ info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld)."
+ , hibernation_delay
+ , host->hostname
+ , (long)(now - last_now)
+ , (long)(now_boottime - last_now_boottime)
+ );
+
+ host->health_delay_up_to = now + hibernation_delay;
+ }
+
+ if(unlikely(!host->health_enabled || now < host->health_delay_up_to))
+ continue;
rrdhost_rdlock(host);
rc->old_value = rc->value;
rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
- // 1. if there is database lookup, do it
- // 2. if there is calculation expression, run it
+ // ------------------------------------------------------------
+ // if there is database lookup, do it
if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
/* time_t old_db_timestamp = rc->db_before; */
int value_is_null = 0;
- int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
+ int ret = rrdset2value_api_v1(rc->rrdset
+ , wb
+ , &rc->value
+ , rc->dimensions
+ , 1
+ , rc->after
+ , rc->before
+ , rc->group
+ , rc->options
+ , &rc->db_after
+ , &rc->db_before
+ , &value_is_null
+ );
if(unlikely(ret != 200)) {
// database lookup failed
rc->value = NAN;
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
-
- if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
- error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
- }
+ rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
+
+ debug(D_HEALTH
+ , "Health on host '%s', alarm '%s.%s': database lookup returned error %d"
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ , ret
+ );
}
- else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
+ else
rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
/* - RRDCALC_FLAG_DB_STALE not currently used
if(unlikely(value_is_null)) {
// collected value is null
-
rc->value = NAN;
+ rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
-
- if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
- error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
- }
+ debug(D_HEALTH
+ , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)"
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ );
}
- else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
+ else
rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
+ debug(D_HEALTH
+ , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ , rc->value
+ );
}
+ // ------------------------------------------------------------
+ // if there is calculation expression, run it
+
if(unlikely(rc->calculation)) {
if(unlikely(!expression_evaluate(rc->calculation))) {
// calculation failed
-
rc->value = NAN;
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
-
- if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
- rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
- error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname, rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
- }
+ rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
+
+ debug(D_HEALTH
+ , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s"
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ , rc->calculation->parsed_as
+ , buffer_tostring(rc->calculation->error_msg)
+ );
}
else {
- if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
- CALCULATED_NUMBER_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
- , rc->calculation->parsed_as, rc->calculation->result,
- buffer_tostring(rc->calculation->error_msg), rc->source
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
+
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ , rc->calculation->parsed_as
+ , rc->calculation->result
+ , buffer_tostring(rc->calculation->error_msg)
+ , rc->source
);
rc->value = rc->calculation->result;
if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
continue;
- int warning_status = RRDCALC_STATUS_UNDEFINED;
+ int warning_status = RRDCALC_STATUS_UNDEFINED;
int critical_status = RRDCALC_STATUS_UNDEFINED;
+ // --------------------------------------------------------
+ // check the warning expression
+
if(likely(rc->warning)) {
if(unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
-
- if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
- rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
- error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
- }
+ rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
+
+ debug(D_HEALTH
+ , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s"
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ , buffer_tostring(rc->warning->error_msg)
+ );
}
else {
- if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
-
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
+ debug(D_HEALTH
+ , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ , rc->warning->result
+ , buffer_tostring(rc->warning->error_msg)
+ , rc->source
+ );
warning_status = rrdcalc_value2status(rc->warning->result);
}
}
+ // --------------------------------------------------------
+ // check the critical expression
+
if(likely(rc->critical)) {
if(unlikely(!expression_evaluate(rc->critical))) {
// calculation failed
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
-
- if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
- rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
- error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
- }
+ rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
+
+ debug(D_HEALTH
+ , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s"
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ , buffer_tostring(rc->critical->error_msg)
+ );
}
else {
- if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
-
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
+ debug(D_HEALTH
+ , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
+ , host->hostname
+ , rc->chart ? rc->chart : "NOCHART"
+ , rc->name
+ , rc->critical->result
+ , buffer_tostring(rc->critical->error_msg)
+ , rc->source
+ );
critical_status = rrdcalc_value2status(rc->critical->result);
}
}
+ // --------------------------------------------------------
+ // decide the final alarm status
+
int status = RRDCALC_STATUS_UNDEFINED;
switch(warning_status) {
break;
}
+ // --------------------------------------------------------
+ // check if the new status and the old differ
+
if(status != rc->status) {
int delay = 0;
+ // apply trigger hysteresis
+
if(now > rc->delay_up_to_timestamp) {
rc->delay_up_current = rc->delay_up_duration;
rc->delay_down_current = rc->delay_down_duration;
rc->delay_last = delay;
rc->delay_up_to_timestamp = now + delay;
+
+ // add the alarm into the log
+
health_alarm_log(
- host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
- , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
- , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
- , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
- ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
+ host
+ , rc->id
+ , rc->next_event_id++
+ , now
+ , rc->name
+ , rc->rrdset->id
+ , rc->rrdset->family
+ , rc->exec
+ , rc->recipient
+ , now - rc->last_status_change
+ , rc->old_value
+ , rc->value
+ , rc->status
+ , status
+ , rc->source
+ , rc->units
+ , rc->info
+ , rc->delay_last
+ , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
);
+
rc->last_status_change = now;
rc->status = status;
}
if(unlikely(netdata_exit))
break;
- } /* host loop */
+ } /* rrdhost_foreach */
rrd_unlock();
if(now < next_run) {
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
+ now = now_realtime_sec();
}
else
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
- }
+
+ now_boottime = now_boottime_sec();
+
+ } // forever
buffer_free(wb);