X-Git-Url: https://arthur.barton.de/gitweb/?a=blobdiff_plain;f=src%2Fhealth.c;h=bd62fbcf8b7fbb8d3ead594422b15df7a61909d4;hb=35c1abbdc2833089667ffde39813d103e65dd54d;hp=dd2d82644e481b60ec6870dc7f9137b039021232;hpb=1c663765f4c9ee9ef25f932909b18b52f08ca1c6;p=netdata.git diff --git a/src/health.c b/src/health.c old mode 100755 new mode 100644 index dd2d8264..bd62fbcf --- a/src/health.c +++ b/src/health.c @@ -1,442 +1,14 @@ +#define NETDATA_HEALTH_INTERNALS #include "common.h" #define RRDVAR_MAX_LENGTH 1024 -struct health_options { - const char *health_default_exec; - const char *health_default_recipient; - const char *log_filename; - size_t log_entries_written; - FILE *log_fp; -}; - -static struct health_options health = { - .health_default_exec = PLUGINS_DIR "/alarm-notify.sh", - .health_default_recipient = "root", - .log_filename = VARLIB_DIR "/health/alarm_log.db", - .log_entries_written = 0, - .log_fp = NULL -}; - -int health_enabled = 1; - -// ---------------------------------------------------------------------------- -// health alarm log load/save -// no need for locking - only one thread is reading / writing the alarms log - -static inline int health_alarm_log_open(void) { - if(health.log_fp) - fclose(health.log_fp); - - health.log_fp = fopen(health.log_filename, "a"); - - if(health.log_fp) { - if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0) - error("Health: cannot set line buffering on health log file."); - return 0; - } - - error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename); - return -1; -} - -static inline void health_alarm_log_close(void) { - if(health.log_fp) { - fclose(health.log_fp); - health.log_fp = NULL; - } -} - -static inline void health_log_rotate(void) { - static size_t rotate_every = 0; - - if(unlikely(rotate_every == 0)) { - rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000); - if(rotate_every < 100) rotate_every = 100; - } - - if(unlikely(health.log_entries_written > rotate_every)) { - health_alarm_log_close(); - - char old_filename[FILENAME_MAX + 1]; - snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename); - - if(unlink(old_filename) == -1 && errno != ENOENT) - error("Health: cannot remove old alarms log file '%s'", old_filename); - - if(link(health.log_filename, old_filename) == -1 && errno != ENOENT) - error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename); - - if(unlink(health.log_filename) == -1 && errno != ENOENT) - error("Health: cannot remove old alarms log file '%s'", health.log_filename); - - // open it with truncate - health.log_fp = fopen(health.log_filename, "w"); - - if(health.log_fp) - fclose(health.log_fp); - else - error("Health: cannot truncate health log '%s'", health.log_filename); - - health.log_fp = NULL; - - health.log_entries_written = 0; - health_alarm_log_open(); - } -} - -static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { - health_log_rotate(); - - if(likely(health.log_fp)) { - if(unlikely(fprintf(health.log_fp - , "%c\t%s" - "\t%08x\t%08x\t%08x\t%08x\t%08x" - "\t%08x\t%08x\t%08x" - "\t%08x\t%08x\t%08x" - "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" - "\t%d\t%d\t%d\t%d" - "\t%Lf\t%Lf" - "\n" - , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A' - , host->hostname - - , ae->unique_id - , ae->alarm_id - , ae->alarm_event_id - , ae->updated_by_id - , ae->updates_id - - , (uint32_t)ae->when - , (uint32_t)ae->duration - , (uint32_t)ae->non_clear_duration - , (uint32_t)ae->flags - , (uint32_t)ae->exec_run_timestamp - , (uint32_t)ae->delay_up_to_timestamp - - , (ae->name)?ae->name:"" - , (ae->chart)?ae->chart:"" - , (ae->family)?ae->family:"" - , (ae->exec)?ae->exec:"" - , (ae->recipient)?ae->recipient:"" - , (ae->source)?ae->source:"" - , (ae->units)?ae->units:"" - , (ae->info)?ae->info:"" - - , ae->exec_code - , ae->new_status - , ae->old_status - , ae->delay - - , (long double)ae->new_value - , (long double)ae->old_value - ) < 0)) - error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart."); - else { - ae->flags |= HEALTH_ENTRY_FLAG_SAVED; - health.log_entries_written++; - } - } -} - -static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) { - static uint32_t max_unique_id = 0, max_alarm_id = 0; - - errno = 0; - - char *s, *buf = mallocz(65536 + 1); - size_t line = 0, len = 0; - ssize_t loaded = 0, updated = 0, errored = 0, duplicate = 0; - - pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - - while((s = fgets_trim_len(buf, 65536, fp, &len))) { - health.log_entries_written++; - line++; - - int max_entries = 30, entries = 0; - char *pointers[max_entries]; - - pointers[entries++] = s++; - while(*s) { - if(unlikely(*s == '\t')) { - *s = '\0'; - pointers[entries++] = ++s; - if(entries >= max_entries) { - error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries); - break; - } - } - else s++; - } - - if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) { - ALARM_ENTRY *ae = NULL; - - if(entries < 26) { - error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries); - errored++; - continue; - } - - // check that we have valid ids - uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16); - if(!unique_id) { - error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]); - errored++; - continue; - } - - uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16); - if(!alarm_id) { - error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]); - errored++; - continue; - } - - if(unlikely(*pointers[0] == 'A')) { - // make sure it is properly numbered - if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) { - error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id); - errored++; - continue; - } - - ae = callocz(1, sizeof(ALARM_ENTRY)); - } - else if(unlikely(*pointers[0] == 'U')) { - // find the original - for(ae = host->health_log.alarms; ae; ae = ae->next) { - if(unlikely(unique_id == ae->unique_id)) { - if(unlikely(*pointers[0] == 'A')) { - error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later." - , line, filename, unique_id); - *pointers[0] = 'U'; - duplicate++; - } - break; - } - else if(unlikely(unique_id > ae->unique_id)) { - // no need to continue - // the linked list is sorted - ae = NULL; - break; - } - } - - // if not found, skip this line - if(!ae) { - // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id); - continue; - } - } - - // check for a possible host missmatch - //if(strcmp(pointers[1], host->hostname)) - // error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname); - - ae->unique_id = unique_id; - ae->alarm_id = alarm_id; - ae->alarm_event_id = (uint32_t)strtoul(pointers[4], NULL, 16); - ae->updated_by_id = (uint32_t)strtoul(pointers[5], NULL, 16); - ae->updates_id = (uint32_t)strtoul(pointers[6], NULL, 16); - - ae->when = (uint32_t)strtoul(pointers[7], NULL, 16); - ae->duration = (uint32_t)strtoul(pointers[8], NULL, 16); - ae->non_clear_duration = (uint32_t)strtoul(pointers[9], NULL, 16); - - ae->flags = (uint32_t)strtoul(pointers[10], NULL, 16); - ae->flags |= HEALTH_ENTRY_FLAG_SAVED; - - ae->exec_run_timestamp = (uint32_t)strtoul(pointers[11], NULL, 16); - ae->delay_up_to_timestamp = (uint32_t)strtoul(pointers[12], NULL, 16); - - if(unlikely(ae->name)) freez(ae->name); - ae->name = strdupz(pointers[13]); - ae->hash_name = simple_hash(ae->name); - - if(unlikely(ae->chart)) freez(ae->chart); - ae->chart = strdupz(pointers[14]); - ae->hash_chart = simple_hash(ae->chart); - - if(unlikely(ae->family)) freez(ae->family); - ae->family = strdupz(pointers[15]); - - if(unlikely(ae->exec)) freez(ae->exec); - ae->exec = strdupz(pointers[16]); - if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; } - - if(unlikely(ae->recipient)) freez(ae->recipient); - ae->recipient = strdupz(pointers[17]); - if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; } - - if(unlikely(ae->source)) freez(ae->source); - ae->source = strdupz(pointers[18]); - if(!*ae->source) { freez(ae->source); ae->source = NULL; } - - if(unlikely(ae->units)) freez(ae->units); - ae->units = strdupz(pointers[19]); - if(!*ae->units) { freez(ae->units); ae->units = NULL; } - - if(unlikely(ae->info)) freez(ae->info); - ae->info = strdupz(pointers[20]); - if(!*ae->info) { freez(ae->info); ae->info = NULL; } - - ae->exec_code = atoi(pointers[21]); - ae->new_status = atoi(pointers[22]); - ae->old_status = atoi(pointers[23]); - ae->delay = atoi(pointers[24]); - - ae->new_value = strtold(pointers[25], NULL); - ae->old_value = strtold(pointers[26], NULL); - - // add it to host if not already there - if(unlikely(*pointers[0] == 'A')) { - ae->next = host->health_log.alarms; - host->health_log.alarms = ae; - loaded++; - } - else updated++; - - if(unlikely(ae->unique_id > max_unique_id)) - max_unique_id = ae->unique_id; - - if(unlikely(ae->alarm_id >= max_alarm_id)) - max_alarm_id = ae->alarm_id; - } - else { - error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]); - errored++; - } - } - - pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); - - freez(buf); - - if(!max_unique_id) max_unique_id = (uint32_t)now_realtime_sec(); - if(!max_alarm_id) max_alarm_id = (uint32_t)now_realtime_sec(); - - host->health_log.next_log_id = max_unique_id + 1; - host->health_log.next_alarm_id = max_alarm_id + 1; - - debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate); - return loaded; -} - -static inline void health_alarm_log_load(RRDHOST *host) { - health_alarm_log_close(); - - char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename); - FILE *fp = fopen(filename, "r"); - if(!fp) - error("Health: cannot open health file: %s", filename); - else { - health_alarm_log_read(host, fp, filename); - fclose(fp); - } - - health.log_entries_written = 0; - fp = fopen(health.log_filename, "r"); - if(!fp) - error("Health: cannot open health file: %s", health.log_filename); - else { - health_alarm_log_read(host, fp, health.log_filename); - fclose(fp); - } - - health_alarm_log_open(); -} - - -// ---------------------------------------------------------------------------- -// health alarm log management - -static inline void health_alarm_log(RRDHOST *host, - uint32_t alarm_id, uint32_t alarm_event_id, - time_t when, - const char *name, const char *chart, const char *family, - const char *exec, const char *recipient, time_t duration, - calculated_number old_value, calculated_number new_value, - int old_status, int new_status, - const char *source, - const char *units, - const char *info, - int delay -) { - debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id); - - ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY)); - ae->name = strdupz(name); - ae->hash_name = simple_hash(ae->name); - - if(chart) { - ae->chart = strdupz(chart); - ae->hash_chart = simple_hash(ae->chart); - } - - if(family) - ae->family = strdupz(family); - - if(exec) ae->exec = strdupz(exec); - if(recipient) ae->recipient = strdupz(recipient); - if(source) ae->source = strdupz(source); - if(units) ae->units = strdupz(units); - if(info) ae->info = strdupz(info); - - ae->unique_id = host->health_log.next_log_id++; - ae->alarm_id = alarm_id; - ae->alarm_event_id = alarm_event_id; - ae->when = when; - ae->old_value = old_value; - ae->new_value = new_value; - ae->old_status = old_status; - ae->new_status = new_status; - ae->duration = duration; - ae->delay = delay; - ae->delay_up_to_timestamp = when + delay; - - if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) - ae->non_clear_duration += ae->duration; - - // link it - pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock); - ae->next = host->health_log.alarms; - host->health_log.alarms = ae; - host->health_log.count++; - pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); - - // match previous alarms - pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - ALARM_ENTRY *t; - for(t = host->health_log.alarms ; t ; t = t->next) { - if(t != ae && t->alarm_id == ae->alarm_id) { - if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) { - t->flags |= HEALTH_ENTRY_FLAG_UPDATED; - t->updated_by_id = ae->unique_id; - ae->updates_id = t->unique_id; - - if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) && - (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL)) - ae->non_clear_duration += t->non_clear_duration; - - health_alarm_log_save(host, t); - } - - // no need to continue - break; - } - } - pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); - - health_alarm_log_save(host, ae); -} +int default_localhost_health_enabled = 1; // ---------------------------------------------------------------------------- // RRDVAR management -static inline int rrdvar_fix_name(char *variable) { +inline int rrdvar_fix_name(char *variable) { int fixed = 0; while(*variable) { if (!isalnum(*variable) && *variable != '.' && *variable != '_') { @@ -1095,7 +667,27 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) { { time_t now = now_realtime_sec(); - health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0); + health_alarm_log( + st->rrdhost, + rc->id, + rc->next_event_id++, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->family, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + RRDCALC_STATUS_UNINITIALIZED, + rc->source, + rc->units, + rc->info, + 0, + 0 + ); } } @@ -1133,7 +725,27 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) { { time_t now = now_realtime_sec(); - health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0); + health_alarm_log( + st->rrdhost, + rc->id, + rc->next_event_id++, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->family, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + RRDCALC_STATUS_REMOVED, + rc->source, + rc->units, + rc->info, + 0, + 0 + ); } RRDHOST *host = st->rrdhost; @@ -1183,7 +795,7 @@ RRDCALC *rrdcalc_find(RRDSET *st, const char *name) { return NULL; } -static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) { +inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) { RRDCALC *rc; if(unlikely(!chart)) { @@ -1206,7 +818,7 @@ static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *n return 0; } -static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) { +inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) { if(chart && name) { uint32_t hash_chart = simple_hash(chart); uint32_t hash_name = simple_hash(name); @@ -1224,7 +836,7 @@ static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, c return host->health_log.next_alarm_id++; } -static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) { +inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) { rrdhost_check_rdlock(host); if(rc->calculation) { @@ -1400,7 +1012,8 @@ void rrdcalctemplate_link_matching(RRDSET *st) { RRDCALCTEMPLATE *rt; for(rt = st->rrdhost->templates; rt ; rt = rt->next) { - if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) { + if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context) + && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) { RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id); if(unlikely(!rc)) error("Health tried to create alarm from template '%s', but it failed", rt->name); @@ -1413,7 +1026,7 @@ void rrdcalctemplate_link_matching(RRDSET *st) { } } -static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) { +inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) { debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname); if(host->templates) { @@ -1436,6 +1049,9 @@ static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) { expression_free(rt->warning); expression_free(rt->critical); + freez(rt->family_match); + simple_pattern_free(rt->family_pattern); + freez(rt->name); freez(rt->exec); freez(rt->recipient); @@ -1448,1167 +1064,129 @@ static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) { } // ---------------------------------------------------------------------------- -// load health configuration - -#define HEALTH_CONF_MAX_LINE 4096 - -#define HEALTH_ALARM_KEY "alarm" -#define HEALTH_TEMPLATE_KEY "template" -#define HEALTH_ON_KEY "on" -#define HEALTH_LOOKUP_KEY "lookup" -#define HEALTH_CALC_KEY "calc" -#define HEALTH_EVERY_KEY "every" -#define HEALTH_GREEN_KEY "green" -#define HEALTH_RED_KEY "red" -#define HEALTH_WARN_KEY "warn" -#define HEALTH_CRIT_KEY "crit" -#define HEALTH_EXEC_KEY "exec" -#define HEALTH_RECIPIENT_KEY "to" -#define HEALTH_UNITS_KEY "units" -#define HEALTH_INFO_KEY "info" -#define HEALTH_DELAY_KEY "delay" - -static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { - if(!rc->chart) { - error("Health configuration for alarm '%s' does not have a chart", rc->name); - return 0; - } +// health initialization - if(!rc->update_every) { - error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name); - return 0; - } +inline char *health_config_dir(void) { + char buffer[FILENAME_MAX + 1]; + snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir); + return config_get("health", "health configuration directory", buffer); +} - if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) { - error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name); - return 0; +void health_init(void) { + debug(D_HEALTH, "Health configuration initializing"); + + if(!(default_localhost_health_enabled = config_get_boolean("health", "enabled", 1))) { + debug(D_HEALTH, "Health is disabled."); + return; } - if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash)) - return 0; + char pathname[FILENAME_MAX + 1]; + snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir); + if(mkdir(pathname, 0770) == -1 && errno != EEXIST) + fatal("Cannot create directory '%s'.", pathname); +} - rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id); +// ---------------------------------------------------------------------------- +// re-load health configuration - debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", - rc->chart?rc->chart:"NOCHART", - rc->name, - rc->id, - (rc->exec)?rc->exec:"DEFAULT", - (rc->recipient)?rc->recipient:"DEFAULT", - rc->green, - rc->red, - rc->group, - rc->after, - rc->before, - rc->options, - (rc->dimensions)?rc->dimensions:"NONE", - rc->update_every, - (rc->calculation)?rc->calculation->parsed_as:"NONE", - (rc->warning)?rc->warning->parsed_as:"NONE", - (rc->critical)?rc->critical->parsed_as:"NONE", - rc->source, - rc->delay_up_duration, - rc->delay_down_duration, - rc->delay_max_duration, - rc->delay_multiplier - ); +inline void health_free_host_nolock(RRDHOST *host) { + while(host->templates) + rrdcalctemplate_free(host, host->templates); - rrdcalc_create_part2(host, rc); - return 1; + while(host->alarms) + rrdcalc_free(host, host->alarms); } -static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) { - if(unlikely(!rt->context)) { - error("Health configuration for template '%s' does not have a context", rt->name); - return 0; - } +void health_reload_host(RRDHOST *host) { + char *path = health_config_dir(); - if(unlikely(!rt->update_every)) { - error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name); - return 0; - } + // free all running alarms + rrdhost_wrlock(host); + health_free_host_nolock(host); + rrdhost_unlock(host); - if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) { - error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name); - return 0; + // invalidate all previous entries in the alarm log + ALARM_ENTRY *t; + for(t = host->health_log.alarms ; t ; t = t->next) { + if(t->new_status != RRDCALC_STATUS_REMOVED) + t->flags |= HEALTH_ENTRY_FLAG_UPDATED; } - RRDCALCTEMPLATE *t, *last = NULL; - for (t = host->templates; t ; last = t, t = t->next) { - if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) { - error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); - return 0; - } + // reset all thresholds to all charts + RRDSET *st; + for(st = host->rrdset_root; st ; st = st->next) { + st->green = NAN; + st->red = NAN; } - debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", - rt->name, - (rt->context)?rt->context:"NONE", - (rt->exec)?rt->exec:"DEFAULT", - (rt->recipient)?rt->recipient:"DEFAULT", - rt->green, - rt->red, - rt->group, - rt->after, - rt->before, - rt->options, - (rt->dimensions)?rt->dimensions:"NONE", - rt->update_every, - (rt->calculation)?rt->calculation->parsed_as:"NONE", - (rt->warning)?rt->warning->parsed_as:"NONE", - (rt->critical)?rt->critical->parsed_as:"NONE", - rt->source, - rt->delay_up_duration, - rt->delay_down_duration, - rt->delay_max_duration, - rt->delay_multiplier - ); + // load the new alarms + rrdhost_wrlock(host); + health_readdir(host, path); + rrdhost_unlock(host); - if(likely(last)) { - last->next = rt; - } - else { - rt->next = host->templates; - host->templates = rt; - } + // link the loaded alarms to their charts + for(st = host->rrdset_root; st ; st = st->next) { + rrdhost_wrlock(host); - return 1; -} + rrdsetcalc_link_matching(st); + rrdcalctemplate_link_matching(st); -static inline int health_parse_duration(char *string, int *result) { - // make sure it is a number - if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) { - *result = 0; - return 0; + rrdhost_unlock(host); } +} - char *e = NULL; - calculated_number n = strtold(string, &e); - if(e && *e) { - switch (*e) { - case 'Y': - *result = (int) (n * 86400 * 365); - break; - case 'M': - *result = (int) (n * 86400 * 30); - break; - case 'w': - *result = (int) (n * 86400 * 7); - break; - case 'd': - *result = (int) (n * 86400); - break; - case 'h': - *result = (int) (n * 3600); - break; - case 'm': - *result = (int) (n * 60); - break; - - default: - case 's': - *result = (int) (n); - break; - } - } - else - *result = (int)(n); +void health_reload(void) { + RRDHOST *host; - return 1; + for(host = localhost; host ; host = host->next) + health_reload_host(host); } -static inline int health_parse_delay( - size_t line, const char *path, const char *file, char *string, - int *delay_up_duration, - int *delay_down_duration, - int *delay_max_duration, - float *delay_multiplier) { +// ---------------------------------------------------------------------------- +// health main thread and friends - char given_up = 0; - char given_down = 0; - char given_max = 0; - char given_multiplier = 0; +static inline int rrdcalc_value2status(calculated_number n) { + if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED; + if(n) return RRDCALC_STATUS_RAISED; + return RRDCALC_STATUS_CLEAR; +} - char *s = string; - while(*s) { - char *key = s; +#define ALARM_EXEC_COMMAND_LENGTH 8192 - while(*s && !isspace(*s)) s++; - while(*s && isspace(*s)) *s++ = '\0'; +static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { + ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED; - if(!*key) break; + if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { + // do not send notifications for internal statuses + debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + goto done; + } - char *value = s; - while(*s && !isspace(*s)) s++; - while(*s && isspace(*s)) *s++ = '\0'; + if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { + // do not send notifications for disabled statuses + debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + // mark it as run, so that we will send the same alarm if it happens again + goto done; + } - if(!strcasecmp(key, "up")) { - if (!health_parse_duration(value, delay_up_duration)) { - error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword", - line, path, file, value, key); - } - else given_up = 1; - } - else if(!strcasecmp(key, "down")) { - if (!health_parse_duration(value, delay_down_duration)) { - error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword", - line, path, file, value, key); - } - else given_down = 1; - } - else if(!strcasecmp(key, "multiplier")) { - *delay_multiplier = strtof(value, NULL); - if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) { - error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword", - line, path, file, value, key); - } - else given_multiplier = 1; + // find the previous notification for the same alarm + // which we have run the exec script + // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set + if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { + uint32_t id = ae->alarm_id; + ALARM_ENTRY *t; + for(t = ae->next; t ; t = t->next) { + if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN) + break; } - else if(!strcasecmp(key, "max")) { - if (!health_parse_duration(value, delay_max_duration)) { - error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword", - line, path, file, value, key); - } - else given_max = 1; - } - else { - error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'", - line, path, file, key); - } - } - - if(!given_up) - *delay_up_duration = 0; - - if(!given_down) - *delay_down_duration = 0; - - if(!given_multiplier) - *delay_multiplier = 1.0; - - if(!given_max) { - if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier)) - *delay_max_duration = (*delay_up_duration) * (*delay_multiplier); - - if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier)) - *delay_max_duration = (*delay_down_duration) * (*delay_multiplier); - } - - return 1; -} - -static inline int health_parse_db_lookup( - size_t line, const char *path, const char *file, char *string, - int *group_method, int *after, int *before, int *every, - uint32_t *options, char **dimensions -) { - debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string); - - if(*dimensions) freez(*dimensions); - *dimensions = NULL; - *after = 0; - *before = 0; - *every = 0; - *options = 0; - - char *s = string, *key; - - // first is the group method - key = s; - while(*s && !isspace(*s)) s++; - while(*s && isspace(*s)) *s++ = '\0'; - if(!*s) { - error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'", - line, path, file, key); - return 0; - } - - if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) { - error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'", - line, path, file, key); - return 0; - } - - // then is the 'after' time - key = s; - while(*s && !isspace(*s)) s++; - while(*s && isspace(*s)) *s++ = '\0'; - - if(!health_parse_duration(key, after)) { - error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method", - line, path, file, key); - return 0; - } - - // sane defaults - *every = abs(*after); - - // now we may have optional parameters - while(*s) { - key = s; - while(*s && !isspace(*s)) s++; - while(*s && isspace(*s)) *s++ = '\0'; - if(!*key) break; - - if(!strcasecmp(key, "at")) { - char *value = s; - while(*s && !isspace(*s)) s++; - while(*s && isspace(*s)) *s++ = '\0'; - - if (!health_parse_duration(value, before)) { - error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword", - line, path, file, value, key); - } - } - else if(!strcasecmp(key, HEALTH_EVERY_KEY)) { - char *value = s; - while(*s && !isspace(*s)) s++; - while(*s && isspace(*s)) *s++ = '\0'; - - if (!health_parse_duration(value, every)) { - error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword", - line, path, file, value, key); - } - } - else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) { - *options |= RRDR_OPTION_ABSOLUTE; - } - else if(!strcasecmp(key, "min2max")) { - *options |= RRDR_OPTION_MIN2MAX; - } - else if(!strcasecmp(key, "null2zero")) { - *options |= RRDR_OPTION_NULL2ZERO; - } - else if(!strcasecmp(key, "percentage")) { - *options |= RRDR_OPTION_PERCENTAGE; - } - else if(!strcasecmp(key, "unaligned")) { - *options |= RRDR_OPTION_NOT_ALIGNED; - } - else if(!strcasecmp(key, "of")) { - if(*s && strcasecmp(s, "all")) - *dimensions = strdupz(s); - break; - } - else { - error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'", - line, path, file, key); - } - } - - return 1; -} - -static inline char *tabs2spaces(char *s) { - char *t = s; - while(*t) { - if(unlikely(*t == '\t')) *t = ' '; - t++; - } - - return s; -} - -static inline char *health_source_file(size_t line, const char *path, const char *filename) { - char buffer[FILENAME_MAX + 1]; - snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename); - return strdupz(buffer); -} - -static inline void strip_quotes(char *s) { - while(*s) { - if(*s == '\'' || *s == '"') *s = ' '; - s++; - } -} - -int health_readfile(const char *path, const char *filename) { - debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename); - - static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0; - char buffer[HEALTH_CONF_MAX_LINE + 1]; - - if(unlikely(!hash_alarm)) { - hash_alarm = simple_uhash(HEALTH_ALARM_KEY); - hash_template = simple_uhash(HEALTH_TEMPLATE_KEY); - hash_on = simple_uhash(HEALTH_ON_KEY); - hash_calc = simple_uhash(HEALTH_CALC_KEY); - hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY); - hash_green = simple_uhash(HEALTH_GREEN_KEY); - hash_red = simple_uhash(HEALTH_RED_KEY); - hash_warn = simple_uhash(HEALTH_WARN_KEY); - hash_crit = simple_uhash(HEALTH_CRIT_KEY); - hash_exec = simple_uhash(HEALTH_EXEC_KEY); - hash_every = simple_uhash(HEALTH_EVERY_KEY); - hash_units = simple_hash(HEALTH_UNITS_KEY); - hash_info = simple_hash(HEALTH_INFO_KEY); - hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY); - hash_delay = simple_uhash(HEALTH_DELAY_KEY); - } - - snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename); - FILE *fp = fopen(buffer, "r"); - if(!fp) { - error("Health configuration cannot read file '%s'.", buffer); - return 0; - } - - RRDCALC *rc = NULL; - RRDCALCTEMPLATE *rt = NULL; - - size_t line = 0, append = 0; - char *s; - while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) { - int stop_appending = !s; - line++; - s = trim(buffer); - if(!s) continue; - - append = strlen(s); - if(!stop_appending && s[append - 1] == '\\') { - s[append - 1] = ' '; - append = &s[append] - buffer; - if(append < HEALTH_CONF_MAX_LINE) - continue; - else { - error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename); - } - } - append = 0; - - char *key = s; - while(*s && *s != ':') s++; - if(!*s) { - error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename); - continue; - } - *s = '\0'; - s++; - - char *value = s; - key = trim(key); - value = trim(value); - - if(!key) { - error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename); - continue; - } - - if(!value) { - error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename); - continue; - } - - uint32_t hash = simple_uhash(key); - - if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { - if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc)) - rrdcalc_free(&localhost, rc); - - if(rt) { - if (!rrdcalctemplate_add_template_from_config(&localhost, rt)) - rrdcalctemplate_free(&localhost, rt); - rt = NULL; - } - - rc = callocz(1, sizeof(RRDCALC)); - rc->next_event_id = 1; - rc->name = tabs2spaces(strdupz(value)); - rc->hash = simple_hash(rc->name); - rc->source = health_source_file(line, path, filename); - rc->green = NAN; - rc->red = NAN; - rc->value = NAN; - rc->old_value = NAN; - rc->delay_multiplier = 1.0; - - if(rrdvar_fix_name(rc->name)) - error("Health configuration renamed alarm '%s' to '%s'", value, rc->name); - } - else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) { - if(rc) { - if(!rrdcalc_add_alarm_from_config(&localhost, rc)) - rrdcalc_free(&localhost, rc); - rc = NULL; - } - - if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt)) - rrdcalctemplate_free(&localhost, rt); - - rt = callocz(1, sizeof(RRDCALCTEMPLATE)); - rt->name = tabs2spaces(strdupz(value)); - rt->hash_name = simple_hash(rt->name); - rt->source = health_source_file(line, path, filename); - rt->green = NAN; - rt->red = NAN; - rt->delay_multiplier = 1.0; - - if(rrdvar_fix_name(rt->name)) - error("Health configuration renamed template '%s' to '%s'", value, rt->name); - } - else if(rc) { - if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { - if(rc->chart) { - if(strcmp(rc->chart, value)) - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rc->name, key, rc->chart, value, value); - - freez(rc->chart); - } - rc->chart = tabs2spaces(strdupz(value)); - rc->hash_chart = simple_hash(rc->chart); - } - else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { - health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before, - &rc->update_every, - &rc->options, &rc->dimensions); - } - else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - if(!health_parse_duration(value, &rc->update_every)) - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", - line, path, filename, rc->name, key, value); - } - else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { - char *e; - rc->green = strtold(value, &e); - if(e && *e) { - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", - line, path, filename, rc->name, key, e); - } - } - else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { - char *e; - rc->red = strtold(value, &e); - if(e && *e) { - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", - line, path, filename, rc->name, key, e); - } - } - else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { - const char *failed_at = NULL; - int error = 0; - rc->calculation = expression_parse(value, &failed_at, &error); - if(!rc->calculation) { - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, path, filename, rc->name, key, value, expression_strerror(error), failed_at); - } - } - else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) { - const char *failed_at = NULL; - int error = 0; - rc->warning = expression_parse(value, &failed_at, &error); - if(!rc->warning) { - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, path, filename, rc->name, key, value, expression_strerror(error), failed_at); - } - } - else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) { - const char *failed_at = NULL; - int error = 0; - rc->critical = expression_parse(value, &failed_at, &error); - if(!rc->critical) { - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, path, filename, rc->name, key, value, expression_strerror(error), failed_at); - } - } - else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { - if(rc->exec) { - if(strcmp(rc->exec, value)) - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rc->name, key, rc->exec, value, value); - - freez(rc->exec); - } - rc->exec = tabs2spaces(strdupz(value)); - } - else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { - if(rc->recipient) { - if(strcmp(rc->recipient, value)) - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rc->name, key, rc->recipient, value, value); - - freez(rc->recipient); - } - rc->recipient = tabs2spaces(strdupz(value)); - } - else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { - if(rc->units) { - if(strcmp(rc->units, value)) - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rc->name, key, rc->units, value, value); - - freez(rc->units); - } - rc->units = tabs2spaces(strdupz(value)); - strip_quotes(rc->units); - } - else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { - if(rc->info) { - if(strcmp(rc->info, value)) - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rc->name, key, rc->info, value, value); - - freez(rc->info); - } - rc->info = tabs2spaces(strdupz(value)); - strip_quotes(rc->info); - } - else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { - health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier); - } - else { - error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.", - line, path, filename, rc->name, key); - } - } - else if(rt) { - if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { - if(rt->context) { - if(strcmp(rt->context, value)) - error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rt->name, key, rt->context, value, value); - - freez(rt->context); - } - rt->context = tabs2spaces(strdupz(value)); - rt->hash_context = simple_hash(rt->context); - } - else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { - health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before, - &rt->update_every, - &rt->options, &rt->dimensions); - } - else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - if(!health_parse_duration(value, &rt->update_every)) - error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.", - line, path, filename, rt->name, key, value); - } - else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { - char *e; - rt->green = strtold(value, &e); - if(e && *e) { - error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", - line, path, filename, rt->name, key, e); - } - } - else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { - char *e; - rt->red = strtold(value, &e); - if(e && *e) { - error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", - line, path, filename, rt->name, key, e); - } - } - else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { - const char *failed_at = NULL; - int error = 0; - rt->calculation = expression_parse(value, &failed_at, &error); - if(!rt->calculation) { - error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, path, filename, rt->name, key, value, expression_strerror(error), failed_at); - } - } - else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) { - const char *failed_at = NULL; - int error = 0; - rt->warning = expression_parse(value, &failed_at, &error); - if(!rt->warning) { - error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, path, filename, rt->name, key, value, expression_strerror(error), failed_at); - } - } - else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) { - const char *failed_at = NULL; - int error = 0; - rt->critical = expression_parse(value, &failed_at, &error); - if(!rt->critical) { - error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, path, filename, rt->name, key, value, expression_strerror(error), failed_at); - } - } - else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { - if(rt->exec) { - if(strcmp(rt->exec, value)) - error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rt->name, key, rt->exec, value, value); - - freez(rt->exec); - } - rt->exec = tabs2spaces(strdupz(value)); - } - else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { - if(rt->recipient) { - if(strcmp(rt->recipient, value)) - error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rt->name, key, rt->recipient, value, value); - - freez(rt->recipient); - } - rt->recipient = tabs2spaces(strdupz(value)); - } - else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { - if(rt->units) { - if(strcmp(rt->units, value)) - error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rt->name, key, rt->units, value, value); - - freez(rt->units); - } - rt->units = tabs2spaces(strdupz(value)); - strip_quotes(rt->units); - } - else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { - if(rt->info) { - if(strcmp(rt->info, value)) - error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, path, filename, rt->name, key, rt->info, value, value); - - freez(rt->info); - } - rt->info = tabs2spaces(strdupz(value)); - strip_quotes(rt->info); - } - else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { - health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier); - } - else { - error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.", - line, path, filename, rt->name, key); - } - } - else { - error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.", - line, path, filename, key); - } - } - - if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc)) - rrdcalc_free(&localhost, rc); - - if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt)) - rrdcalctemplate_free(&localhost, rt); - - fclose(fp); - return 1; -} - -void health_readdir(const char *path) { - size_t pathlen = strlen(path); - - debug(D_HEALTH, "Health configuration reading directory '%s'", path); - - DIR *dir = opendir(path); - if (!dir) { - error("Health configuration cannot open directory '%s'.", path); - return; - } - - struct dirent *de = NULL; - while ((de = readdir(dir))) { - size_t len = strlen(de->d_name); - - if(de->d_type == DT_DIR - && ( - (de->d_name[0] == '.' && de->d_name[1] == '\0') - || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0') - )) { - debug(D_HEALTH, "Ignoring directory '%s'", de->d_name); - continue; - } - - else if(de->d_type == DT_DIR) { - char *s = mallocz(pathlen + strlen(de->d_name) + 2); - strcpy(s, path); - strcat(s, "/"); - strcat(s, de->d_name); - health_readdir(s); - freez(s); - continue; - } - - else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) && - len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) { - health_readfile(path, de->d_name); - } - - else debug(D_HEALTH, "Ignoring file '%s'", de->d_name); - } - - closedir(dir); -} - -static inline char *health_config_dir(void) { - char buffer[FILENAME_MAX + 1]; - snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR)); - return config_get("health", "health configuration directory", buffer); -} - -void health_init(void) { - debug(D_HEALTH, "Health configuration initializing"); - - if(!(health_enabled = config_get_boolean("health", "enabled", 1))) { - debug(D_HEALTH, "Health is disabled."); - return; - } - - char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health"); - if(mkdir(pathname, 0770) == -1 && errno != EEXIST) - fatal("Cannot create directory '%s'.", pathname); - - char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname); - health.log_filename = config_get("health", "health db file", filename); - - health_alarm_log_load(&localhost); - health_alarm_log_open(); - - char *path = health_config_dir(); - - { - char buffer[FILENAME_MAX + 1]; - snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR)); - health.health_default_exec = config_get("health", "script to execute on alarm", buffer); - } - - long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max); - if(n < 10) { - error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max); - config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max); - } - else localhost.health_log.max = (unsigned int)n; - - rrdhost_rwlock(&localhost); - health_readdir(path); - rrdhost_unlock(&localhost); -} - -// ---------------------------------------------------------------------------- -// JSON generation - -static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) { - if(value && *value) - buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix); - else - buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix); -} - -static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { - buffer_sprintf(wb, "\n\t{\n" - "\t\t\"hostname\": \"%s\",\n" - "\t\t\"unique_id\": %u,\n" - "\t\t\"alarm_id\": %u,\n" - "\t\t\"alarm_event_id\": %u,\n" - "\t\t\"name\": \"%s\",\n" - "\t\t\"chart\": \"%s\",\n" - "\t\t\"family\": \"%s\",\n" - "\t\t\"processed\": %s,\n" - "\t\t\"updated\": %s,\n" - "\t\t\"exec_run\": %lu,\n" - "\t\t\"exec_failed\": %s,\n" - "\t\t\"exec\": \"%s\",\n" - "\t\t\"recipient\": \"%s\",\n" - "\t\t\"exec_code\": %d,\n" - "\t\t\"source\": \"%s\",\n" - "\t\t\"units\": \"%s\",\n" - "\t\t\"info\": \"%s\",\n" - "\t\t\"when\": %lu,\n" - "\t\t\"duration\": %lu,\n" - "\t\t\"non_clear_duration\": %lu,\n" - "\t\t\"status\": \"%s\",\n" - "\t\t\"old_status\": \"%s\",\n" - "\t\t\"delay\": %d,\n" - "\t\t\"delay_up_to_timestamp\": %lu,\n" - "\t\t\"updated_by_id\": %u,\n" - "\t\t\"updates_id\": %u,\n", - host->hostname, - ae->unique_id, - ae->alarm_id, - ae->alarm_event_id, - ae->name, - ae->chart, - ae->family, - (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false", - (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false", - (unsigned long)ae->exec_run_timestamp, - (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false", - ae->exec?ae->exec:health.health_default_exec, - ae->recipient?ae->recipient:health.health_default_recipient, - ae->exec_code, - ae->source, - ae->units?ae->units:"", - ae->info?ae->info:"", - (unsigned long)ae->when, - (unsigned long)ae->duration, - (unsigned long)ae->non_clear_duration, - rrdcalc_status2string(ae->new_status), - rrdcalc_status2string(ae->old_status), - ae->delay, - (unsigned long)ae->delay_up_to_timestamp, - ae->updated_by_id, - ae->updates_id - ); - - buffer_strcat(wb, "\t\t\"value\":"); - buffer_rrd_value(wb, ae->new_value); - buffer_strcat(wb, ",\n"); - - buffer_strcat(wb, "\t\t\"old_value\":"); - buffer_rrd_value(wb, ae->old_value); - buffer_strcat(wb, "\n"); - - buffer_strcat(wb, "\t}"); -} - -void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) { - pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - - buffer_strcat(wb, "["); - - unsigned int max = host->health_log.max; - unsigned int count = 0; - ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) { - if(ae->unique_id > after) { - if(likely(count)) buffer_strcat(wb, ","); - health_alarm_entry2json_nolock(wb, ae, host); - } - } - - buffer_strcat(wb, "\n]\n"); - - pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); -} - -static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) { - buffer_sprintf(wb, - "\t\t\"%s.%s\": {\n" - "\t\t\t\"id\": %lu,\n" - "\t\t\t\"name\": \"%s\",\n" - "\t\t\t\"chart\": \"%s\",\n" - "\t\t\t\"family\": \"%s\",\n" - "\t\t\t\"active\": %s,\n" - "\t\t\t\"exec\": \"%s\",\n" - "\t\t\t\"recipient\": \"%s\",\n" - "\t\t\t\"source\": \"%s\",\n" - "\t\t\t\"units\": \"%s\",\n" - "\t\t\t\"info\": \"%s\",\n" - "\t\t\t\"status\": \"%s\",\n" - "\t\t\t\"last_status_change\": %lu,\n" - "\t\t\t\"last_updated\": %lu,\n" - "\t\t\t\"next_update\": %lu,\n" - "\t\t\t\"update_every\": %d,\n" - "\t\t\t\"delay_up_duration\": %d,\n" - "\t\t\t\"delay_down_duration\": %d,\n" - "\t\t\t\"delay_max_duration\": %d,\n" - "\t\t\t\"delay_multiplier\": %f,\n" - "\t\t\t\"delay\": %d,\n" - "\t\t\t\"delay_up_to_timestamp\": %lu,\n" - , rc->chart, rc->name - , (unsigned long)rc->id - , rc->name - , rc->chart - , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:"" - , (rc->rrdset)?"true":"false" - , rc->exec?rc->exec:health.health_default_exec - , rc->recipient?rc->recipient:health.health_default_recipient - , rc->source - , rc->units?rc->units:"" - , rc->info?rc->info:"" - , rrdcalc_status2string(rc->status) - , (unsigned long)rc->last_status_change - , (unsigned long)rc->last_updated - , (unsigned long)rc->next_update - , rc->update_every - , rc->delay_up_duration - , rc->delay_down_duration - , rc->delay_max_duration - , rc->delay_multiplier - , rc->delay_last - , (unsigned long)rc->delay_up_to_timestamp - ); - - if(RRDCALC_HAS_DB_LOOKUP(rc)) { - if(rc->dimensions && *rc->dimensions) - health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n"); - - buffer_sprintf(wb, - "\t\t\t\"db_after\": %lu,\n" - "\t\t\t\"db_before\": %lu,\n" - "\t\t\t\"lookup_method\": \"%s\",\n" - "\t\t\t\"lookup_after\": %d,\n" - "\t\t\t\"lookup_before\": %d,\n" - "\t\t\t\"lookup_options\": \"", - (unsigned long) rc->db_after, - (unsigned long) rc->db_before, - group_method2string(rc->group), - rc->after, - rc->before - ); - buffer_data_options2string(wb, rc->options); - buffer_strcat(wb, "\",\n"); - } - - if(rc->calculation) { - health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n"); - health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n"); - } - - if(rc->warning) { - health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n"); - health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n"); - } - - if(rc->critical) { - health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n"); - health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n"); - } - - buffer_strcat(wb, "\t\t\t\"green\":"); - buffer_rrd_value(wb, rc->green); - buffer_strcat(wb, ",\n"); - - buffer_strcat(wb, "\t\t\t\"red\":"); - buffer_rrd_value(wb, rc->red); - buffer_strcat(wb, ",\n"); - - buffer_strcat(wb, "\t\t\t\"value\":"); - buffer_rrd_value(wb, rc->value); - buffer_strcat(wb, "\n"); - - buffer_strcat(wb, "\t\t}"); -} - -//void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) { -// -//} - -void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) { - int i; - - rrdhost_rdlock(&localhost); - buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\"," - "\n\t\"latest_alarm_log_unique_id\": %u," - "\n\t\"status\": %s," - "\n\t\"now\": %lu," - "\n\t\"alarms\": {\n", - host->hostname, - (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0, - health_enabled?"true":"false", - (unsigned long)now_realtime_sec()); - - RRDCALC *rc; - for(i = 0, rc = host->alarms; rc ; rc = rc->next) { - if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) - continue; - - if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL))) - continue; - - if(likely(i)) buffer_strcat(wb, ",\n"); - health_rrdcalc2json_nolock(wb, rc); - i++; - } - -// buffer_strcat(wb, "\n\t},\n\t\"templates\": {"); -// RRDCALCTEMPLATE *rt; -// for(rt = host->templates; rt ; rt = rt->next) -// health_rrdcalctemplate2json_nolock(wb, rt); - - buffer_strcat(wb, "\n\t}\n}\n"); - rrdhost_unlock(&localhost); -} - - -// ---------------------------------------------------------------------------- -// re-load health configuration - -static inline void health_free_all_nolock(RRDHOST *host) { - while(host->templates) - rrdcalctemplate_free(host, host->templates); - - while(host->alarms) - rrdcalc_free(host, host->alarms); -} - -void health_reload(void) { - if(!health_enabled) { - error("Health reload is requested, but health is not enabled."); - return; - } - - char *path = health_config_dir(); - - // free all running alarms - rrdhost_rwlock(&localhost); - health_free_all_nolock(&localhost); - rrdhost_unlock(&localhost); - - // invalidate all previous entries in the alarm log - ALARM_ENTRY *t; - for(t = localhost.health_log.alarms ; t ; t = t->next) { - if(t->new_status != RRDCALC_STATUS_REMOVED) - t->flags |= HEALTH_ENTRY_FLAG_UPDATED; - } - - // reset all thresholds to all charts - RRDSET *st; - for(st = localhost.rrdset_root; st ; st = st->next) { - st->green = NAN; - st->red = NAN; - } - - // load the new alarms - rrdhost_rwlock(&localhost); - health_readdir(path); - rrdhost_unlock(&localhost); - - // link the loaded alarms to their charts - for(st = localhost.rrdset_root; st ; st = st->next) { - rrdhost_rwlock(&localhost); - - rrdsetcalc_link_matching(st); - rrdcalctemplate_link_matching(st); - - rrdhost_unlock(&localhost); - } -} - -// ---------------------------------------------------------------------------- -// health main thread and friends - -static inline int rrdcalc_value2status(calculated_number n) { - if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED; - if(n) return RRDCALC_STATUS_RAISED; - return RRDCALC_STATUS_CLEAR; -} - -#define ALARM_EXEC_COMMAND_LENGTH 8192 - -static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { - ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED; - - if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { - // do not send notifications for internal statuses - goto done; - } - - // find the previous notification for the same alarm - // which we have run the exec script - { - uint32_t id = ae->alarm_id; - ALARM_ENTRY *t; - for(t = ae->next; t ; t = t->next) { - if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN) - break; - } - - if(likely(t)) { - // we have executed this alarm notification in the past - if(t && t->new_status == ae->new_status) { - // don't send the notification for the same status again - debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name - , rrdcalc_status2string(ae->new_status)); - goto done; + + if(likely(t)) { + // we have executed this alarm notification in the past + if(t && t->new_status == ae->new_status) { + // don't send the notification for the same status again + debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name + , rrdcalc_status2string(ae->new_status)); + goto done; } } else { @@ -2625,13 +1203,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1]; pid_t command_pid; - const char *exec = ae->exec; - if(!exec) exec = health.health_default_exec; - - const char *recipient = ae->recipient; - if(!recipient) recipient = health.health_default_recipient; + const char *exec = (ae->exec) ? ae->exec : host->health_default_exec; + const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient; - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'", + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'", exec, recipient, host->hostname, @@ -2650,7 +1225,9 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { (uint32_t)ae->duration, (uint32_t)ae->non_clear_duration, ae->units?ae->units:"", - ae->info?ae->info:"" + ae->info?ae->info:"", + ae->new_value_string, + ae->old_value_string ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; @@ -2742,6 +1319,8 @@ static inline void health_alarm_log_process(RRDHOST *host) { freez(ae->source); freez(ae->units); freez(ae->info); + freez(ae->old_value_string); + freez(ae->new_value_string); freez(ae); ae = t; @@ -2806,7 +1385,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) } void *health_main(void *ptr) { - (void)ptr; + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; info("HEALTH thread created with task id %d", gettid()); @@ -2822,7 +1401,7 @@ void *health_main(void *ptr) { BUFFER *wb = buffer_create(100); unsigned int loop = 0; - while(health_enabled && !netdata_exit) { + while(!netdata_exit) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); @@ -2834,285 +1413,274 @@ void *health_main(void *ptr) { if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0)) error("Cannot set pthread cancel state to DISABLE."); - rrdhost_rdlock(&localhost); + RRDHOST *host; + for(host = localhost; host ; host = host->next) { + if(unlikely(!host->health_enabled)) continue; - // the first loop is to lookup values from the db - for(rc = localhost.alarms; rc; rc = rc->next) { - if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE; - continue; - } + rrdhost_rdlock(host); - runnable++; - rc->old_value = rc->value; - rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; + // the first loop is to lookup values from the db + for(rc = host->alarms; rc; rc = rc->next) { + if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { + if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE; + continue; + } + + runnable++; + rc->old_value = rc->value; + rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; - // 1. if there is database lookup, do it - // 2. if there is calculation expression, run it + // 1. if there is database lookup, do it + // 2. if there is calculation expression, run it - if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { - /* time_t old_db_timestamp = rc->db_before; */ - int value_is_null = 0; + if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + /* time_t old_db_timestamp = rc->db_before; */ + int value_is_null = 0; - int ret = rrd2value(rc->rrdset, wb, &rc->value, - rc->dimensions, 1, rc->after, rc->before, rc->group, - rc->options, &rc->db_after, &rc->db_before, &value_is_null); + int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null); - if (unlikely(ret != 200)) { - // database lookup failed - rc->value = NAN; + if(unlikely(ret != 200)) { + // database lookup failed + rc->value = NAN; - debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret); - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; - error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret); + if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; + error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret); + } } - } - else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; + else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; - /* - RRDCALC_FLAG_DB_STALE not currently used - if (unlikely(old_db_timestamp == rc->db_before)) { - // database is stale + /* - RRDCALC_FLAG_DB_STALE not currently used + if (unlikely(old_db_timestamp == rc->db_before)) { + // database is stale - debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; - error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name); + if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; + error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + } } - } - else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; - */ + else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; + */ - if (unlikely(value_is_null)) { - // collected value is null + if(unlikely(value_is_null)) { + // collected value is null - rc->value = NAN; + rc->value = NAN; - debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", - rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name); - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; - error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", - rc->chart?rc->chart:"NOCHART", rc->name); + if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; + error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name); + } } + else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value); } - else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value " - CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value); - } + if(unlikely(rc->calculation)) { + if(unlikely(!expression_evaluate(rc->calculation))) { + // calculation failed - if(unlikely(rc->calculation)) { - if (unlikely(!expression_evaluate(rc->calculation))) { - // calculation failed + rc->value = NAN; - rc->value = NAN; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)); - debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s", - rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)); + if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; + error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname, rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)); + } + } + else { + if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + CALCULATED_NUMBER_FORMAT + ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name + , rc->calculation->parsed_as, rc->calculation->result, + buffer_tostring(rc->calculation->error_msg), rc->source + ); - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; - error("Health alarm '%s.%s': expression '%s' failed: %s", - rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)); + rc->value = rc->calculation->result; } } - else { - if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value " - CALCULATED_NUMBER_FORMAT - ": %s (source: %s)", - rc->chart?rc->chart:"NOCHART", rc->name, - rc->calculation->parsed_as, - rc->calculation->result, - buffer_tostring(rc->calculation->error_msg), - rc->source - ); - - rc->value = rc->calculation->result; - } } - } - rrdhost_unlock(&localhost); + rrdhost_unlock(host); - if(unlikely(runnable && !netdata_exit)) { - rrdhost_rdlock(&localhost); + if(unlikely(runnable && !netdata_exit)) { + rrdhost_rdlock(host); - for(rc = localhost.alarms; rc; rc = rc->next) { - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) - continue; + for(rc = host->alarms; rc; rc = rc->next) { + if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) + continue; - int warning_status = RRDCALC_STATUS_UNDEFINED; - int critical_status = RRDCALC_STATUS_UNDEFINED; + int warning_status = RRDCALC_STATUS_UNDEFINED; + int critical_status = RRDCALC_STATUS_UNDEFINED; - if(likely(rc->warning)) { - if(unlikely(!expression_evaluate(rc->warning))) { - // calculation failed + if(likely(rc->warning)) { + if(unlikely(!expression_evaluate(rc->warning))) { + // calculation failed - debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s", - rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg)); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg)); - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; - error("Health alarm '%s.%s': warning expression failed with error: %s", - rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg)); + if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; + error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg)); + } } - } - else { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; + else { + if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value " - CALCULATED_NUMBER_FORMAT - ": %s (source: %s)", - rc->chart?rc->chart:"NOCHART", rc->name, - rc->warning->result, - buffer_tostring(rc->warning->error_msg), - rc->source - ); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source); - warning_status = rrdcalc_value2status(rc->warning->result); + warning_status = rrdcalc_value2status(rc->warning->result); + } } - } - if(likely(rc->critical)) { - if(unlikely(!expression_evaluate(rc->critical))) { - // calculation failed + if(likely(rc->critical)) { + if(unlikely(!expression_evaluate(rc->critical))) { + // calculation failed - debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s", - rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg)); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg)); - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; - error("Health alarm '%s.%s': critical expression failed with error: %s", - rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg)); + if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; + error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg)); + } } - } - else { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; + else { + if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value " - CALCULATED_NUMBER_FORMAT - ": %s (source: %s)", - rc->chart?rc->chart:"NOCHART", rc->name, - rc->critical->result, - buffer_tostring(rc->critical->error_msg), - rc->source - ); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source); - critical_status = rrdcalc_value2status(rc->critical->result); + critical_status = rrdcalc_value2status(rc->critical->result); + } } - } - int status = RRDCALC_STATUS_UNDEFINED; + int status = RRDCALC_STATUS_UNDEFINED; - switch(warning_status) { - case RRDCALC_STATUS_CLEAR: - status = RRDCALC_STATUS_CLEAR; - break; + switch(warning_status) { + case RRDCALC_STATUS_CLEAR: + status = RRDCALC_STATUS_CLEAR; + break; - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_WARNING; - break; + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_WARNING; + break; - default: - break; - } + default: + break; + } - switch(critical_status) { - case RRDCALC_STATUS_CLEAR: - if(status == RRDCALC_STATUS_UNDEFINED) - status = RRDCALC_STATUS_CLEAR; - break; + switch(critical_status) { + case RRDCALC_STATUS_CLEAR: + if(status == RRDCALC_STATUS_UNDEFINED) + status = RRDCALC_STATUS_CLEAR; + break; - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_CRITICAL; - break; + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_CRITICAL; + break; - default: - break; - } + default: + break; + } - if(status != rc->status) { - int delay = 0; + if(status != rc->status) { + int delay = 0; - if(now > rc->delay_up_to_timestamp) { - rc->delay_up_current = rc->delay_up_duration; - rc->delay_down_current = rc->delay_down_duration; - rc->delay_last = 0; - rc->delay_up_to_timestamp = 0; - } - else { - rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier); - if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration; + if(now > rc->delay_up_to_timestamp) { + rc->delay_up_current = rc->delay_up_duration; + rc->delay_down_current = rc->delay_down_duration; + rc->delay_last = 0; + rc->delay_up_to_timestamp = 0; + } + else { + rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); + if(rc->delay_up_current > rc->delay_max_duration) + rc->delay_up_current = rc->delay_max_duration; + + rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); + if(rc->delay_down_current > rc->delay_max_duration) + rc->delay_down_current = rc->delay_max_duration; + } - rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier); - if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration; + if(status > rc->status) + delay = rc->delay_up_current; + else + delay = rc->delay_down_current; + + // COMMENTED: because we do need to send raising alarms + // if(now + delay < rc->delay_up_to_timestamp) + // delay = (int)(rc->delay_up_to_timestamp - now); + + rc->delay_last = delay; + rc->delay_up_to_timestamp = now + delay; + health_alarm_log( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id + , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change + , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info + , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) + ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0 + ); + rc->last_status_change = now; + rc->status = status; } - if(status > rc->status) - delay = rc->delay_up_current; - else - delay = rc->delay_down_current; + rc->last_updated = now; + rc->next_update = now + rc->update_every; - // COMMENTED: because we do need to send raising alarms - // if(now + delay < rc->delay_up_to_timestamp) - // delay = (int)(rc->delay_up_to_timestamp - now); - - rc->delay_last = delay; - rc->delay_up_to_timestamp = now + delay; - health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last); - rc->last_status_change = now; - rc->status = status; + if(next_run > rc->next_update) + next_run = rc->next_update; } - rc->last_updated = now; - rc->next_update = now + rc->update_every; - - if (next_run > rc->next_update) - next_run = rc->next_update; + rrdhost_unlock(host); } - rrdhost_unlock(&localhost); - } + if(unlikely(netdata_exit)) + break; - if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0)) - error("Cannot set pthread cancel state to RESTORE (%d).", oldstate); + // execute notifications + // and cleanup + health_alarm_log_process(host); - if(unlikely(netdata_exit)) - break; + if(unlikely(netdata_exit)) + break; + + } /* host loop */ - // execute notifications - // and cleanup - health_alarm_log_process(&localhost); + if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0)) + error("Cannot set pthread cancel state to RESTORE (%d).", oldstate); if(unlikely(netdata_exit)) break; - + now = now_realtime_sec(); if(now < next_run) { - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", - loop, (int) (next_run - now)); + debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now)); } - else { + else debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); - } } buffer_free(wb); info("HEALTH thread exiting"); + + static_thread->enabled = 0; pthread_exit(NULL); return NULL; }