3 #define RRDVAR_MAX_LENGTH 1024
5 struct health_options {
6 const char *health_default_exec;
7 const char *health_default_recipient;
8 const char *log_filename;
9 size_t log_entries_written;
13 static struct health_options health = {
14 .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15 .health_default_recipient = "root",
16 .log_filename = VARLIB_DIR "/health/alarm_log.db",
17 .log_entries_written = 0,
21 int health_enabled = 1;
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
27 static inline int health_alarm_log_open(void) {
29 fclose(health.log_fp);
31 health.log_fp = fopen(health.log_filename, "a");
34 if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35 error("Health: cannot set line buffering on health log file.");
39 error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
43 static inline void health_alarm_log_close(void) {
45 fclose(health.log_fp);
50 static inline void health_log_rotate(void) {
51 static size_t rotate_every = 0;
53 if(unlikely(rotate_every == 0)) {
54 rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55 if(rotate_every < 100) rotate_every = 100;
58 if(unlikely(health.log_entries_written > rotate_every)) {
59 health_alarm_log_close();
61 char old_filename[FILENAME_MAX + 1];
62 snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
64 if(unlink(old_filename) == -1 && errno != ENOENT)
65 error("Health: cannot remove old alarms log file '%s'", old_filename);
67 if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68 error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
70 if(unlink(health.log_filename) == -1 && errno != ENOENT)
71 error("Health: cannot remove old alarms log file '%s'", health.log_filename);
73 // open it with truncate
74 health.log_fp = fopen(health.log_filename, "w");
77 fclose(health.log_fp);
79 error("Health: cannot truncate health log '%s'", health.log_filename);
83 health.log_entries_written = 0;
84 health_alarm_log_open();
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
91 if(likely(health.log_fp)) {
92 if(unlikely(fprintf(health.log_fp
94 "\t%08x\t%08x\t%08x\t%08x\t%08x"
97 "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
101 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
111 , (uint32_t)ae->duration
112 , (uint32_t)ae->non_clear_duration
113 , (uint32_t)ae->flags
114 , (uint32_t)ae->exec_run_timestamp
115 , (uint32_t)ae->delay_up_to_timestamp
117 , (ae->name)?ae->name:""
118 , (ae->chart)?ae->chart:""
119 , (ae->family)?ae->family:""
120 , (ae->exec)?ae->exec:""
121 , (ae->recipient)?ae->recipient:""
122 , (ae->source)?ae->source:""
123 , (ae->units)?ae->units:""
124 , (ae->info)?ae->info:""
131 , (long double)ae->new_value
132 , (long double)ae->old_value
134 error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
136 ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137 health.log_entries_written++;
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143 static uint32_t max_unique_id = 0, max_alarm_id = 0;
144 ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
148 char *s, *buf = mallocz(65536 + 1);
149 size_t line = 0, len = 0;
150 loaded = updated = errored = duplicate = 0;
152 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
154 while((s = fgets_trim_len(buf, 65536, fp, &len))) {
155 health.log_entries_written++;
158 int max_entries = 30, entries = 0;
159 char *pointers[max_entries];
161 pointers[entries++] = s++;
163 if(unlikely(*s == '\t')) {
165 pointers[entries++] = ++s;
166 if(entries >= max_entries) {
167 error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
174 if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
175 ALARM_ENTRY *ae = NULL;
178 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring line.", line, filename, entries);
183 // check that we have valid ids
184 uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
186 error("Health: line %zu of file '%s' states alarm entry with unique id %u (%s). Ignoring line.", line, filename, unique_id, pointers[2]);
191 uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
193 error("Health: line %zu of file '%s' states alarm entry for alarm id %u (%s). Ignoring line.", line, filename, alarm_id, pointers[3]);
198 // find a possible overwrite
199 for(ae = host->health_log.alarms; ae; ae = ae->next) {
200 if(unlikely(ae->unique_id == unique_id)) {
201 if(unlikely(*pointers[0] == 'A')) {
202 error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u."
203 , line, filename, unique_id);
211 // if not found, create a new one
214 // if it is an update, but we haven't found it, make it an addition
215 if(unlikely(*pointers[0] == 'U')) {
217 error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
220 // alarms should be added in the right order
221 if(unlikely(unique_id < max_unique_id)) {
222 error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order.", line, filename, unique_id);
225 ae = callocz(1, sizeof(ALARM_ENTRY));
228 // check for a possible host missmatch
229 if(strcmp(pointers[1], host->hostname))
230 error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
232 ae->unique_id = unique_id;
233 ae->alarm_id = alarm_id;
234 ae->alarm_event_id = (uint32_t)strtoul(pointers[4], NULL, 16);
235 ae->updated_by_id = (uint32_t)strtoul(pointers[5], NULL, 16);
236 ae->updates_id = (uint32_t)strtoul(pointers[6], NULL, 16);
238 ae->when = (uint32_t)strtoul(pointers[7], NULL, 16);
239 ae->duration = (uint32_t)strtoul(pointers[8], NULL, 16);
240 ae->non_clear_duration = (uint32_t)strtoul(pointers[9], NULL, 16);
242 ae->flags = (uint32_t)strtoul(pointers[10], NULL, 16);
243 ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
245 ae->exec_run_timestamp = (uint32_t)strtoul(pointers[11], NULL, 16);
246 ae->delay_up_to_timestamp = (uint32_t)strtoul(pointers[12], NULL, 16);
248 if(unlikely(ae->name)) freez(ae->name);
249 ae->name = strdupz(pointers[13]);
250 ae->hash_name = simple_hash(ae->name);
252 if(unlikely(ae->chart)) freez(ae->chart);
253 ae->chart = strdupz(pointers[14]);
254 ae->hash_chart = simple_hash(ae->chart);
256 if(unlikely(ae->family)) freez(ae->family);
257 ae->family = strdupz(pointers[15]);
259 if(unlikely(ae->exec)) freez(ae->exec);
260 ae->exec = strdupz(pointers[16]);
261 if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
263 if(unlikely(ae->recipient)) freez(ae->recipient);
264 ae->recipient = strdupz(pointers[17]);
265 if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
267 if(unlikely(ae->source)) freez(ae->source);
268 ae->source = strdupz(pointers[18]);
269 if(!*ae->source) { freez(ae->source); ae->source = NULL; }
271 if(unlikely(ae->units)) freez(ae->units);
272 ae->units = strdupz(pointers[19]);
273 if(!*ae->units) { freez(ae->units); ae->units = NULL; }
275 if(unlikely(ae->info)) freez(ae->info);
276 ae->info = strdupz(pointers[20]);
277 if(!*ae->info) { freez(ae->info); ae->info = NULL; }
279 ae->exec_code = atoi(pointers[21]);
280 ae->new_status = atoi(pointers[22]);
281 ae->old_status = atoi(pointers[23]);
282 ae->delay = atoi(pointers[24]);
284 ae->new_value = strtold(pointers[25], NULL);
285 ae->old_value = strtold(pointers[26], NULL);
287 // add it to host if not already there
288 if(unlikely(*pointers[0] == 'A')) {
289 ae->next = host->health_log.alarms;
290 host->health_log.alarms = ae;
295 if(unlikely(ae->unique_id > max_unique_id))
296 max_unique_id = ae->unique_id;
298 if(unlikely(ae->alarm_id >= max_alarm_id))
299 max_alarm_id = ae->alarm_id;
302 error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
307 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
311 if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
312 if(!max_alarm_id) max_alarm_id = (uint32_t)time(NULL);
314 host->health_log.next_log_id = max_unique_id + 1;
315 host->health_log.next_alarm_id = max_alarm_id + 1;
317 info("Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
321 static inline void health_alarm_log_load(RRDHOST *host) {
322 health_alarm_log_close();
324 char buffer[FILENAME_MAX + 1];
325 snprintfz(buffer, FILENAME_MAX, "%s.old", health.log_filename);
326 FILE *fp = fopen(buffer, "r");
328 error("Health: cannot open health file: %s", health.log_filename);
330 health_alarm_log_read(host, fp, health.log_filename);
334 health.log_entries_written = 0;
335 fp = fopen(health.log_filename, "r");
337 error("Health: cannot open health file: %s", health.log_filename);
339 health_alarm_log_read(host, fp, health.log_filename);
343 health_alarm_log_open();
347 // ----------------------------------------------------------------------------
348 // health alarm log management
350 static inline void health_alarm_log(RRDHOST *host,
351 uint32_t alarm_id, uint32_t alarm_event_id,
353 const char *name, const char *chart, const char *family,
354 const char *exec, const char *recipient, time_t duration,
355 calculated_number old_value, calculated_number new_value,
356 int old_status, int new_status,
362 debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
364 ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
365 ae->name = strdupz(name);
366 ae->hash_name = simple_hash(ae->name);
369 ae->chart = strdupz(chart);
370 ae->hash_chart = simple_hash(ae->chart);
374 ae->family = strdupz(family);
376 if(exec) ae->exec = strdupz(exec);
377 if(recipient) ae->recipient = strdupz(recipient);
378 if(source) ae->source = strdupz(source);
379 if(units) ae->units = strdupz(units);
380 if(info) ae->info = strdupz(info);
382 ae->unique_id = host->health_log.next_log_id++;
383 ae->alarm_id = alarm_id;
384 ae->alarm_event_id = alarm_event_id;
386 ae->old_value = old_value;
387 ae->new_value = new_value;
388 ae->old_status = old_status;
389 ae->new_status = new_status;
390 ae->duration = duration;
392 ae->delay_up_to_timestamp = when + delay;
394 if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
395 ae->non_clear_duration += ae->duration;
398 pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
399 ae->next = host->health_log.alarms;
400 host->health_log.alarms = ae;
401 host->health_log.count++;
402 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
404 // match previous alarms
405 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
407 for(t = host->health_log.alarms ; t ; t = t->next) {
408 if(t != ae && t->alarm_id == ae->alarm_id) {
409 if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
410 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
411 t->updated_by_id = ae->unique_id;
412 ae->updates_id = t->unique_id;
414 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
415 (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
416 ae->non_clear_duration += t->non_clear_duration;
418 health_alarm_log_save(host, t);
421 // no need to continue
425 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
427 health_alarm_log_save(host, ae);
430 // ----------------------------------------------------------------------------
433 static inline int rrdvar_fix_name(char *variable) {
436 if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
447 int rrdvar_compare(void* a, void* b) {
448 if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
449 else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
450 else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
453 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
454 RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
456 debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
461 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
462 RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
464 error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
469 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
471 tmp.name = (char *)name;
472 tmp.hash = (hash)?hash:simple_hash(tmp.name);
474 return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
477 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
483 rrdvar_index_del(tree, rv);
489 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
490 char *variable = strdupz(name);
491 rrdvar_fix_name(variable);
492 uint32_t hash = simple_hash(variable);
494 RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
496 debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
498 rv = callocz(1, sizeof(RRDVAR));
504 RRDVAR *ret = rrdvar_index_add(tree, rv);
505 if(unlikely(ret != rv)) {
506 debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
507 rrdvar_free(NULL, NULL, rv);
511 debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
522 // ----------------------------------------------------------------------------
525 calculated_number rrdvar2number(RRDVAR *rv) {
527 case RRDVAR_TYPE_CALCULATED: {
528 calculated_number *n = (calculated_number *)rv->value;
532 case RRDVAR_TYPE_TIME_T: {
533 time_t *n = (time_t *)rv->value;
537 case RRDVAR_TYPE_COLLECTED: {
538 collected_number *n = (collected_number *)rv->value;
542 case RRDVAR_TYPE_TOTAL: {
543 total_number *n = (total_number *)rv->value;
547 case RRDVAR_TYPE_INT: {
548 int *n = (int *)rv->value;
553 error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
558 void dump_variable(void *data) {
559 RRDVAR *rv = (RRDVAR *)data;
560 debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
563 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
564 RRDSET *st = rc->rrdset;
569 rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
571 *result = rrdvar2number(rv);
575 rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
577 *result = rrdvar2number(rv);
581 rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
583 *result = rrdvar2number(rv);
587 debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
588 avl_traverse_lock(&st->variables_root_index, dump_variable);
590 debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
591 avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
593 debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
594 avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
599 // ----------------------------------------------------------------------------
600 // RRDSETVAR management
602 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
603 debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
604 RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
606 char buffer[RRDVAR_MAX_LENGTH + 1];
607 snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
608 rs->fullid = strdupz(buffer);
610 snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
611 rs->fullname = strdupz(buffer);
613 rs->variable = strdupz(variable);
617 rs->options = options;
620 rs->local = rrdvar_create_and_index("local", &st->variables_root_index, rs->variable, rs->type, rs->value);
621 rs->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
622 rs->host = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
623 rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
624 rs->host_name = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
626 rs->next = st->variables;
632 void rrdsetvar_rename_all(RRDSET *st) {
633 debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
635 // only these 2 can change name
639 char buffer[RRDVAR_MAX_LENGTH + 1];
640 RRDSETVAR *rs, *next = st->variables;
644 snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
646 if (strcmp(buffer, rs->fullname)) {
648 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
649 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
652 rs->fullname = strdupz(st->name);
653 rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
654 rs->host_name = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
658 rrdsetcalc_link_matching(st);
661 void rrdsetvar_free(RRDSETVAR *rs) {
662 RRDSET *st = rs->rrdset;
663 debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
665 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
666 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
667 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
668 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
669 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
671 if(st->variables == rs) {
672 st->variables = rs->next;
676 for (t = st->variables; t && t->next != rs; t = t->next);
677 if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
678 else t->next = rs->next;
687 // ----------------------------------------------------------------------------
688 // RRDDIMVAR management
690 #define RRDDIMVAR_ID_MAX 1024
692 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
693 RRDSET *st = rd->rrdset;
695 debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
697 if(!prefix) prefix = "";
698 if(!suffix) suffix = "";
700 char buffer[RRDDIMVAR_ID_MAX + 1];
701 RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
703 rs->prefix = strdupz(prefix);
704 rs->suffix = strdupz(suffix);
706 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
707 rs->id = strdupz(buffer);
709 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
710 rs->name = strdupz(buffer);
712 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
713 rs->fullidid = strdupz(buffer);
715 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
716 rs->fullidname = strdupz(buffer);
718 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
719 rs->fullnameid = strdupz(buffer);
721 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
722 rs->fullnamename = strdupz(buffer);
726 rs->options = options;
729 rs->local_id = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
730 rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
732 rs->family_id = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
733 rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
735 rs->host_fullidid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
736 rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
737 rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
738 rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
740 rs->next = rd->variables;
746 void rrddimvar_rename_all(RRDDIM *rd) {
747 RRDSET *st = rd->rrdset;
748 debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
750 RRDDIMVAR *rs, *next = rd->variables;
754 if (strcmp(rd->name, rs->name)) {
755 char buffer[RRDDIMVAR_ID_MAX + 1];
759 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
761 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
762 rs->name = strdupz(buffer);
763 rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
765 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
766 freez(rs->fullidname);
767 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
768 rs->fullidname = strdupz(buffer);
769 rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
770 rs->fullidname, rs->type, rs->value);
773 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
774 freez(rs->fullnameid);
775 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
776 rs->fullnameid = strdupz(buffer);
777 rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
778 rs->fullnameid, rs->type, rs->value);
781 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
782 freez(rs->fullnamename);
783 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
784 rs->fullnamename = strdupz(buffer);
785 rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
786 rs->fullnamename, rs->type, rs->value);
791 void rrddimvar_free(RRDDIMVAR *rs) {
792 RRDDIM *rd = rs->rrddim;
793 RRDSET *st = rd->rrdset;
794 debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
796 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
797 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
799 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
800 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
802 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
803 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
804 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
805 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
807 if(rd->variables == rs) {
808 debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
809 rd->variables = rs->next;
812 debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
814 for (t = rd->variables; t && t->next != rs; t = t->next) ;
815 if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
816 else t->next = rs->next;
824 freez(rs->fullidname);
825 freez(rs->fullnameid);
826 freez(rs->fullnamename);
830 // ----------------------------------------------------------------------------
831 // RRDCALC management
833 static inline const char *rrdcalc_status2string(int status) {
835 case RRDCALC_STATUS_REMOVED:
838 case RRDCALC_STATUS_UNDEFINED:
841 case RRDCALC_STATUS_UNINITIALIZED:
842 return "UNINITIALIZED";
844 case RRDCALC_STATUS_CLEAR:
847 case RRDCALC_STATUS_RAISED:
850 case RRDCALC_STATUS_WARNING:
853 case RRDCALC_STATUS_CRITICAL:
857 error("Unknown alarm status %d", status);
862 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
863 debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
865 rc->last_status_change = time(NULL);
868 rc->rrdset_next = st->alarms;
869 rc->rrdset_prev = NULL;
872 rc->rrdset_next->rrdset_prev = rc;
876 if(rc->update_every < rc->rrdset->update_every) {
877 error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
878 rc->update_every = rc->rrdset->update_every;
881 if(!isnan(rc->green) && isnan(st->green)) {
882 debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
883 st->green = rc->green;
886 if(!isnan(rc->red) && isnan(st->red)) {
887 debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
891 rc->local = rrdvar_create_and_index("local", &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
892 rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
894 char fullname[RRDVAR_MAX_LENGTH + 1];
895 snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
896 rc->hostid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
898 snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
899 rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
901 if(!rc->units) rc->units = strdupz(st->units);
904 time_t now = time(NULL);
905 health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
909 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
910 if( (rc->hash_chart == st->hash && !strcmp(rc->chart, st->id)) ||
911 (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
917 // this has to be called while the RRDHOST is locked
918 inline void rrdsetcalc_link_matching(RRDSET *st) {
919 // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
922 for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
923 if(unlikely(rc->rrdset))
926 if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
927 rrdsetcalc_link(st, rc);
931 // this has to be called while the RRDHOST is locked
932 inline void rrdsetcalc_unlink(RRDCALC *rc) {
933 RRDSET *st = rc->rrdset;
936 debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
937 error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
942 time_t now = time(NULL);
943 health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
946 RRDHOST *host = st->rrdhost;
948 debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
952 rc->rrdset_prev->rrdset_next = rc->rrdset_next;
955 rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
958 st->alarms = rc->rrdset_next;
960 rc->rrdset_prev = rc->rrdset_next = NULL;
962 rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
965 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
968 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
971 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
976 // RRDCALC will remain in RRDHOST
977 // so that if the matching chart is found in the future
978 // it will be applied automatically
981 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
983 uint32_t hash = simple_hash(name);
985 for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
986 if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
993 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
996 if(unlikely(!chart)) {
997 error("attempt to find RRDCALC '%s' without giving a chart name", name);
1001 if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1002 if(unlikely(!hash_name)) hash_name = simple_hash(name);
1004 // make sure it does not already exist
1005 for(rc = host->alarms; rc ; rc = rc->next) {
1006 if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1007 debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1008 error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1016 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1018 uint32_t hash_chart = simple_hash(chart);
1019 uint32_t hash_name = simple_hash(name);
1021 // re-use old IDs, by looking them up in the alarm log
1023 for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1024 if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1025 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1026 return ae->alarm_id;
1031 return host->health_log.next_alarm_id++;
1034 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1035 rrdhost_check_rdlock(host);
1037 if(rc->calculation) {
1038 rc->calculation->status = &rc->status;
1039 rc->calculation->this = &rc->value;
1040 rc->calculation->after = &rc->db_after;
1041 rc->calculation->before = &rc->db_before;
1042 rc->calculation->rrdcalc = rc;
1046 rc->warning->status = &rc->status;
1047 rc->warning->this = &rc->value;
1048 rc->warning->after = &rc->db_after;
1049 rc->warning->before = &rc->db_before;
1050 rc->warning->rrdcalc = rc;
1054 rc->critical->status = &rc->status;
1055 rc->critical->this = &rc->value;
1056 rc->critical->after = &rc->db_after;
1057 rc->critical->before = &rc->db_before;
1058 rc->critical->rrdcalc = rc;
1061 // link it to the host
1062 if(likely(host->alarms)) {
1065 for(t = host->alarms; t && t->next ; t = t->next) ;
1072 // link it to its chart
1074 for(st = host->rrdset_root; st ; st = st->next) {
1075 if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1076 rrdsetcalc_link(st, rc);
1082 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1084 debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1086 if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1089 RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1090 rc->next_event_id = 1;
1091 rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1092 rc->name = strdupz(rt->name);
1093 rc->hash = simple_hash(rc->name);
1094 rc->chart = strdupz(chart);
1095 rc->hash_chart = simple_hash(rc->chart);
1097 if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1099 rc->green = rt->green;
1102 rc->old_value = NAN;
1104 rc->delay_up_duration = rt->delay_up_duration;
1105 rc->delay_down_duration = rt->delay_down_duration;
1106 rc->delay_max_duration = rt->delay_max_duration;
1107 rc->delay_multiplier = rt->delay_multiplier;
1109 rc->group = rt->group;
1110 rc->after = rt->after;
1111 rc->before = rt->before;
1112 rc->update_every = rt->update_every;
1113 rc->options = rt->options;
1115 if(rt->exec) rc->exec = strdupz(rt->exec);
1116 if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1117 if(rt->source) rc->source = strdupz(rt->source);
1118 if(rt->units) rc->units = strdupz(rt->units);
1119 if(rt->info) rc->info = strdupz(rt->info);
1121 if(rt->calculation) {
1122 rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1123 if(!rc->calculation)
1124 error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1127 rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1129 error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1132 rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1134 error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1137 debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1138 (rc->chart)?rc->chart:"NOCHART",
1140 (rc->exec)?rc->exec:"DEFAULT",
1141 (rc->recipient)?rc->recipient:"DEFAULT",
1148 (rc->dimensions)?rc->dimensions:"NONE",
1150 (rc->calculation)?rc->calculation->parsed_as:"NONE",
1151 (rc->warning)?rc->warning->parsed_as:"NONE",
1152 (rc->critical)?rc->critical->parsed_as:"NONE",
1154 rc->delay_up_duration,
1155 rc->delay_down_duration,
1156 rc->delay_max_duration,
1157 rc->delay_multiplier
1160 rrdcalc_create_part2(host, rc);
1164 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1167 debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1169 // unlink it from RRDSET
1170 if(rc->rrdset) rrdsetcalc_unlink(rc);
1172 // unlink it from RRDHOST
1173 if(unlikely(rc == host->alarms))
1174 host->alarms = rc->next;
1176 else if(likely(host->alarms)) {
1177 RRDCALC *t, *last = host->alarms;
1178 for(t = last->next; t && t != rc; last = t, t = t->next) ;
1179 if(last->next == rc)
1180 last->next = rc->next;
1182 error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1185 error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1187 expression_free(rc->calculation);
1188 expression_free(rc->warning);
1189 expression_free(rc->critical);
1194 freez(rc->dimensions);
1196 freez(rc->recipient);
1203 // ----------------------------------------------------------------------------
1204 // RRDCALCTEMPLATE management
1206 void rrdcalctemplate_link_matching(RRDSET *st) {
1207 RRDCALCTEMPLATE *rt;
1209 for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1210 if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1211 RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1213 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1215 #ifdef NETDATA_INTERNAL_CHECKS
1216 else if(rc->rrdset != st)
1217 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1223 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1224 debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1226 if(host->templates) {
1227 if(host->templates == rt) {
1228 host->templates = rt->next;
1231 RRDCALCTEMPLATE *t, *last = host->templates;
1232 for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1233 if(last && last->next == rt) {
1234 last->next = rt->next;
1238 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1242 expression_free(rt->calculation);
1243 expression_free(rt->warning);
1244 expression_free(rt->critical);
1248 freez(rt->recipient);
1253 freez(rt->dimensions);
1257 // ----------------------------------------------------------------------------
1258 // load health configuration
1260 #define HEALTH_CONF_MAX_LINE 4096
1262 #define HEALTH_ALARM_KEY "alarm"
1263 #define HEALTH_TEMPLATE_KEY "template"
1264 #define HEALTH_ON_KEY "on"
1265 #define HEALTH_LOOKUP_KEY "lookup"
1266 #define HEALTH_CALC_KEY "calc"
1267 #define HEALTH_EVERY_KEY "every"
1268 #define HEALTH_GREEN_KEY "green"
1269 #define HEALTH_RED_KEY "red"
1270 #define HEALTH_WARN_KEY "warn"
1271 #define HEALTH_CRIT_KEY "crit"
1272 #define HEALTH_EXEC_KEY "exec"
1273 #define HEALTH_RECIPIENT_KEY "to"
1274 #define HEALTH_UNITS_KEY "units"
1275 #define HEALTH_INFO_KEY "info"
1276 #define HEALTH_DELAY_KEY "delay"
1278 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1280 error("Health configuration for alarm '%s' does not have a chart", rc->name);
1284 if(!rc->update_every) {
1285 error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1289 if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1290 error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1294 if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1297 rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1299 debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1300 rc->chart?rc->chart:"NOCHART",
1303 (rc->exec)?rc->exec:"DEFAULT",
1304 (rc->recipient)?rc->recipient:"DEFAULT",
1311 (rc->dimensions)?rc->dimensions:"NONE",
1313 (rc->calculation)?rc->calculation->parsed_as:"NONE",
1314 (rc->warning)?rc->warning->parsed_as:"NONE",
1315 (rc->critical)?rc->critical->parsed_as:"NONE",
1317 rc->delay_up_duration,
1318 rc->delay_down_duration,
1319 rc->delay_max_duration,
1320 rc->delay_multiplier
1323 rrdcalc_create_part2(host, rc);
1327 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1328 if(unlikely(!rt->context)) {
1329 error("Health configuration for template '%s' does not have a context", rt->name);
1333 if(unlikely(!rt->update_every)) {
1334 error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1338 if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1339 error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1343 RRDCALCTEMPLATE *t, *last = NULL;
1344 for (t = host->templates; t ; last = t, t = t->next) {
1345 if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1346 error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1351 debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1353 (rt->context)?rt->context:"NONE",
1354 (rt->exec)?rt->exec:"DEFAULT",
1355 (rt->recipient)?rt->recipient:"DEFAULT",
1362 (rt->dimensions)?rt->dimensions:"NONE",
1364 (rt->calculation)?rt->calculation->parsed_as:"NONE",
1365 (rt->warning)?rt->warning->parsed_as:"NONE",
1366 (rt->critical)?rt->critical->parsed_as:"NONE",
1368 rt->delay_up_duration,
1369 rt->delay_down_duration,
1370 rt->delay_max_duration,
1371 rt->delay_multiplier
1378 rt->next = host->templates;
1379 host->templates = rt;
1385 static inline int health_parse_duration(char *string, int *result) {
1386 // make sure it is a number
1387 if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1393 calculated_number n = strtold(string, &e);
1397 *result = (int) (n * 86400 * 365);
1400 *result = (int) (n * 86400 * 30);
1403 *result = (int) (n * 86400 * 7);
1406 *result = (int) (n * 86400);
1409 *result = (int) (n * 3600);
1412 *result = (int) (n * 60);
1417 *result = (int) (n);
1427 static inline int health_parse_delay(
1428 size_t line, const char *path, const char *file, char *string,
1429 int *delay_up_duration,
1430 int *delay_down_duration,
1431 int *delay_max_duration,
1432 float *delay_multiplier) {
1435 char given_down = 0;
1437 char given_multiplier = 0;
1443 while(*s && !isspace(*s)) s++;
1444 while(*s && isspace(*s)) *s++ = '\0';
1449 while(*s && !isspace(*s)) s++;
1450 while(*s && isspace(*s)) *s++ = '\0';
1452 if(!strcasecmp(key, "up")) {
1453 if (!health_parse_duration(value, delay_up_duration)) {
1454 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1455 line, path, file, value, key);
1459 else if(!strcasecmp(key, "down")) {
1460 if (!health_parse_duration(value, delay_down_duration)) {
1461 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1462 line, path, file, value, key);
1464 else given_down = 1;
1466 else if(!strcasecmp(key, "multiplier")) {
1467 *delay_multiplier = strtof(value, NULL);
1468 if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1469 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1470 line, path, file, value, key);
1472 else given_multiplier = 1;
1474 else if(!strcasecmp(key, "max")) {
1475 if (!health_parse_duration(value, delay_max_duration)) {
1476 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1477 line, path, file, value, key);
1482 error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1483 line, path, file, key);
1488 *delay_up_duration = 0;
1491 *delay_down_duration = 0;
1493 if(!given_multiplier)
1494 *delay_multiplier = 1.0;
1497 if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1498 *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1500 if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1501 *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1507 static inline int health_parse_db_lookup(
1508 size_t line, const char *path, const char *file, char *string,
1509 int *group_method, int *after, int *before, int *every,
1510 uint32_t *options, char **dimensions
1512 debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1514 if(*dimensions) freez(*dimensions);
1521 char *s = string, *key;
1523 // first is the group method
1525 while(*s && !isspace(*s)) s++;
1526 while(*s && isspace(*s)) *s++ = '\0';
1528 error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1529 line, path, file, key);
1533 if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1534 error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1535 line, path, file, key);
1539 // then is the 'after' time
1541 while(*s && !isspace(*s)) s++;
1542 while(*s && isspace(*s)) *s++ = '\0';
1544 if(!health_parse_duration(key, after)) {
1545 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1546 line, path, file, key);
1551 *every = abs(*after);
1553 // now we may have optional parameters
1556 while(*s && !isspace(*s)) s++;
1557 while(*s && isspace(*s)) *s++ = '\0';
1560 if(!strcasecmp(key, "at")) {
1562 while(*s && !isspace(*s)) s++;
1563 while(*s && isspace(*s)) *s++ = '\0';
1565 if (!health_parse_duration(value, before)) {
1566 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1567 line, path, file, value, key);
1570 else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1572 while(*s && !isspace(*s)) s++;
1573 while(*s && isspace(*s)) *s++ = '\0';
1575 if (!health_parse_duration(value, every)) {
1576 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1577 line, path, file, value, key);
1580 else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1581 *options |= RRDR_OPTION_ABSOLUTE;
1583 else if(!strcasecmp(key, "min2max")) {
1584 *options |= RRDR_OPTION_MIN2MAX;
1586 else if(!strcasecmp(key, "null2zero")) {
1587 *options |= RRDR_OPTION_NULL2ZERO;
1589 else if(!strcasecmp(key, "percentage")) {
1590 *options |= RRDR_OPTION_PERCENTAGE;
1592 else if(!strcasecmp(key, "unaligned")) {
1593 *options |= RRDR_OPTION_NOT_ALIGNED;
1595 else if(!strcasecmp(key, "of")) {
1596 if(*s && strcasecmp(s, "all"))
1597 *dimensions = strdupz(s);
1601 error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1602 line, path, file, key);
1609 static inline char *tabs2spaces(char *s) {
1612 if(unlikely(*t == '\t')) *t = ' ';
1619 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1620 char buffer[FILENAME_MAX + 1];
1621 snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1622 return strdupz(buffer);
1625 static inline void strip_quotes(char *s) {
1627 if(*s == '\'' || *s == '"') *s = ' ';
1632 int health_readfile(const char *path, const char *filename) {
1633 debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1635 static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1636 char buffer[HEALTH_CONF_MAX_LINE + 1];
1638 if(unlikely(!hash_alarm)) {
1639 hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1640 hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1641 hash_on = simple_uhash(HEALTH_ON_KEY);
1642 hash_calc = simple_uhash(HEALTH_CALC_KEY);
1643 hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1644 hash_green = simple_uhash(HEALTH_GREEN_KEY);
1645 hash_red = simple_uhash(HEALTH_RED_KEY);
1646 hash_warn = simple_uhash(HEALTH_WARN_KEY);
1647 hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1648 hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1649 hash_every = simple_uhash(HEALTH_EVERY_KEY);
1650 hash_units = simple_hash(HEALTH_UNITS_KEY);
1651 hash_info = simple_hash(HEALTH_INFO_KEY);
1652 hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1653 hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1656 snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1657 FILE *fp = fopen(buffer, "r");
1659 error("Health configuration cannot read file '%s'.", buffer);
1664 RRDCALCTEMPLATE *rt = NULL;
1666 size_t line = 0, append = 0;
1668 while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1669 int stop_appending = !s;
1671 // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1674 // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1677 if(!stop_appending && s[append - 1] == '\\') {
1678 s[append - 1] = ' ';
1679 append = &s[append] - buffer;
1680 if(append < HEALTH_CONF_MAX_LINE)
1683 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1689 while(*s && *s != ':') s++;
1691 error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1699 value = trim(value);
1702 error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1707 error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1711 // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1712 uint32_t hash = simple_uhash(key);
1714 if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1715 if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1716 rrdcalc_free(&localhost, rc);
1719 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1720 rrdcalctemplate_free(&localhost, rt);
1724 rc = callocz(1, sizeof(RRDCALC));
1725 rc->next_event_id = 1;
1726 rc->name = tabs2spaces(strdupz(value));
1727 rc->hash = simple_hash(rc->name);
1728 rc->source = health_source_file(line, path, filename);
1732 rc->old_value = NAN;
1733 rc->delay_multiplier = 1.0;
1735 if(rrdvar_fix_name(rc->name))
1736 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1738 else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1740 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1741 rrdcalc_free(&localhost, rc);
1745 if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1746 rrdcalctemplate_free(&localhost, rt);
1748 rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1749 rt->name = tabs2spaces(strdupz(value));
1750 rt->hash_name = simple_hash(rt->name);
1751 rt->source = health_source_file(line, path, filename);
1754 rt->delay_multiplier = 1.0;
1756 if(rrdvar_fix_name(rt->name))
1757 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1760 if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1762 if(strcmp(rc->chart, value))
1763 info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1764 line, path, filename, rc->name, key, rc->chart, value, value);
1768 rc->chart = tabs2spaces(strdupz(value));
1769 rc->hash_chart = simple_hash(rc->chart);
1771 else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1772 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1774 &rc->options, &rc->dimensions);
1776 else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1777 if(!health_parse_duration(value, &rc->update_every))
1778 info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1779 line, path, filename, rc->name, key, value);
1781 else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1783 rc->green = strtold(value, &e);
1785 info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1786 line, path, filename, rc->name, key, e);
1789 else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1791 rc->red = strtold(value, &e);
1793 info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1794 line, path, filename, rc->name, key, e);
1797 else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1798 const char *failed_at = NULL;
1800 rc->calculation = expression_parse(value, &failed_at, &error);
1801 if(!rc->calculation) {
1802 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1803 line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1806 else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1807 const char *failed_at = NULL;
1809 rc->warning = expression_parse(value, &failed_at, &error);
1811 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1812 line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1815 else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1816 const char *failed_at = NULL;
1818 rc->critical = expression_parse(value, &failed_at, &error);
1820 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1821 line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1824 else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1826 if(strcmp(rc->exec, value))
1827 info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1828 line, path, filename, rc->name, key, rc->exec, value, value);
1832 rc->exec = tabs2spaces(strdupz(value));
1834 else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1836 if(strcmp(rc->recipient, value))
1837 info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1838 line, path, filename, rc->name, key, rc->recipient, value, value);
1840 freez(rc->recipient);
1842 rc->recipient = tabs2spaces(strdupz(value));
1844 else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1846 if(strcmp(rc->units, value))
1847 info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1848 line, path, filename, rc->name, key, rc->units, value, value);
1852 rc->units = tabs2spaces(strdupz(value));
1853 strip_quotes(rc->units);
1855 else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1857 if(strcmp(rc->info, value))
1858 info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1859 line, path, filename, rc->name, key, rc->info, value, value);
1863 rc->info = tabs2spaces(strdupz(value));
1864 strip_quotes(rc->info);
1866 else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1867 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1870 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1871 line, path, filename, rc->name, key);
1875 if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1877 if(strcmp(rt->context, value))
1878 info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1879 line, path, filename, rt->name, key, rt->context, value, value);
1883 rt->context = tabs2spaces(strdupz(value));
1884 rt->hash_context = simple_hash(rt->context);
1886 else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1887 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1889 &rt->options, &rt->dimensions);
1891 else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1892 if(!health_parse_duration(value, &rt->update_every))
1893 info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1894 line, path, filename, rt->name, key, value);
1896 else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1898 rt->green = strtold(value, &e);
1900 info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1901 line, path, filename, rt->name, key, e);
1904 else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1906 rt->red = strtold(value, &e);
1908 info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1909 line, path, filename, rt->name, key, e);
1912 else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1913 const char *failed_at = NULL;
1915 rt->calculation = expression_parse(value, &failed_at, &error);
1916 if(!rt->calculation) {
1917 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1918 line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1921 else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1922 const char *failed_at = NULL;
1924 rt->warning = expression_parse(value, &failed_at, &error);
1926 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1927 line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1930 else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1931 const char *failed_at = NULL;
1933 rt->critical = expression_parse(value, &failed_at, &error);
1935 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1936 line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1939 else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1941 if(strcmp(rt->exec, value))
1942 info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1943 line, path, filename, rt->name, key, rt->exec, value, value);
1947 rt->exec = tabs2spaces(strdupz(value));
1949 else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1951 if(strcmp(rt->recipient, value))
1952 info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1953 line, path, filename, rt->name, key, rt->recipient, value, value);
1955 freez(rt->recipient);
1957 rt->recipient = tabs2spaces(strdupz(value));
1959 else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1961 if(strcmp(rt->units, value))
1962 info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1963 line, path, filename, rt->name, key, rt->units, value, value);
1967 rt->units = tabs2spaces(strdupz(value));
1968 strip_quotes(rt->units);
1970 else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1972 if(strcmp(rt->info, value))
1973 info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1974 line, path, filename, rt->name, key, rt->info, value, value);
1978 rt->info = tabs2spaces(strdupz(value));
1979 strip_quotes(rt->info);
1981 else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1982 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
1985 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1986 line, path, filename, rt->name, key);
1990 error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1991 line, path, filename, key);
1995 if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1996 rrdcalc_free(&localhost, rc);
1998 if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1999 rrdcalctemplate_free(&localhost, rt);
2005 void health_readdir(const char *path) {
2006 size_t pathlen = strlen(path);
2008 debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2010 DIR *dir = opendir(path);
2012 error("Health configuration cannot open directory '%s'.", path);
2016 struct dirent *de = NULL;
2017 while ((de = readdir(dir))) {
2018 size_t len = strlen(de->d_name);
2020 if(de->d_type == DT_DIR
2022 (de->d_name[0] == '.' && de->d_name[1] == '\0')
2023 || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2025 debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2029 else if(de->d_type == DT_DIR) {
2030 char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2033 strcat(s, de->d_name);
2039 else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2040 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2041 health_readfile(path, de->d_name);
2044 else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2050 static inline char *health_config_dir(void) {
2051 char buffer[FILENAME_MAX + 1];
2052 snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2053 return config_get("health", "health configuration directory", buffer);
2056 void health_init(void) {
2057 debug(D_HEALTH, "Health configuration initializing");
2059 if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2060 debug(D_HEALTH, "Health is disabled.");
2064 char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2065 if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2066 fatal("Cannot create directory '%s'.", pathname);
2068 char filename[FILENAME_MAX + 1];
2069 snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2070 health.log_filename = config_get("health", "health db file", filename);
2072 health_alarm_log_load(&localhost);
2073 health_alarm_log_open();
2075 char *path = health_config_dir();
2078 char buffer[FILENAME_MAX + 1];
2079 snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2080 health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2083 long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2085 error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2086 config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2088 else localhost.health_log.max = (unsigned int)n;
2090 rrdhost_rwlock(&localhost);
2091 health_readdir(path);
2092 rrdhost_unlock(&localhost);
2095 // ----------------------------------------------------------------------------
2098 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2100 buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2102 buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2105 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2106 buffer_sprintf(wb, "\n\t{\n"
2107 "\t\t\"hostname\": \"%s\",\n"
2108 "\t\t\"unique_id\": %u,\n"
2109 "\t\t\"alarm_id\": %u,\n"
2110 "\t\t\"alarm_event_id\": %u,\n"
2111 "\t\t\"name\": \"%s\",\n"
2112 "\t\t\"chart\": \"%s\",\n"
2113 "\t\t\"family\": \"%s\",\n"
2114 "\t\t\"processed\": %s,\n"
2115 "\t\t\"updated\": %s,\n"
2116 "\t\t\"exec_run\": %lu,\n"
2117 "\t\t\"exec_failed\": %s,\n"
2118 "\t\t\"exec\": \"%s\",\n"
2119 "\t\t\"recipient\": \"%s\",\n"
2120 "\t\t\"exec_code\": %d,\n"
2121 "\t\t\"source\": \"%s\",\n"
2122 "\t\t\"units\": \"%s\",\n"
2123 "\t\t\"info\": \"%s\",\n"
2124 "\t\t\"when\": %lu,\n"
2125 "\t\t\"duration\": %lu,\n"
2126 "\t\t\"non_clear_duration\": %lu,\n"
2127 "\t\t\"status\": \"%s\",\n"
2128 "\t\t\"old_status\": \"%s\",\n"
2129 "\t\t\"delay\": %d,\n"
2130 "\t\t\"delay_up_to_timestamp\": %lu,\n"
2131 "\t\t\"updated_by_id\": %u,\n"
2132 "\t\t\"updates_id\": %u,\n",
2140 (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2141 (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2142 (unsigned long)ae->exec_run_timestamp,
2143 (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2144 ae->exec?ae->exec:health.health_default_exec,
2145 ae->recipient?ae->recipient:health.health_default_recipient,
2148 ae->units?ae->units:"",
2149 ae->info?ae->info:"",
2150 (unsigned long)ae->when,
2151 (unsigned long)ae->duration,
2152 (unsigned long)ae->non_clear_duration,
2153 rrdcalc_status2string(ae->new_status),
2154 rrdcalc_status2string(ae->old_status),
2156 (unsigned long)ae->delay_up_to_timestamp,
2161 buffer_strcat(wb, "\t\t\"value\":");
2162 buffer_rrd_value(wb, ae->new_value);
2163 buffer_strcat(wb, ",\n");
2165 buffer_strcat(wb, "\t\t\"old_value\":");
2166 buffer_rrd_value(wb, ae->old_value);
2167 buffer_strcat(wb, "\n");
2169 buffer_strcat(wb, "\t}");
2172 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2173 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2175 buffer_strcat(wb, "[");
2177 unsigned int max = host->health_log.max;
2178 unsigned int count = 0;
2180 for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2181 if(ae->unique_id > after) {
2182 if(likely(count)) buffer_strcat(wb, ",");
2183 health_alarm_entry2json_nolock(wb, ae, host);
2187 buffer_strcat(wb, "\n]\n");
2189 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2192 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2194 "\t\t\"%s.%s\": {\n"
2195 "\t\t\t\"id\": %lu,\n"
2196 "\t\t\t\"name\": \"%s\",\n"
2197 "\t\t\t\"chart\": \"%s\",\n"
2198 "\t\t\t\"family\": \"%s\",\n"
2199 "\t\t\t\"active\": %s,\n"
2200 "\t\t\t\"exec\": \"%s\",\n"
2201 "\t\t\t\"recipient\": \"%s\",\n"
2202 "\t\t\t\"source\": \"%s\",\n"
2203 "\t\t\t\"units\": \"%s\",\n"
2204 "\t\t\t\"info\": \"%s\",\n"
2205 "\t\t\t\"status\": \"%s\",\n"
2206 "\t\t\t\"last_status_change\": %lu,\n"
2207 "\t\t\t\"last_updated\": %lu,\n"
2208 "\t\t\t\"next_update\": %lu,\n"
2209 "\t\t\t\"update_every\": %d,\n"
2210 "\t\t\t\"delay_up_duration\": %d,\n"
2211 "\t\t\t\"delay_down_duration\": %d,\n"
2212 "\t\t\t\"delay_max_duration\": %d,\n"
2213 "\t\t\t\"delay_multiplier\": %f,\n"
2214 "\t\t\t\"delay\": %d,\n"
2215 "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2216 , rc->chart, rc->name
2217 , (unsigned long)rc->id
2220 , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2221 , (rc->rrdset)?"true":"false"
2222 , rc->exec?rc->exec:health.health_default_exec
2223 , rc->recipient?rc->recipient:health.health_default_recipient
2225 , rc->units?rc->units:""
2226 , rc->info?rc->info:""
2227 , rrdcalc_status2string(rc->status)
2228 , (unsigned long)rc->last_status_change
2229 , (unsigned long)rc->last_updated
2230 , (unsigned long)rc->next_update
2232 , rc->delay_up_duration
2233 , rc->delay_down_duration
2234 , rc->delay_max_duration
2235 , rc->delay_multiplier
2237 , (unsigned long)rc->delay_up_to_timestamp
2240 if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2241 if(rc->dimensions && *rc->dimensions)
2242 health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2245 "\t\t\t\"db_after\": %lu,\n"
2246 "\t\t\t\"db_before\": %lu,\n"
2247 "\t\t\t\"lookup_method\": \"%s\",\n"
2248 "\t\t\t\"lookup_after\": %d,\n"
2249 "\t\t\t\"lookup_before\": %d,\n"
2250 "\t\t\t\"lookup_options\": \"",
2251 (unsigned long) rc->db_after,
2252 (unsigned long) rc->db_before,
2253 group_method2string(rc->group),
2257 buffer_data_options2string(wb, rc->options);
2258 buffer_strcat(wb, "\",\n");
2261 if(rc->calculation) {
2262 health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2263 health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2267 health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2268 health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2272 health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2273 health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2276 buffer_strcat(wb, "\t\t\t\"green\":");
2277 buffer_rrd_value(wb, rc->green);
2278 buffer_strcat(wb, ",\n");
2280 buffer_strcat(wb, "\t\t\t\"red\":");
2281 buffer_rrd_value(wb, rc->red);
2282 buffer_strcat(wb, ",\n");
2284 buffer_strcat(wb, "\t\t\t\"value\":");
2285 buffer_rrd_value(wb, rc->value);
2286 buffer_strcat(wb, "\n");
2288 buffer_strcat(wb, "\t\t}");
2291 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2295 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2298 rrdhost_rdlock(&localhost);
2299 buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2300 "\n\t\"latest_alarm_log_unique_id\": %u,"
2301 "\n\t\"status\": %s,"
2303 "\n\t\"alarms\": {\n",
2305 (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2306 health_enabled?"true":"false",
2307 (unsigned long)time(NULL));
2310 for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2311 if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2314 if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2317 if(likely(i)) buffer_strcat(wb, ",\n");
2318 health_rrdcalc2json_nolock(wb, rc);
2322 // buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2323 // RRDCALCTEMPLATE *rt;
2324 // for(rt = host->templates; rt ; rt = rt->next)
2325 // health_rrdcalctemplate2json_nolock(wb, rt);
2327 buffer_strcat(wb, "\n\t}\n}\n");
2328 rrdhost_unlock(&localhost);
2332 // ----------------------------------------------------------------------------
2333 // re-load health configuration
2335 static inline void health_free_all_nolock(RRDHOST *host) {
2336 while(host->templates)
2337 rrdcalctemplate_free(host, host->templates);
2340 rrdcalc_free(host, host->alarms);
2343 void health_reload(void) {
2344 if(!health_enabled) {
2345 error("Health reload is requested, but health is not enabled.");
2349 char *path = health_config_dir();
2351 // free all running alarms
2352 rrdhost_rwlock(&localhost);
2353 health_free_all_nolock(&localhost);
2354 rrdhost_unlock(&localhost);
2356 // invalidate all previous entries in the alarm log
2358 for(t = localhost.health_log.alarms ; t ; t = t->next) {
2359 if(t->new_status != RRDCALC_STATUS_REMOVED)
2360 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2363 // reset all thresholds to all charts
2365 for(st = localhost.rrdset_root; st ; st = st->next) {
2370 // load the new alarms
2371 rrdhost_rwlock(&localhost);
2372 health_readdir(path);
2373 rrdhost_unlock(&localhost);
2375 // link the loaded alarms to their charts
2376 for(st = localhost.rrdset_root; st ; st = st->next) {
2377 rrdhost_rwlock(&localhost);
2379 rrdsetcalc_link_matching(st);
2380 rrdcalctemplate_link_matching(st);
2382 rrdhost_unlock(&localhost);
2386 // ----------------------------------------------------------------------------
2387 // health main thread and friends
2389 static inline int rrdcalc_value2status(calculated_number n) {
2390 if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2391 if(n) return RRDCALC_STATUS_RAISED;
2392 return RRDCALC_STATUS_CLEAR;
2395 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2396 ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2398 if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2399 // do not send notifications for internal statuses
2403 // find the previous notification for the same alarm
2404 // which we have run the exec script
2406 for(t = ae->next; t ;t = t->next) {
2407 if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2412 // we have executed this alarm notification in the past
2413 if (t && t->new_status == ae->new_status) {
2414 // don't send the same notification again
2415 info("Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name,
2416 rrdcalc_status2string(ae->new_status));
2421 // we have not executed this alarm notification in the past
2422 if(unlikely(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2423 info("Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2428 char buffer[FILENAME_MAX + 1];
2431 const char *exec = ae->exec;
2432 if(!exec) exec = health.health_default_exec;
2434 const char *recipient = ae->recipient;
2435 if(!recipient) recipient = health.health_default_recipient;
2437 snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2444 (unsigned long)ae->when,
2446 ae->chart?ae->chart:"NOCAHRT",
2447 ae->family?ae->family:"NOFAMILY",
2448 rrdcalc_status2string(ae->new_status),
2449 rrdcalc_status2string(ae->old_status),
2452 ae->source?ae->source:"UNKNOWN",
2453 (uint32_t)ae->duration,
2454 (uint32_t)ae->non_clear_duration,
2455 ae->units?ae->units:"",
2456 ae->info?ae->info:""
2459 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2460 ae->exec_run_timestamp = time(NULL);
2462 debug(D_HEALTH, "executing command '%s'", buffer);
2463 FILE *fp = mypopen(buffer, &command_pid);
2465 error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2468 debug(D_HEALTH, "HEALTH reading from command");
2469 char *s = fgets(buffer, FILENAME_MAX, fp);
2471 ae->exec_code = mypclose(fp, command_pid);
2472 debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2474 if(ae->exec_code != 0)
2475 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2478 health_alarm_log_save(host, ae);
2482 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2483 info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2484 ae->chart?ae->chart:"NOCHART", ae->name,
2486 rrdcalc_status2string(ae->old_status),
2487 rrdcalc_status2string(ae->new_status)
2490 health_alarm_execute(host, ae);
2493 static inline void health_alarm_log_process(RRDHOST *host) {
2494 static uint32_t stop_at_id = 0;
2495 uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2496 time_t now = time(NULL);
2498 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2501 for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2503 !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2504 !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2507 if(unlikely(ae->unique_id < first_waiting))
2508 first_waiting = ae->unique_id;
2510 if(likely(now >= ae->delay_up_to_timestamp))
2511 health_process_notifications(host, ae);
2515 // remember this for the next iteration
2516 stop_at_id = first_waiting;
2518 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2520 if(host->health_log.count <= host->health_log.max)
2523 // cleanup excess entries in the log
2524 pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2526 ALARM_ENTRY *last = NULL;
2527 unsigned int count = host->health_log.max * 2 / 3;
2528 for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2530 if(ae && last && last->next == ae)
2536 debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2538 ALARM_ENTRY *t = ae->next;
2544 freez(ae->recipient);
2551 host->health_log.count--;
2554 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2557 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2558 if (unlikely(!rc->rrdset)) {
2559 debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2563 if (unlikely(!rc->rrdset->last_collected_time.tv_sec)) {
2564 debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not yet collected.", rc->chart?rc->chart:"NOCHART", rc->name);
2568 if (unlikely(!rc->update_every)) {
2569 debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2573 if (unlikely(rc->next_update > now)) {
2574 if (unlikely(*next_run > rc->next_update))
2575 *next_run = rc->next_update;
2577 debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2582 // we should check that the DB lookup is possible
2584 // - the duration of the chart includes the required timeframe
2585 // we SHOULD NOT check the dimensions - there might be alarms that refer non-existing dimensions (e.g. cpu steal)
2590 void *health_main(void *ptr) {
2593 info("HEALTH thread created with task id %d", gettid());
2595 if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2596 error("Cannot set pthread cancel type to DEFERRED.");
2598 if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2599 error("Cannot set pthread cancel state to ENABLE.");
2601 int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2602 if(min_run_every < 1) min_run_every = 1;
2604 BUFFER *wb = buffer_create(100);
2606 unsigned int loop = 0;
2607 while(health_enabled && !netdata_exit) {
2609 debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2611 int oldstate, runnable = 0;
2612 time_t now = time(NULL);
2613 time_t next_run = now + min_run_every;
2616 if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2617 error("Cannot set pthread cancel state to DISABLE.");
2619 rrdhost_rdlock(&localhost);
2621 // the first loop is to lookup values from the db
2622 for (rc = localhost.alarms; rc; rc = rc->next) {
2623 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2627 rc->old_value = rc->value;
2629 // 1. if there is database lookup, do it
2630 // 2. if there is calculation expression, run it
2632 if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2633 time_t old_db_timestamp = rc->db_before;
2634 int value_is_null = 0;
2636 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2637 rc->dimensions, 1, rc->after, rc->before, rc->group,
2638 rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2640 if (unlikely(ret != 200)) {
2641 // database lookup failed
2644 debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2646 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2647 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2648 error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2651 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2652 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2654 if (unlikely(old_db_timestamp == rc->db_before)) {
2655 // database is stale
2657 debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2659 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2660 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2661 error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2664 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2665 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2667 if (unlikely(value_is_null)) {
2668 // collected value is null
2672 debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2673 rc->chart?rc->chart:"NOCHART", rc->name);
2675 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2676 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2677 error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2678 rc->chart?rc->chart:"NOCHART", rc->name);
2681 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2682 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2684 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2685 CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2688 if(unlikely(rc->calculation)) {
2689 if (unlikely(!expression_evaluate(rc->calculation))) {
2690 // calculation failed
2694 debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2695 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2697 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2698 rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2699 error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2700 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2704 if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2705 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2707 debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
2708 CALCULATED_NUMBER_FORMAT
2709 ": %s (source: %s)",
2710 rc->chart?rc->chart:"NOCHART", rc->name,
2711 rc->calculation->result,
2712 buffer_tostring(rc->calculation->error_msg),
2716 rc->value = rc->calculation->result;
2720 rrdhost_unlock(&localhost);
2722 if (unlikely(runnable && !netdata_exit)) {
2723 rrdhost_rdlock(&localhost);
2725 for (rc = localhost.alarms; rc; rc = rc->next) {
2726 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2729 int warning_status = RRDCALC_STATUS_UNDEFINED;
2730 int critical_status = RRDCALC_STATUS_UNDEFINED;
2732 if(likely(rc->warning)) {
2733 if(unlikely(!expression_evaluate(rc->warning))) {
2734 // calculation failed
2736 debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2737 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2739 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2740 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2741 error("Health alarm '%s.%s': warning expression failed with error: %s",
2742 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2746 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2747 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2749 debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2750 CALCULATED_NUMBER_FORMAT
2751 ": %s (source: %s)",
2752 rc->chart?rc->chart:"NOCHART", rc->name,
2753 rc->warning->result,
2754 buffer_tostring(rc->warning->error_msg),
2758 warning_status = rrdcalc_value2status(rc->warning->result);
2762 if(likely(rc->critical)) {
2763 if(unlikely(!expression_evaluate(rc->critical))) {
2764 // calculation failed
2766 debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2767 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2769 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2770 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2771 error("Health alarm '%s.%s': critical expression failed with error: %s",
2772 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2776 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2777 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2779 debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2780 CALCULATED_NUMBER_FORMAT
2781 ": %s (source: %s)",
2782 rc->chart?rc->chart:"NOCHART", rc->name,
2783 rc->critical->result,
2784 buffer_tostring(rc->critical->error_msg),
2788 critical_status = rrdcalc_value2status(rc->critical->result);
2792 int status = RRDCALC_STATUS_UNDEFINED;
2794 switch(warning_status) {
2795 case RRDCALC_STATUS_CLEAR:
2796 status = RRDCALC_STATUS_CLEAR;
2799 case RRDCALC_STATUS_RAISED:
2800 status = RRDCALC_STATUS_WARNING;
2807 switch(critical_status) {
2808 case RRDCALC_STATUS_CLEAR:
2809 if(status == RRDCALC_STATUS_UNDEFINED)
2810 status = RRDCALC_STATUS_CLEAR;
2813 case RRDCALC_STATUS_RAISED:
2814 status = RRDCALC_STATUS_CRITICAL;
2821 if(status != rc->status) {
2824 if(now > rc->delay_up_to_timestamp) {
2825 rc->delay_up_current = rc->delay_up_duration;
2826 rc->delay_down_current = rc->delay_down_duration;
2828 rc->delay_up_to_timestamp = 0;
2831 rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2832 if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2834 rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2835 if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2838 if(status > rc->status)
2839 delay = rc->delay_up_current;
2841 delay = rc->delay_down_current;
2843 // COMMENTED: because we do need to send raising alarms
2844 // if(now + delay < rc->delay_up_to_timestamp)
2845 // delay = (int)(rc->delay_up_to_timestamp - now);
2847 rc->delay_last = delay;
2848 rc->delay_up_to_timestamp = now + delay;
2849 health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
2850 rc->last_status_change = now;
2851 rc->status = status;
2854 rc->last_updated = now;
2855 rc->next_update = now + rc->update_every;
2857 if (next_run > rc->next_update)
2858 next_run = rc->next_update;
2861 rrdhost_unlock(&localhost);
2864 if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2865 error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2867 if(unlikely(netdata_exit))
2870 // execute notifications
2872 health_alarm_log_process(&localhost);
2874 if(unlikely(netdata_exit))
2878 if(now < next_run) {
2879 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2880 loop, (int) (next_run - now));
2881 sleep_usec(1000000 * (unsigned long long) (next_run - now));
2884 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2890 info("HEALTH thread exiting");