3 #define RRDVAR_MAX_LENGTH 1024
5 struct health_options {
6 const char *health_default_exec;
7 const char *health_default_recipient;
8 const char *log_filename;
9 size_t log_entries_written;
13 static struct health_options health = {
14 .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15 .health_default_recipient = "root",
16 .log_filename = VARLIB_DIR "/health/alarm_log.db",
17 .log_entries_written = 0,
21 int health_enabled = 1;
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
27 static inline int health_alarm_log_open(void) {
29 fclose(health.log_fp);
31 health.log_fp = fopen(health.log_filename, "a");
34 if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35 error("Health: cannot set line buffering on health log file.");
39 error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
43 static inline void health_alarm_log_close(void) {
45 fclose(health.log_fp);
50 static inline void health_log_rotate(void) {
51 static size_t rotate_every = 0;
53 if(unlikely(rotate_every == 0)) {
54 rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55 if(rotate_every < 100) rotate_every = 100;
58 if(unlikely(health.log_entries_written > rotate_every)) {
59 health_alarm_log_close();
61 char old_filename[FILENAME_MAX + 1];
62 snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
64 if(unlink(old_filename) == -1 && errno != ENOENT)
65 error("Health: cannot remove old alarms log file '%s'", old_filename);
67 if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68 error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
70 if(unlink(health.log_filename) == -1 && errno != ENOENT)
71 error("Health: cannot remove old alarms log file '%s'", health.log_filename);
73 // open it with truncate
74 health.log_fp = fopen(health.log_filename, "w");
77 fclose(health.log_fp);
79 error("Health: cannot truncate health log '%s'", health.log_filename);
83 health.log_entries_written = 0;
84 health_alarm_log_open();
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
91 if(likely(health.log_fp)) {
92 if(unlikely(fprintf(health.log_fp
94 "\t%08x\t%08x\t%08x\t%08x\t%08x"
97 "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
101 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
111 , (uint32_t)ae->duration
112 , (uint32_t)ae->non_clear_duration
113 , (uint32_t)ae->flags
114 , (uint32_t)ae->exec_run_timestamp
115 , (uint32_t)ae->delay_up_to_timestamp
117 , (ae->name)?ae->name:""
118 , (ae->chart)?ae->chart:""
119 , (ae->family)?ae->family:""
120 , (ae->exec)?ae->exec:""
121 , (ae->recipient)?ae->recipient:""
122 , (ae->source)?ae->source:""
123 , (ae->units)?ae->units:""
124 , (ae->info)?ae->info:""
131 , (long double)ae->new_value
132 , (long double)ae->old_value
134 error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
136 ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137 health.log_entries_written++;
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143 static uint32_t max_unique_id = 0, max_alarm_id = 0;
144 ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
148 char *s, *buf = mallocz(65536 + 1);
149 size_t line = 0, len = 0;
150 loaded = updated = errored = duplicate = 0;
152 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
154 while((s = fgets_trim_len(buf, 65536, fp, &len))) {
155 health.log_entries_written++;
158 int max_entries = 30, entries = 0;
159 char *pointers[max_entries];
161 pointers[entries++] = s++;
163 if(unlikely(*s == '\t')) {
165 pointers[entries++] = ++s;
166 if(entries >= max_entries) {
167 error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
174 if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
175 ALARM_ENTRY *ae = NULL;
178 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries);
183 // check that we have valid ids
184 uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
186 error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]);
191 uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
193 error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]);
198 if(unlikely(*pointers[0] == 'A')) {
199 // make sure it is properly numbered
200 if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
201 error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id);
206 ae = callocz(1, sizeof(ALARM_ENTRY));
208 else if(unlikely(*pointers[0] == 'U')) {
210 for(ae = host->health_log.alarms; ae; ae = ae->next) {
211 if(unlikely(unique_id == ae->unique_id)) {
212 if(unlikely(*pointers[0] == 'A')) {
213 error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later."
214 , line, filename, unique_id);
220 else if(unlikely(unique_id > ae->unique_id)) {
221 // no need to continue
222 // the linked list is sorted
228 // if not found, skip this line
230 // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
235 // check for a possible host missmatch
236 //if(strcmp(pointers[1], host->hostname))
237 // error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
239 ae->unique_id = unique_id;
240 ae->alarm_id = alarm_id;
241 ae->alarm_event_id = (uint32_t)strtoul(pointers[4], NULL, 16);
242 ae->updated_by_id = (uint32_t)strtoul(pointers[5], NULL, 16);
243 ae->updates_id = (uint32_t)strtoul(pointers[6], NULL, 16);
245 ae->when = (uint32_t)strtoul(pointers[7], NULL, 16);
246 ae->duration = (uint32_t)strtoul(pointers[8], NULL, 16);
247 ae->non_clear_duration = (uint32_t)strtoul(pointers[9], NULL, 16);
249 ae->flags = (uint32_t)strtoul(pointers[10], NULL, 16);
250 ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
252 ae->exec_run_timestamp = (uint32_t)strtoul(pointers[11], NULL, 16);
253 ae->delay_up_to_timestamp = (uint32_t)strtoul(pointers[12], NULL, 16);
255 if(unlikely(ae->name)) freez(ae->name);
256 ae->name = strdupz(pointers[13]);
257 ae->hash_name = simple_hash(ae->name);
259 if(unlikely(ae->chart)) freez(ae->chart);
260 ae->chart = strdupz(pointers[14]);
261 ae->hash_chart = simple_hash(ae->chart);
263 if(unlikely(ae->family)) freez(ae->family);
264 ae->family = strdupz(pointers[15]);
266 if(unlikely(ae->exec)) freez(ae->exec);
267 ae->exec = strdupz(pointers[16]);
268 if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
270 if(unlikely(ae->recipient)) freez(ae->recipient);
271 ae->recipient = strdupz(pointers[17]);
272 if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
274 if(unlikely(ae->source)) freez(ae->source);
275 ae->source = strdupz(pointers[18]);
276 if(!*ae->source) { freez(ae->source); ae->source = NULL; }
278 if(unlikely(ae->units)) freez(ae->units);
279 ae->units = strdupz(pointers[19]);
280 if(!*ae->units) { freez(ae->units); ae->units = NULL; }
282 if(unlikely(ae->info)) freez(ae->info);
283 ae->info = strdupz(pointers[20]);
284 if(!*ae->info) { freez(ae->info); ae->info = NULL; }
286 ae->exec_code = atoi(pointers[21]);
287 ae->new_status = atoi(pointers[22]);
288 ae->old_status = atoi(pointers[23]);
289 ae->delay = atoi(pointers[24]);
291 ae->new_value = strtold(pointers[25], NULL);
292 ae->old_value = strtold(pointers[26], NULL);
294 // add it to host if not already there
295 if(unlikely(*pointers[0] == 'A')) {
296 ae->next = host->health_log.alarms;
297 host->health_log.alarms = ae;
302 if(unlikely(ae->unique_id > max_unique_id))
303 max_unique_id = ae->unique_id;
305 if(unlikely(ae->alarm_id >= max_alarm_id))
306 max_alarm_id = ae->alarm_id;
309 error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
314 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
318 if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
319 if(!max_alarm_id) max_alarm_id = (uint32_t)time(NULL);
321 host->health_log.next_log_id = max_unique_id + 1;
322 host->health_log.next_alarm_id = max_alarm_id + 1;
324 debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
328 static inline void health_alarm_log_load(RRDHOST *host) {
329 health_alarm_log_close();
331 char filename[FILENAME_MAX + 1];
332 snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename);
333 FILE *fp = fopen(filename, "r");
335 error("Health: cannot open health file: %s", filename);
337 health_alarm_log_read(host, fp, filename);
341 health.log_entries_written = 0;
342 fp = fopen(health.log_filename, "r");
344 error("Health: cannot open health file: %s", health.log_filename);
346 health_alarm_log_read(host, fp, health.log_filename);
350 health_alarm_log_open();
354 // ----------------------------------------------------------------------------
355 // health alarm log management
357 static inline void health_alarm_log(RRDHOST *host,
358 uint32_t alarm_id, uint32_t alarm_event_id,
360 const char *name, const char *chart, const char *family,
361 const char *exec, const char *recipient, time_t duration,
362 calculated_number old_value, calculated_number new_value,
363 int old_status, int new_status,
369 debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
371 ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
372 ae->name = strdupz(name);
373 ae->hash_name = simple_hash(ae->name);
376 ae->chart = strdupz(chart);
377 ae->hash_chart = simple_hash(ae->chart);
381 ae->family = strdupz(family);
383 if(exec) ae->exec = strdupz(exec);
384 if(recipient) ae->recipient = strdupz(recipient);
385 if(source) ae->source = strdupz(source);
386 if(units) ae->units = strdupz(units);
387 if(info) ae->info = strdupz(info);
389 ae->unique_id = host->health_log.next_log_id++;
390 ae->alarm_id = alarm_id;
391 ae->alarm_event_id = alarm_event_id;
393 ae->old_value = old_value;
394 ae->new_value = new_value;
395 ae->old_status = old_status;
396 ae->new_status = new_status;
397 ae->duration = duration;
399 ae->delay_up_to_timestamp = when + delay;
401 if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
402 ae->non_clear_duration += ae->duration;
405 pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
406 ae->next = host->health_log.alarms;
407 host->health_log.alarms = ae;
408 host->health_log.count++;
409 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
411 // match previous alarms
412 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
414 for(t = host->health_log.alarms ; t ; t = t->next) {
415 if(t != ae && t->alarm_id == ae->alarm_id) {
416 if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
417 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
418 t->updated_by_id = ae->unique_id;
419 ae->updates_id = t->unique_id;
421 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
422 (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
423 ae->non_clear_duration += t->non_clear_duration;
425 health_alarm_log_save(host, t);
428 // no need to continue
432 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
434 health_alarm_log_save(host, ae);
437 // ----------------------------------------------------------------------------
440 static inline int rrdvar_fix_name(char *variable) {
443 if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
454 int rrdvar_compare(void* a, void* b) {
455 if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
456 else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
457 else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
460 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
461 RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
463 debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
468 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
469 RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
471 error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
476 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
478 tmp.name = (char *)name;
479 tmp.hash = (hash)?hash:simple_hash(tmp.name);
481 return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
484 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
490 debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
491 rrdvar_index_del(tree, rv);
498 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
499 char *variable = strdupz(name);
500 rrdvar_fix_name(variable);
501 uint32_t hash = simple_hash(variable);
503 RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
505 debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
507 rv = callocz(1, sizeof(RRDVAR));
513 RRDVAR *ret = rrdvar_index_add(tree, rv);
514 if(unlikely(ret != rv)) {
515 debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
516 rrdvar_free(NULL, NULL, rv);
520 debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
523 debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
529 // it must return NULL - not the existing variable - or double-free will happen
536 // ----------------------------------------------------------------------------
539 static calculated_number rrdvar2number(RRDVAR *rv) {
541 case RRDVAR_TYPE_CALCULATED: {
542 calculated_number *n = (calculated_number *)rv->value;
546 case RRDVAR_TYPE_TIME_T: {
547 time_t *n = (time_t *)rv->value;
551 case RRDVAR_TYPE_COLLECTED: {
552 collected_number *n = (collected_number *)rv->value;
556 case RRDVAR_TYPE_TOTAL: {
557 total_number *n = (total_number *)rv->value;
561 case RRDVAR_TYPE_INT: {
562 int *n = (int *)rv->value;
567 error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
572 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
573 RRDSET *st = rc->rrdset;
578 rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
580 *result = rrdvar2number(rv);
584 rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
586 *result = rrdvar2number(rv);
590 rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
592 *result = rrdvar2number(rv);
599 // ----------------------------------------------------------------------------
602 struct variable2json_helper {
607 static void single_variable2json(void *entry, void *data) {
608 struct variable2json_helper *helper = (struct variable2json_helper *)data;
609 RRDVAR *rv = (RRDVAR *)entry;
610 calculated_number value = rrdvar2number(rv);
612 if(unlikely(isnan(value) || isinf(value)))
613 buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
615 buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
620 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
621 struct variable2json_helper helper = {
626 buffer_sprintf(buf, "{\n\t\"chart\": \"%s.%s\",\n\t\"chart_name\": \"%s.%s\",\n\t\"chart_variables\": {", st->type, st->id, st->type, st->name);
627 avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
628 buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
630 avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
631 buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
633 avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
634 buffer_strcat(buf, "\n\t}\n}\n");
638 // ----------------------------------------------------------------------------
639 // RRDDIMVAR management
640 // DIMENSION VARIABLES
642 #define RRDDIMVAR_ID_MAX 1024
644 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
645 RRDDIM *rd = rs->rrddim;
646 RRDSET *st = rd->rrdset;
648 // CHART VARIABLES FOR THIS DIMENSION
650 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
651 rs->var_local_id = NULL;
653 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
654 rs->var_local_name = NULL;
656 // FAMILY VARIABLES FOR THIS DIMENSION
658 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
659 rs->var_family_id = NULL;
661 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
662 rs->var_family_name = NULL;
664 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
665 rs->var_family_contextid = NULL;
667 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
668 rs->var_family_contextname = NULL;
670 // HOST VARIABLES FOR THIS DIMENSION
672 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
673 rs->var_host_chartidid = NULL;
675 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
676 rs->var_host_chartidname = NULL;
678 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
679 rs->var_host_chartnameid = NULL;
681 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
682 rs->var_host_chartnamename = NULL;
692 freez(rs->key_fullidid);
693 rs->key_fullidid = NULL;
695 freez(rs->key_fullidname);
696 rs->key_fullidname = NULL;
698 freez(rs->key_contextid);
699 rs->key_contextid = NULL;
701 freez(rs->key_contextname);
702 rs->key_contextname = NULL;
704 freez(rs->key_fullnameid);
705 rs->key_fullnameid = NULL;
707 freez(rs->key_fullnamename);
708 rs->key_fullnamename = NULL;
711 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
712 rrddimvar_free_variables(rs);
714 RRDDIM *rd = rs->rrddim;
715 RRDSET *st = rd->rrdset;
717 char buffer[RRDDIMVAR_ID_MAX + 1];
721 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
722 rs->key_id = strdupz(buffer);
724 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
725 rs->key_name = strdupz(buffer);
727 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
728 rs->key_fullidid = strdupz(buffer);
730 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
731 rs->key_fullidname = strdupz(buffer);
733 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
734 rs->key_contextid = strdupz(buffer);
736 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
737 rs->key_contextname = strdupz(buffer);
739 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
740 rs->key_fullnameid = strdupz(buffer);
742 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
743 rs->key_fullnamename = strdupz(buffer);
745 // CHART VARIABLES FOR THIS DIMENSION
746 // -----------------------------------
748 // dimensions are available as:
752 rs->var_local_id = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
753 rs->var_local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
755 // FAMILY VARIABLES FOR THIS DIMENSION
756 // -----------------------------------
758 // dimensions are available as:
759 // - $id (only the first, when multiple overlap)
760 // - $name (only the first, when multiple overlap)
761 // - $chart-context.id
762 // - $chart-context.name
764 rs->var_family_id = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
765 rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
766 rs->var_family_contextid = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
767 rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
769 // HOST VARIABLES FOR THIS DIMENSION
770 // -----------------------------------
772 // dimensions are available as:
776 // - $chart-name.name
778 rs->var_host_chartidid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
779 rs->var_host_chartidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
780 rs->var_host_chartnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
781 rs->var_host_chartnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
784 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
785 RRDSET *st = rd->rrdset;
787 debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
789 if(!prefix) prefix = "";
790 if(!suffix) suffix = "";
792 RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
794 rs->prefix = strdupz(prefix);
795 rs->suffix = strdupz(suffix);
799 rs->options = options;
802 rs->next = rd->variables;
805 rrddimvar_create_variables(rs);
810 void rrddimvar_rename_all(RRDDIM *rd) {
811 RRDSET *st = rd->rrdset;
812 debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
814 RRDDIMVAR *rs, *next = rd->variables;
817 rrddimvar_create_variables(rs);
821 void rrddimvar_free(RRDDIMVAR *rs) {
822 RRDDIM *rd = rs->rrddim;
823 RRDSET *st = rd->rrdset;
824 debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
826 rrddimvar_free_variables(rs);
828 if(rd->variables == rs) {
829 debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
830 rd->variables = rs->next;
833 debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
835 for (t = rd->variables; t && t->next != rs; t = t->next) ;
836 if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
837 else t->next = rs->next;
845 // ----------------------------------------------------------------------------
846 // RRDSETVAR management
849 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
850 RRDSET *st = rs->rrdset;
854 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
855 rs->var_local = NULL;
859 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
860 rs->var_family = NULL;
862 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
867 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
868 rs->var_family_name = NULL;
870 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
871 rs->var_host_name = NULL;
875 freez(rs->key_fullid);
876 rs->key_fullid = NULL;
878 freez(rs->key_fullname);
879 rs->key_fullname = NULL;
882 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
883 rrdsetvar_free_variables(rs);
885 RRDSET *st = rs->rrdset;
889 char buffer[RRDVAR_MAX_LENGTH + 1];
890 snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
891 rs->key_fullid = strdupz(buffer);
893 snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
894 rs->key_fullname = strdupz(buffer);
898 rs->var_local = rrdvar_create_and_index("local", &st->variables_root_index, rs->variable, rs->type, rs->value);
902 rs->var_family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_fullid, rs->type, rs->value);
903 rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_fullname, rs->type, rs->value);
907 rs->var_host = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullid, rs->type, rs->value);
908 rs->var_host_name = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullname, rs->type, rs->value);
912 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
913 debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
914 RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
916 rs->variable = strdupz(variable);
919 rs->options = options;
922 rs->next = st->variables;
925 rrdsetvar_create_variables(rs);
930 void rrdsetvar_rename_all(RRDSET *st) {
931 debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
933 RRDSETVAR *rs, *next = st->variables;
936 rrdsetvar_create_variables(rs);
939 rrdsetcalc_link_matching(st);
942 void rrdsetvar_free(RRDSETVAR *rs) {
943 RRDSET *st = rs->rrdset;
944 debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
946 if(st->variables == rs) {
947 st->variables = rs->next;
951 for (t = st->variables; t && t->next != rs; t = t->next);
952 if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
953 else t->next = rs->next;
956 rrdsetvar_free_variables(rs);
962 // ----------------------------------------------------------------------------
963 // RRDCALC management
965 static inline const char *rrdcalc_status2string(int status) {
967 case RRDCALC_STATUS_REMOVED:
970 case RRDCALC_STATUS_UNDEFINED:
973 case RRDCALC_STATUS_UNINITIALIZED:
974 return "UNINITIALIZED";
976 case RRDCALC_STATUS_CLEAR:
979 case RRDCALC_STATUS_RAISED:
982 case RRDCALC_STATUS_WARNING:
985 case RRDCALC_STATUS_CRITICAL:
989 error("Unknown alarm status %d", status);
994 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
995 debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
997 rc->last_status_change = time(NULL);
1000 rc->rrdset_next = st->alarms;
1001 rc->rrdset_prev = NULL;
1004 rc->rrdset_next->rrdset_prev = rc;
1008 if(rc->update_every < rc->rrdset->update_every) {
1009 error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
1010 rc->update_every = rc->rrdset->update_every;
1013 if(!isnan(rc->green) && isnan(st->green)) {
1014 debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
1015 st->green = rc->green;
1018 if(!isnan(rc->red) && isnan(st->red)) {
1019 debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
1023 rc->local = rrdvar_create_and_index("local", &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1024 rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1026 char fullname[RRDVAR_MAX_LENGTH + 1];
1027 snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
1028 rc->hostid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1030 snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
1031 rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1033 if(!rc->units) rc->units = strdupz(st->units);
1036 time_t now = time(NULL);
1037 health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
1041 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
1042 if( (rc->hash_chart == st->hash && !strcmp(rc->chart, st->id)) ||
1043 (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
1049 // this has to be called while the RRDHOST is locked
1050 inline void rrdsetcalc_link_matching(RRDSET *st) {
1051 // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
1054 for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
1055 if(unlikely(rc->rrdset))
1058 if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
1059 rrdsetcalc_link(st, rc);
1063 // this has to be called while the RRDHOST is locked
1064 inline void rrdsetcalc_unlink(RRDCALC *rc) {
1065 RRDSET *st = rc->rrdset;
1068 debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1069 error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1074 time_t now = time(NULL);
1075 health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
1078 RRDHOST *host = st->rrdhost;
1080 debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
1084 rc->rrdset_prev->rrdset_next = rc->rrdset_next;
1087 rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
1089 if(st->alarms == rc)
1090 st->alarms = rc->rrdset_next;
1092 rc->rrdset_prev = rc->rrdset_next = NULL;
1094 rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
1097 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
1100 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
1103 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
1104 rc->hostname = NULL;
1108 // RRDCALC will remain in RRDHOST
1109 // so that if the matching chart is found in the future
1110 // it will be applied automatically
1113 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
1115 uint32_t hash = simple_hash(name);
1117 for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
1118 if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
1125 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
1128 if(unlikely(!chart)) {
1129 error("attempt to find RRDCALC '%s' without giving a chart name", name);
1133 if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1134 if(unlikely(!hash_name)) hash_name = simple_hash(name);
1136 // make sure it does not already exist
1137 for(rc = host->alarms; rc ; rc = rc->next) {
1138 if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1139 debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1140 error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1148 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1150 uint32_t hash_chart = simple_hash(chart);
1151 uint32_t hash_name = simple_hash(name);
1153 // re-use old IDs, by looking them up in the alarm log
1155 for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1156 if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1157 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1158 return ae->alarm_id;
1163 return host->health_log.next_alarm_id++;
1166 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1167 rrdhost_check_rdlock(host);
1169 if(rc->calculation) {
1170 rc->calculation->status = &rc->status;
1171 rc->calculation->this = &rc->value;
1172 rc->calculation->after = &rc->db_after;
1173 rc->calculation->before = &rc->db_before;
1174 rc->calculation->rrdcalc = rc;
1178 rc->warning->status = &rc->status;
1179 rc->warning->this = &rc->value;
1180 rc->warning->after = &rc->db_after;
1181 rc->warning->before = &rc->db_before;
1182 rc->warning->rrdcalc = rc;
1186 rc->critical->status = &rc->status;
1187 rc->critical->this = &rc->value;
1188 rc->critical->after = &rc->db_after;
1189 rc->critical->before = &rc->db_before;
1190 rc->critical->rrdcalc = rc;
1193 // link it to the host
1194 if(likely(host->alarms)) {
1197 for(t = host->alarms; t && t->next ; t = t->next) ;
1204 // link it to its chart
1206 for(st = host->rrdset_root; st ; st = st->next) {
1207 if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1208 rrdsetcalc_link(st, rc);
1214 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1216 debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1218 if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1221 RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1222 rc->next_event_id = 1;
1223 rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1224 rc->name = strdupz(rt->name);
1225 rc->hash = simple_hash(rc->name);
1226 rc->chart = strdupz(chart);
1227 rc->hash_chart = simple_hash(rc->chart);
1229 if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1231 rc->green = rt->green;
1234 rc->old_value = NAN;
1236 rc->delay_up_duration = rt->delay_up_duration;
1237 rc->delay_down_duration = rt->delay_down_duration;
1238 rc->delay_max_duration = rt->delay_max_duration;
1239 rc->delay_multiplier = rt->delay_multiplier;
1241 rc->group = rt->group;
1242 rc->after = rt->after;
1243 rc->before = rt->before;
1244 rc->update_every = rt->update_every;
1245 rc->options = rt->options;
1247 if(rt->exec) rc->exec = strdupz(rt->exec);
1248 if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1249 if(rt->source) rc->source = strdupz(rt->source);
1250 if(rt->units) rc->units = strdupz(rt->units);
1251 if(rt->info) rc->info = strdupz(rt->info);
1253 if(rt->calculation) {
1254 rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1255 if(!rc->calculation)
1256 error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1259 rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1261 error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1264 rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1266 error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1269 debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1270 (rc->chart)?rc->chart:"NOCHART",
1272 (rc->exec)?rc->exec:"DEFAULT",
1273 (rc->recipient)?rc->recipient:"DEFAULT",
1280 (rc->dimensions)?rc->dimensions:"NONE",
1282 (rc->calculation)?rc->calculation->parsed_as:"NONE",
1283 (rc->warning)?rc->warning->parsed_as:"NONE",
1284 (rc->critical)?rc->critical->parsed_as:"NONE",
1286 rc->delay_up_duration,
1287 rc->delay_down_duration,
1288 rc->delay_max_duration,
1289 rc->delay_multiplier
1292 rrdcalc_create_part2(host, rc);
1296 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1299 debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1301 // unlink it from RRDSET
1302 if(rc->rrdset) rrdsetcalc_unlink(rc);
1304 // unlink it from RRDHOST
1305 if(unlikely(rc == host->alarms))
1306 host->alarms = rc->next;
1308 else if(likely(host->alarms)) {
1309 RRDCALC *t, *last = host->alarms;
1310 for(t = last->next; t && t != rc; last = t, t = t->next) ;
1311 if(last->next == rc)
1312 last->next = rc->next;
1314 error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1317 error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1319 expression_free(rc->calculation);
1320 expression_free(rc->warning);
1321 expression_free(rc->critical);
1326 freez(rc->dimensions);
1328 freez(rc->recipient);
1335 // ----------------------------------------------------------------------------
1336 // RRDCALCTEMPLATE management
1338 void rrdcalctemplate_link_matching(RRDSET *st) {
1339 RRDCALCTEMPLATE *rt;
1341 for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1342 if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1343 RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1345 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1347 #ifdef NETDATA_INTERNAL_CHECKS
1348 else if(rc->rrdset != st)
1349 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1355 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1356 debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1358 if(host->templates) {
1359 if(host->templates == rt) {
1360 host->templates = rt->next;
1363 RRDCALCTEMPLATE *t, *last = host->templates;
1364 for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1365 if(last && last->next == rt) {
1366 last->next = rt->next;
1370 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1374 expression_free(rt->calculation);
1375 expression_free(rt->warning);
1376 expression_free(rt->critical);
1380 freez(rt->recipient);
1385 freez(rt->dimensions);
1389 // ----------------------------------------------------------------------------
1390 // load health configuration
1392 #define HEALTH_CONF_MAX_LINE 4096
1394 #define HEALTH_ALARM_KEY "alarm"
1395 #define HEALTH_TEMPLATE_KEY "template"
1396 #define HEALTH_ON_KEY "on"
1397 #define HEALTH_LOOKUP_KEY "lookup"
1398 #define HEALTH_CALC_KEY "calc"
1399 #define HEALTH_EVERY_KEY "every"
1400 #define HEALTH_GREEN_KEY "green"
1401 #define HEALTH_RED_KEY "red"
1402 #define HEALTH_WARN_KEY "warn"
1403 #define HEALTH_CRIT_KEY "crit"
1404 #define HEALTH_EXEC_KEY "exec"
1405 #define HEALTH_RECIPIENT_KEY "to"
1406 #define HEALTH_UNITS_KEY "units"
1407 #define HEALTH_INFO_KEY "info"
1408 #define HEALTH_DELAY_KEY "delay"
1410 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1412 error("Health configuration for alarm '%s' does not have a chart", rc->name);
1416 if(!rc->update_every) {
1417 error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1421 if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1422 error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1426 if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1429 rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1431 debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1432 rc->chart?rc->chart:"NOCHART",
1435 (rc->exec)?rc->exec:"DEFAULT",
1436 (rc->recipient)?rc->recipient:"DEFAULT",
1443 (rc->dimensions)?rc->dimensions:"NONE",
1445 (rc->calculation)?rc->calculation->parsed_as:"NONE",
1446 (rc->warning)?rc->warning->parsed_as:"NONE",
1447 (rc->critical)?rc->critical->parsed_as:"NONE",
1449 rc->delay_up_duration,
1450 rc->delay_down_duration,
1451 rc->delay_max_duration,
1452 rc->delay_multiplier
1455 rrdcalc_create_part2(host, rc);
1459 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1460 if(unlikely(!rt->context)) {
1461 error("Health configuration for template '%s' does not have a context", rt->name);
1465 if(unlikely(!rt->update_every)) {
1466 error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1470 if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1471 error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1475 RRDCALCTEMPLATE *t, *last = NULL;
1476 for (t = host->templates; t ; last = t, t = t->next) {
1477 if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1478 error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1483 debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1485 (rt->context)?rt->context:"NONE",
1486 (rt->exec)?rt->exec:"DEFAULT",
1487 (rt->recipient)?rt->recipient:"DEFAULT",
1494 (rt->dimensions)?rt->dimensions:"NONE",
1496 (rt->calculation)?rt->calculation->parsed_as:"NONE",
1497 (rt->warning)?rt->warning->parsed_as:"NONE",
1498 (rt->critical)?rt->critical->parsed_as:"NONE",
1500 rt->delay_up_duration,
1501 rt->delay_down_duration,
1502 rt->delay_max_duration,
1503 rt->delay_multiplier
1510 rt->next = host->templates;
1511 host->templates = rt;
1517 static inline int health_parse_duration(char *string, int *result) {
1518 // make sure it is a number
1519 if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1525 calculated_number n = strtold(string, &e);
1529 *result = (int) (n * 86400 * 365);
1532 *result = (int) (n * 86400 * 30);
1535 *result = (int) (n * 86400 * 7);
1538 *result = (int) (n * 86400);
1541 *result = (int) (n * 3600);
1544 *result = (int) (n * 60);
1549 *result = (int) (n);
1559 static inline int health_parse_delay(
1560 size_t line, const char *path, const char *file, char *string,
1561 int *delay_up_duration,
1562 int *delay_down_duration,
1563 int *delay_max_duration,
1564 float *delay_multiplier) {
1567 char given_down = 0;
1569 char given_multiplier = 0;
1575 while(*s && !isspace(*s)) s++;
1576 while(*s && isspace(*s)) *s++ = '\0';
1581 while(*s && !isspace(*s)) s++;
1582 while(*s && isspace(*s)) *s++ = '\0';
1584 if(!strcasecmp(key, "up")) {
1585 if (!health_parse_duration(value, delay_up_duration)) {
1586 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1587 line, path, file, value, key);
1591 else if(!strcasecmp(key, "down")) {
1592 if (!health_parse_duration(value, delay_down_duration)) {
1593 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1594 line, path, file, value, key);
1596 else given_down = 1;
1598 else if(!strcasecmp(key, "multiplier")) {
1599 *delay_multiplier = strtof(value, NULL);
1600 if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1601 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1602 line, path, file, value, key);
1604 else given_multiplier = 1;
1606 else if(!strcasecmp(key, "max")) {
1607 if (!health_parse_duration(value, delay_max_duration)) {
1608 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1609 line, path, file, value, key);
1614 error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1615 line, path, file, key);
1620 *delay_up_duration = 0;
1623 *delay_down_duration = 0;
1625 if(!given_multiplier)
1626 *delay_multiplier = 1.0;
1629 if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1630 *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1632 if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1633 *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1639 static inline int health_parse_db_lookup(
1640 size_t line, const char *path, const char *file, char *string,
1641 int *group_method, int *after, int *before, int *every,
1642 uint32_t *options, char **dimensions
1644 debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1646 if(*dimensions) freez(*dimensions);
1653 char *s = string, *key;
1655 // first is the group method
1657 while(*s && !isspace(*s)) s++;
1658 while(*s && isspace(*s)) *s++ = '\0';
1660 error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1661 line, path, file, key);
1665 if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1666 error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1667 line, path, file, key);
1671 // then is the 'after' time
1673 while(*s && !isspace(*s)) s++;
1674 while(*s && isspace(*s)) *s++ = '\0';
1676 if(!health_parse_duration(key, after)) {
1677 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1678 line, path, file, key);
1683 *every = abs(*after);
1685 // now we may have optional parameters
1688 while(*s && !isspace(*s)) s++;
1689 while(*s && isspace(*s)) *s++ = '\0';
1692 if(!strcasecmp(key, "at")) {
1694 while(*s && !isspace(*s)) s++;
1695 while(*s && isspace(*s)) *s++ = '\0';
1697 if (!health_parse_duration(value, before)) {
1698 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1699 line, path, file, value, key);
1702 else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1704 while(*s && !isspace(*s)) s++;
1705 while(*s && isspace(*s)) *s++ = '\0';
1707 if (!health_parse_duration(value, every)) {
1708 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1709 line, path, file, value, key);
1712 else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1713 *options |= RRDR_OPTION_ABSOLUTE;
1715 else if(!strcasecmp(key, "min2max")) {
1716 *options |= RRDR_OPTION_MIN2MAX;
1718 else if(!strcasecmp(key, "null2zero")) {
1719 *options |= RRDR_OPTION_NULL2ZERO;
1721 else if(!strcasecmp(key, "percentage")) {
1722 *options |= RRDR_OPTION_PERCENTAGE;
1724 else if(!strcasecmp(key, "unaligned")) {
1725 *options |= RRDR_OPTION_NOT_ALIGNED;
1727 else if(!strcasecmp(key, "of")) {
1728 if(*s && strcasecmp(s, "all"))
1729 *dimensions = strdupz(s);
1733 error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1734 line, path, file, key);
1741 static inline char *tabs2spaces(char *s) {
1744 if(unlikely(*t == '\t')) *t = ' ';
1751 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1752 char buffer[FILENAME_MAX + 1];
1753 snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1754 return strdupz(buffer);
1757 static inline void strip_quotes(char *s) {
1759 if(*s == '\'' || *s == '"') *s = ' ';
1764 int health_readfile(const char *path, const char *filename) {
1765 debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1767 static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1768 char buffer[HEALTH_CONF_MAX_LINE + 1];
1770 if(unlikely(!hash_alarm)) {
1771 hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1772 hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1773 hash_on = simple_uhash(HEALTH_ON_KEY);
1774 hash_calc = simple_uhash(HEALTH_CALC_KEY);
1775 hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1776 hash_green = simple_uhash(HEALTH_GREEN_KEY);
1777 hash_red = simple_uhash(HEALTH_RED_KEY);
1778 hash_warn = simple_uhash(HEALTH_WARN_KEY);
1779 hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1780 hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1781 hash_every = simple_uhash(HEALTH_EVERY_KEY);
1782 hash_units = simple_hash(HEALTH_UNITS_KEY);
1783 hash_info = simple_hash(HEALTH_INFO_KEY);
1784 hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1785 hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1788 snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1789 FILE *fp = fopen(buffer, "r");
1791 error("Health configuration cannot read file '%s'.", buffer);
1796 RRDCALCTEMPLATE *rt = NULL;
1798 size_t line = 0, append = 0;
1800 while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1801 int stop_appending = !s;
1807 if(!stop_appending && s[append - 1] == '\\') {
1808 s[append - 1] = ' ';
1809 append = &s[append] - buffer;
1810 if(append < HEALTH_CONF_MAX_LINE)
1813 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1819 while(*s && *s != ':') s++;
1821 error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1829 value = trim(value);
1832 error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1837 error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1841 uint32_t hash = simple_uhash(key);
1843 if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1844 if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1845 rrdcalc_free(&localhost, rc);
1848 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1849 rrdcalctemplate_free(&localhost, rt);
1853 rc = callocz(1, sizeof(RRDCALC));
1854 rc->next_event_id = 1;
1855 rc->name = tabs2spaces(strdupz(value));
1856 rc->hash = simple_hash(rc->name);
1857 rc->source = health_source_file(line, path, filename);
1861 rc->old_value = NAN;
1862 rc->delay_multiplier = 1.0;
1864 if(rrdvar_fix_name(rc->name))
1865 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1867 else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1869 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1870 rrdcalc_free(&localhost, rc);
1874 if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1875 rrdcalctemplate_free(&localhost, rt);
1877 rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1878 rt->name = tabs2spaces(strdupz(value));
1879 rt->hash_name = simple_hash(rt->name);
1880 rt->source = health_source_file(line, path, filename);
1883 rt->delay_multiplier = 1.0;
1885 if(rrdvar_fix_name(rt->name))
1886 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1889 if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1891 if(strcmp(rc->chart, value))
1892 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1893 line, path, filename, rc->name, key, rc->chart, value, value);
1897 rc->chart = tabs2spaces(strdupz(value));
1898 rc->hash_chart = simple_hash(rc->chart);
1900 else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1901 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1903 &rc->options, &rc->dimensions);
1905 else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1906 if(!health_parse_duration(value, &rc->update_every))
1907 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1908 line, path, filename, rc->name, key, value);
1910 else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1912 rc->green = strtold(value, &e);
1914 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1915 line, path, filename, rc->name, key, e);
1918 else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1920 rc->red = strtold(value, &e);
1922 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1923 line, path, filename, rc->name, key, e);
1926 else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1927 const char *failed_at = NULL;
1929 rc->calculation = expression_parse(value, &failed_at, &error);
1930 if(!rc->calculation) {
1931 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1932 line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1935 else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1936 const char *failed_at = NULL;
1938 rc->warning = expression_parse(value, &failed_at, &error);
1940 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1941 line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1944 else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1945 const char *failed_at = NULL;
1947 rc->critical = expression_parse(value, &failed_at, &error);
1949 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1950 line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1953 else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1955 if(strcmp(rc->exec, value))
1956 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1957 line, path, filename, rc->name, key, rc->exec, value, value);
1961 rc->exec = tabs2spaces(strdupz(value));
1963 else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1965 if(strcmp(rc->recipient, value))
1966 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1967 line, path, filename, rc->name, key, rc->recipient, value, value);
1969 freez(rc->recipient);
1971 rc->recipient = tabs2spaces(strdupz(value));
1973 else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1975 if(strcmp(rc->units, value))
1976 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1977 line, path, filename, rc->name, key, rc->units, value, value);
1981 rc->units = tabs2spaces(strdupz(value));
1982 strip_quotes(rc->units);
1984 else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1986 if(strcmp(rc->info, value))
1987 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1988 line, path, filename, rc->name, key, rc->info, value, value);
1992 rc->info = tabs2spaces(strdupz(value));
1993 strip_quotes(rc->info);
1995 else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1996 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1999 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
2000 line, path, filename, rc->name, key);
2004 if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
2006 if(strcmp(rt->context, value))
2007 error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2008 line, path, filename, rt->name, key, rt->context, value, value);
2012 rt->context = tabs2spaces(strdupz(value));
2013 rt->hash_context = simple_hash(rt->context);
2015 else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
2016 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
2018 &rt->options, &rt->dimensions);
2020 else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
2021 if(!health_parse_duration(value, &rt->update_every))
2022 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
2023 line, path, filename, rt->name, key, value);
2025 else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
2027 rt->green = strtold(value, &e);
2029 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2030 line, path, filename, rt->name, key, e);
2033 else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
2035 rt->red = strtold(value, &e);
2037 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2038 line, path, filename, rt->name, key, e);
2041 else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2042 const char *failed_at = NULL;
2044 rt->calculation = expression_parse(value, &failed_at, &error);
2045 if(!rt->calculation) {
2046 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2047 line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2050 else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2051 const char *failed_at = NULL;
2053 rt->warning = expression_parse(value, &failed_at, &error);
2055 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2056 line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2059 else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2060 const char *failed_at = NULL;
2062 rt->critical = expression_parse(value, &failed_at, &error);
2064 error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2065 line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2068 else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2070 if(strcmp(rt->exec, value))
2071 error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2072 line, path, filename, rt->name, key, rt->exec, value, value);
2076 rt->exec = tabs2spaces(strdupz(value));
2078 else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2080 if(strcmp(rt->recipient, value))
2081 error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2082 line, path, filename, rt->name, key, rt->recipient, value, value);
2084 freez(rt->recipient);
2086 rt->recipient = tabs2spaces(strdupz(value));
2088 else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2090 if(strcmp(rt->units, value))
2091 error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2092 line, path, filename, rt->name, key, rt->units, value, value);
2096 rt->units = tabs2spaces(strdupz(value));
2097 strip_quotes(rt->units);
2099 else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2101 if(strcmp(rt->info, value))
2102 error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2103 line, path, filename, rt->name, key, rt->info, value, value);
2107 rt->info = tabs2spaces(strdupz(value));
2108 strip_quotes(rt->info);
2110 else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2111 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
2114 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
2115 line, path, filename, rt->name, key);
2119 error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
2120 line, path, filename, key);
2124 if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2125 rrdcalc_free(&localhost, rc);
2127 if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2128 rrdcalctemplate_free(&localhost, rt);
2134 void health_readdir(const char *path) {
2135 size_t pathlen = strlen(path);
2137 debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2139 DIR *dir = opendir(path);
2141 error("Health configuration cannot open directory '%s'.", path);
2145 struct dirent *de = NULL;
2146 while ((de = readdir(dir))) {
2147 size_t len = strlen(de->d_name);
2149 if(de->d_type == DT_DIR
2151 (de->d_name[0] == '.' && de->d_name[1] == '\0')
2152 || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2154 debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2158 else if(de->d_type == DT_DIR) {
2159 char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2162 strcat(s, de->d_name);
2168 else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2169 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2170 health_readfile(path, de->d_name);
2173 else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2179 static inline char *health_config_dir(void) {
2180 char buffer[FILENAME_MAX + 1];
2181 snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2182 return config_get("health", "health configuration directory", buffer);
2185 void health_init(void) {
2186 debug(D_HEALTH, "Health configuration initializing");
2188 if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2189 debug(D_HEALTH, "Health is disabled.");
2193 char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2194 if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2195 fatal("Cannot create directory '%s'.", pathname);
2197 char filename[FILENAME_MAX + 1];
2198 snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2199 health.log_filename = config_get("health", "health db file", filename);
2201 health_alarm_log_load(&localhost);
2202 health_alarm_log_open();
2204 char *path = health_config_dir();
2207 char buffer[FILENAME_MAX + 1];
2208 snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2209 health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2212 long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2214 error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2215 config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2217 else localhost.health_log.max = (unsigned int)n;
2219 rrdhost_rwlock(&localhost);
2220 health_readdir(path);
2221 rrdhost_unlock(&localhost);
2224 // ----------------------------------------------------------------------------
2227 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2229 buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2231 buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2234 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2235 buffer_sprintf(wb, "\n\t{\n"
2236 "\t\t\"hostname\": \"%s\",\n"
2237 "\t\t\"unique_id\": %u,\n"
2238 "\t\t\"alarm_id\": %u,\n"
2239 "\t\t\"alarm_event_id\": %u,\n"
2240 "\t\t\"name\": \"%s\",\n"
2241 "\t\t\"chart\": \"%s\",\n"
2242 "\t\t\"family\": \"%s\",\n"
2243 "\t\t\"processed\": %s,\n"
2244 "\t\t\"updated\": %s,\n"
2245 "\t\t\"exec_run\": %lu,\n"
2246 "\t\t\"exec_failed\": %s,\n"
2247 "\t\t\"exec\": \"%s\",\n"
2248 "\t\t\"recipient\": \"%s\",\n"
2249 "\t\t\"exec_code\": %d,\n"
2250 "\t\t\"source\": \"%s\",\n"
2251 "\t\t\"units\": \"%s\",\n"
2252 "\t\t\"info\": \"%s\",\n"
2253 "\t\t\"when\": %lu,\n"
2254 "\t\t\"duration\": %lu,\n"
2255 "\t\t\"non_clear_duration\": %lu,\n"
2256 "\t\t\"status\": \"%s\",\n"
2257 "\t\t\"old_status\": \"%s\",\n"
2258 "\t\t\"delay\": %d,\n"
2259 "\t\t\"delay_up_to_timestamp\": %lu,\n"
2260 "\t\t\"updated_by_id\": %u,\n"
2261 "\t\t\"updates_id\": %u,\n",
2269 (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2270 (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2271 (unsigned long)ae->exec_run_timestamp,
2272 (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2273 ae->exec?ae->exec:health.health_default_exec,
2274 ae->recipient?ae->recipient:health.health_default_recipient,
2277 ae->units?ae->units:"",
2278 ae->info?ae->info:"",
2279 (unsigned long)ae->when,
2280 (unsigned long)ae->duration,
2281 (unsigned long)ae->non_clear_duration,
2282 rrdcalc_status2string(ae->new_status),
2283 rrdcalc_status2string(ae->old_status),
2285 (unsigned long)ae->delay_up_to_timestamp,
2290 buffer_strcat(wb, "\t\t\"value\":");
2291 buffer_rrd_value(wb, ae->new_value);
2292 buffer_strcat(wb, ",\n");
2294 buffer_strcat(wb, "\t\t\"old_value\":");
2295 buffer_rrd_value(wb, ae->old_value);
2296 buffer_strcat(wb, "\n");
2298 buffer_strcat(wb, "\t}");
2301 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2302 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2304 buffer_strcat(wb, "[");
2306 unsigned int max = host->health_log.max;
2307 unsigned int count = 0;
2309 for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2310 if(ae->unique_id > after) {
2311 if(likely(count)) buffer_strcat(wb, ",");
2312 health_alarm_entry2json_nolock(wb, ae, host);
2316 buffer_strcat(wb, "\n]\n");
2318 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2321 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2323 "\t\t\"%s.%s\": {\n"
2324 "\t\t\t\"id\": %lu,\n"
2325 "\t\t\t\"name\": \"%s\",\n"
2326 "\t\t\t\"chart\": \"%s\",\n"
2327 "\t\t\t\"family\": \"%s\",\n"
2328 "\t\t\t\"active\": %s,\n"
2329 "\t\t\t\"exec\": \"%s\",\n"
2330 "\t\t\t\"recipient\": \"%s\",\n"
2331 "\t\t\t\"source\": \"%s\",\n"
2332 "\t\t\t\"units\": \"%s\",\n"
2333 "\t\t\t\"info\": \"%s\",\n"
2334 "\t\t\t\"status\": \"%s\",\n"
2335 "\t\t\t\"last_status_change\": %lu,\n"
2336 "\t\t\t\"last_updated\": %lu,\n"
2337 "\t\t\t\"next_update\": %lu,\n"
2338 "\t\t\t\"update_every\": %d,\n"
2339 "\t\t\t\"delay_up_duration\": %d,\n"
2340 "\t\t\t\"delay_down_duration\": %d,\n"
2341 "\t\t\t\"delay_max_duration\": %d,\n"
2342 "\t\t\t\"delay_multiplier\": %f,\n"
2343 "\t\t\t\"delay\": %d,\n"
2344 "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2345 , rc->chart, rc->name
2346 , (unsigned long)rc->id
2349 , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2350 , (rc->rrdset)?"true":"false"
2351 , rc->exec?rc->exec:health.health_default_exec
2352 , rc->recipient?rc->recipient:health.health_default_recipient
2354 , rc->units?rc->units:""
2355 , rc->info?rc->info:""
2356 , rrdcalc_status2string(rc->status)
2357 , (unsigned long)rc->last_status_change
2358 , (unsigned long)rc->last_updated
2359 , (unsigned long)rc->next_update
2361 , rc->delay_up_duration
2362 , rc->delay_down_duration
2363 , rc->delay_max_duration
2364 , rc->delay_multiplier
2366 , (unsigned long)rc->delay_up_to_timestamp
2369 if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2370 if(rc->dimensions && *rc->dimensions)
2371 health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2374 "\t\t\t\"db_after\": %lu,\n"
2375 "\t\t\t\"db_before\": %lu,\n"
2376 "\t\t\t\"lookup_method\": \"%s\",\n"
2377 "\t\t\t\"lookup_after\": %d,\n"
2378 "\t\t\t\"lookup_before\": %d,\n"
2379 "\t\t\t\"lookup_options\": \"",
2380 (unsigned long) rc->db_after,
2381 (unsigned long) rc->db_before,
2382 group_method2string(rc->group),
2386 buffer_data_options2string(wb, rc->options);
2387 buffer_strcat(wb, "\",\n");
2390 if(rc->calculation) {
2391 health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2392 health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2396 health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2397 health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2401 health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2402 health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2405 buffer_strcat(wb, "\t\t\t\"green\":");
2406 buffer_rrd_value(wb, rc->green);
2407 buffer_strcat(wb, ",\n");
2409 buffer_strcat(wb, "\t\t\t\"red\":");
2410 buffer_rrd_value(wb, rc->red);
2411 buffer_strcat(wb, ",\n");
2413 buffer_strcat(wb, "\t\t\t\"value\":");
2414 buffer_rrd_value(wb, rc->value);
2415 buffer_strcat(wb, "\n");
2417 buffer_strcat(wb, "\t\t}");
2420 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2424 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2427 rrdhost_rdlock(&localhost);
2428 buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2429 "\n\t\"latest_alarm_log_unique_id\": %u,"
2430 "\n\t\"status\": %s,"
2432 "\n\t\"alarms\": {\n",
2434 (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2435 health_enabled?"true":"false",
2436 (unsigned long)time(NULL));
2439 for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2440 if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2443 if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2446 if(likely(i)) buffer_strcat(wb, ",\n");
2447 health_rrdcalc2json_nolock(wb, rc);
2451 // buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2452 // RRDCALCTEMPLATE *rt;
2453 // for(rt = host->templates; rt ; rt = rt->next)
2454 // health_rrdcalctemplate2json_nolock(wb, rt);
2456 buffer_strcat(wb, "\n\t}\n}\n");
2457 rrdhost_unlock(&localhost);
2461 // ----------------------------------------------------------------------------
2462 // re-load health configuration
2464 static inline void health_free_all_nolock(RRDHOST *host) {
2465 while(host->templates)
2466 rrdcalctemplate_free(host, host->templates);
2469 rrdcalc_free(host, host->alarms);
2472 void health_reload(void) {
2473 if(!health_enabled) {
2474 error("Health reload is requested, but health is not enabled.");
2478 char *path = health_config_dir();
2480 // free all running alarms
2481 rrdhost_rwlock(&localhost);
2482 health_free_all_nolock(&localhost);
2483 rrdhost_unlock(&localhost);
2485 // invalidate all previous entries in the alarm log
2487 for(t = localhost.health_log.alarms ; t ; t = t->next) {
2488 if(t->new_status != RRDCALC_STATUS_REMOVED)
2489 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2492 // reset all thresholds to all charts
2494 for(st = localhost.rrdset_root; st ; st = st->next) {
2499 // load the new alarms
2500 rrdhost_rwlock(&localhost);
2501 health_readdir(path);
2502 rrdhost_unlock(&localhost);
2504 // link the loaded alarms to their charts
2505 for(st = localhost.rrdset_root; st ; st = st->next) {
2506 rrdhost_rwlock(&localhost);
2508 rrdsetcalc_link_matching(st);
2509 rrdcalctemplate_link_matching(st);
2511 rrdhost_unlock(&localhost);
2515 // ----------------------------------------------------------------------------
2516 // health main thread and friends
2518 static inline int rrdcalc_value2status(calculated_number n) {
2519 if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2520 if(n) return RRDCALC_STATUS_RAISED;
2521 return RRDCALC_STATUS_CLEAR;
2524 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2525 ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2527 if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2528 // do not send notifications for internal statuses
2532 // find the previous notification for the same alarm
2533 // which we have run the exec script
2535 for(t = ae->next; t ;t = t->next) {
2536 if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2541 // we have executed this alarm notification in the past
2542 if (t && t->new_status == ae->new_status) {
2543 // don't send the same notification again
2544 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name,
2545 rrdcalc_status2string(ae->new_status));
2550 // we have not executed this alarm notification in the past
2551 if(unlikely(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2552 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2557 char buffer[FILENAME_MAX + 1];
2560 const char *exec = ae->exec;
2561 if(!exec) exec = health.health_default_exec;
2563 const char *recipient = ae->recipient;
2564 if(!recipient) recipient = health.health_default_recipient;
2566 snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2573 (unsigned long)ae->when,
2575 ae->chart?ae->chart:"NOCAHRT",
2576 ae->family?ae->family:"NOFAMILY",
2577 rrdcalc_status2string(ae->new_status),
2578 rrdcalc_status2string(ae->old_status),
2581 ae->source?ae->source:"UNKNOWN",
2582 (uint32_t)ae->duration,
2583 (uint32_t)ae->non_clear_duration,
2584 ae->units?ae->units:"",
2585 ae->info?ae->info:""
2588 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2589 ae->exec_run_timestamp = time(NULL);
2591 debug(D_HEALTH, "executing command '%s'", buffer);
2592 FILE *fp = mypopen(buffer, &command_pid);
2594 error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2597 debug(D_HEALTH, "HEALTH reading from command");
2598 char *s = fgets(buffer, FILENAME_MAX, fp);
2600 ae->exec_code = mypclose(fp, command_pid);
2601 debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2603 if(ae->exec_code != 0)
2604 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2607 health_alarm_log_save(host, ae);
2611 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2612 debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2613 ae->chart?ae->chart:"NOCHART", ae->name,
2615 rrdcalc_status2string(ae->old_status),
2616 rrdcalc_status2string(ae->new_status)
2619 health_alarm_execute(host, ae);
2622 static inline void health_alarm_log_process(RRDHOST *host) {
2623 static uint32_t stop_at_id = 0;
2624 uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2625 time_t now = time(NULL);
2627 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2630 for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2632 !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2633 !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2636 if(unlikely(ae->unique_id < first_waiting))
2637 first_waiting = ae->unique_id;
2639 if(likely(now >= ae->delay_up_to_timestamp))
2640 health_process_notifications(host, ae);
2644 // remember this for the next iteration
2645 stop_at_id = first_waiting;
2647 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2649 if(host->health_log.count <= host->health_log.max)
2652 // cleanup excess entries in the log
2653 pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2655 ALARM_ENTRY *last = NULL;
2656 unsigned int count = host->health_log.max * 2 / 3;
2657 for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2659 if(ae && last && last->next == ae)
2665 debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2667 ALARM_ENTRY *t = ae->next;
2673 freez(ae->recipient);
2680 host->health_log.count--;
2683 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2686 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2687 if(unlikely(!rc->rrdset)) {
2688 debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2692 if(unlikely(rc->next_update > now)) {
2693 if (unlikely(*next_run > rc->next_update)) {
2694 // update the next_run time of the main loop
2695 // to run this alarm precisely the time required
2696 *next_run = rc->next_update;
2699 debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2703 if(unlikely(!rc->update_every)) {
2704 debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2708 if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
2709 debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
2713 int update_every = rc->rrdset->update_every;
2714 time_t first = rrdset_first_entry_t(rc->rrdset);
2715 time_t last = rrdset_last_entry_t(rc->rrdset);
2717 if(unlikely(now + update_every < first /* || now - update_every > last */)) {
2719 , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
2720 , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
2721 , (unsigned long) last);
2725 if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2726 time_t needed = now + rc->before + rc->after;
2728 if(needed + update_every < first || needed - update_every > last) {
2730 , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
2731 , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
2732 , (unsigned long) last);
2740 void *health_main(void *ptr) {
2743 info("HEALTH thread created with task id %d", gettid());
2745 if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2746 error("Cannot set pthread cancel type to DEFERRED.");
2748 if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2749 error("Cannot set pthread cancel state to ENABLE.");
2751 int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2752 if(min_run_every < 1) min_run_every = 1;
2754 BUFFER *wb = buffer_create(100);
2756 unsigned int loop = 0;
2757 while(health_enabled && !netdata_exit) {
2759 debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2761 int oldstate, runnable = 0;
2762 time_t now = time(NULL);
2763 time_t next_run = now + min_run_every;
2766 if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2767 error("Cannot set pthread cancel state to DISABLE.");
2769 rrdhost_rdlock(&localhost);
2771 // the first loop is to lookup values from the db
2772 for(rc = localhost.alarms; rc; rc = rc->next) {
2773 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
2774 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
2775 rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
2780 rc->old_value = rc->value;
2781 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
2783 // 1. if there is database lookup, do it
2784 // 2. if there is calculation expression, run it
2786 if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2787 /* time_t old_db_timestamp = rc->db_before; */
2788 int value_is_null = 0;
2790 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2791 rc->dimensions, 1, rc->after, rc->before, rc->group,
2792 rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2794 if (unlikely(ret != 200)) {
2795 // database lookup failed
2798 debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2800 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2801 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2802 error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2805 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2806 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2808 /* - RRDCALC_FLAG_DB_STALE not currently used
2809 if (unlikely(old_db_timestamp == rc->db_before)) {
2810 // database is stale
2812 debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2814 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2815 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2816 error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2819 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2820 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2823 if (unlikely(value_is_null)) {
2824 // collected value is null
2828 debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2829 rc->chart?rc->chart:"NOCHART", rc->name);
2831 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2832 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2833 error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2834 rc->chart?rc->chart:"NOCHART", rc->name);
2837 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2838 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2840 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2841 CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2844 if(unlikely(rc->calculation)) {
2845 if (unlikely(!expression_evaluate(rc->calculation))) {
2846 // calculation failed
2850 debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s",
2851 rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2853 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2854 rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2855 error("Health alarm '%s.%s': expression '%s' failed: %s",
2856 rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2860 if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2861 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2863 debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value "
2864 CALCULATED_NUMBER_FORMAT
2865 ": %s (source: %s)",
2866 rc->chart?rc->chart:"NOCHART", rc->name,
2867 rc->calculation->parsed_as,
2868 rc->calculation->result,
2869 buffer_tostring(rc->calculation->error_msg),
2873 rc->value = rc->calculation->result;
2877 rrdhost_unlock(&localhost);
2879 if(unlikely(runnable && !netdata_exit)) {
2880 rrdhost_rdlock(&localhost);
2882 for(rc = localhost.alarms; rc; rc = rc->next) {
2883 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
2886 int warning_status = RRDCALC_STATUS_UNDEFINED;
2887 int critical_status = RRDCALC_STATUS_UNDEFINED;
2889 if(likely(rc->warning)) {
2890 if(unlikely(!expression_evaluate(rc->warning))) {
2891 // calculation failed
2893 debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2894 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2896 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2897 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2898 error("Health alarm '%s.%s': warning expression failed with error: %s",
2899 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2903 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2904 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2906 debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2907 CALCULATED_NUMBER_FORMAT
2908 ": %s (source: %s)",
2909 rc->chart?rc->chart:"NOCHART", rc->name,
2910 rc->warning->result,
2911 buffer_tostring(rc->warning->error_msg),
2915 warning_status = rrdcalc_value2status(rc->warning->result);
2919 if(likely(rc->critical)) {
2920 if(unlikely(!expression_evaluate(rc->critical))) {
2921 // calculation failed
2923 debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2924 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2926 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2927 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2928 error("Health alarm '%s.%s': critical expression failed with error: %s",
2929 rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2933 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2934 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2936 debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2937 CALCULATED_NUMBER_FORMAT
2938 ": %s (source: %s)",
2939 rc->chart?rc->chart:"NOCHART", rc->name,
2940 rc->critical->result,
2941 buffer_tostring(rc->critical->error_msg),
2945 critical_status = rrdcalc_value2status(rc->critical->result);
2949 int status = RRDCALC_STATUS_UNDEFINED;
2951 switch(warning_status) {
2952 case RRDCALC_STATUS_CLEAR:
2953 status = RRDCALC_STATUS_CLEAR;
2956 case RRDCALC_STATUS_RAISED:
2957 status = RRDCALC_STATUS_WARNING;
2964 switch(critical_status) {
2965 case RRDCALC_STATUS_CLEAR:
2966 if(status == RRDCALC_STATUS_UNDEFINED)
2967 status = RRDCALC_STATUS_CLEAR;
2970 case RRDCALC_STATUS_RAISED:
2971 status = RRDCALC_STATUS_CRITICAL;
2978 if(status != rc->status) {
2981 if(now > rc->delay_up_to_timestamp) {
2982 rc->delay_up_current = rc->delay_up_duration;
2983 rc->delay_down_current = rc->delay_down_duration;
2985 rc->delay_up_to_timestamp = 0;
2988 rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2989 if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2991 rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2992 if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2995 if(status > rc->status)
2996 delay = rc->delay_up_current;
2998 delay = rc->delay_down_current;
3000 // COMMENTED: because we do need to send raising alarms
3001 // if(now + delay < rc->delay_up_to_timestamp)
3002 // delay = (int)(rc->delay_up_to_timestamp - now);
3004 rc->delay_last = delay;
3005 rc->delay_up_to_timestamp = now + delay;
3006 health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
3007 rc->last_status_change = now;
3008 rc->status = status;
3011 rc->last_updated = now;
3012 rc->next_update = now + rc->update_every;
3014 if (next_run > rc->next_update)
3015 next_run = rc->next_update;
3018 rrdhost_unlock(&localhost);
3021 if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
3022 error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
3024 if(unlikely(netdata_exit))
3027 // execute notifications
3029 health_alarm_log_process(&localhost);
3031 if(unlikely(netdata_exit))
3035 if(now < next_run) {
3036 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
3037 loop, (int) (next_run - now));
3038 sleep_usec(1000000 * (unsigned long long) (next_run - now));
3041 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
3047 info("HEALTH thread exiting");