1 #define NETDATA_HEALTH_INTERNALS
4 #define RRDVAR_MAX_LENGTH 1024
6 int default_localhost_health_enabled = 1;
8 // ----------------------------------------------------------------------------
11 inline int rrdvar_fix_name(char *variable) {
14 if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
25 int rrdvar_compare(void* a, void* b) {
26 if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
27 else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
28 else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
31 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
32 RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
34 debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
39 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
40 RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
42 error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
47 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
49 tmp.name = (char *)name;
50 tmp.hash = (hash)?hash:simple_hash(tmp.name);
52 return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
55 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
61 debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
62 if(unlikely(!rrdvar_index_del(tree, rv)))
63 error("Attempted to delete variable '%s' from host '%s', but it is not found.", rv->name, host->hostname);
70 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, void *value) {
71 char *variable = strdupz(name);
72 rrdvar_fix_name(variable);
73 uint32_t hash = simple_hash(variable);
75 RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
77 debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
79 rv = callocz(1, sizeof(RRDVAR));
85 RRDVAR *ret = rrdvar_index_add(tree, rv);
86 if(unlikely(ret != rv)) {
87 debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
88 rrdvar_free(NULL, NULL, rv);
92 debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
95 debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
101 // it must return NULL - not the existing variable - or double-free will happen
108 // ----------------------------------------------------------------------------
111 RRDVAR *rrdvar_custom_host_variable_create(RRDHOST *host, const char *name) {
112 calculated_number *v = callocz(1, sizeof(calculated_number));
114 RRDVAR *rv = rrdvar_create_and_index("host", &host->variables_root_index, name, RRDVAR_TYPE_CALCULATED_ALLOCATED, v);
117 error("Requested variable '%s' already exists - possibly 2 plugins will be updating it at the same time", name);
119 char *variable = strdupz(name);
120 rrdvar_fix_name(variable);
121 uint32_t hash = simple_hash(variable);
123 rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
129 void rrdvar_custom_host_variable_destroy(RRDHOST *host, const char *name) {
130 char *variable = strdupz(name);
131 rrdvar_fix_name(variable);
132 uint32_t hash = simple_hash(variable);
134 RRDVAR *rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
138 error("Attempted to remove variable '%s' from host '%s', but it does not exist.", name, host->hostname);
142 if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED) {
143 error("Attempted to remove variable '%s' from host '%s', but it does not a custom allocated variable.", name, host->hostname);
147 if(!rrdvar_index_del(&host->variables_root_index, rv)) {
148 error("Attempted to remove variable '%s' from host '%s', but it cannot be found.", name, host->hostname);
157 void rrdvar_custom_host_variable_set(RRDVAR *rv, calculated_number value) {
158 if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED)
159 error("requested to set variable '%s' to value " CALCULATED_NUMBER_FORMAT " but the variable is not a custom one.", rv->name, value);
161 calculated_number *v = rv->value;
166 // ----------------------------------------------------------------------------
169 static calculated_number rrdvar2number(RRDVAR *rv) {
171 case RRDVAR_TYPE_CALCULATED_ALLOCATED:
172 case RRDVAR_TYPE_CALCULATED: {
173 calculated_number *n = (calculated_number *)rv->value;
177 case RRDVAR_TYPE_TIME_T: {
178 time_t *n = (time_t *)rv->value;
182 case RRDVAR_TYPE_COLLECTED: {
183 collected_number *n = (collected_number *)rv->value;
187 case RRDVAR_TYPE_TOTAL: {
188 total_number *n = (total_number *)rv->value;
192 case RRDVAR_TYPE_INT: {
193 int *n = (int *)rv->value;
198 error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
203 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
204 RRDSET *st = rc->rrdset;
209 rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
211 *result = rrdvar2number(rv);
215 rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
217 *result = rrdvar2number(rv);
221 rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
223 *result = rrdvar2number(rv);
230 // ----------------------------------------------------------------------------
233 struct variable2json_helper {
238 static int single_variable2json(void *entry, void *data) {
239 struct variable2json_helper *helper = (struct variable2json_helper *)data;
240 RRDVAR *rv = (RRDVAR *)entry;
241 calculated_number value = rrdvar2number(rv);
243 if(unlikely(isnan(value) || isinf(value)))
244 buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
246 buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
253 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
254 struct variable2json_helper helper = {
259 buffer_sprintf(buf, "{\n\t\"chart\": \"%s\",\n\t\"chart_name\": \"%s\",\n\t\"chart_context\": \"%s\",\n\t\"chart_variables\": {", st->id, st->name, st->context);
260 avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
261 buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
263 avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
264 buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
266 avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
267 buffer_strcat(buf, "\n\t}\n}\n");
271 // ----------------------------------------------------------------------------
272 // RRDDIMVAR management
273 // DIMENSION VARIABLES
275 #define RRDDIMVAR_ID_MAX 1024
277 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
278 RRDDIM *rd = rs->rrddim;
279 RRDSET *st = rd->rrdset;
281 // CHART VARIABLES FOR THIS DIMENSION
283 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
284 rs->var_local_id = NULL;
286 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
287 rs->var_local_name = NULL;
289 // FAMILY VARIABLES FOR THIS DIMENSION
291 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
292 rs->var_family_id = NULL;
294 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
295 rs->var_family_name = NULL;
297 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
298 rs->var_family_contextid = NULL;
300 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
301 rs->var_family_contextname = NULL;
303 // HOST VARIABLES FOR THIS DIMENSION
305 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
306 rs->var_host_chartidid = NULL;
308 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
309 rs->var_host_chartidname = NULL;
311 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
312 rs->var_host_chartnameid = NULL;
314 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
315 rs->var_host_chartnamename = NULL;
325 freez(rs->key_fullidid);
326 rs->key_fullidid = NULL;
328 freez(rs->key_fullidname);
329 rs->key_fullidname = NULL;
331 freez(rs->key_contextid);
332 rs->key_contextid = NULL;
334 freez(rs->key_contextname);
335 rs->key_contextname = NULL;
337 freez(rs->key_fullnameid);
338 rs->key_fullnameid = NULL;
340 freez(rs->key_fullnamename);
341 rs->key_fullnamename = NULL;
344 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
345 rrddimvar_free_variables(rs);
347 RRDDIM *rd = rs->rrddim;
348 RRDSET *st = rd->rrdset;
350 char buffer[RRDDIMVAR_ID_MAX + 1];
354 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
355 rs->key_id = strdupz(buffer);
357 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
358 rs->key_name = strdupz(buffer);
360 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
361 rs->key_fullidid = strdupz(buffer);
363 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
364 rs->key_fullidname = strdupz(buffer);
366 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
367 rs->key_contextid = strdupz(buffer);
369 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
370 rs->key_contextname = strdupz(buffer);
372 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
373 rs->key_fullnameid = strdupz(buffer);
375 snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
376 rs->key_fullnamename = strdupz(buffer);
378 // CHART VARIABLES FOR THIS DIMENSION
379 // -----------------------------------
381 // dimensions are available as:
385 rs->var_local_id = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
386 rs->var_local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
388 // FAMILY VARIABLES FOR THIS DIMENSION
389 // -----------------------------------
391 // dimensions are available as:
392 // - $id (only the first, when multiple overlap)
393 // - $name (only the first, when multiple overlap)
394 // - $chart-context.id
395 // - $chart-context.name
397 rs->var_family_id = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
398 rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
399 rs->var_family_contextid = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
400 rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
402 // HOST VARIABLES FOR THIS DIMENSION
403 // -----------------------------------
405 // dimensions are available as:
409 // - $chart-name.name
411 rs->var_host_chartidid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
412 rs->var_host_chartidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
413 rs->var_host_chartnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
414 rs->var_host_chartnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
417 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
418 RRDSET *st = rd->rrdset;
420 debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
422 if(!prefix) prefix = "";
423 if(!suffix) suffix = "";
425 RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
427 rs->prefix = strdupz(prefix);
428 rs->suffix = strdupz(suffix);
432 rs->options = options;
435 rs->next = rd->variables;
438 rrddimvar_create_variables(rs);
443 void rrddimvar_rename_all(RRDDIM *rd) {
444 RRDSET *st = rd->rrdset;
445 debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
447 RRDDIMVAR *rs, *next = rd->variables;
450 rrddimvar_create_variables(rs);
454 void rrddimvar_free(RRDDIMVAR *rs) {
455 RRDDIM *rd = rs->rrddim;
456 RRDSET *st = rd->rrdset;
457 debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
459 rrddimvar_free_variables(rs);
461 if(rd->variables == rs) {
462 debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
463 rd->variables = rs->next;
466 debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
468 for (t = rd->variables; t && t->next != rs; t = t->next) ;
469 if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
470 else t->next = rs->next;
478 // ----------------------------------------------------------------------------
479 // RRDSETVAR management
482 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
483 RRDSET *st = rs->rrdset;
487 rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
488 rs->var_local = NULL;
492 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
493 rs->var_family = NULL;
495 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
500 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
501 rs->var_family_name = NULL;
503 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
504 rs->var_host_name = NULL;
508 freez(rs->key_fullid);
509 rs->key_fullid = NULL;
511 freez(rs->key_fullname);
512 rs->key_fullname = NULL;
515 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
516 rrdsetvar_free_variables(rs);
518 RRDSET *st = rs->rrdset;
522 char buffer[RRDVAR_MAX_LENGTH + 1];
523 snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
524 rs->key_fullid = strdupz(buffer);
526 snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
527 rs->key_fullname = strdupz(buffer);
531 rs->var_local = rrdvar_create_and_index("local", &st->variables_root_index, rs->variable, rs->type, rs->value);
535 rs->var_family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_fullid, rs->type, rs->value);
536 rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_fullname, rs->type, rs->value);
540 rs->var_host = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullid, rs->type, rs->value);
541 rs->var_host_name = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullname, rs->type, rs->value);
545 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
546 debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
547 RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
549 rs->variable = strdupz(variable);
552 rs->options = options;
555 rs->next = st->variables;
558 rrdsetvar_create_variables(rs);
563 void rrdsetvar_rename_all(RRDSET *st) {
564 debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
566 RRDSETVAR *rs, *next = st->variables;
569 rrdsetvar_create_variables(rs);
572 rrdsetcalc_link_matching(st);
575 void rrdsetvar_free(RRDSETVAR *rs) {
576 RRDSET *st = rs->rrdset;
577 debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
579 if(st->variables == rs) {
580 st->variables = rs->next;
584 for (t = st->variables; t && t->next != rs; t = t->next);
585 if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
586 else t->next = rs->next;
589 rrdsetvar_free_variables(rs);
595 // ----------------------------------------------------------------------------
596 // RRDCALC management
598 inline const char *rrdcalc_status2string(int status) {
600 case RRDCALC_STATUS_REMOVED:
603 case RRDCALC_STATUS_UNDEFINED:
606 case RRDCALC_STATUS_UNINITIALIZED:
607 return "UNINITIALIZED";
609 case RRDCALC_STATUS_CLEAR:
612 case RRDCALC_STATUS_RAISED:
615 case RRDCALC_STATUS_WARNING:
618 case RRDCALC_STATUS_CRITICAL:
622 error("Unknown alarm status %d", status);
627 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
628 debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
630 rc->last_status_change = now_realtime_sec();
633 rc->rrdset_next = st->alarms;
634 rc->rrdset_prev = NULL;
637 rc->rrdset_next->rrdset_prev = rc;
641 if(rc->update_every < rc->rrdset->update_every) {
642 error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
643 rc->update_every = rc->rrdset->update_every;
646 if(!isnan(rc->green) && isnan(st->green)) {
647 debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
648 st->green = rc->green;
651 if(!isnan(rc->red) && isnan(st->red)) {
652 debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
656 rc->local = rrdvar_create_and_index("local", &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
657 rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
659 char fullname[RRDVAR_MAX_LENGTH + 1];
660 snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
661 rc->hostid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
663 snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
664 rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
666 if(!rc->units) rc->units = strdupz(st->units);
669 time_t now = now_realtime_sec();
680 now - rc->last_status_change,
684 RRDCALC_STATUS_UNINITIALIZED,
694 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
695 if( (rc->hash_chart == st->hash && !strcmp(rc->chart, st->id)) ||
696 (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
702 // this has to be called while the RRDHOST is locked
703 inline void rrdsetcalc_link_matching(RRDSET *st) {
704 // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
707 for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
708 if(unlikely(rc->rrdset))
711 if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
712 rrdsetcalc_link(st, rc);
716 // this has to be called while the RRDHOST is locked
717 inline void rrdsetcalc_unlink(RRDCALC *rc) {
718 RRDSET *st = rc->rrdset;
721 debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
722 error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
727 time_t now = now_realtime_sec();
738 now - rc->last_status_change,
742 RRDCALC_STATUS_REMOVED,
751 RRDHOST *host = st->rrdhost;
753 debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
757 rc->rrdset_prev->rrdset_next = rc->rrdset_next;
760 rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
763 st->alarms = rc->rrdset_next;
765 rc->rrdset_prev = rc->rrdset_next = NULL;
767 rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
770 rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
773 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
776 rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
781 // RRDCALC will remain in RRDHOST
782 // so that if the matching chart is found in the future
783 // it will be applied automatically
786 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
788 uint32_t hash = simple_hash(name);
790 for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
791 if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
798 inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
801 if(unlikely(!chart)) {
802 error("attempt to find RRDCALC '%s' without giving a chart name", name);
806 if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
807 if(unlikely(!hash_name)) hash_name = simple_hash(name);
809 // make sure it does not already exist
810 for(rc = host->alarms; rc ; rc = rc->next) {
811 if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
812 debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
813 error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
821 inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
823 uint32_t hash_chart = simple_hash(chart);
824 uint32_t hash_name = simple_hash(name);
826 // re-use old IDs, by looking them up in the alarm log
828 for(ae = host->health_log.alarms; ae ;ae = ae->next) {
829 if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
830 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
836 return host->health_log.next_alarm_id++;
839 inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
840 rrdhost_check_rdlock(host);
842 if(rc->calculation) {
843 rc->calculation->status = &rc->status;
844 rc->calculation->this = &rc->value;
845 rc->calculation->after = &rc->db_after;
846 rc->calculation->before = &rc->db_before;
847 rc->calculation->rrdcalc = rc;
851 rc->warning->status = &rc->status;
852 rc->warning->this = &rc->value;
853 rc->warning->after = &rc->db_after;
854 rc->warning->before = &rc->db_before;
855 rc->warning->rrdcalc = rc;
859 rc->critical->status = &rc->status;
860 rc->critical->this = &rc->value;
861 rc->critical->after = &rc->db_after;
862 rc->critical->before = &rc->db_before;
863 rc->critical->rrdcalc = rc;
866 // link it to the host
867 if(likely(host->alarms)) {
870 for(t = host->alarms; t && t->next ; t = t->next) ;
877 // link it to its chart
879 for(st = host->rrdset_root; st ; st = st->next) {
880 if(rrdcalc_is_matching_this_rrdset(rc, st)) {
881 rrdsetcalc_link(st, rc);
887 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
889 debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
891 if(rrdcalc_exists(host, chart, rt->name, 0, 0))
894 RRDCALC *rc = callocz(1, sizeof(RRDCALC));
895 rc->next_event_id = 1;
896 rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
897 rc->name = strdupz(rt->name);
898 rc->hash = simple_hash(rc->name);
899 rc->chart = strdupz(chart);
900 rc->hash_chart = simple_hash(rc->chart);
902 if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
904 rc->green = rt->green;
909 rc->delay_up_duration = rt->delay_up_duration;
910 rc->delay_down_duration = rt->delay_down_duration;
911 rc->delay_max_duration = rt->delay_max_duration;
912 rc->delay_multiplier = rt->delay_multiplier;
914 rc->group = rt->group;
915 rc->after = rt->after;
916 rc->before = rt->before;
917 rc->update_every = rt->update_every;
918 rc->options = rt->options;
920 if(rt->exec) rc->exec = strdupz(rt->exec);
921 if(rt->recipient) rc->recipient = strdupz(rt->recipient);
922 if(rt->source) rc->source = strdupz(rt->source);
923 if(rt->units) rc->units = strdupz(rt->units);
924 if(rt->info) rc->info = strdupz(rt->info);
926 if(rt->calculation) {
927 rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
929 error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
932 rc->warning = expression_parse(rt->warning->source, NULL, NULL);
934 error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
937 rc->critical = expression_parse(rt->critical->source, NULL, NULL);
939 error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
942 debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
943 (rc->chart)?rc->chart:"NOCHART",
945 (rc->exec)?rc->exec:"DEFAULT",
946 (rc->recipient)?rc->recipient:"DEFAULT",
953 (rc->dimensions)?rc->dimensions:"NONE",
955 (rc->calculation)?rc->calculation->parsed_as:"NONE",
956 (rc->warning)?rc->warning->parsed_as:"NONE",
957 (rc->critical)?rc->critical->parsed_as:"NONE",
959 rc->delay_up_duration,
960 rc->delay_down_duration,
961 rc->delay_max_duration,
965 rrdcalc_create_part2(host, rc);
969 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
972 debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
974 // unlink it from RRDSET
975 if(rc->rrdset) rrdsetcalc_unlink(rc);
977 // unlink it from RRDHOST
978 if(unlikely(rc == host->alarms))
979 host->alarms = rc->next;
981 else if(likely(host->alarms)) {
982 RRDCALC *t, *last = host->alarms;
983 for(t = last->next; t && t != rc; last = t, t = t->next) ;
985 last->next = rc->next;
987 error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
990 error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
992 expression_free(rc->calculation);
993 expression_free(rc->warning);
994 expression_free(rc->critical);
999 freez(rc->dimensions);
1001 freez(rc->recipient);
1008 // ----------------------------------------------------------------------------
1009 // RRDCALCTEMPLATE management
1011 void rrdcalctemplate_link_matching(RRDSET *st) {
1012 RRDCALCTEMPLATE *rt;
1014 for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1015 if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
1016 && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
1017 RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1019 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1021 #ifdef NETDATA_INTERNAL_CHECKS
1022 else if(rc->rrdset != st)
1023 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1029 inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1030 debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1032 if(host->templates) {
1033 if(host->templates == rt) {
1034 host->templates = rt->next;
1037 RRDCALCTEMPLATE *t, *last = host->templates;
1038 for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1039 if(last && last->next == rt) {
1040 last->next = rt->next;
1044 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1048 expression_free(rt->calculation);
1049 expression_free(rt->warning);
1050 expression_free(rt->critical);
1052 freez(rt->family_match);
1053 simple_pattern_free(rt->family_pattern);
1057 freez(rt->recipient);
1062 freez(rt->dimensions);
1066 // ----------------------------------------------------------------------------
1067 // health initialization
1069 inline char *health_config_dir(void) {
1070 char buffer[FILENAME_MAX + 1];
1071 snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
1072 return config_get("health", "health configuration directory", buffer);
1075 void health_init(void) {
1076 debug(D_HEALTH, "Health configuration initializing");
1078 if(!(default_localhost_health_enabled = config_get_boolean("health", "enabled", 1))) {
1079 debug(D_HEALTH, "Health is disabled.");
1083 char pathname[FILENAME_MAX + 1];
1084 snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir);
1085 if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
1086 fatal("Cannot create directory '%s'.", pathname);
1089 // ----------------------------------------------------------------------------
1090 // re-load health configuration
1092 inline void health_free_host_nolock(RRDHOST *host) {
1093 while(host->templates)
1094 rrdcalctemplate_free(host, host->templates);
1097 rrdcalc_free(host, host->alarms);
1100 void health_reload_host(RRDHOST *host) {
1101 char *path = health_config_dir();
1103 // free all running alarms
1104 rrdhost_wrlock(host);
1105 health_free_host_nolock(host);
1106 rrdhost_unlock(host);
1108 // invalidate all previous entries in the alarm log
1110 for(t = host->health_log.alarms ; t ; t = t->next) {
1111 if(t->new_status != RRDCALC_STATUS_REMOVED)
1112 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
1115 // reset all thresholds to all charts
1117 for(st = host->rrdset_root; st ; st = st->next) {
1122 // load the new alarms
1123 rrdhost_wrlock(host);
1124 health_readdir(host, path);
1125 rrdhost_unlock(host);
1127 // link the loaded alarms to their charts
1128 for(st = host->rrdset_root; st ; st = st->next) {
1129 rrdhost_wrlock(host);
1131 rrdsetcalc_link_matching(st);
1132 rrdcalctemplate_link_matching(st);
1134 rrdhost_unlock(host);
1138 void health_reload(void) {
1141 for(host = localhost; host ; host = host->next)
1142 health_reload_host(host);
1145 // ----------------------------------------------------------------------------
1146 // health main thread and friends
1148 static inline int rrdcalc_value2status(calculated_number n) {
1149 if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
1150 if(n) return RRDCALC_STATUS_RAISED;
1151 return RRDCALC_STATUS_CLEAR;
1154 #define ALARM_EXEC_COMMAND_LENGTH 8192
1156 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
1157 ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
1159 if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
1160 // do not send notifications for internal statuses
1161 debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1165 if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
1166 // do not send notifications for disabled statuses
1167 debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1168 // mark it as run, so that we will send the same alarm if it happens again
1172 // find the previous notification for the same alarm
1173 // which we have run the exec script
1174 // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
1175 if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
1176 uint32_t id = ae->alarm_id;
1178 for(t = ae->next; t ; t = t->next) {
1179 if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
1184 // we have executed this alarm notification in the past
1185 if(t && t->new_status == ae->new_status) {
1186 // don't send the notification for the same status again
1187 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
1188 , rrdcalc_status2string(ae->new_status));
1193 // we have not executed this alarm notification in the past
1194 // so, don't send CLEAR notifications
1195 if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
1196 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
1197 , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1203 static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
1206 const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
1207 const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
1209 snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
1216 (unsigned long)ae->when,
1218 ae->chart?ae->chart:"NOCAHRT",
1219 ae->family?ae->family:"NOFAMILY",
1220 rrdcalc_status2string(ae->new_status),
1221 rrdcalc_status2string(ae->old_status),
1224 ae->source?ae->source:"UNKNOWN",
1225 (uint32_t)ae->duration,
1226 (uint32_t)ae->non_clear_duration,
1227 ae->units?ae->units:"",
1228 ae->info?ae->info:"",
1229 ae->new_value_string,
1230 ae->old_value_string
1233 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
1234 ae->exec_run_timestamp = now_realtime_sec();
1236 debug(D_HEALTH, "executing command '%s'", command_to_run);
1237 FILE *fp = mypopen(command_to_run, &command_pid);
1239 error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
1242 debug(D_HEALTH, "HEALTH reading from command");
1243 char *s = fgets(command_to_run, FILENAME_MAX, fp);
1245 ae->exec_code = mypclose(fp, command_pid);
1246 debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
1248 if(ae->exec_code != 0)
1249 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
1252 health_alarm_log_save(host, ae);
1256 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
1257 debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
1258 ae->chart?ae->chart:"NOCHART", ae->name,
1260 rrdcalc_status2string(ae->old_status),
1261 rrdcalc_status2string(ae->new_status)
1264 health_alarm_execute(host, ae);
1267 static inline void health_alarm_log_process(RRDHOST *host) {
1268 static uint32_t stop_at_id = 0;
1269 uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
1270 time_t now = now_realtime_sec();
1272 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1275 for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
1277 !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
1278 !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
1281 if(unlikely(ae->unique_id < first_waiting))
1282 first_waiting = ae->unique_id;
1284 if(likely(now >= ae->delay_up_to_timestamp))
1285 health_process_notifications(host, ae);
1289 // remember this for the next iteration
1290 stop_at_id = first_waiting;
1292 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1294 if(host->health_log.count <= host->health_log.max)
1297 // cleanup excess entries in the log
1298 pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1300 ALARM_ENTRY *last = NULL;
1301 unsigned int count = host->health_log.max * 2 / 3;
1302 for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
1304 if(ae && last && last->next == ae)
1310 debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
1312 ALARM_ENTRY *t = ae->next;
1318 freez(ae->recipient);
1322 freez(ae->old_value_string);
1323 freez(ae->new_value_string);
1327 host->health_log.count--;
1330 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1333 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
1334 if(unlikely(!rc->rrdset)) {
1335 debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
1339 if(unlikely(rc->next_update > now)) {
1340 if (unlikely(*next_run > rc->next_update)) {
1341 // update the next_run time of the main loop
1342 // to run this alarm precisely the time required
1343 *next_run = rc->next_update;
1346 debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
1350 if(unlikely(!rc->update_every)) {
1351 debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
1355 if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
1356 debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
1360 int update_every = rc->rrdset->update_every;
1361 time_t first = rrdset_first_entry_t(rc->rrdset);
1362 time_t last = rrdset_last_entry_t(rc->rrdset);
1364 if(unlikely(now + update_every < first /* || now - update_every > last */)) {
1366 , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
1367 , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
1368 , (unsigned long) last);
1372 if(RRDCALC_HAS_DB_LOOKUP(rc)) {
1373 time_t needed = now + rc->before + rc->after;
1375 if(needed + update_every < first || needed - update_every > last) {
1377 , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
1378 , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
1379 , (unsigned long) last);
1387 void *health_main(void *ptr) {
1388 struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
1390 info("HEALTH thread created with task id %d", gettid());
1392 if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
1393 error("Cannot set pthread cancel type to DEFERRED.");
1395 if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
1396 error("Cannot set pthread cancel state to ENABLE.");
1398 int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
1399 if(min_run_every < 1) min_run_every = 1;
1401 BUFFER *wb = buffer_create(100);
1403 unsigned int loop = 0;
1404 while(!netdata_exit) {
1406 debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
1408 int oldstate, runnable = 0;
1409 time_t now = now_realtime_sec();
1410 time_t next_run = now + min_run_every;
1413 if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
1414 error("Cannot set pthread cancel state to DISABLE.");
1417 for(host = localhost; host ; host = host->next) {
1418 if(unlikely(!host->health_enabled)) continue;
1420 rrdhost_rdlock(host);
1422 // the first loop is to lookup values from the db
1423 for(rc = host->alarms; rc; rc = rc->next) {
1424 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
1425 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
1426 rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
1431 rc->old_value = rc->value;
1432 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
1434 // 1. if there is database lookup, do it
1435 // 2. if there is calculation expression, run it
1437 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
1438 /* time_t old_db_timestamp = rc->db_before; */
1439 int value_is_null = 0;
1441 int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
1443 if(unlikely(ret != 200)) {
1444 // database lookup failed
1447 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
1449 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
1450 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
1451 error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
1454 else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
1455 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
1457 /* - RRDCALC_FLAG_DB_STALE not currently used
1458 if (unlikely(old_db_timestamp == rc->db_before)) {
1459 // database is stale
1461 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
1463 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
1464 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
1465 error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
1468 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
1469 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
1472 if(unlikely(value_is_null)) {
1473 // collected value is null
1477 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
1479 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
1480 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
1481 error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
1484 else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
1485 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
1487 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
1490 if(unlikely(rc->calculation)) {
1491 if(unlikely(!expression_evaluate(rc->calculation))) {
1492 // calculation failed
1496 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
1498 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
1499 rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
1500 error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname, rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
1504 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
1505 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
1507 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
1508 CALCULATED_NUMBER_FORMAT
1509 ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
1510 , rc->calculation->parsed_as, rc->calculation->result,
1511 buffer_tostring(rc->calculation->error_msg), rc->source
1514 rc->value = rc->calculation->result;
1518 rrdhost_unlock(host);
1520 if(unlikely(runnable && !netdata_exit)) {
1521 rrdhost_rdlock(host);
1523 for(rc = host->alarms; rc; rc = rc->next) {
1524 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
1527 int warning_status = RRDCALC_STATUS_UNDEFINED;
1528 int critical_status = RRDCALC_STATUS_UNDEFINED;
1530 if(likely(rc->warning)) {
1531 if(unlikely(!expression_evaluate(rc->warning))) {
1532 // calculation failed
1534 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1536 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
1537 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
1538 error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1542 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
1543 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
1545 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
1547 warning_status = rrdcalc_value2status(rc->warning->result);
1551 if(likely(rc->critical)) {
1552 if(unlikely(!expression_evaluate(rc->critical))) {
1553 // calculation failed
1555 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
1557 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
1558 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
1559 error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
1563 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
1564 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
1566 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
1568 critical_status = rrdcalc_value2status(rc->critical->result);
1572 int status = RRDCALC_STATUS_UNDEFINED;
1574 switch(warning_status) {
1575 case RRDCALC_STATUS_CLEAR:
1576 status = RRDCALC_STATUS_CLEAR;
1579 case RRDCALC_STATUS_RAISED:
1580 status = RRDCALC_STATUS_WARNING;
1587 switch(critical_status) {
1588 case RRDCALC_STATUS_CLEAR:
1589 if(status == RRDCALC_STATUS_UNDEFINED)
1590 status = RRDCALC_STATUS_CLEAR;
1593 case RRDCALC_STATUS_RAISED:
1594 status = RRDCALC_STATUS_CRITICAL;
1601 if(status != rc->status) {
1604 if(now > rc->delay_up_to_timestamp) {
1605 rc->delay_up_current = rc->delay_up_duration;
1606 rc->delay_down_current = rc->delay_down_duration;
1608 rc->delay_up_to_timestamp = 0;
1611 rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
1612 if(rc->delay_up_current > rc->delay_max_duration)
1613 rc->delay_up_current = rc->delay_max_duration;
1615 rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
1616 if(rc->delay_down_current > rc->delay_max_duration)
1617 rc->delay_down_current = rc->delay_max_duration;
1620 if(status > rc->status)
1621 delay = rc->delay_up_current;
1623 delay = rc->delay_down_current;
1625 // COMMENTED: because we do need to send raising alarms
1626 // if(now + delay < rc->delay_up_to_timestamp)
1627 // delay = (int)(rc->delay_up_to_timestamp - now);
1629 rc->delay_last = delay;
1630 rc->delay_up_to_timestamp = now + delay;
1632 host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
1633 , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
1634 , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
1635 , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
1636 ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
1638 rc->last_status_change = now;
1639 rc->status = status;
1642 rc->last_updated = now;
1643 rc->next_update = now + rc->update_every;
1645 if(next_run > rc->next_update)
1646 next_run = rc->next_update;
1649 rrdhost_unlock(host);
1652 if(unlikely(netdata_exit))
1655 // execute notifications
1657 health_alarm_log_process(host);
1659 if(unlikely(netdata_exit))
1664 if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
1665 error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
1667 if(unlikely(netdata_exit))
1670 now = now_realtime_sec();
1671 if(now < next_run) {
1672 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
1673 sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
1676 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
1681 info("HEALTH thread exiting");
1683 static_thread->enabled = 0;