1 #define NETDATA_HEALTH_INTERNALS
4 int default_health_enabled = 1;
6 // ----------------------------------------------------------------------------
7 // health initialization
9 inline char *health_config_dir(void) {
10 char buffer[FILENAME_MAX + 1];
11 snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12 return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
15 void health_init(void) {
16 debug(D_HEALTH, "Health configuration initializing");
18 if(!(default_health_enabled = config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", 1))) {
19 debug(D_HEALTH, "Health is disabled.");
24 // ----------------------------------------------------------------------------
25 // re-load health configuration
27 void health_reload_host(RRDHOST *host) {
28 if(unlikely(!host->health_enabled))
31 char *path = health_config_dir();
33 // free all running alarms
36 while(host->templates)
37 rrdcalctemplate_free(host, host->templates);
40 rrdcalc_free(host, host->alarms);
44 // invalidate all previous entries in the alarm log
46 for(t = host->health_log.alarms ; t ; t = t->next) {
47 if(t->new_status != RRDCALC_STATUS_REMOVED)
48 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
52 // reset all thresholds to all charts
54 rrdset_foreach_read(st, host) {
60 // load the new alarms
62 health_readdir(host, path);
64 // link the loaded alarms to their charts
65 rrdset_foreach_write(st, host) {
66 rrdsetcalc_link_matching(st);
67 rrdcalctemplate_link_matching(st);
73 void health_reload(void) {
78 rrdhost_foreach_read(host)
79 health_reload_host(host);
84 // ----------------------------------------------------------------------------
85 // health main thread and friends
87 static inline int rrdcalc_value2status(calculated_number n) {
88 if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
89 if(n) return RRDCALC_STATUS_RAISED;
90 return RRDCALC_STATUS_CLEAR;
93 #define ALARM_EXEC_COMMAND_LENGTH 8192
95 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
96 ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
98 if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
99 // do not send notifications for internal statuses
100 debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
104 if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
105 // do not send notifications for disabled statuses
106 debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
107 // mark it as run, so that we will send the same alarm if it happens again
111 // find the previous notification for the same alarm
112 // which we have run the exec script
113 // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
114 if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
115 uint32_t id = ae->alarm_id;
117 for(t = ae->next; t ; t = t->next) {
118 if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
123 // we have executed this alarm notification in the past
124 if(t && t->new_status == ae->new_status) {
125 // don't send the notification for the same status again
126 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
127 , rrdcalc_status2string(ae->new_status));
132 // we have not executed this alarm notification in the past
133 // so, don't send CLEAR notifications
134 if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
135 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
136 , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
142 static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
145 const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
146 const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
148 snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
155 (unsigned long)ae->when,
157 ae->chart?ae->chart:"NOCAHRT",
158 ae->family?ae->family:"NOFAMILY",
159 rrdcalc_status2string(ae->new_status),
160 rrdcalc_status2string(ae->old_status),
163 ae->source?ae->source:"UNKNOWN",
164 (uint32_t)ae->duration,
165 (uint32_t)ae->non_clear_duration,
166 ae->units?ae->units:"",
167 ae->info?ae->info:"",
168 ae->new_value_string,
172 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
173 ae->exec_run_timestamp = now_realtime_sec();
175 debug(D_HEALTH, "executing command '%s'", command_to_run);
176 FILE *fp = mypopen(command_to_run, &command_pid);
178 error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
181 debug(D_HEALTH, "HEALTH reading from command");
182 char *s = fgets(command_to_run, FILENAME_MAX, fp);
184 ae->exec_code = mypclose(fp, command_pid);
185 debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
187 if(ae->exec_code != 0)
188 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
191 health_alarm_log_save(host, ae);
195 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
196 debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
197 ae->chart?ae->chart:"NOCHART", ae->name,
199 rrdcalc_status2string(ae->old_status),
200 rrdcalc_status2string(ae->new_status)
203 health_alarm_execute(host, ae);
206 static inline void health_alarm_log_process(RRDHOST *host) {
207 static uint32_t stop_at_id = 0;
208 uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
209 time_t now = now_realtime_sec();
211 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
214 for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
216 !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
217 !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
220 if(unlikely(ae->unique_id < first_waiting))
221 first_waiting = ae->unique_id;
223 if(likely(now >= ae->delay_up_to_timestamp))
224 health_process_notifications(host, ae);
228 // remember this for the next iteration
229 stop_at_id = first_waiting;
231 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
233 if(host->health_log.count <= host->health_log.max)
236 // cleanup excess entries in the log
237 pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
239 ALARM_ENTRY *last = NULL;
240 unsigned int count = host->health_log.max * 2 / 3;
241 for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
243 if(ae && last && last->next == ae)
249 debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
251 ALARM_ENTRY *t = ae->next;
253 health_alarm_log_free_one_nochecks_nounlink(ae);
256 host->health_log.count--;
259 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
262 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
263 if(unlikely(!rc->rrdset)) {
264 debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
268 if(unlikely(rc->next_update > now)) {
269 if (unlikely(*next_run > rc->next_update)) {
270 // update the next_run time of the main loop
271 // to run this alarm precisely the time required
272 *next_run = rc->next_update;
275 debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
279 if(unlikely(!rc->update_every)) {
280 debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
284 if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
285 debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
289 if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) {
290 debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name);
294 if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
295 debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
299 int update_every = rc->rrdset->update_every;
300 time_t first = rrdset_first_entry_t(rc->rrdset);
301 time_t last = rrdset_last_entry_t(rc->rrdset);
303 if(unlikely(now + update_every < first /* || now - update_every > last */)) {
305 , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
306 , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
307 , (unsigned long) last);
311 if(RRDCALC_HAS_DB_LOOKUP(rc)) {
312 time_t needed = now + rc->before + rc->after;
314 if(needed + update_every < first || needed - update_every > last) {
316 , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
317 , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
318 , (unsigned long) last);
326 void *health_main(void *ptr) {
327 struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
329 info("HEALTH thread created with task id %d", gettid());
331 if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
332 error("Cannot set pthread cancel type to DEFERRED.");
334 if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
335 error("Cannot set pthread cancel state to ENABLE.");
337 int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
338 if(min_run_every < 1) min_run_every = 1;
340 BUFFER *wb = buffer_create(100);
342 time_t now = now_realtime_sec();
343 time_t now_boottime = now_boottime_sec();
344 time_t last_now = now;
345 time_t last_now_boottime = now_boottime;
346 time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
348 unsigned int loop = 0;
349 while(!netdata_exit) {
351 debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
353 int oldstate, runnable = 0, apply_hibernation_delay = 0;
354 time_t next_run = now + min_run_every;
357 // detect if boottime and realtime have twice the difference
358 // in which case we assume the system was just waken from hibernation
359 if(unlikely(now - last_now > 2 * (now_boottime - last_now_boottime)))
360 apply_hibernation_delay = 1;
363 last_now_boottime = now_boottime;
365 if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
366 error("Cannot set pthread cancel state to DISABLE.");
371 rrdhost_foreach_read(host) {
372 if(unlikely(!host->health_enabled))
375 if(unlikely(apply_hibernation_delay)) {
377 info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld)."
380 , (long)(now - last_now)
381 , (long)(now_boottime - last_now_boottime)
384 host->health_delay_up_to = now + hibernation_delay;
387 if(unlikely(!host->health_enabled || now < host->health_delay_up_to))
390 rrdhost_rdlock(host);
392 // the first loop is to lookup values from the db
393 for(rc = host->alarms; rc; rc = rc->next) {
394 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
395 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
396 rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
401 rc->old_value = rc->value;
402 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
404 // ------------------------------------------------------------
405 // if there is database lookup, do it
407 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
408 /* time_t old_db_timestamp = rc->db_before; */
409 int value_is_null = 0;
411 int ret = rrdset2value_api_v1(rc->rrdset
425 if(unlikely(ret != 200)) {
426 // database lookup failed
428 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
431 , "Health on host '%s', alarm '%s.%s': database lookup returned error %d"
433 , rc->chart ? rc->chart : "NOCHART"
439 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
441 /* - RRDCALC_FLAG_DB_STALE not currently used
442 if (unlikely(old_db_timestamp == rc->db_before)) {
445 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
447 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
448 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
449 error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
452 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
453 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
456 if(unlikely(value_is_null)) {
457 // collected value is null
459 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
462 , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)"
464 , rc->chart ? rc->chart : "NOCHART"
469 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
472 , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT
474 , rc->chart ? rc->chart : "NOCHART"
480 // ------------------------------------------------------------
481 // if there is calculation expression, run it
483 if(unlikely(rc->calculation)) {
484 if(unlikely(!expression_evaluate(rc->calculation))) {
485 // calculation failed
487 rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
490 , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s"
492 , rc->chart ? rc->chart : "NOCHART"
494 , rc->calculation->parsed_as
495 , buffer_tostring(rc->calculation->error_msg)
499 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
501 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
503 , rc->chart ? rc->chart : "NOCHART"
505 , rc->calculation->parsed_as
506 , rc->calculation->result
507 , buffer_tostring(rc->calculation->error_msg)
511 rc->value = rc->calculation->result;
515 rrdhost_unlock(host);
517 if(unlikely(runnable && !netdata_exit)) {
518 rrdhost_rdlock(host);
520 for(rc = host->alarms; rc; rc = rc->next) {
521 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
524 int warning_status = RRDCALC_STATUS_UNDEFINED;
525 int critical_status = RRDCALC_STATUS_UNDEFINED;
527 // --------------------------------------------------------
528 // check the warning expression
530 if(likely(rc->warning)) {
531 if(unlikely(!expression_evaluate(rc->warning))) {
532 // calculation failed
533 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
536 , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s"
538 , rc->chart ? rc->chart : "NOCHART"
540 , buffer_tostring(rc->warning->error_msg)
544 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
546 , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
548 , rc->chart ? rc->chart : "NOCHART"
550 , rc->warning->result
551 , buffer_tostring(rc->warning->error_msg)
554 warning_status = rrdcalc_value2status(rc->warning->result);
558 // --------------------------------------------------------
559 // check the critical expression
561 if(likely(rc->critical)) {
562 if(unlikely(!expression_evaluate(rc->critical))) {
563 // calculation failed
564 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
567 , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s"
569 , rc->chart ? rc->chart : "NOCHART"
571 , buffer_tostring(rc->critical->error_msg)
575 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
577 , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
579 , rc->chart ? rc->chart : "NOCHART"
581 , rc->critical->result
582 , buffer_tostring(rc->critical->error_msg)
585 critical_status = rrdcalc_value2status(rc->critical->result);
589 // --------------------------------------------------------
590 // decide the final alarm status
592 int status = RRDCALC_STATUS_UNDEFINED;
594 switch(warning_status) {
595 case RRDCALC_STATUS_CLEAR:
596 status = RRDCALC_STATUS_CLEAR;
599 case RRDCALC_STATUS_RAISED:
600 status = RRDCALC_STATUS_WARNING;
607 switch(critical_status) {
608 case RRDCALC_STATUS_CLEAR:
609 if(status == RRDCALC_STATUS_UNDEFINED)
610 status = RRDCALC_STATUS_CLEAR;
613 case RRDCALC_STATUS_RAISED:
614 status = RRDCALC_STATUS_CRITICAL;
621 // --------------------------------------------------------
622 // check if the new status and the old differ
624 if(status != rc->status) {
627 // apply trigger hysteresis
629 if(now > rc->delay_up_to_timestamp) {
630 rc->delay_up_current = rc->delay_up_duration;
631 rc->delay_down_current = rc->delay_down_duration;
633 rc->delay_up_to_timestamp = 0;
636 rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
637 if(rc->delay_up_current > rc->delay_max_duration)
638 rc->delay_up_current = rc->delay_max_duration;
640 rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
641 if(rc->delay_down_current > rc->delay_max_duration)
642 rc->delay_down_current = rc->delay_max_duration;
645 if(status > rc->status)
646 delay = rc->delay_up_current;
648 delay = rc->delay_down_current;
650 // COMMENTED: because we do need to send raising alarms
651 // if(now + delay < rc->delay_up_to_timestamp)
652 // delay = (int)(rc->delay_up_to_timestamp - now);
654 rc->delay_last = delay;
655 rc->delay_up_to_timestamp = now + delay;
657 // add the alarm into the log
662 , rc->next_event_id++
669 , now - rc->last_status_change
678 , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
681 rc->last_status_change = now;
685 rc->last_updated = now;
686 rc->next_update = now + rc->update_every;
688 if(next_run > rc->next_update)
689 next_run = rc->next_update;
692 rrdhost_unlock(host);
695 if(unlikely(netdata_exit))
698 // execute notifications
700 health_alarm_log_process(host);
702 if(unlikely(netdata_exit))
705 } /* rrdhost_foreach */
709 if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
710 error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
712 if(unlikely(netdata_exit))
715 now = now_realtime_sec();
717 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
718 sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
719 now = now_realtime_sec();
722 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
724 now_boottime = now_boottime_sec();
730 info("HEALTH thread exiting");
732 static_thread->enabled = 0;