1 #define NETDATA_HEALTH_INTERNALS
4 int default_health_enabled = 1;
6 // ----------------------------------------------------------------------------
7 // health initialization
9 inline char *health_config_dir(void) {
10 char buffer[FILENAME_MAX + 1];
11 snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12 return config_get("health", "health configuration directory", buffer);
15 void health_init(void) {
16 debug(D_HEALTH, "Health configuration initializing");
18 if(!central_netdata_to_push_data) {
19 if(!(default_health_enabled = config_get_boolean("health", "enabled", 1))) {
20 debug(D_HEALTH, "Health is disabled.");
25 info("Health is disabled - setup alarms at the central netdata.");
26 config_set_boolean("health", "enabled", 0);
27 default_health_enabled = 0;
30 char pathname[FILENAME_MAX + 1];
31 snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir);
32 if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
33 fatal("Cannot create directory '%s'.", pathname);
36 // ----------------------------------------------------------------------------
37 // re-load health configuration
39 void health_reload_host(RRDHOST *host) {
40 char *path = health_config_dir();
42 // free all running alarms
45 while(host->templates)
46 rrdcalctemplate_free(host, host->templates);
49 rrdcalc_free(host, host->alarms);
53 // invalidate all previous entries in the alarm log
55 for(t = host->health_log.alarms ; t ; t = t->next) {
56 if(t->new_status != RRDCALC_STATUS_REMOVED)
57 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
60 // reset all thresholds to all charts
62 for(st = host->rrdset_root; st ; st = st->next) {
67 // load the new alarms
69 health_readdir(host, path);
72 // link the loaded alarms to their charts
73 for(st = host->rrdset_root; st ; st = st->next) {
76 rrdsetcalc_link_matching(st);
77 rrdcalctemplate_link_matching(st);
83 void health_reload(void) {
86 for(host = localhost; host ; host = host->next)
87 health_reload_host(host);
90 // ----------------------------------------------------------------------------
91 // health main thread and friends
93 static inline int rrdcalc_value2status(calculated_number n) {
94 if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
95 if(n) return RRDCALC_STATUS_RAISED;
96 return RRDCALC_STATUS_CLEAR;
99 #define ALARM_EXEC_COMMAND_LENGTH 8192
101 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
102 ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
104 if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
105 // do not send notifications for internal statuses
106 debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
110 if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
111 // do not send notifications for disabled statuses
112 debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
113 // mark it as run, so that we will send the same alarm if it happens again
117 // find the previous notification for the same alarm
118 // which we have run the exec script
119 // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
120 if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
121 uint32_t id = ae->alarm_id;
123 for(t = ae->next; t ; t = t->next) {
124 if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
129 // we have executed this alarm notification in the past
130 if(t && t->new_status == ae->new_status) {
131 // don't send the notification for the same status again
132 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
133 , rrdcalc_status2string(ae->new_status));
138 // we have not executed this alarm notification in the past
139 // so, don't send CLEAR notifications
140 if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
141 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
142 , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
148 static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
151 const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
152 const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
154 snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
161 (unsigned long)ae->when,
163 ae->chart?ae->chart:"NOCAHRT",
164 ae->family?ae->family:"NOFAMILY",
165 rrdcalc_status2string(ae->new_status),
166 rrdcalc_status2string(ae->old_status),
169 ae->source?ae->source:"UNKNOWN",
170 (uint32_t)ae->duration,
171 (uint32_t)ae->non_clear_duration,
172 ae->units?ae->units:"",
173 ae->info?ae->info:"",
174 ae->new_value_string,
178 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
179 ae->exec_run_timestamp = now_realtime_sec();
181 debug(D_HEALTH, "executing command '%s'", command_to_run);
182 FILE *fp = mypopen(command_to_run, &command_pid);
184 error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
187 debug(D_HEALTH, "HEALTH reading from command");
188 char *s = fgets(command_to_run, FILENAME_MAX, fp);
190 ae->exec_code = mypclose(fp, command_pid);
191 debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
193 if(ae->exec_code != 0)
194 ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
197 health_alarm_log_save(host, ae);
201 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
202 debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
203 ae->chart?ae->chart:"NOCHART", ae->name,
205 rrdcalc_status2string(ae->old_status),
206 rrdcalc_status2string(ae->new_status)
209 health_alarm_execute(host, ae);
212 static inline void health_alarm_log_process(RRDHOST *host) {
213 static uint32_t stop_at_id = 0;
214 uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
215 time_t now = now_realtime_sec();
217 pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
220 for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
222 !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
223 !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
226 if(unlikely(ae->unique_id < first_waiting))
227 first_waiting = ae->unique_id;
229 if(likely(now >= ae->delay_up_to_timestamp))
230 health_process_notifications(host, ae);
234 // remember this for the next iteration
235 stop_at_id = first_waiting;
237 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
239 if(host->health_log.count <= host->health_log.max)
242 // cleanup excess entries in the log
243 pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
245 ALARM_ENTRY *last = NULL;
246 unsigned int count = host->health_log.max * 2 / 3;
247 for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
249 if(ae && last && last->next == ae)
255 debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
257 ALARM_ENTRY *t = ae->next;
259 health_alarm_log_free_one_nochecks_nounlink(ae);
262 host->health_log.count--;
265 pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
268 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
269 if(unlikely(!rc->rrdset)) {
270 debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
274 if(unlikely(rc->next_update > now)) {
275 if (unlikely(*next_run > rc->next_update)) {
276 // update the next_run time of the main loop
277 // to run this alarm precisely the time required
278 *next_run = rc->next_update;
281 debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
285 if(unlikely(!rc->update_every)) {
286 debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
290 if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
291 debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
295 int update_every = rc->rrdset->update_every;
296 time_t first = rrdset_first_entry_t(rc->rrdset);
297 time_t last = rrdset_last_entry_t(rc->rrdset);
299 if(unlikely(now + update_every < first /* || now - update_every > last */)) {
301 , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
302 , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
303 , (unsigned long) last);
307 if(RRDCALC_HAS_DB_LOOKUP(rc)) {
308 time_t needed = now + rc->before + rc->after;
310 if(needed + update_every < first || needed - update_every > last) {
312 , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
313 , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
314 , (unsigned long) last);
322 void *health_main(void *ptr) {
323 struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
325 info("HEALTH thread created with task id %d", gettid());
327 if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
328 error("Cannot set pthread cancel type to DEFERRED.");
330 if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
331 error("Cannot set pthread cancel state to ENABLE.");
333 int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
334 if(min_run_every < 1) min_run_every = 1;
336 BUFFER *wb = buffer_create(100);
338 unsigned int loop = 0;
339 while(!netdata_exit) {
341 debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
343 int oldstate, runnable = 0;
344 time_t now = now_realtime_sec();
345 time_t next_run = now + min_run_every;
348 if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
349 error("Cannot set pthread cancel state to DISABLE.");
352 for(host = localhost; host ; host = host->next) {
353 if(unlikely(!host->health_enabled)) continue;
355 rrdhost_rdlock(host);
357 // the first loop is to lookup values from the db
358 for(rc = host->alarms; rc; rc = rc->next) {
359 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
360 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
361 rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
366 rc->old_value = rc->value;
367 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
369 // 1. if there is database lookup, do it
370 // 2. if there is calculation expression, run it
372 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
373 /* time_t old_db_timestamp = rc->db_before; */
374 int value_is_null = 0;
376 int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
378 if(unlikely(ret != 200)) {
379 // database lookup failed
382 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
384 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
385 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
386 error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
389 else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
390 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
392 /* - RRDCALC_FLAG_DB_STALE not currently used
393 if (unlikely(old_db_timestamp == rc->db_before)) {
396 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
398 if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
399 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
400 error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
403 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
404 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
407 if(unlikely(value_is_null)) {
408 // collected value is null
412 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
414 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
415 rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
416 error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
419 else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
420 rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
422 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
425 if(unlikely(rc->calculation)) {
426 if(unlikely(!expression_evaluate(rc->calculation))) {
427 // calculation failed
431 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
433 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
434 rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
435 error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname, rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
439 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
440 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
442 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
443 CALCULATED_NUMBER_FORMAT
444 ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
445 , rc->calculation->parsed_as, rc->calculation->result,
446 buffer_tostring(rc->calculation->error_msg), rc->source
449 rc->value = rc->calculation->result;
453 rrdhost_unlock(host);
455 if(unlikely(runnable && !netdata_exit)) {
456 rrdhost_rdlock(host);
458 for(rc = host->alarms; rc; rc = rc->next) {
459 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
462 int warning_status = RRDCALC_STATUS_UNDEFINED;
463 int critical_status = RRDCALC_STATUS_UNDEFINED;
465 if(likely(rc->warning)) {
466 if(unlikely(!expression_evaluate(rc->warning))) {
467 // calculation failed
469 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
471 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
472 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
473 error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
477 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
478 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
480 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
482 warning_status = rrdcalc_value2status(rc->warning->result);
486 if(likely(rc->critical)) {
487 if(unlikely(!expression_evaluate(rc->critical))) {
488 // calculation failed
490 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
492 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
493 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
494 error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
498 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
499 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
501 debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
503 critical_status = rrdcalc_value2status(rc->critical->result);
507 int status = RRDCALC_STATUS_UNDEFINED;
509 switch(warning_status) {
510 case RRDCALC_STATUS_CLEAR:
511 status = RRDCALC_STATUS_CLEAR;
514 case RRDCALC_STATUS_RAISED:
515 status = RRDCALC_STATUS_WARNING;
522 switch(critical_status) {
523 case RRDCALC_STATUS_CLEAR:
524 if(status == RRDCALC_STATUS_UNDEFINED)
525 status = RRDCALC_STATUS_CLEAR;
528 case RRDCALC_STATUS_RAISED:
529 status = RRDCALC_STATUS_CRITICAL;
536 if(status != rc->status) {
539 if(now > rc->delay_up_to_timestamp) {
540 rc->delay_up_current = rc->delay_up_duration;
541 rc->delay_down_current = rc->delay_down_duration;
543 rc->delay_up_to_timestamp = 0;
546 rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
547 if(rc->delay_up_current > rc->delay_max_duration)
548 rc->delay_up_current = rc->delay_max_duration;
550 rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
551 if(rc->delay_down_current > rc->delay_max_duration)
552 rc->delay_down_current = rc->delay_max_duration;
555 if(status > rc->status)
556 delay = rc->delay_up_current;
558 delay = rc->delay_down_current;
560 // COMMENTED: because we do need to send raising alarms
561 // if(now + delay < rc->delay_up_to_timestamp)
562 // delay = (int)(rc->delay_up_to_timestamp - now);
564 rc->delay_last = delay;
565 rc->delay_up_to_timestamp = now + delay;
567 host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
568 , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
569 , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
570 , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
571 ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
573 rc->last_status_change = now;
577 rc->last_updated = now;
578 rc->next_update = now + rc->update_every;
580 if(next_run > rc->next_update)
581 next_run = rc->next_update;
584 rrdhost_unlock(host);
587 if(unlikely(netdata_exit))
590 // execute notifications
592 health_alarm_log_process(host);
594 if(unlikely(netdata_exit))
599 if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
600 error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
602 if(unlikely(netdata_exit))
605 now = now_realtime_sec();
607 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
608 sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
611 debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
616 info("HEALTH thread exiting");
618 static_thread->enabled = 0;