]> arthur.barton.de Git - netdata.git/blobdiff - src/health.c
Merge branch 'master' into ab-debian
[netdata.git] / src / health.c
index 494904c097434dbd821b55641e4c797b75402222..46b27db6fbfdacc871ceb13ab63cbc9f4b75e4e6 100644 (file)
@@ -9,22 +9,15 @@ int default_health_enabled = 1;
 inline char *health_config_dir(void) {
     char buffer[FILENAME_MAX + 1];
     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
-    return config_get("health", "health configuration directory", buffer);
+    return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
 }
 
 void health_init(void) {
     debug(D_HEALTH, "Health configuration initializing");
 
-    if(!central_netdata_to_push_data) {
-        if(!(default_health_enabled = config_get_boolean("health", "enabled", 1))) {
-            debug(D_HEALTH, "Health is disabled.");
-            return;
-        }
-    }
-    else {
-        info("Health is disabled - setup alarms at the central netdata.");
-        config_set_boolean("health", "enabled", 0);
-        default_health_enabled = 0;
+    if(!(default_health_enabled = config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", 1))) {
+        debug(D_HEALTH, "Health is disabled.");
+        return;
     }
 }
 
@@ -32,6 +25,9 @@ void health_init(void) {
 // re-load health configuration
 
 void health_reload_host(RRDHOST *host) {
+    if(unlikely(!host->health_enabled))
+        return;
+
     char *path = health_config_dir();
 
     // free all running alarms
@@ -212,7 +208,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
     time_t now = now_realtime_sec();
 
-    pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+    netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
 
     ALARM_ENTRY *ae;
     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
@@ -232,13 +228,13 @@ static inline void health_alarm_log_process(RRDHOST *host) {
     // remember this for the next iteration
     stop_at_id = first_waiting;
 
-    pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+    netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
 
     if(host->health_log.count <= host->health_log.max)
         return;
 
     // cleanup excess entries in the log
-    pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
+    netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
 
     ALARM_ENTRY *last = NULL;
     unsigned int count = host->health_log.max * 2 / 3;
@@ -260,7 +256,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
         host->health_log.count--;
     }
 
-    pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+    netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
 }
 
 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
@@ -285,6 +281,16 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
         return 0;
     }
 
+    if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
+        debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
+        return 0;
+    }
+
+    if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) {
+        debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name);
+        return 0;
+    }
+
     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
         return 0;
@@ -328,21 +334,34 @@ void *health_main(void *ptr) {
     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
         error("Cannot set pthread cancel state to ENABLE.");
 
-    int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
+    int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
     if(min_run_every < 1) min_run_every = 1;
 
     BUFFER *wb = buffer_create(100);
 
+    time_t now               = now_realtime_sec();
+    time_t now_boottime      = now_boottime_sec();
+    time_t last_now          = now;
+    time_t last_now_boottime = now_boottime;
+    time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
+
     unsigned int loop = 0;
     while(!netdata_exit) {
         loop++;
         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
 
-        int oldstate, runnable = 0;
-        time_t now = now_realtime_sec();
+        int oldstate, runnable = 0, apply_hibernation_delay = 0;
         time_t next_run = now + min_run_every;
         RRDCALC *rc;
 
+        // detect if boottime and realtime have twice the difference
+        // in which case we assume the system was just waken from hibernation
+        if(unlikely(now - last_now > 2 * (now_boottime - last_now_boottime)))
+            apply_hibernation_delay = 1;
+
+        last_now = now;
+        last_now_boottime = now_boottime;
+
         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
             error("Cannot set pthread cancel state to DISABLE.");
 
@@ -350,7 +369,23 @@ void *health_main(void *ptr) {
 
         RRDHOST *host;
         rrdhost_foreach_read(host) {
-            if(unlikely(!host->health_enabled)) continue;
+            if(unlikely(!host->health_enabled))
+                continue;
+
+            if(unlikely(apply_hibernation_delay)) {
+
+                info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld)."
+                     , hibernation_delay
+                     , host->hostname
+                     , (long)(now - last_now)
+                     , (long)(now_boottime - last_now_boottime)
+                );
+
+                host->health_delay_up_to = now + hibernation_delay;
+            }
+
+            if(unlikely(!host->health_enabled || now < host->health_delay_up_to))
+                continue;
 
             rrdhost_rdlock(host);
 
@@ -366,27 +401,41 @@ void *health_main(void *ptr) {
                 rc->old_value = rc->value;
                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
 
-                // 1. if there is database lookup, do it
-                // 2. if there is calculation expression, run it
+                // ------------------------------------------------------------
+                // if there is database lookup, do it
 
                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
                     /* time_t old_db_timestamp = rc->db_before; */
                     int value_is_null = 0;
 
-                    int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
+                    int ret = rrdset2value_api_v1(rc->rrdset
+                                                  , wb
+                                                  , &rc->value
+                                                  , rc->dimensions
+                                                  , 1
+                                                  , rc->after
+                                                  , rc->before
+                                                  , rc->group
+                                                  , rc->options
+                                                  , &rc->db_after
+                                                  , &rc->db_before
+                                                  , &value_is_null
+                    );
 
                     if(unlikely(ret != 200)) {
                         // database lookup failed
                         rc->value = NAN;
-
-                        debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
-
-                        if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
-                            rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
-                            error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
-                        }
+                        rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
+
+                        debug(D_HEALTH
+                              , "Health on host '%s', alarm '%s.%s': database lookup returned error %d"
+                              , host->hostname
+                              , rc->chart ? rc->chart : "NOCHART"
+                              , rc->name
+                              , ret
+                        );
                     }
-                    else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
+                    else
                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
 
                     /* - RRDCALC_FLAG_DB_STALE not currently used
@@ -406,44 +455,57 @@ void *health_main(void *ptr) {
 
                     if(unlikely(value_is_null)) {
                         // collected value is null
-
                         rc->value = NAN;
+                        rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
 
-                        debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
-
-                        if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
-                            rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
-                            error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname,  rc->chart ? rc->chart : "NOCHART", rc->name);
-                        }
+                        debug(D_HEALTH
+                              , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)"
+                              , host->hostname
+                              , rc->chart ? rc->chart : "NOCHART"
+                              , rc->name
+                        );
                     }
-                    else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
+                    else
                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
 
-                    debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
+                    debug(D_HEALTH
+                          , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT
+                          , host->hostname
+                          , rc->chart ? rc->chart : "NOCHART"
+                          , rc->name
+                          , rc->value
+                    );
                 }
 
+                // ------------------------------------------------------------
+                // if there is calculation expression, run it
+
                 if(unlikely(rc->calculation)) {
                     if(unlikely(!expression_evaluate(rc->calculation))) {
                         // calculation failed
-
                         rc->value = NAN;
-
-                        debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
-
-                        if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
-                            rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
-                            error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname,  rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
-                        }
+                        rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
+
+                        debug(D_HEALTH
+                              , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s"
+                              , host->hostname
+                              , rc->chart ? rc->chart : "NOCHART"
+                              , rc->name
+                              , rc->calculation->parsed_as
+                              , buffer_tostring(rc->calculation->error_msg)
+                        );
                     }
                     else {
-                        if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
-                            rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
-
-                        debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
-                                CALCULATED_NUMBER_FORMAT
-                                ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
-                              , rc->calculation->parsed_as, rc->calculation->result,
-                                buffer_tostring(rc->calculation->error_msg), rc->source
+                        rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
+
+                        debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
+                              , host->hostname
+                              , rc->chart ? rc->chart : "NOCHART"
+                              , rc->name
+                              , rc->calculation->parsed_as
+                              , rc->calculation->result
+                              , buffer_tostring(rc->calculation->error_msg)
+                              , rc->source
                         );
 
                         rc->value = rc->calculation->result;
@@ -459,51 +521,74 @@ void *health_main(void *ptr) {
                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
                         continue;
 
-                    int warning_status = RRDCALC_STATUS_UNDEFINED;
+                    int warning_status  = RRDCALC_STATUS_UNDEFINED;
                     int critical_status = RRDCALC_STATUS_UNDEFINED;
 
+                    // --------------------------------------------------------
+                    // check the warning expression
+
                     if(likely(rc->warning)) {
                         if(unlikely(!expression_evaluate(rc->warning))) {
                             // calculation failed
-
-                            debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
-
-                            if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
-                                rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
-                                error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
-                            }
+                            rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
+
+                            debug(D_HEALTH
+                                  , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s"
+                                  , host->hostname
+                                  , rc->chart ? rc->chart : "NOCHART"
+                                  , rc->name
+                                  , buffer_tostring(rc->warning->error_msg)
+                            );
                         }
                         else {
-                            if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
-                                rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
-
-                            debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
-
+                            rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
+                            debug(D_HEALTH
+                                  , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
+                                  , host->hostname
+                                  , rc->chart ? rc->chart : "NOCHART"
+                                  , rc->name
+                                  , rc->warning->result
+                                  , buffer_tostring(rc->warning->error_msg)
+                                  , rc->source
+                            );
                             warning_status = rrdcalc_value2status(rc->warning->result);
                         }
                     }
 
+                    // --------------------------------------------------------
+                    // check the critical expression
+
                     if(likely(rc->critical)) {
                         if(unlikely(!expression_evaluate(rc->critical))) {
                             // calculation failed
-
-                            debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
-
-                            if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
-                                rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
-                                error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
-                            }
+                            rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
+
+                            debug(D_HEALTH
+                                  , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s"
+                                  , host->hostname
+                                  , rc->chart ? rc->chart : "NOCHART"
+                                  , rc->name
+                                  , buffer_tostring(rc->critical->error_msg)
+                            );
                         }
                         else {
-                            if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
-                                rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
-
-                            debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
-
+                            rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
+                            debug(D_HEALTH
+                                  , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
+                                  , host->hostname
+                                  , rc->chart ? rc->chart : "NOCHART"
+                                  , rc->name
+                                  , rc->critical->result
+                                  , buffer_tostring(rc->critical->error_msg)
+                                  , rc->source
+                            );
                             critical_status = rrdcalc_value2status(rc->critical->result);
                         }
                     }
 
+                    // --------------------------------------------------------
+                    // decide the final alarm status
+
                     int status = RRDCALC_STATUS_UNDEFINED;
 
                     switch(warning_status) {
@@ -533,9 +618,14 @@ void *health_main(void *ptr) {
                             break;
                     }
 
+                    // --------------------------------------------------------
+                    // check if the new status and the old differ
+
                     if(status != rc->status) {
                         int delay = 0;
 
+                        // apply trigger hysteresis
+
                         if(now > rc->delay_up_to_timestamp) {
                             rc->delay_up_current = rc->delay_up_duration;
                             rc->delay_down_current = rc->delay_down_duration;
@@ -563,13 +653,31 @@ void *health_main(void *ptr) {
 
                         rc->delay_last = delay;
                         rc->delay_up_to_timestamp = now + delay;
+
+                        // add the alarm into the log
+
                         health_alarm_log(
-                                host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
-                                , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
-                                , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
-                                , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
-                                                  ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
+                                host
+                                , rc->id
+                                , rc->next_event_id++
+                                , now
+                                , rc->name
+                                , rc->rrdset->id
+                                , rc->rrdset->family
+                                , rc->exec
+                                , rc->recipient
+                                , now - rc->last_status_change
+                                , rc->old_value
+                                , rc->value
+                                , rc->status
+                                , status
+                                , rc->source
+                                , rc->units
+                                , rc->info
+                                , rc->delay_last
+                                , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
                         );
+
                         rc->last_status_change = now;
                         rc->status = status;
                     }
@@ -594,7 +702,7 @@ void *health_main(void *ptr) {
             if(unlikely(netdata_exit))
                 break;
 
-        } /* host loop */
+        } /* rrdhost_foreach */
 
         rrd_unlock();
 
@@ -608,10 +716,14 @@ void *health_main(void *ptr) {
         if(now < next_run) {
             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
+            now = now_realtime_sec();
         }
         else
             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
-    }
+
+        now_boottime = now_boottime_sec();
+
+    } // forever
 
     buffer_free(wb);