]> arthur.barton.de Git - netdata.git/commitdiff
allow delaying alarm checks when remote netdata starts streaming metrics or when...
authorCosta Tsaousis <costa@tsaousis.gr>
Thu, 23 Feb 2017 16:22:47 +0000 (18:22 +0200)
committerCosta Tsaousis <costa@tsaousis.gr>
Thu, 23 Feb 2017 16:22:47 +0000 (18:22 +0200)
src/health.c
src/rrd.h
src/rrdpush.c

index 0fdedfb4c32b0ca256383f9f9636772aa49d8a4f..d8bc9db59ace4306c08d83e24fc7c118016969fc 100644 (file)
@@ -340,9 +340,14 @@ void *health_main(void *ptr) {
 
         int oldstate, runnable = 0;
         time_t now = now_realtime_sec();
+        time_t now_boot = now_boottime_sec();
         time_t next_run = now + min_run_every;
         RRDCALC *rc;
 
+        time_t last_now = now;
+        time_t last_now_boot = now_boot;
+        time_t hibernation_delay = config_get_number("health", "postpone alarms during hibernation for seconds", 60);
+
         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
             error("Cannot set pthread cancel state to DISABLE.");
 
@@ -350,7 +355,15 @@ void *health_main(void *ptr) {
 
         RRDHOST *host;
         rrdhost_foreach_read(host) {
-            if(unlikely(!host->health_enabled)) continue;
+            if(now - last_now > 2 * (now_boot - last_now_boot)) {
+                info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld).",
+                     hibernation_delay, host->hostname, (long)(now - last_now), (long)(now_boot - last_now_boot));
+                host->health_delay_up_to = now + hibernation_delay;
+            }
+            last_now = now;
+            last_now_boot = now_boot;
+
+            if(unlikely(!host->health_enabled || now < host->health_delay_up_to)) continue;
 
             rrdhost_rdlock(host);
 
@@ -611,6 +624,8 @@ void *health_main(void *ptr) {
         }
         else
             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
+
+        now_boot = now_boottime_sec();
     }
 
     buffer_free(wb);
index 0e7a1efa17719fdbca5376d78c797cca624b0185..7840da865d0c8f02f26512fef5e6e5253d6fd85c 100644 (file)
--- a/src/rrd.h
+++ b/src/rrd.h
@@ -340,6 +340,7 @@ struct rrdhost {
     int rrd_history_entries;                        // the number of history entries for the host's charts
 
     int health_enabled;                             // 1 when this host has health enabled
+    time_t health_delay_up_to;                      // a timestamp to delay alarms processing up to
     RRD_MEMORY_MODE rrd_memory_mode;                // the memory more for the charts of this host
 
     RRDSET *rrdset_root;                            // the host charts
index 1ee410ec633922952efb87afcffd56ecf6614417..214aa0f81938ed876898751782334ad9d13033c1 100644 (file)
@@ -404,6 +404,7 @@ int rrdpush_receive(int fd, const char *key, const char *hostname, const char *m
     int history = default_rrd_history_entries;
     RRD_MEMORY_MODE mode = default_rrd_memory_mode;
     int health_enabled = default_health_enabled;
+    time_t alarms_delay = 60;
 
     update_every = (int)appconfig_get_number(&stream_config, machine_guid, "update every", update_every);
     if(update_every < 0) update_every = 1;
@@ -418,6 +419,9 @@ int rrdpush_receive(int fd, const char *key, const char *hostname, const char *m
     health_enabled = appconfig_get_boolean_ondemand(&stream_config, key, "health enabled by default", health_enabled);
     health_enabled = appconfig_get_boolean_ondemand(&stream_config, machine_guid, "health enabled", health_enabled);
 
+    alarms_delay = appconfig_get_number(&stream_config, key, "default postpone alarms on connect seconds", alarms_delay);
+    alarms_delay = appconfig_get_number(&stream_config, machine_guid, "postpone alarms on connect seconds", alarms_delay);
+
     if(!strcmp(machine_guid, "localhost"))
         host = localhost;
     else
@@ -468,6 +472,8 @@ int rrdpush_receive(int fd, const char *key, const char *hostname, const char *m
 
     rrdhost_wrlock(host);
     host->use_counter++;
+    if(health_enabled != CONFIG_BOOLEAN_NO)
+        host->health_delay_up_to = now_realtime_sec() + alarms_delay;
     rrdhost_unlock(host);
 
     // call the plugins.d processor to receive the metrics