]> arthur.barton.de Git - netdata.git/blob - src/health.c
9e02b5449b7b45d1bf78223028350b15a4d125ca
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 int default_health_enabled = 1;
5
6 // ----------------------------------------------------------------------------
7 // health initialization
8
9 inline char *health_config_dir(void) {
10     char buffer[FILENAME_MAX + 1];
11     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12     return config_get("health", "health configuration directory", buffer);
13 }
14
15 void health_init(void) {
16     debug(D_HEALTH, "Health configuration initializing");
17
18     if(!rrdpush_exclusive) {
19         if(!(default_health_enabled = config_get_boolean("health", "enabled", 1))) {
20             debug(D_HEALTH, "Health is disabled.");
21             return;
22         }
23     }
24     else {
25         info("Health is disabled - setup alarms at the central netdata.");
26         config_set_boolean("health", "enabled", 0);
27         default_health_enabled = 0;
28     }
29 }
30
31 // ----------------------------------------------------------------------------
32 // re-load health configuration
33
34 void health_reload_host(RRDHOST *host) {
35     char *path = health_config_dir();
36
37     // free all running alarms
38     rrdhost_wrlock(host);
39
40     while(host->templates)
41         rrdcalctemplate_free(host, host->templates);
42
43     while(host->alarms)
44         rrdcalc_free(host, host->alarms);
45
46     rrdhost_unlock(host);
47
48     // invalidate all previous entries in the alarm log
49     ALARM_ENTRY *t;
50     for(t = host->health_log.alarms ; t ; t = t->next) {
51         if(t->new_status != RRDCALC_STATUS_REMOVED)
52             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
53     }
54
55     rrdhost_rdlock(host);
56     // reset all thresholds to all charts
57     RRDSET *st;
58     rrdset_foreach_read(st, host) {
59         st->green = NAN;
60         st->red = NAN;
61     }
62     rrdhost_unlock(host);
63
64     // load the new alarms
65     rrdhost_wrlock(host);
66     health_readdir(host, path);
67
68     // link the loaded alarms to their charts
69     rrdset_foreach_write(st, host) {
70         rrdsetcalc_link_matching(st);
71         rrdcalctemplate_link_matching(st);
72     }
73
74     rrdhost_unlock(host);
75 }
76
77 void health_reload(void) {
78
79     rrd_rdlock();
80
81     RRDHOST *host;
82     rrdhost_foreach_read(host)
83         health_reload_host(host);
84
85     rrd_unlock();
86 }
87
88 // ----------------------------------------------------------------------------
89 // health main thread and friends
90
91 static inline int rrdcalc_value2status(calculated_number n) {
92     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
93     if(n) return RRDCALC_STATUS_RAISED;
94     return RRDCALC_STATUS_CLEAR;
95 }
96
97 #define ALARM_EXEC_COMMAND_LENGTH 8192
98
99 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
100     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
101
102     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
103         // do not send notifications for internal statuses
104         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
105         goto done;
106     }
107
108     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
109         // do not send notifications for disabled statuses
110         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
111         // mark it as run, so that we will send the same alarm if it happens again
112         goto done;
113     }
114
115     // find the previous notification for the same alarm
116     // which we have run the exec script
117     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
118     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
119         uint32_t id = ae->alarm_id;
120         ALARM_ENTRY *t;
121         for(t = ae->next; t ; t = t->next) {
122             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
123                 break;
124         }
125
126         if(likely(t)) {
127             // we have executed this alarm notification in the past
128             if(t && t->new_status == ae->new_status) {
129                 // don't send the notification for the same status again
130                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
131                       , rrdcalc_status2string(ae->new_status));
132                 goto done;
133             }
134         }
135         else {
136             // we have not executed this alarm notification in the past
137             // so, don't send CLEAR notifications
138             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
139                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
140                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
141                 goto done;
142             }
143         }
144     }
145
146     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
147     pid_t command_pid;
148
149     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
150     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
151
152     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
153               exec,
154               recipient,
155               host->hostname,
156               ae->unique_id,
157               ae->alarm_id,
158               ae->alarm_event_id,
159               (unsigned long)ae->when,
160               ae->name,
161               ae->chart?ae->chart:"NOCAHRT",
162               ae->family?ae->family:"NOFAMILY",
163               rrdcalc_status2string(ae->new_status),
164               rrdcalc_status2string(ae->old_status),
165               ae->new_value,
166               ae->old_value,
167               ae->source?ae->source:"UNKNOWN",
168               (uint32_t)ae->duration,
169               (uint32_t)ae->non_clear_duration,
170               ae->units?ae->units:"",
171               ae->info?ae->info:"",
172               ae->new_value_string,
173               ae->old_value_string
174     );
175
176     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
177     ae->exec_run_timestamp = now_realtime_sec();
178
179     debug(D_HEALTH, "executing command '%s'", command_to_run);
180     FILE *fp = mypopen(command_to_run, &command_pid);
181     if(!fp) {
182         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
183         goto done;
184     }
185     debug(D_HEALTH, "HEALTH reading from command");
186     char *s = fgets(command_to_run, FILENAME_MAX, fp);
187     (void)s;
188     ae->exec_code = mypclose(fp, command_pid);
189     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
190
191     if(ae->exec_code != 0)
192         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
193
194 done:
195     health_alarm_log_save(host, ae);
196     return;
197 }
198
199 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
200     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
201          ae->chart?ae->chart:"NOCHART", ae->name,
202          ae->new_value,
203          rrdcalc_status2string(ae->old_status),
204          rrdcalc_status2string(ae->new_status)
205     );
206
207     health_alarm_execute(host, ae);
208 }
209
210 static inline void health_alarm_log_process(RRDHOST *host) {
211     static uint32_t stop_at_id = 0;
212     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
213     time_t now = now_realtime_sec();
214
215     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
216
217     ALARM_ENTRY *ae;
218     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
219         if(unlikely(
220             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
221             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
222             )) {
223
224             if(unlikely(ae->unique_id < first_waiting))
225                 first_waiting = ae->unique_id;
226
227             if(likely(now >= ae->delay_up_to_timestamp))
228                 health_process_notifications(host, ae);
229         }
230     }
231
232     // remember this for the next iteration
233     stop_at_id = first_waiting;
234
235     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
236
237     if(host->health_log.count <= host->health_log.max)
238         return;
239
240     // cleanup excess entries in the log
241     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
242
243     ALARM_ENTRY *last = NULL;
244     unsigned int count = host->health_log.max * 2 / 3;
245     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
246
247     if(ae && last && last->next == ae)
248         last->next = NULL;
249     else
250         ae = NULL;
251
252     while(ae) {
253         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
254
255         ALARM_ENTRY *t = ae->next;
256
257         health_alarm_log_free_one_nochecks_nounlink(ae);
258
259         ae = t;
260         host->health_log.count--;
261     }
262
263     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
264 }
265
266 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
267     if(unlikely(!rc->rrdset)) {
268         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
269         return 0;
270     }
271
272     if(unlikely(rc->next_update > now)) {
273         if (unlikely(*next_run > rc->next_update)) {
274             // update the next_run time of the main loop
275             // to run this alarm precisely the time required
276             *next_run = rc->next_update;
277         }
278
279         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
280         return 0;
281     }
282
283     if(unlikely(!rc->update_every)) {
284         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
285         return 0;
286     }
287
288     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
289         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
290         return 0;
291     }
292
293     int update_every = rc->rrdset->update_every;
294     time_t first = rrdset_first_entry_t(rc->rrdset);
295     time_t last = rrdset_last_entry_t(rc->rrdset);
296
297     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
298         debug(D_HEALTH
299               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
300               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
301               , (unsigned long) last);
302         return 0;
303     }
304
305     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
306         time_t needed = now + rc->before + rc->after;
307
308         if(needed + update_every < first || needed - update_every > last) {
309             debug(D_HEALTH
310                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
311                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
312                   , (unsigned long) last);
313             return 0;
314         }
315     }
316
317     return 1;
318 }
319
320 void *health_main(void *ptr) {
321     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
322
323     info("HEALTH thread created with task id %d", gettid());
324
325     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
326         error("Cannot set pthread cancel type to DEFERRED.");
327
328     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
329         error("Cannot set pthread cancel state to ENABLE.");
330
331     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
332     if(min_run_every < 1) min_run_every = 1;
333
334     BUFFER *wb = buffer_create(100);
335
336     time_t now               = now_realtime_sec();
337     time_t now_boottime      = now_boottime_sec();
338     time_t last_now          = now;
339     time_t last_now_boottime = now_boottime;
340     time_t hibernation_delay = config_get_number("health", "postpone alarms during hibernation for seconds", 60);
341
342     unsigned int loop = 0;
343     while(!netdata_exit) {
344         loop++;
345         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
346
347         int oldstate, runnable = 0, apply_hibernation_delay = 0;
348         time_t next_run = now + min_run_every;
349         RRDCALC *rc;
350
351         // detect if boottime and realtime have twice the difference
352         // in which case we assume the system was just waken from hibernation
353         if(unlikely(now - last_now > 2 * (now_boottime - last_now_boottime)))
354             apply_hibernation_delay = 1;
355
356         last_now = now;
357         last_now_boottime = now_boottime;
358
359         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
360             error("Cannot set pthread cancel state to DISABLE.");
361
362         rrd_rdlock();
363
364         RRDHOST *host;
365         rrdhost_foreach_read(host) {
366             if(unlikely(apply_hibernation_delay)) {
367
368                 info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld)."
369                      , hibernation_delay
370                      , host->hostname
371                      , (long)(now - last_now)
372                      , (long)(now_boottime - last_now_boottime)
373                 );
374
375                 host->health_delay_up_to = now + hibernation_delay;
376             }
377
378             if(unlikely(!host->health_enabled || now < host->health_delay_up_to))
379                 continue;
380
381             rrdhost_rdlock(host);
382
383             // the first loop is to lookup values from the db
384             for(rc = host->alarms; rc; rc = rc->next) {
385                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
386                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
387                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
388                     continue;
389                 }
390
391                 runnable++;
392                 rc->old_value = rc->value;
393                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
394
395                 // ------------------------------------------------------------
396                 // if there is database lookup, do it
397
398                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
399                     /* time_t old_db_timestamp = rc->db_before; */
400                     int value_is_null = 0;
401
402                     int ret = rrd2value(rc->rrdset
403                                         , wb
404                                         , &rc->value
405                                         , rc->dimensions
406                                         , 1
407                                         , rc->after
408                                         , rc->before
409                                         , rc->group
410                                         , rc->options
411                                         , &rc->db_after
412                                         , &rc->db_before
413                                         , &value_is_null
414                     );
415
416                     if(unlikely(ret != 200)) {
417                         // database lookup failed
418                         rc->value = NAN;
419                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
420
421                         debug(D_HEALTH
422                               , "Health on host '%s', alarm '%s.%s': database lookup returned error %d"
423                               , host->hostname
424                               , rc->chart ? rc->chart : "NOCHART"
425                               , rc->name
426                               , ret
427                         );
428                     }
429                     else
430                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
431
432                     /* - RRDCALC_FLAG_DB_STALE not currently used
433                     if (unlikely(old_db_timestamp == rc->db_before)) {
434                         // database is stale
435
436                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
437
438                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
439                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
440                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
441                         }
442                     }
443                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
444                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
445                     */
446
447                     if(unlikely(value_is_null)) {
448                         // collected value is null
449                         rc->value = NAN;
450                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
451
452                         debug(D_HEALTH
453                               , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)"
454                               , host->hostname
455                               , rc->chart ? rc->chart : "NOCHART"
456                               , rc->name
457                         );
458                     }
459                     else
460                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
461
462                     debug(D_HEALTH
463                           , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT
464                           , host->hostname
465                           , rc->chart ? rc->chart : "NOCHART"
466                           , rc->name
467                           , rc->value
468                     );
469                 }
470
471                 // ------------------------------------------------------------
472                 // if there is calculation expression, run it
473
474                 if(unlikely(rc->calculation)) {
475                     if(unlikely(!expression_evaluate(rc->calculation))) {
476                         // calculation failed
477                         rc->value = NAN;
478                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
479
480                         debug(D_HEALTH
481                               , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s"
482                               , host->hostname
483                               , rc->chart ? rc->chart : "NOCHART"
484                               , rc->name
485                               , rc->calculation->parsed_as
486                               , buffer_tostring(rc->calculation->error_msg)
487                         );
488                     }
489                     else {
490                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
491
492                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
493                               , host->hostname
494                               , rc->chart ? rc->chart : "NOCHART"
495                               , rc->name
496                               , rc->calculation->parsed_as
497                               , rc->calculation->result
498                               , buffer_tostring(rc->calculation->error_msg)
499                               , rc->source
500                         );
501
502                         rc->value = rc->calculation->result;
503                     }
504                 }
505             }
506             rrdhost_unlock(host);
507
508             if(unlikely(runnable && !netdata_exit)) {
509                 rrdhost_rdlock(host);
510
511                 for(rc = host->alarms; rc; rc = rc->next) {
512                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
513                         continue;
514
515                     int warning_status  = RRDCALC_STATUS_UNDEFINED;
516                     int critical_status = RRDCALC_STATUS_UNDEFINED;
517
518                     // --------------------------------------------------------
519                     // check the warning expression
520
521                     if(likely(rc->warning)) {
522                         if(unlikely(!expression_evaluate(rc->warning))) {
523                             // calculation failed
524                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
525
526                             debug(D_HEALTH
527                                   , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s"
528                                   , host->hostname
529                                   , rc->chart ? rc->chart : "NOCHART"
530                                   , rc->name
531                                   , buffer_tostring(rc->warning->error_msg)
532                             );
533                         }
534                         else {
535                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
536                             debug(D_HEALTH
537                                   , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
538                                   , host->hostname
539                                   , rc->chart ? rc->chart : "NOCHART"
540                                   , rc->name
541                                   , rc->warning->result
542                                   , buffer_tostring(rc->warning->error_msg)
543                                   , rc->source
544                             );
545                             warning_status = rrdcalc_value2status(rc->warning->result);
546                         }
547                     }
548
549                     // --------------------------------------------------------
550                     // check the critical expression
551
552                     if(likely(rc->critical)) {
553                         if(unlikely(!expression_evaluate(rc->critical))) {
554                             // calculation failed
555                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
556
557                             debug(D_HEALTH
558                                   , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s"
559                                   , host->hostname
560                                   , rc->chart ? rc->chart : "NOCHART"
561                                   , rc->name
562                                   , buffer_tostring(rc->critical->error_msg)
563                             );
564                         }
565                         else {
566                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
567                             debug(D_HEALTH
568                                   , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
569                                   , host->hostname
570                                   , rc->chart ? rc->chart : "NOCHART"
571                                   , rc->name
572                                   , rc->critical->result
573                                   , buffer_tostring(rc->critical->error_msg)
574                                   , rc->source
575                             );
576                             critical_status = rrdcalc_value2status(rc->critical->result);
577                         }
578                     }
579
580                     // --------------------------------------------------------
581                     // decide the final alarm status
582
583                     int status = RRDCALC_STATUS_UNDEFINED;
584
585                     switch(warning_status) {
586                         case RRDCALC_STATUS_CLEAR:
587                             status = RRDCALC_STATUS_CLEAR;
588                             break;
589
590                         case RRDCALC_STATUS_RAISED:
591                             status = RRDCALC_STATUS_WARNING;
592                             break;
593
594                         default:
595                             break;
596                     }
597
598                     switch(critical_status) {
599                         case RRDCALC_STATUS_CLEAR:
600                             if(status == RRDCALC_STATUS_UNDEFINED)
601                                 status = RRDCALC_STATUS_CLEAR;
602                             break;
603
604                         case RRDCALC_STATUS_RAISED:
605                             status = RRDCALC_STATUS_CRITICAL;
606                             break;
607
608                         default:
609                             break;
610                     }
611
612                     // --------------------------------------------------------
613                     // check if the new status and the old differ
614
615                     if(status != rc->status) {
616                         int delay = 0;
617
618                         // apply trigger hysteresis
619
620                         if(now > rc->delay_up_to_timestamp) {
621                             rc->delay_up_current = rc->delay_up_duration;
622                             rc->delay_down_current = rc->delay_down_duration;
623                             rc->delay_last = 0;
624                             rc->delay_up_to_timestamp = 0;
625                         }
626                         else {
627                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
628                             if(rc->delay_up_current > rc->delay_max_duration)
629                                 rc->delay_up_current = rc->delay_max_duration;
630
631                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
632                             if(rc->delay_down_current > rc->delay_max_duration)
633                                 rc->delay_down_current = rc->delay_max_duration;
634                         }
635
636                         if(status > rc->status)
637                             delay = rc->delay_up_current;
638                         else
639                             delay = rc->delay_down_current;
640
641                         // COMMENTED: because we do need to send raising alarms
642                         // if(now + delay < rc->delay_up_to_timestamp)
643                         //    delay = (int)(rc->delay_up_to_timestamp - now);
644
645                         rc->delay_last = delay;
646                         rc->delay_up_to_timestamp = now + delay;
647
648                         // add the alarm into the log
649
650                         health_alarm_log(
651                                 host
652                                 , rc->id
653                                 , rc->next_event_id++
654                                 , now
655                                 , rc->name
656                                 , rc->rrdset->id
657                                 , rc->rrdset->family
658                                 , rc->exec
659                                 , rc->recipient
660                                 , now - rc->last_status_change
661                                 , rc->old_value
662                                 , rc->value
663                                 , rc->status
664                                 , status
665                                 , rc->source
666                                 , rc->units
667                                 , rc->info
668                                 , rc->delay_last
669                                 , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
670                         );
671
672                         rc->last_status_change = now;
673                         rc->status = status;
674                     }
675
676                     rc->last_updated = now;
677                     rc->next_update = now + rc->update_every;
678
679                     if(next_run > rc->next_update)
680                         next_run = rc->next_update;
681                 }
682
683                 rrdhost_unlock(host);
684             }
685
686             if(unlikely(netdata_exit))
687                 break;
688
689             // execute notifications
690             // and cleanup
691             health_alarm_log_process(host);
692
693             if(unlikely(netdata_exit))
694                 break;
695
696         } /* rrdhost_foreach */
697
698         rrd_unlock();
699
700         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
701             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
702
703         if(unlikely(netdata_exit))
704             break;
705
706         now = now_realtime_sec();
707         if(now < next_run) {
708             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
709             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
710             now = now_realtime_sec();
711         }
712         else
713             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
714
715         now_boottime = now_boottime_sec();
716
717     } // forever
718
719     buffer_free(wb);
720
721     info("HEALTH thread exiting");
722
723     static_thread->enabled = 0;
724     pthread_exit(NULL);
725     return NULL;
726 }