]> arthur.barton.de Git - netdata.git/blob - src/health.c
properly lock all sensitive linked lists
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 int default_health_enabled = 1;
5
6 // ----------------------------------------------------------------------------
7 // health initialization
8
9 inline char *health_config_dir(void) {
10     char buffer[FILENAME_MAX + 1];
11     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12     return config_get("health", "health configuration directory", buffer);
13 }
14
15 void health_init(void) {
16     debug(D_HEALTH, "Health configuration initializing");
17
18     if(!central_netdata_to_push_data) {
19         if(!(default_health_enabled = config_get_boolean("health", "enabled", 1))) {
20             debug(D_HEALTH, "Health is disabled.");
21             return;
22         }
23     }
24     else {
25         info("Health is disabled - setup alarms at the central netdata.");
26         config_set_boolean("health", "enabled", 0);
27         default_health_enabled = 0;
28     }
29
30     char pathname[FILENAME_MAX + 1];
31     snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir);
32     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
33         fatal("Cannot create directory '%s'.", pathname);
34 }
35
36 // ----------------------------------------------------------------------------
37 // re-load health configuration
38
39 void health_reload_host(RRDHOST *host) {
40     char *path = health_config_dir();
41
42     // free all running alarms
43     rrdhost_wrlock(host);
44
45     while(host->templates)
46         rrdcalctemplate_free(host, host->templates);
47
48     while(host->alarms)
49         rrdcalc_free(host, host->alarms);
50
51     rrdhost_unlock(host);
52
53     // invalidate all previous entries in the alarm log
54     ALARM_ENTRY *t;
55     for(t = host->health_log.alarms ; t ; t = t->next) {
56         if(t->new_status != RRDCALC_STATUS_REMOVED)
57             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
58     }
59
60     rrdhost_rdlock(host);
61     // reset all thresholds to all charts
62     RRDSET *st;
63     rrdset_foreach_read(st, host) {
64         st->green = NAN;
65         st->red = NAN;
66     }
67     rrdhost_unlock(host);
68
69     // load the new alarms
70     rrdhost_wrlock(host);
71     health_readdir(host, path);
72
73     // link the loaded alarms to their charts
74     rrdset_foreach_write(st, host) {
75         rrdsetcalc_link_matching(st);
76         rrdcalctemplate_link_matching(st);
77     }
78
79     rrdhost_unlock(host);
80 }
81
82 void health_reload(void) {
83
84     rrd_rdlock();
85
86     RRDHOST *host;
87     rrdhost_foreach_read(host)
88         health_reload_host(host);
89
90     rrd_unlock();
91 }
92
93 // ----------------------------------------------------------------------------
94 // health main thread and friends
95
96 static inline int rrdcalc_value2status(calculated_number n) {
97     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
98     if(n) return RRDCALC_STATUS_RAISED;
99     return RRDCALC_STATUS_CLEAR;
100 }
101
102 #define ALARM_EXEC_COMMAND_LENGTH 8192
103
104 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
105     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
106
107     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
108         // do not send notifications for internal statuses
109         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
110         goto done;
111     }
112
113     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
114         // do not send notifications for disabled statuses
115         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
116         // mark it as run, so that we will send the same alarm if it happens again
117         goto done;
118     }
119
120     // find the previous notification for the same alarm
121     // which we have run the exec script
122     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
123     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
124         uint32_t id = ae->alarm_id;
125         ALARM_ENTRY *t;
126         for(t = ae->next; t ; t = t->next) {
127             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
128                 break;
129         }
130
131         if(likely(t)) {
132             // we have executed this alarm notification in the past
133             if(t && t->new_status == ae->new_status) {
134                 // don't send the notification for the same status again
135                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
136                       , rrdcalc_status2string(ae->new_status));
137                 goto done;
138             }
139         }
140         else {
141             // we have not executed this alarm notification in the past
142             // so, don't send CLEAR notifications
143             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
144                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
145                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
146                 goto done;
147             }
148         }
149     }
150
151     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
152     pid_t command_pid;
153
154     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
155     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
156
157     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
158               exec,
159               recipient,
160               host->hostname,
161               ae->unique_id,
162               ae->alarm_id,
163               ae->alarm_event_id,
164               (unsigned long)ae->when,
165               ae->name,
166               ae->chart?ae->chart:"NOCAHRT",
167               ae->family?ae->family:"NOFAMILY",
168               rrdcalc_status2string(ae->new_status),
169               rrdcalc_status2string(ae->old_status),
170               ae->new_value,
171               ae->old_value,
172               ae->source?ae->source:"UNKNOWN",
173               (uint32_t)ae->duration,
174               (uint32_t)ae->non_clear_duration,
175               ae->units?ae->units:"",
176               ae->info?ae->info:"",
177               ae->new_value_string,
178               ae->old_value_string
179     );
180
181     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
182     ae->exec_run_timestamp = now_realtime_sec();
183
184     debug(D_HEALTH, "executing command '%s'", command_to_run);
185     FILE *fp = mypopen(command_to_run, &command_pid);
186     if(!fp) {
187         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
188         goto done;
189     }
190     debug(D_HEALTH, "HEALTH reading from command");
191     char *s = fgets(command_to_run, FILENAME_MAX, fp);
192     (void)s;
193     ae->exec_code = mypclose(fp, command_pid);
194     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
195
196     if(ae->exec_code != 0)
197         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
198
199 done:
200     health_alarm_log_save(host, ae);
201     return;
202 }
203
204 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
205     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
206          ae->chart?ae->chart:"NOCHART", ae->name,
207          ae->new_value,
208          rrdcalc_status2string(ae->old_status),
209          rrdcalc_status2string(ae->new_status)
210     );
211
212     health_alarm_execute(host, ae);
213 }
214
215 static inline void health_alarm_log_process(RRDHOST *host) {
216     static uint32_t stop_at_id = 0;
217     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
218     time_t now = now_realtime_sec();
219
220     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
221
222     ALARM_ENTRY *ae;
223     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
224         if(unlikely(
225             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
226             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
227             )) {
228
229             if(unlikely(ae->unique_id < first_waiting))
230                 first_waiting = ae->unique_id;
231
232             if(likely(now >= ae->delay_up_to_timestamp))
233                 health_process_notifications(host, ae);
234         }
235     }
236
237     // remember this for the next iteration
238     stop_at_id = first_waiting;
239
240     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
241
242     if(host->health_log.count <= host->health_log.max)
243         return;
244
245     // cleanup excess entries in the log
246     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
247
248     ALARM_ENTRY *last = NULL;
249     unsigned int count = host->health_log.max * 2 / 3;
250     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
251
252     if(ae && last && last->next == ae)
253         last->next = NULL;
254     else
255         ae = NULL;
256
257     while(ae) {
258         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
259
260         ALARM_ENTRY *t = ae->next;
261
262         health_alarm_log_free_one_nochecks_nounlink(ae);
263
264         ae = t;
265         host->health_log.count--;
266     }
267
268     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
269 }
270
271 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
272     if(unlikely(!rc->rrdset)) {
273         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
274         return 0;
275     }
276
277     if(unlikely(rc->next_update > now)) {
278         if (unlikely(*next_run > rc->next_update)) {
279             // update the next_run time of the main loop
280             // to run this alarm precisely the time required
281             *next_run = rc->next_update;
282         }
283
284         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
285         return 0;
286     }
287
288     if(unlikely(!rc->update_every)) {
289         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
290         return 0;
291     }
292
293     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
294         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
295         return 0;
296     }
297
298     int update_every = rc->rrdset->update_every;
299     time_t first = rrdset_first_entry_t(rc->rrdset);
300     time_t last = rrdset_last_entry_t(rc->rrdset);
301
302     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
303         debug(D_HEALTH
304               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
305               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
306               , (unsigned long) last);
307         return 0;
308     }
309
310     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
311         time_t needed = now + rc->before + rc->after;
312
313         if(needed + update_every < first || needed - update_every > last) {
314             debug(D_HEALTH
315                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
316                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
317                   , (unsigned long) last);
318             return 0;
319         }
320     }
321
322     return 1;
323 }
324
325 void *health_main(void *ptr) {
326     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
327
328     info("HEALTH thread created with task id %d", gettid());
329
330     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
331         error("Cannot set pthread cancel type to DEFERRED.");
332
333     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
334         error("Cannot set pthread cancel state to ENABLE.");
335
336     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
337     if(min_run_every < 1) min_run_every = 1;
338
339     BUFFER *wb = buffer_create(100);
340
341     unsigned int loop = 0;
342     while(!netdata_exit) {
343         loop++;
344         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
345
346         int oldstate, runnable = 0;
347         time_t now = now_realtime_sec();
348         time_t next_run = now + min_run_every;
349         RRDCALC *rc;
350
351         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
352             error("Cannot set pthread cancel state to DISABLE.");
353
354         rrd_rdlock();
355
356         RRDHOST *host;
357         rrdhost_foreach_read(host) {
358             if(unlikely(!host->health_enabled)) continue;
359
360             rrdhost_rdlock(host);
361
362             // the first loop is to lookup values from the db
363             for(rc = host->alarms; rc; rc = rc->next) {
364                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
365                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
366                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
367                     continue;
368                 }
369
370                 runnable++;
371                 rc->old_value = rc->value;
372                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
373
374                 // 1. if there is database lookup, do it
375                 // 2. if there is calculation expression, run it
376
377                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
378                     /* time_t old_db_timestamp = rc->db_before; */
379                     int value_is_null = 0;
380
381                     int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
382
383                     if(unlikely(ret != 200)) {
384                         // database lookup failed
385                         rc->value = NAN;
386
387                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
388
389                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
390                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
391                             error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
392                         }
393                     }
394                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
395                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
396
397                     /* - RRDCALC_FLAG_DB_STALE not currently used
398                     if (unlikely(old_db_timestamp == rc->db_before)) {
399                         // database is stale
400
401                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
402
403                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
404                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
405                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
406                         }
407                     }
408                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
409                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
410                     */
411
412                     if(unlikely(value_is_null)) {
413                         // collected value is null
414
415                         rc->value = NAN;
416
417                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
418
419                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
420                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
421                             error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname,  rc->chart ? rc->chart : "NOCHART", rc->name);
422                         }
423                     }
424                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
425                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
426
427                     debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
428                 }
429
430                 if(unlikely(rc->calculation)) {
431                     if(unlikely(!expression_evaluate(rc->calculation))) {
432                         // calculation failed
433
434                         rc->value = NAN;
435
436                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
437
438                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
439                             rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
440                             error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname,  rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
441                         }
442                     }
443                     else {
444                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
445                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
446
447                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
448                                 CALCULATED_NUMBER_FORMAT
449                                 ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
450                               , rc->calculation->parsed_as, rc->calculation->result,
451                                 buffer_tostring(rc->calculation->error_msg), rc->source
452                         );
453
454                         rc->value = rc->calculation->result;
455                     }
456                 }
457             }
458             rrdhost_unlock(host);
459
460             if(unlikely(runnable && !netdata_exit)) {
461                 rrdhost_rdlock(host);
462
463                 for(rc = host->alarms; rc; rc = rc->next) {
464                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
465                         continue;
466
467                     int warning_status = RRDCALC_STATUS_UNDEFINED;
468                     int critical_status = RRDCALC_STATUS_UNDEFINED;
469
470                     if(likely(rc->warning)) {
471                         if(unlikely(!expression_evaluate(rc->warning))) {
472                             // calculation failed
473
474                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
475
476                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
477                                 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
478                                 error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
479                             }
480                         }
481                         else {
482                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
483                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
484
485                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
486
487                             warning_status = rrdcalc_value2status(rc->warning->result);
488                         }
489                     }
490
491                     if(likely(rc->critical)) {
492                         if(unlikely(!expression_evaluate(rc->critical))) {
493                             // calculation failed
494
495                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
496
497                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
498                                 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
499                                 error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
500                             }
501                         }
502                         else {
503                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
504                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
505
506                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
507
508                             critical_status = rrdcalc_value2status(rc->critical->result);
509                         }
510                     }
511
512                     int status = RRDCALC_STATUS_UNDEFINED;
513
514                     switch(warning_status) {
515                         case RRDCALC_STATUS_CLEAR:
516                             status = RRDCALC_STATUS_CLEAR;
517                             break;
518
519                         case RRDCALC_STATUS_RAISED:
520                             status = RRDCALC_STATUS_WARNING;
521                             break;
522
523                         default:
524                             break;
525                     }
526
527                     switch(critical_status) {
528                         case RRDCALC_STATUS_CLEAR:
529                             if(status == RRDCALC_STATUS_UNDEFINED)
530                                 status = RRDCALC_STATUS_CLEAR;
531                             break;
532
533                         case RRDCALC_STATUS_RAISED:
534                             status = RRDCALC_STATUS_CRITICAL;
535                             break;
536
537                         default:
538                             break;
539                     }
540
541                     if(status != rc->status) {
542                         int delay = 0;
543
544                         if(now > rc->delay_up_to_timestamp) {
545                             rc->delay_up_current = rc->delay_up_duration;
546                             rc->delay_down_current = rc->delay_down_duration;
547                             rc->delay_last = 0;
548                             rc->delay_up_to_timestamp = 0;
549                         }
550                         else {
551                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
552                             if(rc->delay_up_current > rc->delay_max_duration)
553                                 rc->delay_up_current = rc->delay_max_duration;
554
555                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
556                             if(rc->delay_down_current > rc->delay_max_duration)
557                                 rc->delay_down_current = rc->delay_max_duration;
558                         }
559
560                         if(status > rc->status)
561                             delay = rc->delay_up_current;
562                         else
563                             delay = rc->delay_down_current;
564
565                         // COMMENTED: because we do need to send raising alarms
566                         // if(now + delay < rc->delay_up_to_timestamp)
567                         //    delay = (int)(rc->delay_up_to_timestamp - now);
568
569                         rc->delay_last = delay;
570                         rc->delay_up_to_timestamp = now + delay;
571                         health_alarm_log(
572                                 host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
573                                 , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
574                                 , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
575                                 , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
576                                                   ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
577                         );
578                         rc->last_status_change = now;
579                         rc->status = status;
580                     }
581
582                     rc->last_updated = now;
583                     rc->next_update = now + rc->update_every;
584
585                     if(next_run > rc->next_update)
586                         next_run = rc->next_update;
587                 }
588
589                 rrdhost_unlock(host);
590             }
591
592             if(unlikely(netdata_exit))
593                 break;
594
595             // execute notifications
596             // and cleanup
597             health_alarm_log_process(host);
598
599             if(unlikely(netdata_exit))
600                 break;
601
602         } /* host loop */
603
604         rrd_unlock();
605
606         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
607             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
608
609         if(unlikely(netdata_exit))
610             break;
611
612         now = now_realtime_sec();
613         if(now < next_run) {
614             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
615             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
616         }
617         else
618             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
619     }
620
621     buffer_free(wb);
622
623     info("HEALTH thread exiting");
624
625     static_thread->enabled = 0;
626     pthread_exit(NULL);
627     return NULL;
628 }