]> arthur.barton.de Git - netdata.git/blob - src/health.c
7dbf7292c13c886b4e9da97314c7fa5fd3c1ef66
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 int default_health_enabled = 1;
5
6 // ----------------------------------------------------------------------------
7 // health initialization
8
9 inline char *health_config_dir(void) {
10     char buffer[FILENAME_MAX + 1];
11     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12     return config_get("health", "health configuration directory", buffer);
13 }
14
15 void health_init(void) {
16     debug(D_HEALTH, "Health configuration initializing");
17
18     if(!central_netdata_to_push_data) {
19         if(!(default_health_enabled = config_get_boolean("health", "enabled", 1))) {
20             debug(D_HEALTH, "Health is disabled.");
21             return;
22         }
23     }
24     else {
25         info("Health is disabled - setup alarms at the central netdata.");
26         config_set_boolean("health", "enabled", 0);
27         default_health_enabled = 0;
28     }
29
30     char pathname[FILENAME_MAX + 1];
31     snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir);
32     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
33         fatal("Cannot create directory '%s'.", pathname);
34 }
35
36 // ----------------------------------------------------------------------------
37 // re-load health configuration
38
39 void health_reload_host(RRDHOST *host) {
40     char *path = health_config_dir();
41
42     // free all running alarms
43     rrdhost_wrlock(host);
44
45     while(host->templates)
46         rrdcalctemplate_free(host, host->templates);
47
48     while(host->alarms)
49         rrdcalc_free(host, host->alarms);
50
51     rrdhost_unlock(host);
52
53     // invalidate all previous entries in the alarm log
54     ALARM_ENTRY *t;
55     for(t = host->health_log.alarms ; t ; t = t->next) {
56         if(t->new_status != RRDCALC_STATUS_REMOVED)
57             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
58     }
59
60     // reset all thresholds to all charts
61     RRDSET *st;
62     for(st = host->rrdset_root; st ; st = st->next) {
63         st->green = NAN;
64         st->red = NAN;
65     }
66
67     // load the new alarms
68     rrdhost_wrlock(host);
69     health_readdir(host, path);
70     rrdhost_unlock(host);
71
72     // link the loaded alarms to their charts
73     for(st = host->rrdset_root; st ; st = st->next) {
74         rrdhost_wrlock(host);
75
76         rrdsetcalc_link_matching(st);
77         rrdcalctemplate_link_matching(st);
78
79         rrdhost_unlock(host);
80     }
81 }
82
83 void health_reload(void) {
84     RRDHOST *host;
85
86     for(host = localhost; host ; host = host->next)
87         health_reload_host(host);
88 }
89
90 // ----------------------------------------------------------------------------
91 // health main thread and friends
92
93 static inline int rrdcalc_value2status(calculated_number n) {
94     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
95     if(n) return RRDCALC_STATUS_RAISED;
96     return RRDCALC_STATUS_CLEAR;
97 }
98
99 #define ALARM_EXEC_COMMAND_LENGTH 8192
100
101 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
102     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
103
104     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
105         // do not send notifications for internal statuses
106         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
107         goto done;
108     }
109
110     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
111         // do not send notifications for disabled statuses
112         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
113         // mark it as run, so that we will send the same alarm if it happens again
114         goto done;
115     }
116
117     // find the previous notification for the same alarm
118     // which we have run the exec script
119     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
120     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
121         uint32_t id = ae->alarm_id;
122         ALARM_ENTRY *t;
123         for(t = ae->next; t ; t = t->next) {
124             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
125                 break;
126         }
127
128         if(likely(t)) {
129             // we have executed this alarm notification in the past
130             if(t && t->new_status == ae->new_status) {
131                 // don't send the notification for the same status again
132                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
133                       , rrdcalc_status2string(ae->new_status));
134                 goto done;
135             }
136         }
137         else {
138             // we have not executed this alarm notification in the past
139             // so, don't send CLEAR notifications
140             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
141                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
142                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
143                 goto done;
144             }
145         }
146     }
147
148     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
149     pid_t command_pid;
150
151     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
152     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
153
154     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
155               exec,
156               recipient,
157               host->hostname,
158               ae->unique_id,
159               ae->alarm_id,
160               ae->alarm_event_id,
161               (unsigned long)ae->when,
162               ae->name,
163               ae->chart?ae->chart:"NOCAHRT",
164               ae->family?ae->family:"NOFAMILY",
165               rrdcalc_status2string(ae->new_status),
166               rrdcalc_status2string(ae->old_status),
167               ae->new_value,
168               ae->old_value,
169               ae->source?ae->source:"UNKNOWN",
170               (uint32_t)ae->duration,
171               (uint32_t)ae->non_clear_duration,
172               ae->units?ae->units:"",
173               ae->info?ae->info:"",
174               ae->new_value_string,
175               ae->old_value_string
176     );
177
178     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
179     ae->exec_run_timestamp = now_realtime_sec();
180
181     debug(D_HEALTH, "executing command '%s'", command_to_run);
182     FILE *fp = mypopen(command_to_run, &command_pid);
183     if(!fp) {
184         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
185         goto done;
186     }
187     debug(D_HEALTH, "HEALTH reading from command");
188     char *s = fgets(command_to_run, FILENAME_MAX, fp);
189     (void)s;
190     ae->exec_code = mypclose(fp, command_pid);
191     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
192
193     if(ae->exec_code != 0)
194         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
195
196 done:
197     health_alarm_log_save(host, ae);
198     return;
199 }
200
201 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
202     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
203          ae->chart?ae->chart:"NOCHART", ae->name,
204          ae->new_value,
205          rrdcalc_status2string(ae->old_status),
206          rrdcalc_status2string(ae->new_status)
207     );
208
209     health_alarm_execute(host, ae);
210 }
211
212 static inline void health_alarm_log_process(RRDHOST *host) {
213     static uint32_t stop_at_id = 0;
214     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
215     time_t now = now_realtime_sec();
216
217     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
218
219     ALARM_ENTRY *ae;
220     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
221         if(unlikely(
222             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
223             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
224             )) {
225
226             if(unlikely(ae->unique_id < first_waiting))
227                 first_waiting = ae->unique_id;
228
229             if(likely(now >= ae->delay_up_to_timestamp))
230                 health_process_notifications(host, ae);
231         }
232     }
233
234     // remember this for the next iteration
235     stop_at_id = first_waiting;
236
237     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
238
239     if(host->health_log.count <= host->health_log.max)
240         return;
241
242     // cleanup excess entries in the log
243     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
244
245     ALARM_ENTRY *last = NULL;
246     unsigned int count = host->health_log.max * 2 / 3;
247     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
248
249     if(ae && last && last->next == ae)
250         last->next = NULL;
251     else
252         ae = NULL;
253
254     while(ae) {
255         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
256
257         ALARM_ENTRY *t = ae->next;
258
259         health_alarm_log_free_one_nochecks_nounlink(ae);
260
261         ae = t;
262         host->health_log.count--;
263     }
264
265     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
266 }
267
268 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
269     if(unlikely(!rc->rrdset)) {
270         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
271         return 0;
272     }
273
274     if(unlikely(rc->next_update > now)) {
275         if (unlikely(*next_run > rc->next_update)) {
276             // update the next_run time of the main loop
277             // to run this alarm precisely the time required
278             *next_run = rc->next_update;
279         }
280
281         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
282         return 0;
283     }
284
285     if(unlikely(!rc->update_every)) {
286         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
287         return 0;
288     }
289
290     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
291         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
292         return 0;
293     }
294
295     int update_every = rc->rrdset->update_every;
296     time_t first = rrdset_first_entry_t(rc->rrdset);
297     time_t last = rrdset_last_entry_t(rc->rrdset);
298
299     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
300         debug(D_HEALTH
301               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
302               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
303               , (unsigned long) last);
304         return 0;
305     }
306
307     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
308         time_t needed = now + rc->before + rc->after;
309
310         if(needed + update_every < first || needed - update_every > last) {
311             debug(D_HEALTH
312                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
313                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
314                   , (unsigned long) last);
315             return 0;
316         }
317     }
318
319     return 1;
320 }
321
322 void *health_main(void *ptr) {
323     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
324
325     info("HEALTH thread created with task id %d", gettid());
326
327     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
328         error("Cannot set pthread cancel type to DEFERRED.");
329
330     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
331         error("Cannot set pthread cancel state to ENABLE.");
332
333     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
334     if(min_run_every < 1) min_run_every = 1;
335
336     BUFFER *wb = buffer_create(100);
337
338     unsigned int loop = 0;
339     while(!netdata_exit) {
340         loop++;
341         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
342
343         int oldstate, runnable = 0;
344         time_t now = now_realtime_sec();
345         time_t next_run = now + min_run_every;
346         RRDCALC *rc;
347
348         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
349             error("Cannot set pthread cancel state to DISABLE.");
350
351         RRDHOST *host;
352         for(host = localhost; host ; host = host->next) {
353             if(unlikely(!host->health_enabled)) continue;
354
355             rrdhost_rdlock(host);
356
357             // the first loop is to lookup values from the db
358             for(rc = host->alarms; rc; rc = rc->next) {
359                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
360                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
361                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
362                     continue;
363                 }
364
365                 runnable++;
366                 rc->old_value = rc->value;
367                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
368
369                 // 1. if there is database lookup, do it
370                 // 2. if there is calculation expression, run it
371
372                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
373                     /* time_t old_db_timestamp = rc->db_before; */
374                     int value_is_null = 0;
375
376                     int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
377
378                     if(unlikely(ret != 200)) {
379                         // database lookup failed
380                         rc->value = NAN;
381
382                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
383
384                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
385                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
386                             error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
387                         }
388                     }
389                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
390                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
391
392                     /* - RRDCALC_FLAG_DB_STALE not currently used
393                     if (unlikely(old_db_timestamp == rc->db_before)) {
394                         // database is stale
395
396                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
397
398                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
399                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
400                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
401                         }
402                     }
403                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
404                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
405                     */
406
407                     if(unlikely(value_is_null)) {
408                         // collected value is null
409
410                         rc->value = NAN;
411
412                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
413
414                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
415                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
416                             error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname,  rc->chart ? rc->chart : "NOCHART", rc->name);
417                         }
418                     }
419                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
420                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
421
422                     debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
423                 }
424
425                 if(unlikely(rc->calculation)) {
426                     if(unlikely(!expression_evaluate(rc->calculation))) {
427                         // calculation failed
428
429                         rc->value = NAN;
430
431                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
432
433                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
434                             rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
435                             error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname,  rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
436                         }
437                     }
438                     else {
439                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
440                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
441
442                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
443                                 CALCULATED_NUMBER_FORMAT
444                                 ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
445                               , rc->calculation->parsed_as, rc->calculation->result,
446                                 buffer_tostring(rc->calculation->error_msg), rc->source
447                         );
448
449                         rc->value = rc->calculation->result;
450                     }
451                 }
452             }
453             rrdhost_unlock(host);
454
455             if(unlikely(runnable && !netdata_exit)) {
456                 rrdhost_rdlock(host);
457
458                 for(rc = host->alarms; rc; rc = rc->next) {
459                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
460                         continue;
461
462                     int warning_status = RRDCALC_STATUS_UNDEFINED;
463                     int critical_status = RRDCALC_STATUS_UNDEFINED;
464
465                     if(likely(rc->warning)) {
466                         if(unlikely(!expression_evaluate(rc->warning))) {
467                             // calculation failed
468
469                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
470
471                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
472                                 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
473                                 error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
474                             }
475                         }
476                         else {
477                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
478                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
479
480                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
481
482                             warning_status = rrdcalc_value2status(rc->warning->result);
483                         }
484                     }
485
486                     if(likely(rc->critical)) {
487                         if(unlikely(!expression_evaluate(rc->critical))) {
488                             // calculation failed
489
490                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
491
492                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
493                                 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
494                                 error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
495                             }
496                         }
497                         else {
498                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
499                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
500
501                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
502
503                             critical_status = rrdcalc_value2status(rc->critical->result);
504                         }
505                     }
506
507                     int status = RRDCALC_STATUS_UNDEFINED;
508
509                     switch(warning_status) {
510                         case RRDCALC_STATUS_CLEAR:
511                             status = RRDCALC_STATUS_CLEAR;
512                             break;
513
514                         case RRDCALC_STATUS_RAISED:
515                             status = RRDCALC_STATUS_WARNING;
516                             break;
517
518                         default:
519                             break;
520                     }
521
522                     switch(critical_status) {
523                         case RRDCALC_STATUS_CLEAR:
524                             if(status == RRDCALC_STATUS_UNDEFINED)
525                                 status = RRDCALC_STATUS_CLEAR;
526                             break;
527
528                         case RRDCALC_STATUS_RAISED:
529                             status = RRDCALC_STATUS_CRITICAL;
530                             break;
531
532                         default:
533                             break;
534                     }
535
536                     if(status != rc->status) {
537                         int delay = 0;
538
539                         if(now > rc->delay_up_to_timestamp) {
540                             rc->delay_up_current = rc->delay_up_duration;
541                             rc->delay_down_current = rc->delay_down_duration;
542                             rc->delay_last = 0;
543                             rc->delay_up_to_timestamp = 0;
544                         }
545                         else {
546                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
547                             if(rc->delay_up_current > rc->delay_max_duration)
548                                 rc->delay_up_current = rc->delay_max_duration;
549
550                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
551                             if(rc->delay_down_current > rc->delay_max_duration)
552                                 rc->delay_down_current = rc->delay_max_duration;
553                         }
554
555                         if(status > rc->status)
556                             delay = rc->delay_up_current;
557                         else
558                             delay = rc->delay_down_current;
559
560                         // COMMENTED: because we do need to send raising alarms
561                         // if(now + delay < rc->delay_up_to_timestamp)
562                         //    delay = (int)(rc->delay_up_to_timestamp - now);
563
564                         rc->delay_last = delay;
565                         rc->delay_up_to_timestamp = now + delay;
566                         health_alarm_log(
567                                 host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
568                                 , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
569                                 , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
570                                 , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
571                                                   ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
572                         );
573                         rc->last_status_change = now;
574                         rc->status = status;
575                     }
576
577                     rc->last_updated = now;
578                     rc->next_update = now + rc->update_every;
579
580                     if(next_run > rc->next_update)
581                         next_run = rc->next_update;
582                 }
583
584                 rrdhost_unlock(host);
585             }
586
587             if(unlikely(netdata_exit))
588                 break;
589
590             // execute notifications
591             // and cleanup
592             health_alarm_log_process(host);
593
594             if(unlikely(netdata_exit))
595                 break;
596
597         } /* host loop */
598
599         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
600             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
601
602         if(unlikely(netdata_exit))
603             break;
604
605         now = now_realtime_sec();
606         if(now < next_run) {
607             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
608             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
609         }
610         else
611             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
612     }
613
614     buffer_free(wb);
615
616     info("HEALTH thread exiting");
617
618     static_thread->enabled = 0;
619     pthread_exit(NULL);
620     return NULL;
621 }