]> arthur.barton.de Git - netdata.git/blob - src/health.c
494904c097434dbd821b55641e4c797b75402222
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 int default_health_enabled = 1;
5
6 // ----------------------------------------------------------------------------
7 // health initialization
8
9 inline char *health_config_dir(void) {
10     char buffer[FILENAME_MAX + 1];
11     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12     return config_get("health", "health configuration directory", buffer);
13 }
14
15 void health_init(void) {
16     debug(D_HEALTH, "Health configuration initializing");
17
18     if(!central_netdata_to_push_data) {
19         if(!(default_health_enabled = config_get_boolean("health", "enabled", 1))) {
20             debug(D_HEALTH, "Health is disabled.");
21             return;
22         }
23     }
24     else {
25         info("Health is disabled - setup alarms at the central netdata.");
26         config_set_boolean("health", "enabled", 0);
27         default_health_enabled = 0;
28     }
29 }
30
31 // ----------------------------------------------------------------------------
32 // re-load health configuration
33
34 void health_reload_host(RRDHOST *host) {
35     char *path = health_config_dir();
36
37     // free all running alarms
38     rrdhost_wrlock(host);
39
40     while(host->templates)
41         rrdcalctemplate_free(host, host->templates);
42
43     while(host->alarms)
44         rrdcalc_free(host, host->alarms);
45
46     rrdhost_unlock(host);
47
48     // invalidate all previous entries in the alarm log
49     ALARM_ENTRY *t;
50     for(t = host->health_log.alarms ; t ; t = t->next) {
51         if(t->new_status != RRDCALC_STATUS_REMOVED)
52             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
53     }
54
55     rrdhost_rdlock(host);
56     // reset all thresholds to all charts
57     RRDSET *st;
58     rrdset_foreach_read(st, host) {
59         st->green = NAN;
60         st->red = NAN;
61     }
62     rrdhost_unlock(host);
63
64     // load the new alarms
65     rrdhost_wrlock(host);
66     health_readdir(host, path);
67
68     // link the loaded alarms to their charts
69     rrdset_foreach_write(st, host) {
70         rrdsetcalc_link_matching(st);
71         rrdcalctemplate_link_matching(st);
72     }
73
74     rrdhost_unlock(host);
75 }
76
77 void health_reload(void) {
78
79     rrd_rdlock();
80
81     RRDHOST *host;
82     rrdhost_foreach_read(host)
83         health_reload_host(host);
84
85     rrd_unlock();
86 }
87
88 // ----------------------------------------------------------------------------
89 // health main thread and friends
90
91 static inline int rrdcalc_value2status(calculated_number n) {
92     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
93     if(n) return RRDCALC_STATUS_RAISED;
94     return RRDCALC_STATUS_CLEAR;
95 }
96
97 #define ALARM_EXEC_COMMAND_LENGTH 8192
98
99 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
100     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
101
102     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
103         // do not send notifications for internal statuses
104         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
105         goto done;
106     }
107
108     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
109         // do not send notifications for disabled statuses
110         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
111         // mark it as run, so that we will send the same alarm if it happens again
112         goto done;
113     }
114
115     // find the previous notification for the same alarm
116     // which we have run the exec script
117     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
118     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
119         uint32_t id = ae->alarm_id;
120         ALARM_ENTRY *t;
121         for(t = ae->next; t ; t = t->next) {
122             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
123                 break;
124         }
125
126         if(likely(t)) {
127             // we have executed this alarm notification in the past
128             if(t && t->new_status == ae->new_status) {
129                 // don't send the notification for the same status again
130                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
131                       , rrdcalc_status2string(ae->new_status));
132                 goto done;
133             }
134         }
135         else {
136             // we have not executed this alarm notification in the past
137             // so, don't send CLEAR notifications
138             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
139                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
140                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
141                 goto done;
142             }
143         }
144     }
145
146     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
147     pid_t command_pid;
148
149     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
150     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
151
152     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
153               exec,
154               recipient,
155               host->hostname,
156               ae->unique_id,
157               ae->alarm_id,
158               ae->alarm_event_id,
159               (unsigned long)ae->when,
160               ae->name,
161               ae->chart?ae->chart:"NOCAHRT",
162               ae->family?ae->family:"NOFAMILY",
163               rrdcalc_status2string(ae->new_status),
164               rrdcalc_status2string(ae->old_status),
165               ae->new_value,
166               ae->old_value,
167               ae->source?ae->source:"UNKNOWN",
168               (uint32_t)ae->duration,
169               (uint32_t)ae->non_clear_duration,
170               ae->units?ae->units:"",
171               ae->info?ae->info:"",
172               ae->new_value_string,
173               ae->old_value_string
174     );
175
176     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
177     ae->exec_run_timestamp = now_realtime_sec();
178
179     debug(D_HEALTH, "executing command '%s'", command_to_run);
180     FILE *fp = mypopen(command_to_run, &command_pid);
181     if(!fp) {
182         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
183         goto done;
184     }
185     debug(D_HEALTH, "HEALTH reading from command");
186     char *s = fgets(command_to_run, FILENAME_MAX, fp);
187     (void)s;
188     ae->exec_code = mypclose(fp, command_pid);
189     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
190
191     if(ae->exec_code != 0)
192         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
193
194 done:
195     health_alarm_log_save(host, ae);
196     return;
197 }
198
199 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
200     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
201          ae->chart?ae->chart:"NOCHART", ae->name,
202          ae->new_value,
203          rrdcalc_status2string(ae->old_status),
204          rrdcalc_status2string(ae->new_status)
205     );
206
207     health_alarm_execute(host, ae);
208 }
209
210 static inline void health_alarm_log_process(RRDHOST *host) {
211     static uint32_t stop_at_id = 0;
212     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
213     time_t now = now_realtime_sec();
214
215     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
216
217     ALARM_ENTRY *ae;
218     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
219         if(unlikely(
220             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
221             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
222             )) {
223
224             if(unlikely(ae->unique_id < first_waiting))
225                 first_waiting = ae->unique_id;
226
227             if(likely(now >= ae->delay_up_to_timestamp))
228                 health_process_notifications(host, ae);
229         }
230     }
231
232     // remember this for the next iteration
233     stop_at_id = first_waiting;
234
235     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
236
237     if(host->health_log.count <= host->health_log.max)
238         return;
239
240     // cleanup excess entries in the log
241     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
242
243     ALARM_ENTRY *last = NULL;
244     unsigned int count = host->health_log.max * 2 / 3;
245     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
246
247     if(ae && last && last->next == ae)
248         last->next = NULL;
249     else
250         ae = NULL;
251
252     while(ae) {
253         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
254
255         ALARM_ENTRY *t = ae->next;
256
257         health_alarm_log_free_one_nochecks_nounlink(ae);
258
259         ae = t;
260         host->health_log.count--;
261     }
262
263     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
264 }
265
266 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
267     if(unlikely(!rc->rrdset)) {
268         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
269         return 0;
270     }
271
272     if(unlikely(rc->next_update > now)) {
273         if (unlikely(*next_run > rc->next_update)) {
274             // update the next_run time of the main loop
275             // to run this alarm precisely the time required
276             *next_run = rc->next_update;
277         }
278
279         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
280         return 0;
281     }
282
283     if(unlikely(!rc->update_every)) {
284         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
285         return 0;
286     }
287
288     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
289         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
290         return 0;
291     }
292
293     int update_every = rc->rrdset->update_every;
294     time_t first = rrdset_first_entry_t(rc->rrdset);
295     time_t last = rrdset_last_entry_t(rc->rrdset);
296
297     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
298         debug(D_HEALTH
299               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
300               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
301               , (unsigned long) last);
302         return 0;
303     }
304
305     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
306         time_t needed = now + rc->before + rc->after;
307
308         if(needed + update_every < first || needed - update_every > last) {
309             debug(D_HEALTH
310                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
311                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
312                   , (unsigned long) last);
313             return 0;
314         }
315     }
316
317     return 1;
318 }
319
320 void *health_main(void *ptr) {
321     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
322
323     info("HEALTH thread created with task id %d", gettid());
324
325     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
326         error("Cannot set pthread cancel type to DEFERRED.");
327
328     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
329         error("Cannot set pthread cancel state to ENABLE.");
330
331     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
332     if(min_run_every < 1) min_run_every = 1;
333
334     BUFFER *wb = buffer_create(100);
335
336     unsigned int loop = 0;
337     while(!netdata_exit) {
338         loop++;
339         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
340
341         int oldstate, runnable = 0;
342         time_t now = now_realtime_sec();
343         time_t next_run = now + min_run_every;
344         RRDCALC *rc;
345
346         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
347             error("Cannot set pthread cancel state to DISABLE.");
348
349         rrd_rdlock();
350
351         RRDHOST *host;
352         rrdhost_foreach_read(host) {
353             if(unlikely(!host->health_enabled)) continue;
354
355             rrdhost_rdlock(host);
356
357             // the first loop is to lookup values from the db
358             for(rc = host->alarms; rc; rc = rc->next) {
359                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
360                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
361                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
362                     continue;
363                 }
364
365                 runnable++;
366                 rc->old_value = rc->value;
367                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
368
369                 // 1. if there is database lookup, do it
370                 // 2. if there is calculation expression, run it
371
372                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
373                     /* time_t old_db_timestamp = rc->db_before; */
374                     int value_is_null = 0;
375
376                     int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
377
378                     if(unlikely(ret != 200)) {
379                         // database lookup failed
380                         rc->value = NAN;
381
382                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
383
384                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
385                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
386                             error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
387                         }
388                     }
389                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
390                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
391
392                     /* - RRDCALC_FLAG_DB_STALE not currently used
393                     if (unlikely(old_db_timestamp == rc->db_before)) {
394                         // database is stale
395
396                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
397
398                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
399                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
400                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
401                         }
402                     }
403                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
404                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
405                     */
406
407                     if(unlikely(value_is_null)) {
408                         // collected value is null
409
410                         rc->value = NAN;
411
412                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
413
414                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
415                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
416                             error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname,  rc->chart ? rc->chart : "NOCHART", rc->name);
417                         }
418                     }
419                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
420                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
421
422                     debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
423                 }
424
425                 if(unlikely(rc->calculation)) {
426                     if(unlikely(!expression_evaluate(rc->calculation))) {
427                         // calculation failed
428
429                         rc->value = NAN;
430
431                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
432
433                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
434                             rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
435                             error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname,  rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
436                         }
437                     }
438                     else {
439                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
440                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
441
442                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
443                                 CALCULATED_NUMBER_FORMAT
444                                 ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
445                               , rc->calculation->parsed_as, rc->calculation->result,
446                                 buffer_tostring(rc->calculation->error_msg), rc->source
447                         );
448
449                         rc->value = rc->calculation->result;
450                     }
451                 }
452             }
453             rrdhost_unlock(host);
454
455             if(unlikely(runnable && !netdata_exit)) {
456                 rrdhost_rdlock(host);
457
458                 for(rc = host->alarms; rc; rc = rc->next) {
459                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
460                         continue;
461
462                     int warning_status = RRDCALC_STATUS_UNDEFINED;
463                     int critical_status = RRDCALC_STATUS_UNDEFINED;
464
465                     if(likely(rc->warning)) {
466                         if(unlikely(!expression_evaluate(rc->warning))) {
467                             // calculation failed
468
469                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
470
471                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
472                                 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
473                                 error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
474                             }
475                         }
476                         else {
477                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
478                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
479
480                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
481
482                             warning_status = rrdcalc_value2status(rc->warning->result);
483                         }
484                     }
485
486                     if(likely(rc->critical)) {
487                         if(unlikely(!expression_evaluate(rc->critical))) {
488                             // calculation failed
489
490                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
491
492                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
493                                 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
494                                 error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
495                             }
496                         }
497                         else {
498                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
499                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
500
501                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
502
503                             critical_status = rrdcalc_value2status(rc->critical->result);
504                         }
505                     }
506
507                     int status = RRDCALC_STATUS_UNDEFINED;
508
509                     switch(warning_status) {
510                         case RRDCALC_STATUS_CLEAR:
511                             status = RRDCALC_STATUS_CLEAR;
512                             break;
513
514                         case RRDCALC_STATUS_RAISED:
515                             status = RRDCALC_STATUS_WARNING;
516                             break;
517
518                         default:
519                             break;
520                     }
521
522                     switch(critical_status) {
523                         case RRDCALC_STATUS_CLEAR:
524                             if(status == RRDCALC_STATUS_UNDEFINED)
525                                 status = RRDCALC_STATUS_CLEAR;
526                             break;
527
528                         case RRDCALC_STATUS_RAISED:
529                             status = RRDCALC_STATUS_CRITICAL;
530                             break;
531
532                         default:
533                             break;
534                     }
535
536                     if(status != rc->status) {
537                         int delay = 0;
538
539                         if(now > rc->delay_up_to_timestamp) {
540                             rc->delay_up_current = rc->delay_up_duration;
541                             rc->delay_down_current = rc->delay_down_duration;
542                             rc->delay_last = 0;
543                             rc->delay_up_to_timestamp = 0;
544                         }
545                         else {
546                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
547                             if(rc->delay_up_current > rc->delay_max_duration)
548                                 rc->delay_up_current = rc->delay_max_duration;
549
550                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
551                             if(rc->delay_down_current > rc->delay_max_duration)
552                                 rc->delay_down_current = rc->delay_max_duration;
553                         }
554
555                         if(status > rc->status)
556                             delay = rc->delay_up_current;
557                         else
558                             delay = rc->delay_down_current;
559
560                         // COMMENTED: because we do need to send raising alarms
561                         // if(now + delay < rc->delay_up_to_timestamp)
562                         //    delay = (int)(rc->delay_up_to_timestamp - now);
563
564                         rc->delay_last = delay;
565                         rc->delay_up_to_timestamp = now + delay;
566                         health_alarm_log(
567                                 host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
568                                 , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
569                                 , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
570                                 , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
571                                                   ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
572                         );
573                         rc->last_status_change = now;
574                         rc->status = status;
575                     }
576
577                     rc->last_updated = now;
578                     rc->next_update = now + rc->update_every;
579
580                     if(next_run > rc->next_update)
581                         next_run = rc->next_update;
582                 }
583
584                 rrdhost_unlock(host);
585             }
586
587             if(unlikely(netdata_exit))
588                 break;
589
590             // execute notifications
591             // and cleanup
592             health_alarm_log_process(host);
593
594             if(unlikely(netdata_exit))
595                 break;
596
597         } /* host loop */
598
599         rrd_unlock();
600
601         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
602             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
603
604         if(unlikely(netdata_exit))
605             break;
606
607         now = now_realtime_sec();
608         if(now < next_run) {
609             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
610             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
611         }
612         else
613             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
614     }
615
616     buffer_free(wb);
617
618     info("HEALTH thread exiting");
619
620     static_thread->enabled = 0;
621     pthread_exit(NULL);
622     return NULL;
623 }