]> arthur.barton.de Git - netdata.git/blob - src/health.c
RRDHOSTs free all their memory (all substructures) when they are deallocated
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 int default_localhost_health_enabled = 1;
5
6 // ----------------------------------------------------------------------------
7 // health initialization
8
9 inline char *health_config_dir(void) {
10     char buffer[FILENAME_MAX + 1];
11     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12     return config_get("health", "health configuration directory", buffer);
13 }
14
15 void health_init(void) {
16     debug(D_HEALTH, "Health configuration initializing");
17
18     if(!(default_localhost_health_enabled = config_get_boolean("health", "enabled", 1))) {
19         debug(D_HEALTH, "Health is disabled.");
20         return;
21     }
22
23     char pathname[FILENAME_MAX + 1];
24     snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir);
25     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
26         fatal("Cannot create directory '%s'.", pathname);
27 }
28
29 // ----------------------------------------------------------------------------
30 // re-load health configuration
31
32 void health_reload_host(RRDHOST *host) {
33     char *path = health_config_dir();
34
35     // free all running alarms
36     rrdhost_wrlock(host);
37
38     while(host->templates)
39         rrdcalctemplate_free(host, host->templates);
40
41     while(host->alarms)
42         rrdcalc_free(host, host->alarms);
43
44     rrdhost_unlock(host);
45
46     // invalidate all previous entries in the alarm log
47     ALARM_ENTRY *t;
48     for(t = host->health_log.alarms ; t ; t = t->next) {
49         if(t->new_status != RRDCALC_STATUS_REMOVED)
50             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
51     }
52
53     // reset all thresholds to all charts
54     RRDSET *st;
55     for(st = host->rrdset_root; st ; st = st->next) {
56         st->green = NAN;
57         st->red = NAN;
58     }
59
60     // load the new alarms
61     rrdhost_wrlock(host);
62     health_readdir(host, path);
63     rrdhost_unlock(host);
64
65     // link the loaded alarms to their charts
66     for(st = host->rrdset_root; st ; st = st->next) {
67         rrdhost_wrlock(host);
68
69         rrdsetcalc_link_matching(st);
70         rrdcalctemplate_link_matching(st);
71
72         rrdhost_unlock(host);
73     }
74 }
75
76 void health_reload(void) {
77     RRDHOST *host;
78
79     for(host = localhost; host ; host = host->next)
80         health_reload_host(host);
81 }
82
83 // ----------------------------------------------------------------------------
84 // health main thread and friends
85
86 static inline int rrdcalc_value2status(calculated_number n) {
87     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
88     if(n) return RRDCALC_STATUS_RAISED;
89     return RRDCALC_STATUS_CLEAR;
90 }
91
92 #define ALARM_EXEC_COMMAND_LENGTH 8192
93
94 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
95     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
96
97     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
98         // do not send notifications for internal statuses
99         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
100         goto done;
101     }
102
103     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
104         // do not send notifications for disabled statuses
105         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
106         // mark it as run, so that we will send the same alarm if it happens again
107         goto done;
108     }
109
110     // find the previous notification for the same alarm
111     // which we have run the exec script
112     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
113     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
114         uint32_t id = ae->alarm_id;
115         ALARM_ENTRY *t;
116         for(t = ae->next; t ; t = t->next) {
117             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
118                 break;
119         }
120
121         if(likely(t)) {
122             // we have executed this alarm notification in the past
123             if(t && t->new_status == ae->new_status) {
124                 // don't send the notification for the same status again
125                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
126                       , rrdcalc_status2string(ae->new_status));
127                 goto done;
128             }
129         }
130         else {
131             // we have not executed this alarm notification in the past
132             // so, don't send CLEAR notifications
133             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
134                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
135                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
136                 goto done;
137             }
138         }
139     }
140
141     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
142     pid_t command_pid;
143
144     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
145     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
146
147     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
148               exec,
149               recipient,
150               host->hostname,
151               ae->unique_id,
152               ae->alarm_id,
153               ae->alarm_event_id,
154               (unsigned long)ae->when,
155               ae->name,
156               ae->chart?ae->chart:"NOCAHRT",
157               ae->family?ae->family:"NOFAMILY",
158               rrdcalc_status2string(ae->new_status),
159               rrdcalc_status2string(ae->old_status),
160               ae->new_value,
161               ae->old_value,
162               ae->source?ae->source:"UNKNOWN",
163               (uint32_t)ae->duration,
164               (uint32_t)ae->non_clear_duration,
165               ae->units?ae->units:"",
166               ae->info?ae->info:"",
167               ae->new_value_string,
168               ae->old_value_string
169     );
170
171     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
172     ae->exec_run_timestamp = now_realtime_sec();
173
174     debug(D_HEALTH, "executing command '%s'", command_to_run);
175     FILE *fp = mypopen(command_to_run, &command_pid);
176     if(!fp) {
177         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
178         goto done;
179     }
180     debug(D_HEALTH, "HEALTH reading from command");
181     char *s = fgets(command_to_run, FILENAME_MAX, fp);
182     (void)s;
183     ae->exec_code = mypclose(fp, command_pid);
184     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
185
186     if(ae->exec_code != 0)
187         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
188
189 done:
190     health_alarm_log_save(host, ae);
191     return;
192 }
193
194 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
195     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
196          ae->chart?ae->chart:"NOCHART", ae->name,
197          ae->new_value,
198          rrdcalc_status2string(ae->old_status),
199          rrdcalc_status2string(ae->new_status)
200     );
201
202     health_alarm_execute(host, ae);
203 }
204
205 static inline void health_alarm_log_process(RRDHOST *host) {
206     static uint32_t stop_at_id = 0;
207     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
208     time_t now = now_realtime_sec();
209
210     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
211
212     ALARM_ENTRY *ae;
213     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
214         if(unlikely(
215             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
216             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
217             )) {
218
219             if(unlikely(ae->unique_id < first_waiting))
220                 first_waiting = ae->unique_id;
221
222             if(likely(now >= ae->delay_up_to_timestamp))
223                 health_process_notifications(host, ae);
224         }
225     }
226
227     // remember this for the next iteration
228     stop_at_id = first_waiting;
229
230     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
231
232     if(host->health_log.count <= host->health_log.max)
233         return;
234
235     // cleanup excess entries in the log
236     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
237
238     ALARM_ENTRY *last = NULL;
239     unsigned int count = host->health_log.max * 2 / 3;
240     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
241
242     if(ae && last && last->next == ae)
243         last->next = NULL;
244     else
245         ae = NULL;
246
247     while(ae) {
248         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
249
250         ALARM_ENTRY *t = ae->next;
251
252         health_alarm_log_free_one_nochecks_nounlink(ae);
253
254         ae = t;
255         host->health_log.count--;
256     }
257
258     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
259 }
260
261 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
262     if(unlikely(!rc->rrdset)) {
263         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
264         return 0;
265     }
266
267     if(unlikely(rc->next_update > now)) {
268         if (unlikely(*next_run > rc->next_update)) {
269             // update the next_run time of the main loop
270             // to run this alarm precisely the time required
271             *next_run = rc->next_update;
272         }
273
274         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
275         return 0;
276     }
277
278     if(unlikely(!rc->update_every)) {
279         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
280         return 0;
281     }
282
283     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
284         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
285         return 0;
286     }
287
288     int update_every = rc->rrdset->update_every;
289     time_t first = rrdset_first_entry_t(rc->rrdset);
290     time_t last = rrdset_last_entry_t(rc->rrdset);
291
292     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
293         debug(D_HEALTH
294               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
295               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
296               , (unsigned long) last);
297         return 0;
298     }
299
300     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
301         time_t needed = now + rc->before + rc->after;
302
303         if(needed + update_every < first || needed - update_every > last) {
304             debug(D_HEALTH
305                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
306                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
307                   , (unsigned long) last);
308             return 0;
309         }
310     }
311
312     return 1;
313 }
314
315 void *health_main(void *ptr) {
316     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
317
318     info("HEALTH thread created with task id %d", gettid());
319
320     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
321         error("Cannot set pthread cancel type to DEFERRED.");
322
323     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
324         error("Cannot set pthread cancel state to ENABLE.");
325
326     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
327     if(min_run_every < 1) min_run_every = 1;
328
329     BUFFER *wb = buffer_create(100);
330
331     unsigned int loop = 0;
332     while(!netdata_exit) {
333         loop++;
334         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
335
336         int oldstate, runnable = 0;
337         time_t now = now_realtime_sec();
338         time_t next_run = now + min_run_every;
339         RRDCALC *rc;
340
341         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
342             error("Cannot set pthread cancel state to DISABLE.");
343
344         RRDHOST *host;
345         for(host = localhost; host ; host = host->next) {
346             if(unlikely(!host->health_enabled)) continue;
347
348             rrdhost_rdlock(host);
349
350             // the first loop is to lookup values from the db
351             for(rc = host->alarms; rc; rc = rc->next) {
352                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
353                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
354                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
355                     continue;
356                 }
357
358                 runnable++;
359                 rc->old_value = rc->value;
360                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
361
362                 // 1. if there is database lookup, do it
363                 // 2. if there is calculation expression, run it
364
365                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
366                     /* time_t old_db_timestamp = rc->db_before; */
367                     int value_is_null = 0;
368
369                     int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
370
371                     if(unlikely(ret != 200)) {
372                         // database lookup failed
373                         rc->value = NAN;
374
375                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
376
377                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
378                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
379                             error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
380                         }
381                     }
382                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
383                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
384
385                     /* - RRDCALC_FLAG_DB_STALE not currently used
386                     if (unlikely(old_db_timestamp == rc->db_before)) {
387                         // database is stale
388
389                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
390
391                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
392                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
393                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
394                         }
395                     }
396                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
397                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
398                     */
399
400                     if(unlikely(value_is_null)) {
401                         // collected value is null
402
403                         rc->value = NAN;
404
405                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
406
407                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
408                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
409                             error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname,  rc->chart ? rc->chart : "NOCHART", rc->name);
410                         }
411                     }
412                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
413                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
414
415                     debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
416                 }
417
418                 if(unlikely(rc->calculation)) {
419                     if(unlikely(!expression_evaluate(rc->calculation))) {
420                         // calculation failed
421
422                         rc->value = NAN;
423
424                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
425
426                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
427                             rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
428                             error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname,  rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
429                         }
430                     }
431                     else {
432                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
433                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
434
435                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
436                                 CALCULATED_NUMBER_FORMAT
437                                 ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
438                               , rc->calculation->parsed_as, rc->calculation->result,
439                                 buffer_tostring(rc->calculation->error_msg), rc->source
440                         );
441
442                         rc->value = rc->calculation->result;
443                     }
444                 }
445             }
446             rrdhost_unlock(host);
447
448             if(unlikely(runnable && !netdata_exit)) {
449                 rrdhost_rdlock(host);
450
451                 for(rc = host->alarms; rc; rc = rc->next) {
452                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
453                         continue;
454
455                     int warning_status = RRDCALC_STATUS_UNDEFINED;
456                     int critical_status = RRDCALC_STATUS_UNDEFINED;
457
458                     if(likely(rc->warning)) {
459                         if(unlikely(!expression_evaluate(rc->warning))) {
460                             // calculation failed
461
462                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
463
464                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
465                                 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
466                                 error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
467                             }
468                         }
469                         else {
470                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
471                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
472
473                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
474
475                             warning_status = rrdcalc_value2status(rc->warning->result);
476                         }
477                     }
478
479                     if(likely(rc->critical)) {
480                         if(unlikely(!expression_evaluate(rc->critical))) {
481                             // calculation failed
482
483                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
484
485                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
486                                 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
487                                 error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
488                             }
489                         }
490                         else {
491                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
492                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
493
494                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
495
496                             critical_status = rrdcalc_value2status(rc->critical->result);
497                         }
498                     }
499
500                     int status = RRDCALC_STATUS_UNDEFINED;
501
502                     switch(warning_status) {
503                         case RRDCALC_STATUS_CLEAR:
504                             status = RRDCALC_STATUS_CLEAR;
505                             break;
506
507                         case RRDCALC_STATUS_RAISED:
508                             status = RRDCALC_STATUS_WARNING;
509                             break;
510
511                         default:
512                             break;
513                     }
514
515                     switch(critical_status) {
516                         case RRDCALC_STATUS_CLEAR:
517                             if(status == RRDCALC_STATUS_UNDEFINED)
518                                 status = RRDCALC_STATUS_CLEAR;
519                             break;
520
521                         case RRDCALC_STATUS_RAISED:
522                             status = RRDCALC_STATUS_CRITICAL;
523                             break;
524
525                         default:
526                             break;
527                     }
528
529                     if(status != rc->status) {
530                         int delay = 0;
531
532                         if(now > rc->delay_up_to_timestamp) {
533                             rc->delay_up_current = rc->delay_up_duration;
534                             rc->delay_down_current = rc->delay_down_duration;
535                             rc->delay_last = 0;
536                             rc->delay_up_to_timestamp = 0;
537                         }
538                         else {
539                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
540                             if(rc->delay_up_current > rc->delay_max_duration)
541                                 rc->delay_up_current = rc->delay_max_duration;
542
543                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
544                             if(rc->delay_down_current > rc->delay_max_duration)
545                                 rc->delay_down_current = rc->delay_max_duration;
546                         }
547
548                         if(status > rc->status)
549                             delay = rc->delay_up_current;
550                         else
551                             delay = rc->delay_down_current;
552
553                         // COMMENTED: because we do need to send raising alarms
554                         // if(now + delay < rc->delay_up_to_timestamp)
555                         //    delay = (int)(rc->delay_up_to_timestamp - now);
556
557                         rc->delay_last = delay;
558                         rc->delay_up_to_timestamp = now + delay;
559                         health_alarm_log(
560                                 host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
561                                 , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
562                                 , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
563                                 , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
564                                                   ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
565                         );
566                         rc->last_status_change = now;
567                         rc->status = status;
568                     }
569
570                     rc->last_updated = now;
571                     rc->next_update = now + rc->update_every;
572
573                     if(next_run > rc->next_update)
574                         next_run = rc->next_update;
575                 }
576
577                 rrdhost_unlock(host);
578             }
579
580             if(unlikely(netdata_exit))
581                 break;
582
583             // execute notifications
584             // and cleanup
585             health_alarm_log_process(host);
586
587             if(unlikely(netdata_exit))
588                 break;
589
590         } /* host loop */
591
592         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
593             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
594
595         if(unlikely(netdata_exit))
596             break;
597
598         now = now_realtime_sec();
599         if(now < next_run) {
600             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
601             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
602         }
603         else
604             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
605     }
606
607     buffer_free(wb);
608
609     info("HEALTH thread exiting");
610
611     static_thread->enabled = 0;
612     pthread_exit(NULL);
613     return NULL;
614 }