]> arthur.barton.de Git - netdata.git/blob - src/health.c
Merge branch 'master' into ab-debian
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 int default_health_enabled = 1;
5
6 // ----------------------------------------------------------------------------
7 // health initialization
8
9 inline char *health_config_dir(void) {
10     char buffer[FILENAME_MAX + 1];
11     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12     return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
13 }
14
15 void health_init(void) {
16     debug(D_HEALTH, "Health configuration initializing");
17
18     if(!(default_health_enabled = config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", 1))) {
19         debug(D_HEALTH, "Health is disabled.");
20         return;
21     }
22 }
23
24 // ----------------------------------------------------------------------------
25 // re-load health configuration
26
27 void health_reload_host(RRDHOST *host) {
28     if(unlikely(!host->health_enabled))
29         return;
30
31     char *path = health_config_dir();
32
33     // free all running alarms
34     rrdhost_wrlock(host);
35
36     while(host->templates)
37         rrdcalctemplate_free(host, host->templates);
38
39     while(host->alarms)
40         rrdcalc_free(host, host->alarms);
41
42     rrdhost_unlock(host);
43
44     // invalidate all previous entries in the alarm log
45     ALARM_ENTRY *t;
46     for(t = host->health_log.alarms ; t ; t = t->next) {
47         if(t->new_status != RRDCALC_STATUS_REMOVED)
48             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
49     }
50
51     rrdhost_rdlock(host);
52     // reset all thresholds to all charts
53     RRDSET *st;
54     rrdset_foreach_read(st, host) {
55         st->green = NAN;
56         st->red = NAN;
57     }
58     rrdhost_unlock(host);
59
60     // load the new alarms
61     rrdhost_wrlock(host);
62     health_readdir(host, path);
63
64     // link the loaded alarms to their charts
65     rrdset_foreach_write(st, host) {
66         rrdsetcalc_link_matching(st);
67         rrdcalctemplate_link_matching(st);
68     }
69
70     rrdhost_unlock(host);
71 }
72
73 void health_reload(void) {
74
75     rrd_rdlock();
76
77     RRDHOST *host;
78     rrdhost_foreach_read(host)
79         health_reload_host(host);
80
81     rrd_unlock();
82 }
83
84 // ----------------------------------------------------------------------------
85 // health main thread and friends
86
87 static inline int rrdcalc_value2status(calculated_number n) {
88     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
89     if(n) return RRDCALC_STATUS_RAISED;
90     return RRDCALC_STATUS_CLEAR;
91 }
92
93 #define ALARM_EXEC_COMMAND_LENGTH 8192
94
95 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
96     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
97
98     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
99         // do not send notifications for internal statuses
100         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
101         goto done;
102     }
103
104     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
105         // do not send notifications for disabled statuses
106         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
107         // mark it as run, so that we will send the same alarm if it happens again
108         goto done;
109     }
110
111     // find the previous notification for the same alarm
112     // which we have run the exec script
113     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
114     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
115         uint32_t id = ae->alarm_id;
116         ALARM_ENTRY *t;
117         for(t = ae->next; t ; t = t->next) {
118             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
119                 break;
120         }
121
122         if(likely(t)) {
123             // we have executed this alarm notification in the past
124             if(t && t->new_status == ae->new_status) {
125                 // don't send the notification for the same status again
126                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
127                       , rrdcalc_status2string(ae->new_status));
128                 goto done;
129             }
130         }
131         else {
132             // we have not executed this alarm notification in the past
133             // so, don't send CLEAR notifications
134             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
135                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
136                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
137                 goto done;
138             }
139         }
140     }
141
142     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
143     pid_t command_pid;
144
145     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
146     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
147
148     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
149               exec,
150               recipient,
151               host->hostname,
152               ae->unique_id,
153               ae->alarm_id,
154               ae->alarm_event_id,
155               (unsigned long)ae->when,
156               ae->name,
157               ae->chart?ae->chart:"NOCAHRT",
158               ae->family?ae->family:"NOFAMILY",
159               rrdcalc_status2string(ae->new_status),
160               rrdcalc_status2string(ae->old_status),
161               ae->new_value,
162               ae->old_value,
163               ae->source?ae->source:"UNKNOWN",
164               (uint32_t)ae->duration,
165               (uint32_t)ae->non_clear_duration,
166               ae->units?ae->units:"",
167               ae->info?ae->info:"",
168               ae->new_value_string,
169               ae->old_value_string
170     );
171
172     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
173     ae->exec_run_timestamp = now_realtime_sec();
174
175     debug(D_HEALTH, "executing command '%s'", command_to_run);
176     FILE *fp = mypopen(command_to_run, &command_pid);
177     if(!fp) {
178         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
179         goto done;
180     }
181     debug(D_HEALTH, "HEALTH reading from command");
182     char *s = fgets(command_to_run, FILENAME_MAX, fp);
183     (void)s;
184     ae->exec_code = mypclose(fp, command_pid);
185     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
186
187     if(ae->exec_code != 0)
188         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
189
190 done:
191     health_alarm_log_save(host, ae);
192     return;
193 }
194
195 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
196     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
197          ae->chart?ae->chart:"NOCHART", ae->name,
198          ae->new_value,
199          rrdcalc_status2string(ae->old_status),
200          rrdcalc_status2string(ae->new_status)
201     );
202
203     health_alarm_execute(host, ae);
204 }
205
206 static inline void health_alarm_log_process(RRDHOST *host) {
207     static uint32_t stop_at_id = 0;
208     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
209     time_t now = now_realtime_sec();
210
211     netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
212
213     ALARM_ENTRY *ae;
214     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
215         if(unlikely(
216             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
217             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
218             )) {
219
220             if(unlikely(ae->unique_id < first_waiting))
221                 first_waiting = ae->unique_id;
222
223             if(likely(now >= ae->delay_up_to_timestamp))
224                 health_process_notifications(host, ae);
225         }
226     }
227
228     // remember this for the next iteration
229     stop_at_id = first_waiting;
230
231     netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
232
233     if(host->health_log.count <= host->health_log.max)
234         return;
235
236     // cleanup excess entries in the log
237     netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
238
239     ALARM_ENTRY *last = NULL;
240     unsigned int count = host->health_log.max * 2 / 3;
241     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
242
243     if(ae && last && last->next == ae)
244         last->next = NULL;
245     else
246         ae = NULL;
247
248     while(ae) {
249         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
250
251         ALARM_ENTRY *t = ae->next;
252
253         health_alarm_log_free_one_nochecks_nounlink(ae);
254
255         ae = t;
256         host->health_log.count--;
257     }
258
259     netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
260 }
261
262 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
263     if(unlikely(!rc->rrdset)) {
264         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
265         return 0;
266     }
267
268     if(unlikely(rc->next_update > now)) {
269         if (unlikely(*next_run > rc->next_update)) {
270             // update the next_run time of the main loop
271             // to run this alarm precisely the time required
272             *next_run = rc->next_update;
273         }
274
275         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
276         return 0;
277     }
278
279     if(unlikely(!rc->update_every)) {
280         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
281         return 0;
282     }
283
284     if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
285         debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
286         return 0;
287     }
288
289     if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) {
290         debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name);
291         return 0;
292     }
293
294     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
295         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
296         return 0;
297     }
298
299     int update_every = rc->rrdset->update_every;
300     time_t first = rrdset_first_entry_t(rc->rrdset);
301     time_t last = rrdset_last_entry_t(rc->rrdset);
302
303     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
304         debug(D_HEALTH
305               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
306               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
307               , (unsigned long) last);
308         return 0;
309     }
310
311     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
312         time_t needed = now + rc->before + rc->after;
313
314         if(needed + update_every < first || needed - update_every > last) {
315             debug(D_HEALTH
316                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
317                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
318                   , (unsigned long) last);
319             return 0;
320         }
321     }
322
323     return 1;
324 }
325
326 void *health_main(void *ptr) {
327     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
328
329     info("HEALTH thread created with task id %d", gettid());
330
331     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
332         error("Cannot set pthread cancel type to DEFERRED.");
333
334     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
335         error("Cannot set pthread cancel state to ENABLE.");
336
337     int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
338     if(min_run_every < 1) min_run_every = 1;
339
340     BUFFER *wb = buffer_create(100);
341
342     time_t now               = now_realtime_sec();
343     time_t now_boottime      = now_boottime_sec();
344     time_t last_now          = now;
345     time_t last_now_boottime = now_boottime;
346     time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
347
348     unsigned int loop = 0;
349     while(!netdata_exit) {
350         loop++;
351         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
352
353         int oldstate, runnable = 0, apply_hibernation_delay = 0;
354         time_t next_run = now + min_run_every;
355         RRDCALC *rc;
356
357         // detect if boottime and realtime have twice the difference
358         // in which case we assume the system was just waken from hibernation
359         if(unlikely(now - last_now > 2 * (now_boottime - last_now_boottime)))
360             apply_hibernation_delay = 1;
361
362         last_now = now;
363         last_now_boottime = now_boottime;
364
365         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
366             error("Cannot set pthread cancel state to DISABLE.");
367
368         rrd_rdlock();
369
370         RRDHOST *host;
371         rrdhost_foreach_read(host) {
372             if(unlikely(!host->health_enabled))
373                 continue;
374
375             if(unlikely(apply_hibernation_delay)) {
376
377                 info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld)."
378                      , hibernation_delay
379                      , host->hostname
380                      , (long)(now - last_now)
381                      , (long)(now_boottime - last_now_boottime)
382                 );
383
384                 host->health_delay_up_to = now + hibernation_delay;
385             }
386
387             if(unlikely(!host->health_enabled || now < host->health_delay_up_to))
388                 continue;
389
390             rrdhost_rdlock(host);
391
392             // the first loop is to lookup values from the db
393             for(rc = host->alarms; rc; rc = rc->next) {
394                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
395                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
396                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
397                     continue;
398                 }
399
400                 runnable++;
401                 rc->old_value = rc->value;
402                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
403
404                 // ------------------------------------------------------------
405                 // if there is database lookup, do it
406
407                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
408                     /* time_t old_db_timestamp = rc->db_before; */
409                     int value_is_null = 0;
410
411                     int ret = rrdset2value_api_v1(rc->rrdset
412                                                   , wb
413                                                   , &rc->value
414                                                   , rc->dimensions
415                                                   , 1
416                                                   , rc->after
417                                                   , rc->before
418                                                   , rc->group
419                                                   , rc->options
420                                                   , &rc->db_after
421                                                   , &rc->db_before
422                                                   , &value_is_null
423                     );
424
425                     if(unlikely(ret != 200)) {
426                         // database lookup failed
427                         rc->value = NAN;
428                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
429
430                         debug(D_HEALTH
431                               , "Health on host '%s', alarm '%s.%s': database lookup returned error %d"
432                               , host->hostname
433                               , rc->chart ? rc->chart : "NOCHART"
434                               , rc->name
435                               , ret
436                         );
437                     }
438                     else
439                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
440
441                     /* - RRDCALC_FLAG_DB_STALE not currently used
442                     if (unlikely(old_db_timestamp == rc->db_before)) {
443                         // database is stale
444
445                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
446
447                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
448                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
449                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
450                         }
451                     }
452                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
453                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
454                     */
455
456                     if(unlikely(value_is_null)) {
457                         // collected value is null
458                         rc->value = NAN;
459                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
460
461                         debug(D_HEALTH
462                               , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)"
463                               , host->hostname
464                               , rc->chart ? rc->chart : "NOCHART"
465                               , rc->name
466                         );
467                     }
468                     else
469                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
470
471                     debug(D_HEALTH
472                           , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT
473                           , host->hostname
474                           , rc->chart ? rc->chart : "NOCHART"
475                           , rc->name
476                           , rc->value
477                     );
478                 }
479
480                 // ------------------------------------------------------------
481                 // if there is calculation expression, run it
482
483                 if(unlikely(rc->calculation)) {
484                     if(unlikely(!expression_evaluate(rc->calculation))) {
485                         // calculation failed
486                         rc->value = NAN;
487                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
488
489                         debug(D_HEALTH
490                               , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s"
491                               , host->hostname
492                               , rc->chart ? rc->chart : "NOCHART"
493                               , rc->name
494                               , rc->calculation->parsed_as
495                               , buffer_tostring(rc->calculation->error_msg)
496                         );
497                     }
498                     else {
499                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
500
501                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
502                               , host->hostname
503                               , rc->chart ? rc->chart : "NOCHART"
504                               , rc->name
505                               , rc->calculation->parsed_as
506                               , rc->calculation->result
507                               , buffer_tostring(rc->calculation->error_msg)
508                               , rc->source
509                         );
510
511                         rc->value = rc->calculation->result;
512                     }
513                 }
514             }
515             rrdhost_unlock(host);
516
517             if(unlikely(runnable && !netdata_exit)) {
518                 rrdhost_rdlock(host);
519
520                 for(rc = host->alarms; rc; rc = rc->next) {
521                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
522                         continue;
523
524                     int warning_status  = RRDCALC_STATUS_UNDEFINED;
525                     int critical_status = RRDCALC_STATUS_UNDEFINED;
526
527                     // --------------------------------------------------------
528                     // check the warning expression
529
530                     if(likely(rc->warning)) {
531                         if(unlikely(!expression_evaluate(rc->warning))) {
532                             // calculation failed
533                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
534
535                             debug(D_HEALTH
536                                   , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s"
537                                   , host->hostname
538                                   , rc->chart ? rc->chart : "NOCHART"
539                                   , rc->name
540                                   , buffer_tostring(rc->warning->error_msg)
541                             );
542                         }
543                         else {
544                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
545                             debug(D_HEALTH
546                                   , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
547                                   , host->hostname
548                                   , rc->chart ? rc->chart : "NOCHART"
549                                   , rc->name
550                                   , rc->warning->result
551                                   , buffer_tostring(rc->warning->error_msg)
552                                   , rc->source
553                             );
554                             warning_status = rrdcalc_value2status(rc->warning->result);
555                         }
556                     }
557
558                     // --------------------------------------------------------
559                     // check the critical expression
560
561                     if(likely(rc->critical)) {
562                         if(unlikely(!expression_evaluate(rc->critical))) {
563                             // calculation failed
564                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
565
566                             debug(D_HEALTH
567                                   , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s"
568                                   , host->hostname
569                                   , rc->chart ? rc->chart : "NOCHART"
570                                   , rc->name
571                                   , buffer_tostring(rc->critical->error_msg)
572                             );
573                         }
574                         else {
575                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
576                             debug(D_HEALTH
577                                   , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
578                                   , host->hostname
579                                   , rc->chart ? rc->chart : "NOCHART"
580                                   , rc->name
581                                   , rc->critical->result
582                                   , buffer_tostring(rc->critical->error_msg)
583                                   , rc->source
584                             );
585                             critical_status = rrdcalc_value2status(rc->critical->result);
586                         }
587                     }
588
589                     // --------------------------------------------------------
590                     // decide the final alarm status
591
592                     int status = RRDCALC_STATUS_UNDEFINED;
593
594                     switch(warning_status) {
595                         case RRDCALC_STATUS_CLEAR:
596                             status = RRDCALC_STATUS_CLEAR;
597                             break;
598
599                         case RRDCALC_STATUS_RAISED:
600                             status = RRDCALC_STATUS_WARNING;
601                             break;
602
603                         default:
604                             break;
605                     }
606
607                     switch(critical_status) {
608                         case RRDCALC_STATUS_CLEAR:
609                             if(status == RRDCALC_STATUS_UNDEFINED)
610                                 status = RRDCALC_STATUS_CLEAR;
611                             break;
612
613                         case RRDCALC_STATUS_RAISED:
614                             status = RRDCALC_STATUS_CRITICAL;
615                             break;
616
617                         default:
618                             break;
619                     }
620
621                     // --------------------------------------------------------
622                     // check if the new status and the old differ
623
624                     if(status != rc->status) {
625                         int delay = 0;
626
627                         // apply trigger hysteresis
628
629                         if(now > rc->delay_up_to_timestamp) {
630                             rc->delay_up_current = rc->delay_up_duration;
631                             rc->delay_down_current = rc->delay_down_duration;
632                             rc->delay_last = 0;
633                             rc->delay_up_to_timestamp = 0;
634                         }
635                         else {
636                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
637                             if(rc->delay_up_current > rc->delay_max_duration)
638                                 rc->delay_up_current = rc->delay_max_duration;
639
640                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
641                             if(rc->delay_down_current > rc->delay_max_duration)
642                                 rc->delay_down_current = rc->delay_max_duration;
643                         }
644
645                         if(status > rc->status)
646                             delay = rc->delay_up_current;
647                         else
648                             delay = rc->delay_down_current;
649
650                         // COMMENTED: because we do need to send raising alarms
651                         // if(now + delay < rc->delay_up_to_timestamp)
652                         //    delay = (int)(rc->delay_up_to_timestamp - now);
653
654                         rc->delay_last = delay;
655                         rc->delay_up_to_timestamp = now + delay;
656
657                         // add the alarm into the log
658
659                         health_alarm_log(
660                                 host
661                                 , rc->id
662                                 , rc->next_event_id++
663                                 , now
664                                 , rc->name
665                                 , rc->rrdset->id
666                                 , rc->rrdset->family
667                                 , rc->exec
668                                 , rc->recipient
669                                 , now - rc->last_status_change
670                                 , rc->old_value
671                                 , rc->value
672                                 , rc->status
673                                 , status
674                                 , rc->source
675                                 , rc->units
676                                 , rc->info
677                                 , rc->delay_last
678                                 , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
679                         );
680
681                         rc->last_status_change = now;
682                         rc->status = status;
683                     }
684
685                     rc->last_updated = now;
686                     rc->next_update = now + rc->update_every;
687
688                     if(next_run > rc->next_update)
689                         next_run = rc->next_update;
690                 }
691
692                 rrdhost_unlock(host);
693             }
694
695             if(unlikely(netdata_exit))
696                 break;
697
698             // execute notifications
699             // and cleanup
700             health_alarm_log_process(host);
701
702             if(unlikely(netdata_exit))
703                 break;
704
705         } /* rrdhost_foreach */
706
707         rrd_unlock();
708
709         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
710             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
711
712         if(unlikely(netdata_exit))
713             break;
714
715         now = now_realtime_sec();
716         if(now < next_run) {
717             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
718             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
719             now = now_realtime_sec();
720         }
721         else
722             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
723
724         now_boottime = now_boottime_sec();
725
726     } // forever
727
728     buffer_free(wb);
729
730     info("HEALTH thread exiting");
731
732     static_thread->enabled = 0;
733     pthread_exit(NULL);
734     return NULL;
735 }