]> arthur.barton.de Git - netdata.git/blob - src/health.c
0c7983ec1b588e68c5b1db38c92cdd0a304a7626
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 int default_health_enabled = 1;
5
6 // ----------------------------------------------------------------------------
7 // health initialization
8
9 inline char *health_config_dir(void) {
10     char buffer[FILENAME_MAX + 1];
11     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
12     return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
13 }
14
15 void health_init(void) {
16     debug(D_HEALTH, "Health configuration initializing");
17
18     if(!(default_health_enabled = config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", 1))) {
19         debug(D_HEALTH, "Health is disabled.");
20         return;
21     }
22 }
23
24 // ----------------------------------------------------------------------------
25 // re-load health configuration
26
27 void health_reload_host(RRDHOST *host) {
28     if(unlikely(!host->health_enabled))
29         return;
30
31     char *path = health_config_dir();
32
33     // free all running alarms
34     rrdhost_wrlock(host);
35
36     while(host->templates)
37         rrdcalctemplate_free(host, host->templates);
38
39     while(host->alarms)
40         rrdcalc_free(host, host->alarms);
41
42     rrdhost_unlock(host);
43
44     // invalidate all previous entries in the alarm log
45     ALARM_ENTRY *t;
46     for(t = host->health_log.alarms ; t ; t = t->next) {
47         if(t->new_status != RRDCALC_STATUS_REMOVED)
48             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
49     }
50
51     rrdhost_rdlock(host);
52     // reset all thresholds to all charts
53     RRDSET *st;
54     rrdset_foreach_read(st, host) {
55         st->green = NAN;
56         st->red = NAN;
57     }
58     rrdhost_unlock(host);
59
60     // load the new alarms
61     rrdhost_wrlock(host);
62     health_readdir(host, path);
63
64     // link the loaded alarms to their charts
65     rrdset_foreach_write(st, host) {
66         rrdsetcalc_link_matching(st);
67         rrdcalctemplate_link_matching(st);
68     }
69
70     rrdhost_unlock(host);
71 }
72
73 void health_reload(void) {
74
75     rrd_rdlock();
76
77     RRDHOST *host;
78     rrdhost_foreach_read(host)
79         health_reload_host(host);
80
81     rrd_unlock();
82 }
83
84 // ----------------------------------------------------------------------------
85 // health main thread and friends
86
87 static inline int rrdcalc_value2status(calculated_number n) {
88     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
89     if(n) return RRDCALC_STATUS_RAISED;
90     return RRDCALC_STATUS_CLEAR;
91 }
92
93 #define ALARM_EXEC_COMMAND_LENGTH 8192
94
95 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
96     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
97
98     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
99         // do not send notifications for internal statuses
100         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
101         goto done;
102     }
103
104     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
105         // do not send notifications for disabled statuses
106         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
107         // mark it as run, so that we will send the same alarm if it happens again
108         goto done;
109     }
110
111     // find the previous notification for the same alarm
112     // which we have run the exec script
113     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
114     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
115         uint32_t id = ae->alarm_id;
116         ALARM_ENTRY *t;
117         for(t = ae->next; t ; t = t->next) {
118             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
119                 break;
120         }
121
122         if(likely(t)) {
123             // we have executed this alarm notification in the past
124             if(t && t->new_status == ae->new_status) {
125                 // don't send the notification for the same status again
126                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
127                       , rrdcalc_status2string(ae->new_status));
128                 goto done;
129             }
130         }
131         else {
132             // we have not executed this alarm notification in the past
133             // so, don't send CLEAR notifications
134             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
135                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
136                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
137                 goto done;
138             }
139         }
140     }
141
142     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
143     pid_t command_pid;
144
145     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
146     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
147
148     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
149               exec,
150               recipient,
151               host->hostname,
152               ae->unique_id,
153               ae->alarm_id,
154               ae->alarm_event_id,
155               (unsigned long)ae->when,
156               ae->name,
157               ae->chart?ae->chart:"NOCAHRT",
158               ae->family?ae->family:"NOFAMILY",
159               rrdcalc_status2string(ae->new_status),
160               rrdcalc_status2string(ae->old_status),
161               ae->new_value,
162               ae->old_value,
163               ae->source?ae->source:"UNKNOWN",
164               (uint32_t)ae->duration,
165               (uint32_t)ae->non_clear_duration,
166               ae->units?ae->units:"",
167               ae->info?ae->info:"",
168               ae->new_value_string,
169               ae->old_value_string
170     );
171
172     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
173     ae->exec_run_timestamp = now_realtime_sec();
174
175     debug(D_HEALTH, "executing command '%s'", command_to_run);
176     FILE *fp = mypopen(command_to_run, &command_pid);
177     if(!fp) {
178         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
179         goto done;
180     }
181     debug(D_HEALTH, "HEALTH reading from command");
182     char *s = fgets(command_to_run, FILENAME_MAX, fp);
183     (void)s;
184     ae->exec_code = mypclose(fp, command_pid);
185     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
186
187     if(ae->exec_code != 0)
188         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
189
190 done:
191     health_alarm_log_save(host, ae);
192     return;
193 }
194
195 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
196     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
197          ae->chart?ae->chart:"NOCHART", ae->name,
198          ae->new_value,
199          rrdcalc_status2string(ae->old_status),
200          rrdcalc_status2string(ae->new_status)
201     );
202
203     health_alarm_execute(host, ae);
204 }
205
206 static inline void health_alarm_log_process(RRDHOST *host) {
207     static uint32_t stop_at_id = 0;
208     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
209     time_t now = now_realtime_sec();
210
211     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
212
213     ALARM_ENTRY *ae;
214     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
215         if(unlikely(
216             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
217             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
218             )) {
219
220             if(unlikely(ae->unique_id < first_waiting))
221                 first_waiting = ae->unique_id;
222
223             if(likely(now >= ae->delay_up_to_timestamp))
224                 health_process_notifications(host, ae);
225         }
226     }
227
228     // remember this for the next iteration
229     stop_at_id = first_waiting;
230
231     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
232
233     if(host->health_log.count <= host->health_log.max)
234         return;
235
236     // cleanup excess entries in the log
237     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
238
239     ALARM_ENTRY *last = NULL;
240     unsigned int count = host->health_log.max * 2 / 3;
241     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
242
243     if(ae && last && last->next == ae)
244         last->next = NULL;
245     else
246         ae = NULL;
247
248     while(ae) {
249         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
250
251         ALARM_ENTRY *t = ae->next;
252
253         health_alarm_log_free_one_nochecks_nounlink(ae);
254
255         ae = t;
256         host->health_log.count--;
257     }
258
259     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
260 }
261
262 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
263     if(unlikely(!rc->rrdset)) {
264         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
265         return 0;
266     }
267
268     if(unlikely(rc->next_update > now)) {
269         if (unlikely(*next_run > rc->next_update)) {
270             // update the next_run time of the main loop
271             // to run this alarm precisely the time required
272             *next_run = rc->next_update;
273         }
274
275         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
276         return 0;
277     }
278
279     if(unlikely(!rc->update_every)) {
280         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
281         return 0;
282     }
283
284     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
285         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
286         return 0;
287     }
288
289     int update_every = rc->rrdset->update_every;
290     time_t first = rrdset_first_entry_t(rc->rrdset);
291     time_t last = rrdset_last_entry_t(rc->rrdset);
292
293     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
294         debug(D_HEALTH
295               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
296               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
297               , (unsigned long) last);
298         return 0;
299     }
300
301     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
302         time_t needed = now + rc->before + rc->after;
303
304         if(needed + update_every < first || needed - update_every > last) {
305             debug(D_HEALTH
306                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
307                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
308                   , (unsigned long) last);
309             return 0;
310         }
311     }
312
313     return 1;
314 }
315
316 void *health_main(void *ptr) {
317     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
318
319     info("HEALTH thread created with task id %d", gettid());
320
321     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
322         error("Cannot set pthread cancel type to DEFERRED.");
323
324     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
325         error("Cannot set pthread cancel state to ENABLE.");
326
327     int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
328     if(min_run_every < 1) min_run_every = 1;
329
330     BUFFER *wb = buffer_create(100);
331
332     time_t now               = now_realtime_sec();
333     time_t now_boottime      = now_boottime_sec();
334     time_t last_now          = now;
335     time_t last_now_boottime = now_boottime;
336     time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
337
338     unsigned int loop = 0;
339     while(!netdata_exit) {
340         loop++;
341         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
342
343         int oldstate, runnable = 0, apply_hibernation_delay = 0;
344         time_t next_run = now + min_run_every;
345         RRDCALC *rc;
346
347         // detect if boottime and realtime have twice the difference
348         // in which case we assume the system was just waken from hibernation
349         if(unlikely(now - last_now > 2 * (now_boottime - last_now_boottime)))
350             apply_hibernation_delay = 1;
351
352         last_now = now;
353         last_now_boottime = now_boottime;
354
355         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
356             error("Cannot set pthread cancel state to DISABLE.");
357
358         rrd_rdlock();
359
360         RRDHOST *host;
361         rrdhost_foreach_read(host) {
362             if(unlikely(!host->health_enabled))
363                 continue;
364
365             if(unlikely(apply_hibernation_delay)) {
366
367                 info("Postponing alarm checks for %ld seconds, on host '%s', due to boottime discrepancy (realtime dt: %ld, boottime dt: %ld)."
368                      , hibernation_delay
369                      , host->hostname
370                      , (long)(now - last_now)
371                      , (long)(now_boottime - last_now_boottime)
372                 );
373
374                 host->health_delay_up_to = now + hibernation_delay;
375             }
376
377             if(unlikely(!host->health_enabled || now < host->health_delay_up_to))
378                 continue;
379
380             rrdhost_rdlock(host);
381
382             // the first loop is to lookup values from the db
383             for(rc = host->alarms; rc; rc = rc->next) {
384                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
385                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
386                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
387                     continue;
388                 }
389
390                 runnable++;
391                 rc->old_value = rc->value;
392                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
393
394                 // ------------------------------------------------------------
395                 // if there is database lookup, do it
396
397                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
398                     /* time_t old_db_timestamp = rc->db_before; */
399                     int value_is_null = 0;
400
401                     int ret = rrd2value(rc->rrdset
402                                         , wb
403                                         , &rc->value
404                                         , rc->dimensions
405                                         , 1
406                                         , rc->after
407                                         , rc->before
408                                         , rc->group
409                                         , rc->options
410                                         , &rc->db_after
411                                         , &rc->db_before
412                                         , &value_is_null
413                     );
414
415                     if(unlikely(ret != 200)) {
416                         // database lookup failed
417                         rc->value = NAN;
418                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
419
420                         debug(D_HEALTH
421                               , "Health on host '%s', alarm '%s.%s': database lookup returned error %d"
422                               , host->hostname
423                               , rc->chart ? rc->chart : "NOCHART"
424                               , rc->name
425                               , ret
426                         );
427                     }
428                     else
429                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
430
431                     /* - RRDCALC_FLAG_DB_STALE not currently used
432                     if (unlikely(old_db_timestamp == rc->db_before)) {
433                         // database is stale
434
435                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
436
437                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
438                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
439                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
440                         }
441                     }
442                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
443                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
444                     */
445
446                     if(unlikely(value_is_null)) {
447                         // collected value is null
448                         rc->value = NAN;
449                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
450
451                         debug(D_HEALTH
452                               , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)"
453                               , host->hostname
454                               , rc->chart ? rc->chart : "NOCHART"
455                               , rc->name
456                         );
457                     }
458                     else
459                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
460
461                     debug(D_HEALTH
462                           , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT
463                           , host->hostname
464                           , rc->chart ? rc->chart : "NOCHART"
465                           , rc->name
466                           , rc->value
467                     );
468                 }
469
470                 // ------------------------------------------------------------
471                 // if there is calculation expression, run it
472
473                 if(unlikely(rc->calculation)) {
474                     if(unlikely(!expression_evaluate(rc->calculation))) {
475                         // calculation failed
476                         rc->value = NAN;
477                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
478
479                         debug(D_HEALTH
480                               , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s"
481                               , host->hostname
482                               , rc->chart ? rc->chart : "NOCHART"
483                               , rc->name
484                               , rc->calculation->parsed_as
485                               , buffer_tostring(rc->calculation->error_msg)
486                         );
487                     }
488                     else {
489                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
490
491                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
492                               , host->hostname
493                               , rc->chart ? rc->chart : "NOCHART"
494                               , rc->name
495                               , rc->calculation->parsed_as
496                               , rc->calculation->result
497                               , buffer_tostring(rc->calculation->error_msg)
498                               , rc->source
499                         );
500
501                         rc->value = rc->calculation->result;
502                     }
503                 }
504             }
505             rrdhost_unlock(host);
506
507             if(unlikely(runnable && !netdata_exit)) {
508                 rrdhost_rdlock(host);
509
510                 for(rc = host->alarms; rc; rc = rc->next) {
511                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
512                         continue;
513
514                     int warning_status  = RRDCALC_STATUS_UNDEFINED;
515                     int critical_status = RRDCALC_STATUS_UNDEFINED;
516
517                     // --------------------------------------------------------
518                     // check the warning expression
519
520                     if(likely(rc->warning)) {
521                         if(unlikely(!expression_evaluate(rc->warning))) {
522                             // calculation failed
523                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
524
525                             debug(D_HEALTH
526                                   , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s"
527                                   , host->hostname
528                                   , rc->chart ? rc->chart : "NOCHART"
529                                   , rc->name
530                                   , buffer_tostring(rc->warning->error_msg)
531                             );
532                         }
533                         else {
534                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
535                             debug(D_HEALTH
536                                   , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
537                                   , host->hostname
538                                   , rc->chart ? rc->chart : "NOCHART"
539                                   , rc->name
540                                   , rc->warning->result
541                                   , buffer_tostring(rc->warning->error_msg)
542                                   , rc->source
543                             );
544                             warning_status = rrdcalc_value2status(rc->warning->result);
545                         }
546                     }
547
548                     // --------------------------------------------------------
549                     // check the critical expression
550
551                     if(likely(rc->critical)) {
552                         if(unlikely(!expression_evaluate(rc->critical))) {
553                             // calculation failed
554                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
555
556                             debug(D_HEALTH
557                                   , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s"
558                                   , host->hostname
559                                   , rc->chart ? rc->chart : "NOCHART"
560                                   , rc->name
561                                   , buffer_tostring(rc->critical->error_msg)
562                             );
563                         }
564                         else {
565                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
566                             debug(D_HEALTH
567                                   , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)"
568                                   , host->hostname
569                                   , rc->chart ? rc->chart : "NOCHART"
570                                   , rc->name
571                                   , rc->critical->result
572                                   , buffer_tostring(rc->critical->error_msg)
573                                   , rc->source
574                             );
575                             critical_status = rrdcalc_value2status(rc->critical->result);
576                         }
577                     }
578
579                     // --------------------------------------------------------
580                     // decide the final alarm status
581
582                     int status = RRDCALC_STATUS_UNDEFINED;
583
584                     switch(warning_status) {
585                         case RRDCALC_STATUS_CLEAR:
586                             status = RRDCALC_STATUS_CLEAR;
587                             break;
588
589                         case RRDCALC_STATUS_RAISED:
590                             status = RRDCALC_STATUS_WARNING;
591                             break;
592
593                         default:
594                             break;
595                     }
596
597                     switch(critical_status) {
598                         case RRDCALC_STATUS_CLEAR:
599                             if(status == RRDCALC_STATUS_UNDEFINED)
600                                 status = RRDCALC_STATUS_CLEAR;
601                             break;
602
603                         case RRDCALC_STATUS_RAISED:
604                             status = RRDCALC_STATUS_CRITICAL;
605                             break;
606
607                         default:
608                             break;
609                     }
610
611                     // --------------------------------------------------------
612                     // check if the new status and the old differ
613
614                     if(status != rc->status) {
615                         int delay = 0;
616
617                         // apply trigger hysteresis
618
619                         if(now > rc->delay_up_to_timestamp) {
620                             rc->delay_up_current = rc->delay_up_duration;
621                             rc->delay_down_current = rc->delay_down_duration;
622                             rc->delay_last = 0;
623                             rc->delay_up_to_timestamp = 0;
624                         }
625                         else {
626                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
627                             if(rc->delay_up_current > rc->delay_max_duration)
628                                 rc->delay_up_current = rc->delay_max_duration;
629
630                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
631                             if(rc->delay_down_current > rc->delay_max_duration)
632                                 rc->delay_down_current = rc->delay_max_duration;
633                         }
634
635                         if(status > rc->status)
636                             delay = rc->delay_up_current;
637                         else
638                             delay = rc->delay_down_current;
639
640                         // COMMENTED: because we do need to send raising alarms
641                         // if(now + delay < rc->delay_up_to_timestamp)
642                         //    delay = (int)(rc->delay_up_to_timestamp - now);
643
644                         rc->delay_last = delay;
645                         rc->delay_up_to_timestamp = now + delay;
646
647                         // add the alarm into the log
648
649                         health_alarm_log(
650                                 host
651                                 , rc->id
652                                 , rc->next_event_id++
653                                 , now
654                                 , rc->name
655                                 , rc->rrdset->id
656                                 , rc->rrdset->family
657                                 , rc->exec
658                                 , rc->recipient
659                                 , now - rc->last_status_change
660                                 , rc->old_value
661                                 , rc->value
662                                 , rc->status
663                                 , status
664                                 , rc->source
665                                 , rc->units
666                                 , rc->info
667                                 , rc->delay_last
668                                 , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
669                         );
670
671                         rc->last_status_change = now;
672                         rc->status = status;
673                     }
674
675                     rc->last_updated = now;
676                     rc->next_update = now + rc->update_every;
677
678                     if(next_run > rc->next_update)
679                         next_run = rc->next_update;
680                 }
681
682                 rrdhost_unlock(host);
683             }
684
685             if(unlikely(netdata_exit))
686                 break;
687
688             // execute notifications
689             // and cleanup
690             health_alarm_log_process(host);
691
692             if(unlikely(netdata_exit))
693                 break;
694
695         } /* rrdhost_foreach */
696
697         rrd_unlock();
698
699         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
700             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
701
702         if(unlikely(netdata_exit))
703             break;
704
705         now = now_realtime_sec();
706         if(now < next_run) {
707             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
708             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
709             now = now_realtime_sec();
710         }
711         else
712             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
713
714         now_boottime = now_boottime_sec();
715
716     } // forever
717
718     buffer_free(wb);
719
720     info("HEALTH thread exiting");
721
722     static_thread->enabled = 0;
723     pthread_exit(NULL);
724     return NULL;
725 }