]> arthur.barton.de Git - netdata.git/blob - src/health.c
rotate health log file
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 100);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         // open it with truncate
71         health.log_fp = fopen(health.log_filename, "w");
72
73         if(health.log_fp)
74             fclose(health.log_fp);
75         else
76             error("Health: cannot truncate health log '%s'", health.log_filename);
77
78         health.log_fp = NULL;
79
80         health_alarm_log_open();
81     }
82 }
83
84 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
85     health_log_rotate();
86
87     if(likely(health.log_fp)) {
88         if(unlikely(fprintf(health.log_fp
89                 , "%c\t%s"
90                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
91                   "\t%08x\t%08x\t%08x"
92                   "\t%08x\t%08x\t%08x"
93                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
94                   "\t%d\t%d\t%d\t%d"
95                   "\t%Lf\t%Lf"
96                   "\n"
97                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
98                 , host->hostname
99
100                 , ae->unique_id
101                 , ae->alarm_id
102                 , ae->alarm_event_id
103                 , ae->updated_by_id
104                 , ae->updates_id
105
106                 , (uint32_t)ae->when
107                 , (uint32_t)ae->duration
108                 , (uint32_t)ae->non_clear_duration
109                 , (uint32_t)ae->flags
110                 , (uint32_t)ae->exec_run_timestamp
111                 , (uint32_t)ae->delay_up_to_timestamp
112
113                 , (ae->name)?ae->name:""
114                 , (ae->chart)?ae->chart:""
115                 , (ae->family)?ae->family:""
116                 , (ae->exec)?ae->exec:""
117                 , (ae->recipient)?ae->recipient:""
118                 , (ae->source)?ae->source:""
119                 , (ae->units)?ae->units:""
120                 , (ae->info)?ae->info:""
121
122                 , ae->exec_code
123                 , ae->new_status
124                 , ae->old_status
125                 , ae->delay
126
127                 , (long double)ae->new_value
128                 , (long double)ae->old_value
129         ) < 0))
130             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
131         else {
132             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
133             health.log_entries_written++;
134         }
135     }
136 }
137
138 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
139     uint32_t max_unique_id = 0, max_alarm_id = 0;
140     ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
141
142     errno = 0;
143
144     char *s, *buf = mallocz(65536 + 1);
145     size_t line = 0, len = 0;
146     loaded = updated = errored = duplicate = 0;
147
148     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
149
150     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
151         health.log_entries_written++;
152         line++;
153
154         int max_entries = 30, entries = 0;
155         char *pointers[max_entries];
156
157         pointers[entries++] = s++;
158         while(*s) {
159             if(unlikely(*s == '\t')) {
160                 *s = '\0';
161                 pointers[entries++] = ++s;
162                 if(entries >= max_entries) {
163                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
164                     break;
165                 }
166             }
167             else s++;
168         }
169
170         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
171             ALARM_ENTRY *ae = NULL;
172
173             if(entries < 26) {
174                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring line.", line, filename, entries);
175                 errored++;
176                 continue;
177             }
178
179             // check that we have valid ids
180             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
181             if(!unique_id) {
182                 error("Health: line %zu of file '%s' states alarm entry with unique id %u (%s). Ignoring line.", line, filename, unique_id, pointers[2]);
183                 errored++;
184                 continue;
185             }
186
187             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
188             if(!alarm_id) {
189                 error("Health: line %zu of file '%s' states alarm entry for alarm id %u (%s). Ignoring line.", line, filename, alarm_id, pointers[3]);
190                 errored++;
191                 continue;
192             }
193
194             // find a possible overwrite
195             for(ae = host->health_log.alarms; ae; ae = ae->next) {
196                 if(unlikely(ae->unique_id == unique_id)) {
197                     if(unlikely(*pointers[0] == 'A')) {
198                         error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u."
199                               , line, filename, unique_id);
200                         *pointers[0] = 'U';
201                         duplicate++;
202                     }
203                     break;
204                 }
205             }
206
207             // if not found, create a new one
208             if(likely(!ae)) {
209
210                 // if it is an update, but we haven't found it, make it an addition
211                 if(unlikely(*pointers[0] == 'U')) {
212                     *pointers[0] = 'A';
213                     error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
214                 }
215
216                 // alarms should be added in the right order
217                 if(unlikely(unique_id < max_unique_id)) {
218                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order.", line
219                           , filename, ae->unique_id);
220                 }
221
222                 ae = callocz(1, sizeof(ALARM_ENTRY));
223             }
224
225             // check for a possible host missmatch
226             if(strcmp(pointers[1], host->hostname))
227                 error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
228
229             ae->unique_id               = unique_id;
230             ae->alarm_id                = alarm_id;
231             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
232             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
233             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
234
235             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
236             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
237             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
238
239             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
240             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
241
242             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
243             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
244
245             if(unlikely(ae->name)) freez(ae->name);
246             ae->name = strdupz(pointers[13]);
247
248             if(unlikely(ae->chart)) freez(ae->chart);
249             ae->chart = strdupz(pointers[14]);
250
251             if(unlikely(ae->family)) freez(ae->family);
252             ae->family = strdupz(pointers[15]);
253
254             if(unlikely(ae->exec)) freez(ae->exec);
255             ae->exec = strdupz(pointers[16]);
256             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
257
258             if(unlikely(ae->recipient)) freez(ae->recipient);
259             ae->recipient = strdupz(pointers[17]);
260             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
261
262             if(unlikely(ae->source)) freez(ae->source);
263             ae->source = strdupz(pointers[18]);
264             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
265
266             if(unlikely(ae->units)) freez(ae->units);
267             ae->units = strdupz(pointers[19]);
268             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
269
270             if(unlikely(ae->info)) freez(ae->info);
271             ae->info = strdupz(pointers[20]);
272             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
273
274             ae->exec_code   = atoi(pointers[21]);
275             ae->new_status  = atoi(pointers[22]);
276             ae->old_status  = atoi(pointers[23]);
277             ae->delay       = atoi(pointers[24]);
278
279             ae->new_value   = strtold(pointers[25], NULL);
280             ae->old_value   = strtold(pointers[26], NULL);
281
282             // add it to host if not already there
283             if(unlikely(*pointers[0] == 'A')) {
284                 ae->next = host->health_log.alarms;
285                 host->health_log.alarms = ae;
286                 loaded++;
287             }
288             else updated++;
289
290             if(unlikely(ae->unique_id > max_unique_id))
291                 max_unique_id = ae->unique_id;
292
293             if(unlikely(ae->alarm_id >= max_alarm_id))
294                 max_alarm_id = ae->alarm_id;
295         }
296         else {
297             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
298             errored++;
299         }
300     }
301
302     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
303
304     freez(buf);
305
306     if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
307     if(!max_alarm_id)  max_alarm_id  = (uint32_t)time(NULL);
308
309     host->health_log.next_log_id = max_unique_id + 1;
310     host->health_log.next_alarm_id = max_alarm_id + 1;
311
312     info("Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
313     return loaded;
314 }
315
316 static inline void health_alarm_log_load(RRDHOST *host) {
317     health_alarm_log_close();
318
319     char buffer[FILENAME_MAX + 1];
320     snprintfz(buffer, FILENAME_MAX, "%s.old", health.log_filename);
321     FILE *fp = fopen(buffer, "r");
322     if(!fp)
323         error("Health: cannot open health file: %s", health.log_filename);
324     else {
325         health_alarm_log_read(host, fp, health.log_filename);
326         fclose(fp);
327     }
328
329     health.log_entries_written = 0;
330     fp = fopen(health.log_filename, "r");
331     if(!fp)
332         error("Health: cannot open health file: %s", health.log_filename);
333     else {
334         health_alarm_log_read(host, fp, health.log_filename);
335         fclose(fp);
336     }
337
338     health_alarm_log_open();
339 }
340
341
342 // ----------------------------------------------------------------------------
343 // health alarm log management
344
345 static inline void health_alarm_log(RRDHOST *host,
346                 uint32_t alarm_id, uint32_t alarm_event_id,
347                 time_t when,
348                 const char *name, const char *chart, const char *family,
349                 const char *exec, const char *recipient, time_t duration,
350                 calculated_number old_value, calculated_number new_value,
351                 int old_status, int new_status,
352                 const char *source,
353                 const char *units,
354                 const char *info,
355                 int delay
356 ) {
357     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
358
359     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
360     ae->name = strdupz(name);
361     ae->hash_name = simple_hash(ae->name);
362
363     if(chart) {
364         ae->chart = strdupz(chart);
365         ae->hash_chart = simple_hash(ae->chart);
366     }
367
368     if(family)
369         ae->family = strdupz(family);
370
371     if(exec) ae->exec = strdupz(exec);
372     if(recipient) ae->recipient = strdupz(recipient);
373     if(source) ae->source = strdupz(source);
374     if(units) ae->units = strdupz(units);
375     if(info) ae->info = strdupz(info);
376
377     ae->unique_id = host->health_log.next_log_id++;
378     ae->alarm_id = alarm_id;
379     ae->alarm_event_id = alarm_event_id;
380     ae->when = when;
381     ae->old_value = old_value;
382     ae->new_value = new_value;
383     ae->old_status = old_status;
384     ae->new_status = new_status;
385     ae->duration = duration;
386     ae->delay = delay;
387     ae->delay_up_to_timestamp = when + delay;
388
389     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
390         ae->non_clear_duration += ae->duration;
391
392     // link it
393     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
394     ae->next = host->health_log.alarms;
395     host->health_log.alarms = ae;
396     host->health_log.count++;
397     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
398
399     // match previous alarms
400     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
401     ALARM_ENTRY *t;
402     for(t = host->health_log.alarms ; t ; t = t->next) {
403         if(t != ae && t->alarm_id == ae->alarm_id) {
404             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
405                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
406                 t->updated_by_id = ae->unique_id;
407                 ae->updates_id = t->unique_id;
408
409                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
410                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
411                     ae->non_clear_duration += t->non_clear_duration;
412
413                 health_alarm_log_save(host, t);
414             }
415
416             // no need to continue
417             break;
418         }
419     }
420     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
421
422     health_alarm_log_save(host, ae);
423 }
424
425 // ----------------------------------------------------------------------------
426 // RRDVAR management
427
428 static inline int rrdvar_fix_name(char *variable) {
429     int fixed = 0;
430     while(*variable) {
431         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
432             *variable++ = '_';
433             fixed++;
434         }
435         else
436             variable++;
437     }
438
439     return fixed;
440 }
441
442 int rrdvar_compare(void* a, void* b) {
443     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
444     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
445     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
446 }
447
448 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
449     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
450     if(ret != rv)
451         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
452
453     return ret;
454 }
455
456 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
457     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
458     if(!ret)
459         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
460
461     return ret;
462 }
463
464 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
465     RRDVAR tmp;
466     tmp.name = (char *)name;
467     tmp.hash = (hash)?hash:simple_hash(tmp.name);
468
469     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
470 }
471
472 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
473     (void)host;
474
475     if(!rv) return;
476
477     if(tree)
478         rrdvar_index_del(tree, rv);
479
480     freez(rv->name);
481     freez(rv);
482 }
483
484 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
485     char *variable = strdupz(name);
486     rrdvar_fix_name(variable);
487     uint32_t hash = simple_hash(variable);
488
489     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
490     if(unlikely(!rv)) {
491         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
492
493         rv = callocz(1, sizeof(RRDVAR));
494         rv->name = variable;
495         rv->hash = hash;
496         rv->type = type;
497         rv->value = value;
498
499         RRDVAR *ret = rrdvar_index_add(tree, rv);
500         if(unlikely(ret != rv)) {
501             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
502             rrdvar_free(NULL, NULL, rv);
503             rv = NULL;
504         }
505         else
506             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
507     }
508     else {
509         // already exists
510         freez(variable);
511         rv = NULL;
512     }
513
514     return rv;
515 }
516
517 // ----------------------------------------------------------------------------
518 // RRDVAR lookup
519
520 calculated_number rrdvar2number(RRDVAR *rv) {
521     switch(rv->type) {
522         case RRDVAR_TYPE_CALCULATED: {
523             calculated_number *n = (calculated_number *)rv->value;
524             return *n;
525         }
526
527         case RRDVAR_TYPE_TIME_T: {
528             time_t *n = (time_t *)rv->value;
529             return *n;
530         }
531
532         case RRDVAR_TYPE_COLLECTED: {
533             collected_number *n = (collected_number *)rv->value;
534             return *n;
535         }
536
537         case RRDVAR_TYPE_TOTAL: {
538             total_number *n = (total_number *)rv->value;
539             return *n;
540         }
541
542         case RRDVAR_TYPE_INT: {
543             int *n = (int *)rv->value;
544             return *n;
545         }
546
547         default:
548             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
549             return NAN;
550     }
551 }
552
553 void dump_variable(void *data) {
554     RRDVAR *rv = (RRDVAR *)data;
555     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
556 }
557
558 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
559     RRDSET *st = rc->rrdset;
560     RRDVAR *rv;
561
562     if(!st) return 0;
563
564     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
565     if(rv) {
566         *result = rrdvar2number(rv);
567         return 1;
568     }
569
570     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
571     if(rv) {
572         *result = rrdvar2number(rv);
573         return 1;
574     }
575
576     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
577     if(rv) {
578         *result = rrdvar2number(rv);
579         return 1;
580     }
581
582     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
583     avl_traverse_lock(&st->variables_root_index, dump_variable);
584
585     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
586     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
587
588     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
589     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
590
591     return 0;
592 }
593
594 // ----------------------------------------------------------------------------
595 // RRDSETVAR management
596
597 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
598     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
599     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
600
601     char buffer[RRDVAR_MAX_LENGTH + 1];
602     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
603     rs->fullid = strdupz(buffer);
604
605     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
606     rs->fullname = strdupz(buffer);
607
608     rs->variable = strdupz(variable);
609
610     rs->type = type;
611     rs->value = value;
612     rs->options = options;
613     rs->rrdset = st;
614
615     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
616     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
617     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
618     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
619     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
620
621     rs->next = st->variables;
622     st->variables = rs;
623
624     return rs;
625 }
626
627 void rrdsetvar_rename_all(RRDSET *st) {
628     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
629
630     // only these 2 can change name
631     // rs->family_name
632     // rs->host_name
633
634     char buffer[RRDVAR_MAX_LENGTH + 1];
635     RRDSETVAR *rs, *next = st->variables;
636     while((rs = next)) {
637         next = rs->next;
638
639         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
640
641         if (strcmp(buffer, rs->fullname)) {
642             // name changed
643             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
644             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
645
646             freez(rs->fullname);
647             rs->fullname = strdupz(st->name);
648             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
649             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
650         }
651     }
652
653     rrdsetcalc_link_matching(st);
654 }
655
656 void rrdsetvar_free(RRDSETVAR *rs) {
657     RRDSET *st = rs->rrdset;
658     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
659
660     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
661     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
662     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
663     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
664     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
665
666     if(st->variables == rs) {
667         st->variables = rs->next;
668     }
669     else {
670         RRDSETVAR *t;
671         for (t = st->variables; t && t->next != rs; t = t->next);
672         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
673         else t->next = rs->next;
674     }
675
676     freez(rs->fullid);
677     freez(rs->fullname);
678     freez(rs->variable);
679     freez(rs);
680 }
681
682 // ----------------------------------------------------------------------------
683 // RRDDIMVAR management
684
685 #define RRDDIMVAR_ID_MAX 1024
686
687 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
688     RRDSET *st = rd->rrdset;
689
690     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
691
692     if(!prefix) prefix = "";
693     if(!suffix) suffix = "";
694
695     char buffer[RRDDIMVAR_ID_MAX + 1];
696     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
697
698     rs->prefix = strdupz(prefix);
699     rs->suffix = strdupz(suffix);
700
701     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
702     rs->id = strdupz(buffer);
703
704     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
705     rs->name = strdupz(buffer);
706
707     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
708     rs->fullidid = strdupz(buffer);
709
710     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
711     rs->fullidname = strdupz(buffer);
712
713     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
714     rs->fullnameid = strdupz(buffer);
715
716     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
717     rs->fullnamename = strdupz(buffer);
718
719     rs->type = type;
720     rs->value = value;
721     rs->options = options;
722     rs->rrddim = rd;
723
724     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
725     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
726
727     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
728     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
729
730     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
731     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
732     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
733     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
734
735     rs->next = rd->variables;
736     rd->variables = rs;
737
738     return rs;
739 }
740
741 void rrddimvar_rename_all(RRDDIM *rd) {
742     RRDSET *st = rd->rrdset;
743     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
744
745     RRDDIMVAR *rs, *next = rd->variables;
746     while((rs = next)) {
747         next = rs->next;
748
749         if (strcmp(rd->name, rs->name)) {
750             char buffer[RRDDIMVAR_ID_MAX + 1];
751             // name changed
752
753             // name
754             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
755             freez(rs->name);
756             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
757             rs->name = strdupz(buffer);
758             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
759
760             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
761             freez(rs->fullidname);
762             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
763             rs->fullidname = strdupz(buffer);
764             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
765                                                              rs->fullidname, rs->type, rs->value);
766
767             // fullnameid
768             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
769             freez(rs->fullnameid);
770             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
771             rs->fullnameid = strdupz(buffer);
772             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
773                                                           rs->fullnameid, rs->type, rs->value);
774
775             // fullnamename
776             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
777             freez(rs->fullnamename);
778             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
779             rs->fullnamename = strdupz(buffer);
780             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
781                                                           rs->fullnamename, rs->type, rs->value);
782         }
783     }
784 }
785
786 void rrddimvar_free(RRDDIMVAR *rs) {
787     RRDDIM *rd = rs->rrddim;
788     RRDSET *st = rd->rrdset;
789     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
790
791     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
792     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
793
794     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
795     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
796
797     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
798     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
799     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
800     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
801
802     if(rd->variables == rs) {
803         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
804         rd->variables = rs->next;
805     }
806     else {
807         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
808         RRDDIMVAR *t;
809         for (t = rd->variables; t && t->next != rs; t = t->next) ;
810         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
811         else t->next = rs->next;
812     }
813
814     freez(rs->prefix);
815     freez(rs->suffix);
816     freez(rs->id);
817     freez(rs->name);
818     freez(rs->fullidid);
819     freez(rs->fullidname);
820     freez(rs->fullnameid);
821     freez(rs->fullnamename);
822     freez(rs);
823 }
824
825 // ----------------------------------------------------------------------------
826 // RRDCALC management
827
828 static inline const char *rrdcalc_status2string(int status) {
829     switch(status) {
830         case RRDCALC_STATUS_REMOVED:
831             return "REMOVED";
832
833         case RRDCALC_STATUS_UNDEFINED:
834             return "UNDEFINED";
835
836         case RRDCALC_STATUS_UNINITIALIZED:
837             return "UNINITIALIZED";
838
839         case RRDCALC_STATUS_CLEAR:
840             return "CLEAR";
841
842         case RRDCALC_STATUS_RAISED:
843             return "RAISED";
844
845         case RRDCALC_STATUS_WARNING:
846             return "WARNING";
847
848         case RRDCALC_STATUS_CRITICAL:
849             return "CRITICAL";
850
851         default:
852             error("Unknown alarm status %d", status);
853             return "UNKNOWN";
854     }
855 }
856
857 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
858     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
859
860     rc->last_status_change = time(NULL);
861     rc->rrdset = st;
862
863     rc->rrdset_next = st->alarms;
864     rc->rrdset_prev = NULL;
865     
866     if(rc->rrdset_next)
867         rc->rrdset_next->rrdset_prev = rc;
868
869     st->alarms = rc;
870
871     if(rc->update_every < rc->rrdset->update_every) {
872         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
873         rc->update_every = rc->rrdset->update_every;
874     }
875
876     if(!isnan(rc->green) && isnan(st->green)) {
877         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
878         st->green = rc->green;
879     }
880
881     if(!isnan(rc->red) && isnan(st->red)) {
882         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
883         st->red = rc->red;
884     }
885
886     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
887     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
888
889     char fullname[RRDVAR_MAX_LENGTH + 1];
890     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
891     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
892
893     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
894     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
895
896         if(!rc->units) rc->units = strdupz(st->units);
897
898     {
899         time_t now = time(NULL);
900         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
901     }
902 }
903
904 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
905     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
906             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
907         return 1;
908
909     return 0;
910 }
911
912 // this has to be called while the RRDHOST is locked
913 inline void rrdsetcalc_link_matching(RRDSET *st) {
914     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
915
916     RRDCALC *rc;
917     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
918         if(unlikely(rc->rrdset))
919             continue;
920
921         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
922             rrdsetcalc_link(st, rc);
923     }
924 }
925
926 // this has to be called while the RRDHOST is locked
927 inline void rrdsetcalc_unlink(RRDCALC *rc) {
928     RRDSET *st = rc->rrdset;
929
930     if(!st) {
931         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
932         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
933         return;
934     }
935
936     {
937         time_t now = time(NULL);
938         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
939     }
940
941     RRDHOST *host = st->rrdhost;
942
943     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
944
945     // unlink it
946     if(rc->rrdset_prev)
947         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
948
949     if(rc->rrdset_next)
950         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
951
952     if(st->alarms == rc)
953         st->alarms = rc->rrdset_next;
954
955     rc->rrdset_prev = rc->rrdset_next = NULL;
956
957     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
958     rc->local = NULL;
959
960     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
961     rc->family = NULL;
962
963     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
964     rc->hostid = NULL;
965
966     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
967     rc->hostname = NULL;
968
969     rc->rrdset = NULL;
970
971     // RRDCALC will remain in RRDHOST
972     // so that if the matching chart is found in the future
973     // it will be applied automatically
974 }
975
976 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
977     RRDCALC *rc;
978     uint32_t hash = simple_hash(name);
979
980     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
981         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
982             return rc;
983     }
984
985     return NULL;
986 }
987
988 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
989     RRDCALC *rc;
990
991     if(unlikely(!chart)) {
992         error("attempt to find RRDCALC '%s' without giving a chart name", name);
993         return 1;
994     }
995
996     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
997     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
998
999     // make sure it does not already exist
1000     for(rc = host->alarms; rc ; rc = rc->next) {
1001         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1002             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1003             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1004             return 1;
1005         }
1006     }
1007
1008     return 0;
1009 }
1010
1011 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1012     if(chart && name) {
1013         uint32_t hash_chart = simple_hash(chart);
1014         uint32_t hash_name = simple_hash(name);
1015
1016         // re-use old IDs, by looking them up in the alarm log
1017         ALARM_ENTRY *ae;
1018         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1019             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1020                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1021                 return ae->alarm_id;
1022             }
1023         }
1024     }
1025
1026     return host->health_log.next_alarm_id++;
1027 }
1028
1029 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1030     rrdhost_check_rdlock(host);
1031
1032     if(rc->calculation) {
1033         rc->calculation->status = &rc->status;
1034         rc->calculation->this = &rc->value;
1035         rc->calculation->after = &rc->db_after;
1036         rc->calculation->before = &rc->db_before;
1037         rc->calculation->rrdcalc = rc;
1038     }
1039
1040     if(rc->warning) {
1041         rc->warning->status = &rc->status;
1042         rc->warning->this = &rc->value;
1043         rc->warning->after = &rc->db_after;
1044         rc->warning->before = &rc->db_before;
1045         rc->warning->rrdcalc = rc;
1046     }
1047
1048     if(rc->critical) {
1049         rc->critical->status = &rc->status;
1050         rc->critical->this = &rc->value;
1051         rc->critical->after = &rc->db_after;
1052         rc->critical->before = &rc->db_before;
1053         rc->critical->rrdcalc = rc;
1054     }
1055
1056     // link it to the host
1057     if(likely(host->alarms)) {
1058         // append it
1059         RRDCALC *t;
1060         for(t = host->alarms; t && t->next ; t = t->next) ;
1061         t->next = rc;
1062     }
1063     else {
1064         host->alarms = rc;
1065     }
1066
1067     // link it to its chart
1068     RRDSET *st;
1069     for(st = host->rrdset_root; st ; st = st->next) {
1070         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1071             rrdsetcalc_link(st, rc);
1072             break;
1073         }
1074     }
1075 }
1076
1077 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1078
1079     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1080
1081     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1082         return NULL;
1083
1084     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1085     rc->next_event_id = 1;
1086     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1087     rc->name = strdupz(rt->name);
1088     rc->hash = simple_hash(rc->name);
1089     rc->chart = strdupz(chart);
1090     rc->hash_chart = simple_hash(rc->chart);
1091
1092     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1093
1094     rc->green = rt->green;
1095     rc->red = rt->red;
1096     rc->value = NAN;
1097     rc->old_value = NAN;
1098
1099     rc->delay_up_duration = rt->delay_up_duration;
1100     rc->delay_down_duration = rt->delay_down_duration;
1101     rc->delay_max_duration = rt->delay_max_duration;
1102     rc->delay_multiplier = rt->delay_multiplier;
1103
1104     rc->group = rt->group;
1105     rc->after = rt->after;
1106     rc->before = rt->before;
1107     rc->update_every = rt->update_every;
1108     rc->options = rt->options;
1109
1110     if(rt->exec) rc->exec = strdupz(rt->exec);
1111     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1112     if(rt->source) rc->source = strdupz(rt->source);
1113     if(rt->units) rc->units = strdupz(rt->units);
1114     if(rt->info) rc->info = strdupz(rt->info);
1115
1116     if(rt->calculation) {
1117         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1118         if(!rc->calculation)
1119             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1120     }
1121     if(rt->warning) {
1122         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1123         if(!rc->warning)
1124             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1125     }
1126     if(rt->critical) {
1127         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1128         if(!rc->critical)
1129             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1130     }
1131
1132     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1133           (rc->chart)?rc->chart:"NOCHART",
1134           rc->name,
1135           (rc->exec)?rc->exec:"DEFAULT",
1136           (rc->recipient)?rc->recipient:"DEFAULT",
1137           rc->green,
1138           rc->red,
1139           rc->group,
1140           rc->after,
1141           rc->before,
1142           rc->options,
1143           (rc->dimensions)?rc->dimensions:"NONE",
1144           rc->update_every,
1145           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1146           (rc->warning)?rc->warning->parsed_as:"NONE",
1147           (rc->critical)?rc->critical->parsed_as:"NONE",
1148           rc->source,
1149           rc->delay_up_duration,
1150           rc->delay_down_duration,
1151           rc->delay_max_duration,
1152           rc->delay_multiplier
1153     );
1154
1155     rrdcalc_create_part2(host, rc);
1156     return rc;
1157 }
1158
1159 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1160     if(!rc) return;
1161
1162     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1163
1164     // unlink it from RRDSET
1165     if(rc->rrdset) rrdsetcalc_unlink(rc);
1166
1167     // unlink it from RRDHOST
1168     if(unlikely(rc == host->alarms))
1169         host->alarms = rc->next;
1170
1171     else if(likely(host->alarms)) {
1172         RRDCALC *t, *last = host->alarms;
1173         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1174         if(last->next == rc)
1175             last->next = rc->next;
1176         else
1177             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1178     }
1179     else
1180         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1181
1182     expression_free(rc->calculation);
1183     expression_free(rc->warning);
1184     expression_free(rc->critical);
1185
1186     freez(rc->name);
1187     freez(rc->chart);
1188     freez(rc->family);
1189     freez(rc->dimensions);
1190     freez(rc->exec);
1191     freez(rc->recipient);
1192     freez(rc->source);
1193     freez(rc->units);
1194     freez(rc->info);
1195     freez(rc);
1196 }
1197
1198 // ----------------------------------------------------------------------------
1199 // RRDCALCTEMPLATE management
1200
1201 void rrdcalctemplate_link_matching(RRDSET *st) {
1202     RRDCALCTEMPLATE *rt;
1203
1204     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1205         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1206             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1207             if(unlikely(!rc))
1208                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1209
1210 #ifdef NETDATA_INTERNAL_CHECKS
1211             else if(rc->rrdset != st)
1212                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1213 #endif
1214         }
1215     }
1216 }
1217
1218 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1219     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1220
1221     if(host->templates) {
1222         if(host->templates == rt) {
1223             host->templates = rt->next;
1224         }
1225         else {
1226             RRDCALCTEMPLATE *t, *last = host->templates;
1227             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1228             if(last && last->next == rt) {
1229                 last->next = rt->next;
1230                 rt->next = NULL;
1231             }
1232             else
1233                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1234         }
1235     }
1236
1237     expression_free(rt->calculation);
1238     expression_free(rt->warning);
1239     expression_free(rt->critical);
1240
1241     freez(rt->name);
1242     freez(rt->exec);
1243     freez(rt->recipient);
1244     freez(rt->context);
1245     freez(rt->source);
1246     freez(rt->units);
1247     freez(rt->info);
1248     freez(rt->dimensions);
1249     freez(rt);
1250 }
1251
1252 // ----------------------------------------------------------------------------
1253 // load health configuration
1254
1255 #define HEALTH_CONF_MAX_LINE 4096
1256
1257 #define HEALTH_ALARM_KEY "alarm"
1258 #define HEALTH_TEMPLATE_KEY "template"
1259 #define HEALTH_ON_KEY "on"
1260 #define HEALTH_LOOKUP_KEY "lookup"
1261 #define HEALTH_CALC_KEY "calc"
1262 #define HEALTH_EVERY_KEY "every"
1263 #define HEALTH_GREEN_KEY "green"
1264 #define HEALTH_RED_KEY "red"
1265 #define HEALTH_WARN_KEY "warn"
1266 #define HEALTH_CRIT_KEY "crit"
1267 #define HEALTH_EXEC_KEY "exec"
1268 #define HEALTH_RECIPIENT_KEY "to"
1269 #define HEALTH_UNITS_KEY "units"
1270 #define HEALTH_INFO_KEY "info"
1271 #define HEALTH_DELAY_KEY "delay"
1272
1273 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1274     if(!rc->chart) {
1275         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1276         return 0;
1277     }
1278
1279     if(!rc->update_every) {
1280         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1281         return 0;
1282     }
1283
1284     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1285         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1286         return 0;
1287     }
1288
1289     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1290         return 0;
1291
1292     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1293
1294     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1295           rc->chart?rc->chart:"NOCHART",
1296           rc->name,
1297           rc->id,
1298           (rc->exec)?rc->exec:"DEFAULT",
1299           (rc->recipient)?rc->recipient:"DEFAULT",
1300           rc->green,
1301           rc->red,
1302           rc->group,
1303           rc->after,
1304           rc->before,
1305           rc->options,
1306           (rc->dimensions)?rc->dimensions:"NONE",
1307           rc->update_every,
1308           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1309           (rc->warning)?rc->warning->parsed_as:"NONE",
1310           (rc->critical)?rc->critical->parsed_as:"NONE",
1311           rc->source,
1312           rc->delay_up_duration,
1313           rc->delay_down_duration,
1314           rc->delay_max_duration,
1315           rc->delay_multiplier
1316     );
1317
1318     rrdcalc_create_part2(host, rc);
1319     return 1;
1320 }
1321
1322 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1323     if(unlikely(!rt->context)) {
1324         error("Health configuration for template '%s' does not have a context", rt->name);
1325         return 0;
1326     }
1327
1328     if(unlikely(!rt->update_every)) {
1329         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1330         return 0;
1331     }
1332
1333     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1334         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1335         return 0;
1336     }
1337
1338     RRDCALCTEMPLATE *t, *last = NULL;
1339     for (t = host->templates; t ; last = t, t = t->next) {
1340         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1341             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1342             return 0;
1343         }
1344     }
1345
1346     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1347           rt->name,
1348           (rt->context)?rt->context:"NONE",
1349           (rt->exec)?rt->exec:"DEFAULT",
1350           (rt->recipient)?rt->recipient:"DEFAULT",
1351           rt->green,
1352           rt->red,
1353           rt->group,
1354           rt->after,
1355           rt->before,
1356           rt->options,
1357           (rt->dimensions)?rt->dimensions:"NONE",
1358           rt->update_every,
1359           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1360           (rt->warning)?rt->warning->parsed_as:"NONE",
1361           (rt->critical)?rt->critical->parsed_as:"NONE",
1362           rt->source,
1363           rt->delay_up_duration,
1364           rt->delay_down_duration,
1365           rt->delay_max_duration,
1366           rt->delay_multiplier
1367     );
1368
1369     if(likely(last)) {
1370         last->next = rt;
1371     }
1372     else {
1373         rt->next = host->templates;
1374         host->templates = rt;
1375     }
1376
1377     return 1;
1378 }
1379
1380 static inline int health_parse_duration(char *string, int *result) {
1381     // make sure it is a number
1382     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1383         *result = 0;
1384         return 0;
1385     }
1386
1387     char *e = NULL;
1388     calculated_number n = strtold(string, &e);
1389     if(e && *e) {
1390         switch (*e) {
1391             case 'Y':
1392                 *result = (int) (n * 86400 * 365);
1393                 break;
1394             case 'M':
1395                 *result = (int) (n * 86400 * 30);
1396                 break;
1397             case 'w':
1398                 *result = (int) (n * 86400 * 7);
1399                 break;
1400             case 'd':
1401                 *result = (int) (n * 86400);
1402                 break;
1403             case 'h':
1404                 *result = (int) (n * 3600);
1405                 break;
1406             case 'm':
1407                 *result = (int) (n * 60);
1408                 break;
1409
1410             default:
1411             case 's':
1412                 *result = (int) (n);
1413                 break;
1414         }
1415     }
1416     else
1417        *result = (int)(n);
1418
1419     return 1;
1420 }
1421
1422 static inline int health_parse_delay(
1423         size_t line, const char *path, const char *file, char *string,
1424         int *delay_up_duration,
1425         int *delay_down_duration,
1426         int *delay_max_duration,
1427         float *delay_multiplier) {
1428
1429     char given_up = 0;
1430     char given_down = 0;
1431     char given_max = 0;
1432     char given_multiplier = 0;
1433
1434     char *s = string;
1435     while(*s) {
1436         char *key = s;
1437
1438         while(*s && !isspace(*s)) s++;
1439         while(*s && isspace(*s)) *s++ = '\0';
1440
1441         if(!*key) break;
1442
1443         char *value = s;
1444         while(*s && !isspace(*s)) s++;
1445         while(*s && isspace(*s)) *s++ = '\0';
1446
1447         if(!strcasecmp(key, "up")) {
1448             if (!health_parse_duration(value, delay_up_duration)) {
1449                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1450                       line, path, file, value, key);
1451             }
1452             else given_up = 1;
1453         }
1454         else if(!strcasecmp(key, "down")) {
1455             if (!health_parse_duration(value, delay_down_duration)) {
1456                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1457                       line, path, file, value, key);
1458             }
1459             else given_down = 1;
1460         }
1461         else if(!strcasecmp(key, "multiplier")) {
1462             *delay_multiplier = strtof(value, NULL);
1463             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1464                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1465                       line, path, file, value, key);
1466             }
1467             else given_multiplier = 1;
1468         }
1469         else if(!strcasecmp(key, "max")) {
1470             if (!health_parse_duration(value, delay_max_duration)) {
1471                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1472                       line, path, file, value, key);
1473             }
1474             else given_max = 1;
1475         }
1476         else {
1477             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1478                   line, path, file, key);
1479         }
1480     }
1481
1482     if(!given_up)
1483         *delay_up_duration = 0;
1484
1485     if(!given_down)
1486         *delay_down_duration = 0;
1487
1488     if(!given_multiplier)
1489         *delay_multiplier = 1.0;
1490
1491     if(!given_max) {
1492         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1493             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1494
1495         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1496             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1497     }
1498
1499     return 1;
1500 }
1501
1502 static inline int health_parse_db_lookup(
1503         size_t line, const char *path, const char *file, char *string,
1504         int *group_method, int *after, int *before, int *every,
1505         uint32_t *options, char **dimensions
1506 ) {
1507     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1508
1509     if(*dimensions) freez(*dimensions);
1510     *dimensions = NULL;
1511     *after = 0;
1512     *before = 0;
1513     *every = 0;
1514     *options = 0;
1515
1516     char *s = string, *key;
1517
1518     // first is the group method
1519     key = s;
1520     while(*s && !isspace(*s)) s++;
1521     while(*s && isspace(*s)) *s++ = '\0';
1522     if(!*s) {
1523         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1524               line, path, file, key);
1525         return 0;
1526     }
1527
1528     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1529         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1530               line, path, file, key);
1531         return 0;
1532     }
1533
1534     // then is the 'after' time
1535     key = s;
1536     while(*s && !isspace(*s)) s++;
1537     while(*s && isspace(*s)) *s++ = '\0';
1538
1539     if(!health_parse_duration(key, after)) {
1540         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1541               line, path, file, key);
1542         return 0;
1543     }
1544
1545     // sane defaults
1546     *every = abs(*after);
1547
1548     // now we may have optional parameters
1549     while(*s) {
1550         key = s;
1551         while(*s && !isspace(*s)) s++;
1552         while(*s && isspace(*s)) *s++ = '\0';
1553         if(!*key) break;
1554
1555         if(!strcasecmp(key, "at")) {
1556             char *value = s;
1557             while(*s && !isspace(*s)) s++;
1558             while(*s && isspace(*s)) *s++ = '\0';
1559
1560             if (!health_parse_duration(value, before)) {
1561                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1562                       line, path, file, value, key);
1563             }
1564         }
1565         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1566             char *value = s;
1567             while(*s && !isspace(*s)) s++;
1568             while(*s && isspace(*s)) *s++ = '\0';
1569
1570             if (!health_parse_duration(value, every)) {
1571                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1572                       line, path, file, value, key);
1573             }
1574         }
1575         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1576             *options |= RRDR_OPTION_ABSOLUTE;
1577         }
1578         else if(!strcasecmp(key, "min2max")) {
1579             *options |= RRDR_OPTION_MIN2MAX;
1580         }
1581         else if(!strcasecmp(key, "null2zero")) {
1582             *options |= RRDR_OPTION_NULL2ZERO;
1583         }
1584         else if(!strcasecmp(key, "percentage")) {
1585             *options |= RRDR_OPTION_PERCENTAGE;
1586         }
1587         else if(!strcasecmp(key, "unaligned")) {
1588             *options |= RRDR_OPTION_NOT_ALIGNED;
1589         }
1590         else if(!strcasecmp(key, "of")) {
1591             if(*s && strcasecmp(s, "all"))
1592                *dimensions = strdupz(s);
1593             break;
1594         }
1595         else {
1596             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1597                   line, path, file, key);
1598         }
1599     }
1600
1601     return 1;
1602 }
1603
1604 static inline char *tabs2spaces(char *s) {
1605     char *t = s;
1606     while(*t) {
1607         if(unlikely(*t == '\t')) *t = ' ';
1608         t++;
1609     }
1610
1611     return s;
1612 }
1613
1614 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1615     char buffer[FILENAME_MAX + 1];
1616     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1617     return strdupz(buffer);
1618 }
1619
1620 static inline void strip_quotes(char *s) {
1621     while(*s) {
1622         if(*s == '\'' || *s == '"') *s = ' ';
1623         s++;
1624     }
1625 }
1626
1627 int health_readfile(const char *path, const char *filename) {
1628     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1629
1630     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1631     char buffer[HEALTH_CONF_MAX_LINE + 1];
1632
1633     if(unlikely(!hash_alarm)) {
1634         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1635         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1636         hash_on = simple_uhash(HEALTH_ON_KEY);
1637         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1638         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1639         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1640         hash_red = simple_uhash(HEALTH_RED_KEY);
1641         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1642         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1643         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1644         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1645         hash_units = simple_hash(HEALTH_UNITS_KEY);
1646         hash_info = simple_hash(HEALTH_INFO_KEY);
1647         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1648         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1649     }
1650
1651     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1652     FILE *fp = fopen(buffer, "r");
1653     if(!fp) {
1654         error("Health configuration cannot read file '%s'.", buffer);
1655         return 0;
1656     }
1657
1658     RRDCALC *rc = NULL;
1659     RRDCALCTEMPLATE *rt = NULL;
1660
1661     size_t line = 0, append = 0;
1662     char *s;
1663     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1664         int stop_appending = !s;
1665         line++;
1666         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1667         s = trim(buffer);
1668         if(!s) continue;
1669         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1670
1671         append = strlen(s);
1672         if(!stop_appending && s[append - 1] == '\\') {
1673             s[append - 1] = ' ';
1674             append = &s[append] - buffer;
1675             if(append < HEALTH_CONF_MAX_LINE)
1676                 continue;
1677             else {
1678                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1679             }
1680         }
1681         append = 0;
1682
1683         char *key = s;
1684         while(*s && *s != ':') s++;
1685         if(!*s) {
1686             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1687             continue;
1688         }
1689         *s = '\0';
1690         s++;
1691
1692         char *value = s;
1693         key = trim(key);
1694         value = trim(value);
1695
1696         if(!key) {
1697             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1698             continue;
1699         }
1700
1701         if(!value) {
1702             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1703             continue;
1704         }
1705
1706         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1707         uint32_t hash = simple_uhash(key);
1708
1709         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1710             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1711                 rrdcalc_free(&localhost, rc);
1712
1713             if(rt) {
1714                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1715                     rrdcalctemplate_free(&localhost, rt);
1716                 rt = NULL;
1717             }
1718
1719             rc = callocz(1, sizeof(RRDCALC));
1720             rc->next_event_id = 1;
1721             rc->name = tabs2spaces(strdupz(value));
1722             rc->hash = simple_hash(rc->name);
1723             rc->source = health_source_file(line, path, filename);
1724             rc->green = NAN;
1725             rc->red = NAN;
1726             rc->value = NAN;
1727             rc->old_value = NAN;
1728             rc->delay_multiplier = 1.0;
1729
1730             if(rrdvar_fix_name(rc->name))
1731                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1732         }
1733         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1734             if(rc) {
1735                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1736                     rrdcalc_free(&localhost, rc);
1737                 rc = NULL;
1738             }
1739
1740             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1741                 rrdcalctemplate_free(&localhost, rt);
1742
1743             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1744             rt->name = tabs2spaces(strdupz(value));
1745             rt->hash_name = simple_hash(rt->name);
1746             rt->source = health_source_file(line, path, filename);
1747             rt->green = NAN;
1748             rt->red = NAN;
1749             rt->delay_multiplier = 1.0;
1750
1751             if(rrdvar_fix_name(rt->name))
1752                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1753         }
1754         else if(rc) {
1755             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1756                 if(rc->chart) {
1757                     if(strcmp(rc->chart, value))
1758                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1759                              line, path, filename, rc->name, key, rc->chart, value, value);
1760
1761                     freez(rc->chart);
1762                 }
1763                 rc->chart = tabs2spaces(strdupz(value));
1764                 rc->hash_chart = simple_hash(rc->chart);
1765             }
1766             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1767                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1768                                        &rc->update_every,
1769                                        &rc->options, &rc->dimensions);
1770             }
1771             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1772                 if(!health_parse_duration(value, &rc->update_every))
1773                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1774                          line, path, filename, rc->name, key, value);
1775             }
1776             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1777                 char *e;
1778                 rc->green = strtold(value, &e);
1779                 if(e && *e) {
1780                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1781                          line, path, filename, rc->name, key, e);
1782                 }
1783             }
1784             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1785                 char *e;
1786                 rc->red = strtold(value, &e);
1787                 if(e && *e) {
1788                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1789                          line, path, filename, rc->name, key, e);
1790                 }
1791             }
1792             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1793                 const char *failed_at = NULL;
1794                 int error = 0;
1795                 rc->calculation = expression_parse(value, &failed_at, &error);
1796                 if(!rc->calculation) {
1797                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1798                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1799                 }
1800             }
1801             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1802                 const char *failed_at = NULL;
1803                 int error = 0;
1804                 rc->warning = expression_parse(value, &failed_at, &error);
1805                 if(!rc->warning) {
1806                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1807                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1808                 }
1809             }
1810             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1811                 const char *failed_at = NULL;
1812                 int error = 0;
1813                 rc->critical = expression_parse(value, &failed_at, &error);
1814                 if(!rc->critical) {
1815                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1816                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1817                 }
1818             }
1819             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1820                 if(rc->exec) {
1821                     if(strcmp(rc->exec, value))
1822                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1823                              line, path, filename, rc->name, key, rc->exec, value, value);
1824
1825                     freez(rc->exec);
1826                 }
1827                 rc->exec = tabs2spaces(strdupz(value));
1828             }
1829             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1830                 if(rc->recipient) {
1831                     if(strcmp(rc->recipient, value))
1832                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1833                              line, path, filename, rc->name, key, rc->recipient, value, value);
1834
1835                     freez(rc->recipient);
1836                 }
1837                 rc->recipient = tabs2spaces(strdupz(value));
1838             }
1839             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1840                 if(rc->units) {
1841                     if(strcmp(rc->units, value))
1842                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1843                              line, path, filename, rc->name, key, rc->units, value, value);
1844
1845                     freez(rc->units);
1846                 }
1847                 rc->units = tabs2spaces(strdupz(value));
1848                 strip_quotes(rc->units);
1849             }
1850             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1851                 if(rc->info) {
1852                     if(strcmp(rc->info, value))
1853                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1854                              line, path, filename, rc->name, key, rc->info, value, value);
1855
1856                     freez(rc->info);
1857                 }
1858                 rc->info = tabs2spaces(strdupz(value));
1859                 strip_quotes(rc->info);
1860             }
1861             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1862                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1863             }
1864             else {
1865                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1866                      line, path, filename, rc->name, key);
1867             }
1868         }
1869         else if(rt) {
1870             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1871                 if(rt->context) {
1872                     if(strcmp(rt->context, value))
1873                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1874                              line, path, filename, rt->name, key, rt->context, value, value);
1875
1876                     freez(rt->context);
1877                 }
1878                 rt->context = tabs2spaces(strdupz(value));
1879                 rt->hash_context = simple_hash(rt->context);
1880             }
1881             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1882                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1883                                        &rt->update_every,
1884                                        &rt->options, &rt->dimensions);
1885             }
1886             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1887                 if(!health_parse_duration(value, &rt->update_every))
1888                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1889                          line, path, filename, rt->name, key, value);
1890             }
1891             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1892                 char *e;
1893                 rt->green = strtold(value, &e);
1894                 if(e && *e) {
1895                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1896                          line, path, filename, rt->name, key, e);
1897                 }
1898             }
1899             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1900                 char *e;
1901                 rt->red = strtold(value, &e);
1902                 if(e && *e) {
1903                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1904                          line, path, filename, rt->name, key, e);
1905                 }
1906             }
1907             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1908                 const char *failed_at = NULL;
1909                 int error = 0;
1910                 rt->calculation = expression_parse(value, &failed_at, &error);
1911                 if(!rt->calculation) {
1912                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1913                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1914                 }
1915             }
1916             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1917                 const char *failed_at = NULL;
1918                 int error = 0;
1919                 rt->warning = expression_parse(value, &failed_at, &error);
1920                 if(!rt->warning) {
1921                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1922                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1923                 }
1924             }
1925             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1926                 const char *failed_at = NULL;
1927                 int error = 0;
1928                 rt->critical = expression_parse(value, &failed_at, &error);
1929                 if(!rt->critical) {
1930                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1931                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1932                 }
1933             }
1934             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1935                 if(rt->exec) {
1936                     if(strcmp(rt->exec, value))
1937                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1938                              line, path, filename, rt->name, key, rt->exec, value, value);
1939
1940                     freez(rt->exec);
1941                 }
1942                 rt->exec = tabs2spaces(strdupz(value));
1943             }
1944             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1945                 if(rt->recipient) {
1946                     if(strcmp(rt->recipient, value))
1947                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1948                              line, path, filename, rt->name, key, rt->recipient, value, value);
1949
1950                     freez(rt->recipient);
1951                 }
1952                 rt->recipient = tabs2spaces(strdupz(value));
1953             }
1954             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1955                 if(rt->units) {
1956                     if(strcmp(rt->units, value))
1957                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1958                              line, path, filename, rt->name, key, rt->units, value, value);
1959
1960                     freez(rt->units);
1961                 }
1962                 rt->units = tabs2spaces(strdupz(value));
1963                 strip_quotes(rt->units);
1964             }
1965             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1966                 if(rt->info) {
1967                     if(strcmp(rt->info, value))
1968                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1969                              line, path, filename, rt->name, key, rt->info, value, value);
1970
1971                     freez(rt->info);
1972                 }
1973                 rt->info = tabs2spaces(strdupz(value));
1974                 strip_quotes(rt->info);
1975             }
1976             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1977                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
1978             }
1979             else {
1980                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1981                       line, path, filename, rt->name, key);
1982             }
1983         }
1984         else {
1985             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1986                   line, path, filename, key);
1987         }
1988     }
1989
1990     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1991         rrdcalc_free(&localhost, rc);
1992
1993     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1994         rrdcalctemplate_free(&localhost, rt);
1995
1996     fclose(fp);
1997     return 1;
1998 }
1999
2000 void health_readdir(const char *path) {
2001     size_t pathlen = strlen(path);
2002
2003     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2004
2005     DIR *dir = opendir(path);
2006     if (!dir) {
2007         error("Health configuration cannot open directory '%s'.", path);
2008         return;
2009     }
2010
2011     struct dirent *de = NULL;
2012     while ((de = readdir(dir))) {
2013         size_t len = strlen(de->d_name);
2014
2015         if(de->d_type == DT_DIR
2016            && (
2017                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2018                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2019            )) {
2020             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2021             continue;
2022         }
2023
2024         else if(de->d_type == DT_DIR) {
2025             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2026             strcpy(s, path);
2027             strcat(s, "/");
2028             strcat(s, de->d_name);
2029             health_readdir(s);
2030             freez(s);
2031             continue;
2032         }
2033
2034         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2035                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2036             health_readfile(path, de->d_name);
2037         }
2038
2039         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2040     }
2041
2042     closedir(dir);
2043 }
2044
2045 static inline char *health_config_dir(void) {
2046     char buffer[FILENAME_MAX + 1];
2047     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2048     return config_get("health", "health configuration directory", buffer);
2049 }
2050
2051 void health_init(void) {
2052     debug(D_HEALTH, "Health configuration initializing");
2053
2054     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2055         debug(D_HEALTH, "Health is disabled.");
2056         return;
2057     }
2058
2059     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2060     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2061         fatal("Cannot create directory '%s'.", pathname);
2062
2063     char filename[FILENAME_MAX + 1];
2064     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2065     health.log_filename = config_get("health", "health db file", filename);
2066
2067     health_alarm_log_load(&localhost);
2068     health_alarm_log_open();
2069
2070     char *path = health_config_dir();
2071
2072     {
2073         char buffer[FILENAME_MAX + 1];
2074         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2075         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2076     }
2077
2078     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2079     if(n < 10) {
2080         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2081         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2082     }
2083     else localhost.health_log.max = (unsigned int)n;
2084
2085     rrdhost_rwlock(&localhost);
2086     health_readdir(path);
2087     rrdhost_unlock(&localhost);
2088 }
2089
2090 // ----------------------------------------------------------------------------
2091 // JSON generation
2092
2093 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2094     if(value && *value)
2095         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2096     else
2097         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2098 }
2099
2100 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2101     buffer_sprintf(wb, "\n\t{\n"
2102                            "\t\t\"hostname\": \"%s\",\n"
2103                            "\t\t\"unique_id\": %u,\n"
2104                            "\t\t\"alarm_id\": %u,\n"
2105                            "\t\t\"alarm_event_id\": %u,\n"
2106                            "\t\t\"name\": \"%s\",\n"
2107                            "\t\t\"chart\": \"%s\",\n"
2108                            "\t\t\"family\": \"%s\",\n"
2109                            "\t\t\"processed\": %s,\n"
2110                            "\t\t\"updated\": %s,\n"
2111                            "\t\t\"exec_run\": %lu,\n"
2112                            "\t\t\"exec_failed\": %s,\n"
2113                            "\t\t\"exec\": \"%s\",\n"
2114                            "\t\t\"recipient\": \"%s\",\n"
2115                            "\t\t\"exec_code\": %d,\n"
2116                            "\t\t\"source\": \"%s\",\n"
2117                            "\t\t\"units\": \"%s\",\n"
2118                            "\t\t\"info\": \"%s\",\n"
2119                            "\t\t\"when\": %lu,\n"
2120                            "\t\t\"duration\": %lu,\n"
2121                            "\t\t\"non_clear_duration\": %lu,\n"
2122                            "\t\t\"status\": \"%s\",\n"
2123                            "\t\t\"old_status\": \"%s\",\n"
2124                            "\t\t\"delay\": %d,\n"
2125                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2126                            "\t\t\"updated_by_id\": %u,\n"
2127                            "\t\t\"updates_id\": %u,\n",
2128                    host->hostname,
2129                    ae->unique_id,
2130                    ae->alarm_id,
2131                    ae->alarm_event_id,
2132                    ae->name,
2133                    ae->chart,
2134                    ae->family,
2135                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2136                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2137                    (unsigned long)ae->exec_run_timestamp,
2138                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2139                    ae->exec?ae->exec:health.health_default_exec,
2140                    ae->recipient?ae->recipient:health.health_default_recipient,
2141                    ae->exec_code,
2142                    ae->source,
2143                    ae->units?ae->units:"",
2144                    ae->info?ae->info:"",
2145                    (unsigned long)ae->when,
2146                    (unsigned long)ae->duration,
2147                    (unsigned long)ae->non_clear_duration,
2148                    rrdcalc_status2string(ae->new_status),
2149                    rrdcalc_status2string(ae->old_status),
2150                    ae->delay,
2151                    (unsigned long)ae->delay_up_to_timestamp,
2152                    ae->updated_by_id,
2153                    ae->updates_id
2154     );
2155
2156     buffer_strcat(wb, "\t\t\"value\":");
2157     buffer_rrd_value(wb, ae->new_value);
2158     buffer_strcat(wb, ",\n");
2159
2160     buffer_strcat(wb, "\t\t\"old_value\":");
2161     buffer_rrd_value(wb, ae->old_value);
2162     buffer_strcat(wb, "\n");
2163
2164     buffer_strcat(wb, "\t}");
2165 }
2166
2167 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2168     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2169
2170     buffer_strcat(wb, "[");
2171
2172     unsigned int max = host->health_log.max;
2173     unsigned int count = 0;
2174     ALARM_ENTRY *ae;
2175     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2176         if(ae->unique_id > after) {
2177             if(likely(count)) buffer_strcat(wb, ",");
2178             health_alarm_entry2json_nolock(wb, ae, host);
2179         }
2180     }
2181
2182     buffer_strcat(wb, "\n]\n");
2183
2184     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2185 }
2186
2187 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2188     buffer_sprintf(wb,
2189            "\t\t\"%s.%s\": {\n"
2190                    "\t\t\t\"id\": %lu,\n"
2191                    "\t\t\t\"name\": \"%s\",\n"
2192                    "\t\t\t\"chart\": \"%s\",\n"
2193                    "\t\t\t\"family\": \"%s\",\n"
2194                    "\t\t\t\"active\": %s,\n"
2195                    "\t\t\t\"exec\": \"%s\",\n"
2196                    "\t\t\t\"recipient\": \"%s\",\n"
2197                    "\t\t\t\"source\": \"%s\",\n"
2198                    "\t\t\t\"units\": \"%s\",\n"
2199                    "\t\t\t\"info\": \"%s\",\n"
2200                                    "\t\t\t\"status\": \"%s\",\n"
2201                    "\t\t\t\"last_status_change\": %lu,\n"
2202                    "\t\t\t\"last_updated\": %lu,\n"
2203                    "\t\t\t\"next_update\": %lu,\n"
2204                    "\t\t\t\"update_every\": %d,\n"
2205                    "\t\t\t\"delay_up_duration\": %d,\n"
2206                    "\t\t\t\"delay_down_duration\": %d,\n"
2207                    "\t\t\t\"delay_max_duration\": %d,\n"
2208                    "\t\t\t\"delay_multiplier\": %f,\n"
2209                    "\t\t\t\"delay\": %d,\n"
2210                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2211             , rc->chart, rc->name
2212             , (unsigned long)rc->id
2213             , rc->name
2214             , rc->chart
2215             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2216             , (rc->rrdset)?"true":"false"
2217             , rc->exec?rc->exec:health.health_default_exec
2218             , rc->recipient?rc->recipient:health.health_default_recipient
2219             , rc->source
2220             , rc->units?rc->units:""
2221             , rc->info?rc->info:""
2222             , rrdcalc_status2string(rc->status)
2223             , (unsigned long)rc->last_status_change
2224             , (unsigned long)rc->last_updated
2225             , (unsigned long)rc->next_update
2226             , rc->update_every
2227             , rc->delay_up_duration
2228             , rc->delay_down_duration
2229             , rc->delay_max_duration
2230             , rc->delay_multiplier
2231             , rc->delay_last
2232             , (unsigned long)rc->delay_up_to_timestamp
2233     );
2234
2235     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2236         if(rc->dimensions && *rc->dimensions)
2237             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2238
2239         buffer_sprintf(wb,
2240                        "\t\t\t\"db_after\": %lu,\n"
2241                        "\t\t\t\"db_before\": %lu,\n"
2242                        "\t\t\t\"lookup_method\": \"%s\",\n"
2243                        "\t\t\t\"lookup_after\": %d,\n"
2244                        "\t\t\t\"lookup_before\": %d,\n"
2245                        "\t\t\t\"lookup_options\": \"",
2246                        (unsigned long) rc->db_after,
2247                        (unsigned long) rc->db_before,
2248                        group_method2string(rc->group),
2249                        rc->after,
2250                        rc->before
2251         );
2252         buffer_data_options2string(wb, rc->options);
2253         buffer_strcat(wb, "\",\n");
2254     }
2255
2256     if(rc->calculation) {
2257         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2258         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2259     }
2260
2261     if(rc->warning) {
2262         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2263         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2264     }
2265
2266     if(rc->critical) {
2267         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2268         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2269     }
2270
2271     buffer_strcat(wb, "\t\t\t\"green\":");
2272     buffer_rrd_value(wb, rc->green);
2273     buffer_strcat(wb, ",\n");
2274
2275     buffer_strcat(wb, "\t\t\t\"red\":");
2276     buffer_rrd_value(wb, rc->red);
2277     buffer_strcat(wb, ",\n");
2278
2279     buffer_strcat(wb, "\t\t\t\"value\":");
2280     buffer_rrd_value(wb, rc->value);
2281     buffer_strcat(wb, "\n");
2282
2283     buffer_strcat(wb, "\t\t}");
2284 }
2285
2286 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2287 //
2288 //}
2289
2290 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2291     int i;
2292
2293     rrdhost_rdlock(&localhost);
2294     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2295                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2296                         "\n\t\"status\": %s,"
2297                         "\n\t\"now\": %lu,"
2298                         "\n\t\"alarms\": {\n",
2299                         host->hostname,
2300                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2301                         health_enabled?"true":"false",
2302                         (unsigned long)time(NULL));
2303
2304     RRDCALC *rc;
2305     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2306         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2307             continue;
2308
2309         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2310             continue;
2311
2312         if(likely(i)) buffer_strcat(wb, ",\n");
2313         health_rrdcalc2json_nolock(wb, rc);
2314         i++;
2315     }
2316
2317 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2318 //    RRDCALCTEMPLATE *rt;
2319 //    for(rt = host->templates; rt ; rt = rt->next)
2320 //        health_rrdcalctemplate2json_nolock(wb, rt);
2321
2322     buffer_strcat(wb, "\n\t}\n}\n");
2323     rrdhost_unlock(&localhost);
2324 }
2325
2326
2327 // ----------------------------------------------------------------------------
2328 // re-load health configuration
2329
2330 static inline void health_free_all_nolock(RRDHOST *host) {
2331     while(host->templates)
2332         rrdcalctemplate_free(host, host->templates);
2333
2334     while(host->alarms)
2335         rrdcalc_free(host, host->alarms);
2336 }
2337
2338 void health_reload(void) {
2339     if(!health_enabled) {
2340         error("Health reload is requested, but health is not enabled.");
2341         return;
2342     }
2343
2344     char *path = health_config_dir();
2345
2346     // free all running alarms
2347     rrdhost_rwlock(&localhost);
2348     health_free_all_nolock(&localhost);
2349     rrdhost_unlock(&localhost);
2350
2351     // invalidate all previous entries in the alarm log
2352     ALARM_ENTRY *t;
2353     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2354         if(t->new_status != RRDCALC_STATUS_REMOVED)
2355             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2356     }
2357
2358     // reset all thresholds to all charts
2359     RRDSET *st;
2360     for(st = localhost.rrdset_root; st ; st = st->next) {
2361         st->green = NAN;
2362         st->red = NAN;
2363     }
2364
2365     // load the new alarms
2366     rrdhost_rwlock(&localhost);
2367     health_readdir(path);
2368     rrdhost_unlock(&localhost);
2369
2370     // link the loaded alarms to their charts
2371     for(st = localhost.rrdset_root; st ; st = st->next) {
2372         rrdhost_rwlock(&localhost);
2373
2374         rrdsetcalc_link_matching(st);
2375         rrdcalctemplate_link_matching(st);
2376
2377         rrdhost_unlock(&localhost);
2378     }
2379 }
2380
2381 // ----------------------------------------------------------------------------
2382 // health main thread and friends
2383
2384 static inline int rrdcalc_value2status(calculated_number n) {
2385     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2386     if(n) return RRDCALC_STATUS_RAISED;
2387     return RRDCALC_STATUS_CLEAR;
2388 }
2389
2390 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2391     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2392
2393     // find the previous notification for the same alarm
2394     ALARM_ENTRY *t;
2395     for(t = ae->next; t ;t = t->next) {
2396         if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2397             break;
2398     }
2399
2400     if(t && t->new_status == ae->new_status) {
2401         // don't send the same notification again
2402         info("Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2403         goto done;
2404     }
2405
2406     if((ae->old_status == RRDCALC_STATUS_UNDEFINED && ae->new_status == RRDCALC_STATUS_UNINITIALIZED)
2407         || (ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2408         info("Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2409         goto done;
2410     }
2411
2412     char buffer[FILENAME_MAX + 1];
2413     pid_t command_pid;
2414
2415     const char *exec = ae->exec;
2416     if(!exec) exec = health.health_default_exec;
2417
2418     const char *recipient = ae->recipient;
2419     if(!recipient) recipient = health.health_default_recipient;
2420
2421     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2422               exec,
2423               recipient,
2424               host->hostname,
2425               ae->unique_id,
2426               ae->alarm_id,
2427               ae->alarm_event_id,
2428               (unsigned long)ae->when,
2429               ae->name,
2430               ae->chart?ae->chart:"NOCAHRT",
2431               ae->family?ae->family:"NOFAMILY",
2432               rrdcalc_status2string(ae->new_status),
2433               rrdcalc_status2string(ae->old_status),
2434               ae->new_value,
2435               ae->old_value,
2436               ae->source?ae->source:"UNKNOWN",
2437               (uint32_t)ae->duration,
2438               (uint32_t)ae->non_clear_duration,
2439               ae->units?ae->units:"",
2440               ae->info?ae->info:""
2441     );
2442
2443     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2444     ae->exec_run_timestamp = time(NULL);
2445
2446     debug(D_HEALTH, "executing command '%s'", buffer);
2447     FILE *fp = mypopen(buffer, &command_pid);
2448     if(!fp) {
2449         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2450         goto done;
2451     }
2452     debug(D_HEALTH, "HEALTH reading from command");
2453     char *s = fgets(buffer, FILENAME_MAX, fp);
2454     (void)s;
2455     ae->exec_code = mypclose(fp, command_pid);
2456     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2457
2458     if(ae->exec_code != 0)
2459         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2460
2461 done:
2462     health_alarm_log_save(host, ae);
2463     return;
2464 }
2465
2466 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2467     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2468          ae->chart?ae->chart:"NOCHART", ae->name,
2469          ae->new_value,
2470          rrdcalc_status2string(ae->old_status),
2471          rrdcalc_status2string(ae->new_status)
2472     );
2473
2474     health_alarm_execute(host, ae);
2475 }
2476
2477 static inline void health_alarm_log_process(RRDHOST *host) {
2478     static uint32_t stop_at_id = 0;
2479     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2480     time_t now = time(NULL);
2481
2482     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2483
2484     ALARM_ENTRY *ae;
2485     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2486         if(unlikely(
2487             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2488             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2489             )) {
2490
2491             if(unlikely(ae->unique_id < first_waiting))
2492                 first_waiting = ae->unique_id;
2493
2494             if(likely(now >= ae->delay_up_to_timestamp))
2495                 health_process_notifications(host, ae);
2496         }
2497     }
2498
2499     // remember this for the next iteration
2500     stop_at_id = first_waiting;
2501
2502     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2503
2504     if(host->health_log.count <= host->health_log.max)
2505         return;
2506
2507     // cleanup excess entries in the log
2508     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2509
2510     ALARM_ENTRY *last = NULL;
2511     unsigned int count = host->health_log.max * 2 / 3;
2512     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2513
2514     if(ae && last && last->next == ae)
2515         last->next = NULL;
2516     else
2517         ae = NULL;
2518
2519     while(ae) {
2520         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2521
2522         ALARM_ENTRY *t = ae->next;
2523
2524         freez(ae->name);
2525         freez(ae->chart);
2526         freez(ae->family);
2527         freez(ae->exec);
2528         freez(ae->recipient);
2529         freez(ae->source);
2530         freez(ae->units);
2531         freez(ae->info);
2532         freez(ae);
2533
2534         ae = t;
2535         host->health_log.count--;
2536     }
2537
2538     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2539 }
2540
2541 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2542     if (unlikely(!rc->rrdset)) {
2543         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2544         return 0;
2545     }
2546
2547     if (unlikely(!rc->rrdset->last_collected_time.tv_sec)) {
2548         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not yet collected.", rc->chart?rc->chart:"NOCHART", rc->name);
2549         return 0;
2550     }
2551
2552     if (unlikely(!rc->update_every)) {
2553         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2554         return 0;
2555     }
2556
2557     if (unlikely(rc->next_update > now)) {
2558         if (unlikely(*next_run > rc->next_update))
2559             *next_run = rc->next_update;
2560
2561         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2562         return 0;
2563     }
2564
2565     // FIXME
2566     // we should check that the DB lookup is possible
2567     // i.e.
2568     // - the duration of the chart includes the required timeframe
2569     // we SHOULD NOT check the dimensions - there might be alarms that refer non-existing dimensions (e.g. cpu steal)
2570
2571     return 1;
2572 }
2573
2574 void *health_main(void *ptr) {
2575     (void)ptr;
2576
2577     info("HEALTH thread created with task id %d", gettid());
2578
2579     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2580         error("Cannot set pthread cancel type to DEFERRED.");
2581
2582     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2583         error("Cannot set pthread cancel state to ENABLE.");
2584
2585     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2586     if(min_run_every < 1) min_run_every = 1;
2587
2588     BUFFER *wb = buffer_create(100);
2589
2590     unsigned int loop = 0;
2591     while(health_enabled && !netdata_exit) {
2592         loop++;
2593         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2594
2595         int oldstate, runnable = 0;
2596         time_t now = time(NULL);
2597         time_t next_run = now + min_run_every;
2598         RRDCALC *rc;
2599
2600         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2601             error("Cannot set pthread cancel state to DISABLE.");
2602
2603         rrdhost_rdlock(&localhost);
2604
2605         // the first loop is to lookup values from the db
2606         for (rc = localhost.alarms; rc; rc = rc->next) {
2607             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2608                 continue;
2609
2610             runnable++;
2611             rc->old_value = rc->value;
2612
2613             // 1. if there is database lookup, do it
2614             // 2. if there is calculation expression, run it
2615
2616             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2617                 time_t old_db_timestamp = rc->db_before;
2618                 int value_is_null = 0;
2619
2620                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2621                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2622                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2623
2624                 if (unlikely(ret != 200)) {
2625                     // database lookup failed
2626                     rc->value = NAN;
2627
2628                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2629
2630                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2631                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2632                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2633                     }
2634                 }
2635                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2636                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2637
2638                 if (unlikely(old_db_timestamp == rc->db_before)) {
2639                     // database is stale
2640
2641                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2642
2643                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2644                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2645                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2646                     }
2647                 }
2648                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2649                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2650
2651                 if (unlikely(value_is_null)) {
2652                     // collected value is null
2653
2654                     rc->value = NAN;
2655
2656                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2657                           rc->chart?rc->chart:"NOCHART", rc->name);
2658
2659                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2660                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2661                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2662                               rc->chart?rc->chart:"NOCHART", rc->name);
2663                     }
2664                 }
2665                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2666                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2667
2668                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2669                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2670             }
2671
2672             if(unlikely(rc->calculation)) {
2673                 if (unlikely(!expression_evaluate(rc->calculation))) {
2674                     // calculation failed
2675
2676                     rc->value = NAN;
2677
2678                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2679                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2680
2681                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2682                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2683                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2684                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2685                     }
2686                 }
2687                 else {
2688                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2689                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2690
2691                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
2692                             CALCULATED_NUMBER_FORMAT
2693                             ": %s (source: %s)",
2694                           rc->chart?rc->chart:"NOCHART", rc->name,
2695                           rc->calculation->result,
2696                           buffer_tostring(rc->calculation->error_msg),
2697                           rc->source
2698                     );
2699
2700                     rc->value = rc->calculation->result;
2701                 }
2702             }
2703         }
2704         rrdhost_unlock(&localhost);
2705
2706         if (unlikely(runnable && !netdata_exit)) {
2707             rrdhost_rdlock(&localhost);
2708
2709             for (rc = localhost.alarms; rc; rc = rc->next) {
2710                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2711                     continue;
2712
2713                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2714                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2715
2716                 if(likely(rc->warning)) {
2717                     if(unlikely(!expression_evaluate(rc->warning))) {
2718                         // calculation failed
2719
2720                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2721                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2722
2723                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2724                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2725                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2726                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2727                         }
2728                     }
2729                     else {
2730                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2731                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2732
2733                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2734                                 CALCULATED_NUMBER_FORMAT
2735                                 ": %s (source: %s)",
2736                               rc->chart?rc->chart:"NOCHART", rc->name,
2737                               rc->warning->result,
2738                               buffer_tostring(rc->warning->error_msg),
2739                               rc->source
2740                         );
2741
2742                         warning_status = rrdcalc_value2status(rc->warning->result);
2743                     }
2744                 }
2745
2746                 if(likely(rc->critical)) {
2747                     if(unlikely(!expression_evaluate(rc->critical))) {
2748                         // calculation failed
2749
2750                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2751                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2752
2753                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2754                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2755                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2756                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2757                         }
2758                     }
2759                     else {
2760                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2761                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2762
2763                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2764                                 CALCULATED_NUMBER_FORMAT
2765                                 ": %s (source: %s)",
2766                               rc->chart?rc->chart:"NOCHART", rc->name,
2767                               rc->critical->result,
2768                               buffer_tostring(rc->critical->error_msg),
2769                               rc->source
2770                         );
2771
2772                         critical_status = rrdcalc_value2status(rc->critical->result);
2773                     }
2774                 }
2775
2776                 int status = RRDCALC_STATUS_UNDEFINED;
2777
2778                 switch(warning_status) {
2779                     case RRDCALC_STATUS_CLEAR:
2780                         status = RRDCALC_STATUS_CLEAR;
2781                         break;
2782
2783                     case RRDCALC_STATUS_RAISED:
2784                         status = RRDCALC_STATUS_WARNING;
2785                         break;
2786
2787                     default:
2788                         break;
2789                 }
2790
2791                 switch(critical_status) {
2792                     case RRDCALC_STATUS_CLEAR:
2793                         if(status == RRDCALC_STATUS_UNDEFINED)
2794                             status = RRDCALC_STATUS_CLEAR;
2795                         break;
2796
2797                     case RRDCALC_STATUS_RAISED:
2798                         status = RRDCALC_STATUS_CRITICAL;
2799                         break;
2800
2801                     default:
2802                         break;
2803                 }
2804
2805                 if(status != rc->status) {
2806                     int delay = 0;
2807
2808                     if(now > rc->delay_up_to_timestamp) {
2809                         rc->delay_up_current = rc->delay_up_duration;
2810                         rc->delay_down_current = rc->delay_down_duration;
2811                         rc->delay_last = 0;
2812                         rc->delay_up_to_timestamp = 0;
2813                     }
2814                     else {
2815                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2816                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2817
2818                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2819                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2820                     }
2821
2822                     if(status > rc->status)
2823                         delay = rc->delay_up_current;
2824                     else
2825                         delay = rc->delay_down_current;
2826
2827                     // COMMENTED: because we do need to send raising alarms
2828                     // if(now + delay < rc->delay_up_to_timestamp)
2829                     //    delay = (int)(rc->delay_up_to_timestamp - now);
2830
2831                     rc->delay_last = delay;
2832                     rc->delay_up_to_timestamp = now + delay;
2833                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
2834                     rc->last_status_change = now;
2835                     rc->status = status;
2836                 }
2837
2838                 rc->last_updated = now;
2839                 rc->next_update = now + rc->update_every;
2840
2841                 if (next_run > rc->next_update)
2842                     next_run = rc->next_update;
2843             }
2844
2845             rrdhost_unlock(&localhost);
2846         }
2847
2848         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2849             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2850
2851         if(unlikely(netdata_exit))
2852             break;
2853
2854         // execute notifications
2855         // and cleanup
2856         health_alarm_log_process(&localhost);
2857
2858         if(unlikely(netdata_exit))
2859             break;
2860         
2861         now = time(NULL);
2862         if(now < next_run) {
2863             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2864                   loop, (int) (next_run - now));
2865             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2866         }
2867         else {
2868             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2869         }
2870     }
2871
2872     buffer_free(wb);
2873
2874     info("HEALTH thread exiting");
2875     pthread_exit(NULL);
2876     return NULL;
2877 }