]> arthur.barton.de Git - netdata.git/blob - src/health.c
simplified renaming of variables, due to dimension renames
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         if(unlink(health.log_filename) == -1 && errno != ENOENT)
71             error("Health: cannot remove old alarms log file '%s'", health.log_filename);
72
73         // open it with truncate
74         health.log_fp = fopen(health.log_filename, "w");
75
76         if(health.log_fp)
77             fclose(health.log_fp);
78         else
79             error("Health: cannot truncate health log '%s'", health.log_filename);
80
81         health.log_fp = NULL;
82
83         health.log_entries_written = 0;
84         health_alarm_log_open();
85     }
86 }
87
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
89     health_log_rotate();
90
91     if(likely(health.log_fp)) {
92         if(unlikely(fprintf(health.log_fp
93                 , "%c\t%s"
94                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
95                   "\t%08x\t%08x\t%08x"
96                   "\t%08x\t%08x\t%08x"
97                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
98                   "\t%d\t%d\t%d\t%d"
99                   "\t%Lf\t%Lf"
100                   "\n"
101                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
102                 , host->hostname
103
104                 , ae->unique_id
105                 , ae->alarm_id
106                 , ae->alarm_event_id
107                 , ae->updated_by_id
108                 , ae->updates_id
109
110                 , (uint32_t)ae->when
111                 , (uint32_t)ae->duration
112                 , (uint32_t)ae->non_clear_duration
113                 , (uint32_t)ae->flags
114                 , (uint32_t)ae->exec_run_timestamp
115                 , (uint32_t)ae->delay_up_to_timestamp
116
117                 , (ae->name)?ae->name:""
118                 , (ae->chart)?ae->chart:""
119                 , (ae->family)?ae->family:""
120                 , (ae->exec)?ae->exec:""
121                 , (ae->recipient)?ae->recipient:""
122                 , (ae->source)?ae->source:""
123                 , (ae->units)?ae->units:""
124                 , (ae->info)?ae->info:""
125
126                 , ae->exec_code
127                 , ae->new_status
128                 , ae->old_status
129                 , ae->delay
130
131                 , (long double)ae->new_value
132                 , (long double)ae->old_value
133         ) < 0))
134             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
135         else {
136             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137             health.log_entries_written++;
138         }
139     }
140 }
141
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143     static uint32_t max_unique_id = 0, max_alarm_id = 0;
144     ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
145
146     errno = 0;
147
148     char *s, *buf = mallocz(65536 + 1);
149     size_t line = 0, len = 0;
150     loaded = updated = errored = duplicate = 0;
151
152     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
153
154     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
155         health.log_entries_written++;
156         line++;
157
158         int max_entries = 30, entries = 0;
159         char *pointers[max_entries];
160
161         pointers[entries++] = s++;
162         while(*s) {
163             if(unlikely(*s == '\t')) {
164                 *s = '\0';
165                 pointers[entries++] = ++s;
166                 if(entries >= max_entries) {
167                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
168                     break;
169                 }
170             }
171             else s++;
172         }
173
174         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
175             ALARM_ENTRY *ae = NULL;
176
177             if(entries < 26) {
178                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries);
179                 errored++;
180                 continue;
181             }
182
183             // check that we have valid ids
184             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
185             if(!unique_id) {
186                 error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]);
187                 errored++;
188                 continue;
189             }
190
191             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
192             if(!alarm_id) {
193                 error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]);
194                 errored++;
195                 continue;
196             }
197
198             if(unlikely(*pointers[0] == 'A')) {
199                 // make sure it is properly numbered
200                 if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
201                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id);
202                     errored++;
203                     continue;
204                 }
205
206                 ae = callocz(1, sizeof(ALARM_ENTRY));
207             }
208             else if(unlikely(*pointers[0] == 'U')) {
209                 // find the original
210                 for(ae = host->health_log.alarms; ae; ae = ae->next) {
211                     if(unlikely(unique_id == ae->unique_id)) {
212                         if(unlikely(*pointers[0] == 'A')) {
213                             error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later."
214                                   , line, filename, unique_id);
215                             *pointers[0] = 'U';
216                             duplicate++;
217                         }
218                         break;
219                     }
220                     else if(unlikely(unique_id > ae->unique_id)) {
221                         // no need to continue
222                         // the linked list is sorted
223                         ae = NULL;
224                         break;
225                     }
226                 }
227
228                 // if not found, skip this line
229                 if(!ae) {
230                     // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
231                     continue;
232                 }
233             }
234
235             // check for a possible host missmatch
236             //if(strcmp(pointers[1], host->hostname))
237             //    error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
238
239             ae->unique_id               = unique_id;
240             ae->alarm_id                = alarm_id;
241             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
242             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
243             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
244
245             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
246             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
247             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
248
249             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
250             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
251
252             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
253             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
254
255             if(unlikely(ae->name)) freez(ae->name);
256             ae->name = strdupz(pointers[13]);
257             ae->hash_name = simple_hash(ae->name);
258
259             if(unlikely(ae->chart)) freez(ae->chart);
260             ae->chart = strdupz(pointers[14]);
261             ae->hash_chart = simple_hash(ae->chart);
262
263             if(unlikely(ae->family)) freez(ae->family);
264             ae->family = strdupz(pointers[15]);
265
266             if(unlikely(ae->exec)) freez(ae->exec);
267             ae->exec = strdupz(pointers[16]);
268             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
269
270             if(unlikely(ae->recipient)) freez(ae->recipient);
271             ae->recipient = strdupz(pointers[17]);
272             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
273
274             if(unlikely(ae->source)) freez(ae->source);
275             ae->source = strdupz(pointers[18]);
276             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
277
278             if(unlikely(ae->units)) freez(ae->units);
279             ae->units = strdupz(pointers[19]);
280             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
281
282             if(unlikely(ae->info)) freez(ae->info);
283             ae->info = strdupz(pointers[20]);
284             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
285
286             ae->exec_code   = atoi(pointers[21]);
287             ae->new_status  = atoi(pointers[22]);
288             ae->old_status  = atoi(pointers[23]);
289             ae->delay       = atoi(pointers[24]);
290
291             ae->new_value   = strtold(pointers[25], NULL);
292             ae->old_value   = strtold(pointers[26], NULL);
293
294             // add it to host if not already there
295             if(unlikely(*pointers[0] == 'A')) {
296                 ae->next = host->health_log.alarms;
297                 host->health_log.alarms = ae;
298                 loaded++;
299             }
300             else updated++;
301
302             if(unlikely(ae->unique_id > max_unique_id))
303                 max_unique_id = ae->unique_id;
304
305             if(unlikely(ae->alarm_id >= max_alarm_id))
306                 max_alarm_id = ae->alarm_id;
307         }
308         else {
309             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
310             errored++;
311         }
312     }
313
314     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
315
316     freez(buf);
317
318     if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
319     if(!max_alarm_id)  max_alarm_id  = (uint32_t)time(NULL);
320
321     host->health_log.next_log_id = max_unique_id + 1;
322     host->health_log.next_alarm_id = max_alarm_id + 1;
323
324     debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
325     return loaded;
326 }
327
328 static inline void health_alarm_log_load(RRDHOST *host) {
329     health_alarm_log_close();
330
331     char filename[FILENAME_MAX + 1];
332     snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename);
333     FILE *fp = fopen(filename, "r");
334     if(!fp)
335         error("Health: cannot open health file: %s", filename);
336     else {
337         health_alarm_log_read(host, fp, filename);
338         fclose(fp);
339     }
340
341     health.log_entries_written = 0;
342     fp = fopen(health.log_filename, "r");
343     if(!fp)
344         error("Health: cannot open health file: %s", health.log_filename);
345     else {
346         health_alarm_log_read(host, fp, health.log_filename);
347         fclose(fp);
348     }
349
350     health_alarm_log_open();
351 }
352
353
354 // ----------------------------------------------------------------------------
355 // health alarm log management
356
357 static inline void health_alarm_log(RRDHOST *host,
358                 uint32_t alarm_id, uint32_t alarm_event_id,
359                 time_t when,
360                 const char *name, const char *chart, const char *family,
361                 const char *exec, const char *recipient, time_t duration,
362                 calculated_number old_value, calculated_number new_value,
363                 int old_status, int new_status,
364                 const char *source,
365                 const char *units,
366                 const char *info,
367                 int delay
368 ) {
369     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
370
371     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
372     ae->name = strdupz(name);
373     ae->hash_name = simple_hash(ae->name);
374
375     if(chart) {
376         ae->chart = strdupz(chart);
377         ae->hash_chart = simple_hash(ae->chart);
378     }
379
380     if(family)
381         ae->family = strdupz(family);
382
383     if(exec) ae->exec = strdupz(exec);
384     if(recipient) ae->recipient = strdupz(recipient);
385     if(source) ae->source = strdupz(source);
386     if(units) ae->units = strdupz(units);
387     if(info) ae->info = strdupz(info);
388
389     ae->unique_id = host->health_log.next_log_id++;
390     ae->alarm_id = alarm_id;
391     ae->alarm_event_id = alarm_event_id;
392     ae->when = when;
393     ae->old_value = old_value;
394     ae->new_value = new_value;
395     ae->old_status = old_status;
396     ae->new_status = new_status;
397     ae->duration = duration;
398     ae->delay = delay;
399     ae->delay_up_to_timestamp = when + delay;
400
401     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
402         ae->non_clear_duration += ae->duration;
403
404     // link it
405     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
406     ae->next = host->health_log.alarms;
407     host->health_log.alarms = ae;
408     host->health_log.count++;
409     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
410
411     // match previous alarms
412     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
413     ALARM_ENTRY *t;
414     for(t = host->health_log.alarms ; t ; t = t->next) {
415         if(t != ae && t->alarm_id == ae->alarm_id) {
416             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
417                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
418                 t->updated_by_id = ae->unique_id;
419                 ae->updates_id = t->unique_id;
420
421                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
422                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
423                     ae->non_clear_duration += t->non_clear_duration;
424
425                 health_alarm_log_save(host, t);
426             }
427
428             // no need to continue
429             break;
430         }
431     }
432     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
433
434     health_alarm_log_save(host, ae);
435 }
436
437 // ----------------------------------------------------------------------------
438 // RRDVAR management
439
440 static inline int rrdvar_fix_name(char *variable) {
441     int fixed = 0;
442     while(*variable) {
443         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
444             *variable++ = '_';
445             fixed++;
446         }
447         else
448             variable++;
449     }
450
451     return fixed;
452 }
453
454 int rrdvar_compare(void* a, void* b) {
455     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
456     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
457     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
458 }
459
460 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
461     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
462     if(ret != rv)
463         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
464
465     return ret;
466 }
467
468 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
469     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
470     if(!ret)
471         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
472
473     return ret;
474 }
475
476 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
477     RRDVAR tmp;
478     tmp.name = (char *)name;
479     tmp.hash = (hash)?hash:simple_hash(tmp.name);
480
481     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
482 }
483
484 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
485     (void)host;
486
487     if(!rv) return;
488
489     if(tree)
490         rrdvar_index_del(tree, rv);
491
492     freez(rv->name);
493     freez(rv);
494 }
495
496 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
497     char *variable = strdupz(name);
498     rrdvar_fix_name(variable);
499     uint32_t hash = simple_hash(variable);
500
501     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
502     if(unlikely(!rv)) {
503         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
504
505         rv = callocz(1, sizeof(RRDVAR));
506         rv->name = variable;
507         rv->hash = hash;
508         rv->type = type;
509         rv->value = value;
510
511         RRDVAR *ret = rrdvar_index_add(tree, rv);
512         if(unlikely(ret != rv)) {
513             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
514             rrdvar_free(NULL, NULL, rv);
515             rv = NULL;
516         }
517         else
518             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
519     }
520     else {
521         // already exists
522         freez(variable);
523
524         // this is important
525         // it must return NULL - not the existing variable - or double-free will happen
526         rv = NULL;
527     }
528
529     return rv;
530 }
531
532 // ----------------------------------------------------------------------------
533 // RRDVAR lookup
534
535 calculated_number rrdvar2number(RRDVAR *rv) {
536     switch(rv->type) {
537         case RRDVAR_TYPE_CALCULATED: {
538             calculated_number *n = (calculated_number *)rv->value;
539             return *n;
540         }
541
542         case RRDVAR_TYPE_TIME_T: {
543             time_t *n = (time_t *)rv->value;
544             return *n;
545         }
546
547         case RRDVAR_TYPE_COLLECTED: {
548             collected_number *n = (collected_number *)rv->value;
549             return *n;
550         }
551
552         case RRDVAR_TYPE_TOTAL: {
553             total_number *n = (total_number *)rv->value;
554             return *n;
555         }
556
557         case RRDVAR_TYPE_INT: {
558             int *n = (int *)rv->value;
559             return *n;
560         }
561
562         default:
563             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
564             return NAN;
565     }
566 }
567
568 void dump_variable(void *data) {
569     RRDVAR *rv = (RRDVAR *)data;
570     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
571 }
572
573 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
574     RRDSET *st = rc->rrdset;
575     RRDVAR *rv;
576
577     if(!st) return 0;
578
579     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
580     if(rv) {
581         *result = rrdvar2number(rv);
582         return 1;
583     }
584
585     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
586     if(rv) {
587         *result = rrdvar2number(rv);
588         return 1;
589     }
590
591     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
592     if(rv) {
593         *result = rrdvar2number(rv);
594         return 1;
595     }
596
597     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
598     avl_traverse_lock(&st->variables_root_index, dump_variable);
599
600     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
601     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
602
603     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
604     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
605
606     return 0;
607 }
608
609 // ----------------------------------------------------------------------------
610 // RRDSETVAR management
611
612 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
613     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
614     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
615
616     char buffer[RRDVAR_MAX_LENGTH + 1];
617     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
618     rs->fullid = strdupz(buffer);
619
620     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
621     rs->fullname = strdupz(buffer);
622
623     rs->variable = strdupz(variable);
624
625     rs->type = type;
626     rs->value = value;
627     rs->options = options;
628     rs->rrdset = st;
629
630     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
631     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
632     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
633     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
634     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
635
636     rs->next = st->variables;
637     st->variables = rs;
638
639     return rs;
640 }
641
642 void rrdsetvar_rename_all(RRDSET *st) {
643     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
644
645     // only these 2 can change name
646     // rs->family_name
647     // rs->host_name
648
649     char buffer[RRDVAR_MAX_LENGTH + 1];
650     RRDSETVAR *rs, *next = st->variables;
651     while((rs = next)) {
652         next = rs->next;
653
654         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
655
656         if (strcmp(buffer, rs->fullname)) {
657             // name changed
658             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
659             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
660
661             freez(rs->fullname);
662             rs->fullname = strdupz(st->name);
663             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
664             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
665         }
666     }
667
668     rrdsetcalc_link_matching(st);
669 }
670
671 void rrdsetvar_free(RRDSETVAR *rs) {
672     RRDSET *st = rs->rrdset;
673     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
674
675     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
676     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
677     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
678     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
679     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
680
681     if(st->variables == rs) {
682         st->variables = rs->next;
683     }
684     else {
685         RRDSETVAR *t;
686         for (t = st->variables; t && t->next != rs; t = t->next);
687         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
688         else t->next = rs->next;
689     }
690
691     freez(rs->fullid);
692     freez(rs->fullname);
693     freez(rs->variable);
694     freez(rs);
695 }
696
697 // ----------------------------------------------------------------------------
698 // RRDDIMVAR management
699
700 #define RRDDIMVAR_ID_MAX 1024
701
702 static inline void rrddimvar_free_instances(RRDDIMVAR *rs) {
703     RRDDIM *rd = rs->rrddim;
704     RRDSET *st = rd->rrdset;
705
706     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
707     rs->local_id = NULL;
708
709     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
710     rs->local_name = NULL;
711
712     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
713     rs->family_id = NULL;
714
715     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
716     rs->family_name = NULL;
717
718     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_contextid);
719     rs->family_contextid = NULL;
720
721     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_contextname);
722     rs->family_contextname = NULL;
723
724     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
725     rs->host_fullidid = NULL;
726
727     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
728     rs->host_fullidname = NULL;
729
730     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
731     rs->host_fullnameid = NULL;
732
733     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
734     rs->host_fullnamename = NULL;
735
736     freez(rs->id);
737     rs->id = NULL;
738
739     freez(rs->name);
740     rs->name = NULL;
741
742     freez(rs->fullidid);
743     rs->fullidid = NULL;
744
745     freez(rs->fullidname);
746     rs->fullidname = NULL;
747
748     freez(rs->contextid);
749     rs->contextid = NULL;
750
751     freez(rs->contextname);
752     rs->contextname = NULL;
753
754     freez(rs->fullnameid);
755     rs->fullnameid = NULL;
756
757     freez(rs->fullnamename);
758     rs->fullnamename = NULL;
759 }
760
761 static inline void rrddimvar_create_instances(RRDDIMVAR *rs) {
762     rrddimvar_free_instances(rs);
763
764     RRDDIM *rd = rs->rrddim;
765     RRDSET *st = rd->rrdset;
766
767     char buffer[RRDDIMVAR_ID_MAX + 1];
768
769     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
770     rs->id = strdupz(buffer);
771
772     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
773     rs->name = strdupz(buffer);
774
775     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->id);
776     rs->fullidid = strdupz(buffer);
777
778     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
779     rs->fullidname = strdupz(buffer);
780
781     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->id);
782     rs->contextid = strdupz(buffer);
783
784     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->name);
785     rs->contextname = strdupz(buffer);
786
787     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
788     rs->fullnameid = strdupz(buffer);
789
790     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
791     rs->fullnamename = strdupz(buffer);
792
793     rs->local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
794     rs->local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
795
796     rs->family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
797     rs->family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
798     rs->family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->contextid, rs->type, rs->value);
799     rs->family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->contextname, rs->type, rs->value);
800
801     rs->host_fullidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
802     rs->host_fullidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
803     rs->host_fullnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
804     rs->host_fullnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
805 }
806
807
808 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
809     RRDSET *st = rd->rrdset;
810
811     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
812
813     if(!prefix) prefix = "";
814     if(!suffix) suffix = "";
815
816     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
817
818     rs->prefix = strdupz(prefix);
819     rs->suffix = strdupz(suffix);
820
821     rs->type = type;
822     rs->value = value;
823     rs->options = options;
824     rs->rrddim = rd;
825
826     rs->next = rd->variables;
827     rd->variables = rs;
828
829     rrddimvar_create_instances(rs);
830
831     return rs;
832 }
833
834 void rrddimvar_rename_all(RRDDIM *rd) {
835     RRDSET *st = rd->rrdset;
836     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
837
838     RRDDIMVAR *rs, *next = rd->variables;
839     while((rs = next)) {
840         next = rs->next;
841
842         if (strcmp(rd->name, rs->name))
843             rrddimvar_create_instances(rs);
844     }
845 }
846
847 void rrddimvar_free(RRDDIMVAR *rs) {
848     RRDDIM *rd = rs->rrddim;
849     RRDSET *st = rd->rrdset;
850     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
851
852     rrddimvar_free_instances(rs);
853
854     if(rd->variables == rs) {
855         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
856         rd->variables = rs->next;
857     }
858     else {
859         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
860         RRDDIMVAR *t;
861         for (t = rd->variables; t && t->next != rs; t = t->next) ;
862         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
863         else t->next = rs->next;
864     }
865
866     freez(rs->prefix);
867     freez(rs->suffix);
868     freez(rs);
869 }
870
871 // ----------------------------------------------------------------------------
872 // RRDCALC management
873
874 static inline const char *rrdcalc_status2string(int status) {
875     switch(status) {
876         case RRDCALC_STATUS_REMOVED:
877             return "REMOVED";
878
879         case RRDCALC_STATUS_UNDEFINED:
880             return "UNDEFINED";
881
882         case RRDCALC_STATUS_UNINITIALIZED:
883             return "UNINITIALIZED";
884
885         case RRDCALC_STATUS_CLEAR:
886             return "CLEAR";
887
888         case RRDCALC_STATUS_RAISED:
889             return "RAISED";
890
891         case RRDCALC_STATUS_WARNING:
892             return "WARNING";
893
894         case RRDCALC_STATUS_CRITICAL:
895             return "CRITICAL";
896
897         default:
898             error("Unknown alarm status %d", status);
899             return "UNKNOWN";
900     }
901 }
902
903 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
904     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
905
906     rc->last_status_change = time(NULL);
907     rc->rrdset = st;
908
909     rc->rrdset_next = st->alarms;
910     rc->rrdset_prev = NULL;
911     
912     if(rc->rrdset_next)
913         rc->rrdset_next->rrdset_prev = rc;
914
915     st->alarms = rc;
916
917     if(rc->update_every < rc->rrdset->update_every) {
918         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
919         rc->update_every = rc->rrdset->update_every;
920     }
921
922     if(!isnan(rc->green) && isnan(st->green)) {
923         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
924         st->green = rc->green;
925     }
926
927     if(!isnan(rc->red) && isnan(st->red)) {
928         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
929         st->red = rc->red;
930     }
931
932     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
933     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
934
935     char fullname[RRDVAR_MAX_LENGTH + 1];
936     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
937     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
938
939     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
940     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
941
942         if(!rc->units) rc->units = strdupz(st->units);
943
944     {
945         time_t now = time(NULL);
946         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
947     }
948 }
949
950 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
951     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
952             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
953         return 1;
954
955     return 0;
956 }
957
958 // this has to be called while the RRDHOST is locked
959 inline void rrdsetcalc_link_matching(RRDSET *st) {
960     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
961
962     RRDCALC *rc;
963     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
964         if(unlikely(rc->rrdset))
965             continue;
966
967         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
968             rrdsetcalc_link(st, rc);
969     }
970 }
971
972 // this has to be called while the RRDHOST is locked
973 inline void rrdsetcalc_unlink(RRDCALC *rc) {
974     RRDSET *st = rc->rrdset;
975
976     if(!st) {
977         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
978         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
979         return;
980     }
981
982     {
983         time_t now = time(NULL);
984         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
985     }
986
987     RRDHOST *host = st->rrdhost;
988
989     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
990
991     // unlink it
992     if(rc->rrdset_prev)
993         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
994
995     if(rc->rrdset_next)
996         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
997
998     if(st->alarms == rc)
999         st->alarms = rc->rrdset_next;
1000
1001     rc->rrdset_prev = rc->rrdset_next = NULL;
1002
1003     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
1004     rc->local = NULL;
1005
1006     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
1007     rc->family = NULL;
1008
1009     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
1010     rc->hostid = NULL;
1011
1012     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
1013     rc->hostname = NULL;
1014
1015     rc->rrdset = NULL;
1016
1017     // RRDCALC will remain in RRDHOST
1018     // so that if the matching chart is found in the future
1019     // it will be applied automatically
1020 }
1021
1022 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
1023     RRDCALC *rc;
1024     uint32_t hash = simple_hash(name);
1025
1026     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
1027         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
1028             return rc;
1029     }
1030
1031     return NULL;
1032 }
1033
1034 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
1035     RRDCALC *rc;
1036
1037     if(unlikely(!chart)) {
1038         error("attempt to find RRDCALC '%s' without giving a chart name", name);
1039         return 1;
1040     }
1041
1042     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1043     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
1044
1045     // make sure it does not already exist
1046     for(rc = host->alarms; rc ; rc = rc->next) {
1047         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1048             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1049             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1050             return 1;
1051         }
1052     }
1053
1054     return 0;
1055 }
1056
1057 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1058     if(chart && name) {
1059         uint32_t hash_chart = simple_hash(chart);
1060         uint32_t hash_name = simple_hash(name);
1061
1062         // re-use old IDs, by looking them up in the alarm log
1063         ALARM_ENTRY *ae;
1064         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1065             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1066                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1067                 return ae->alarm_id;
1068             }
1069         }
1070     }
1071
1072     return host->health_log.next_alarm_id++;
1073 }
1074
1075 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1076     rrdhost_check_rdlock(host);
1077
1078     if(rc->calculation) {
1079         rc->calculation->status = &rc->status;
1080         rc->calculation->this = &rc->value;
1081         rc->calculation->after = &rc->db_after;
1082         rc->calculation->before = &rc->db_before;
1083         rc->calculation->rrdcalc = rc;
1084     }
1085
1086     if(rc->warning) {
1087         rc->warning->status = &rc->status;
1088         rc->warning->this = &rc->value;
1089         rc->warning->after = &rc->db_after;
1090         rc->warning->before = &rc->db_before;
1091         rc->warning->rrdcalc = rc;
1092     }
1093
1094     if(rc->critical) {
1095         rc->critical->status = &rc->status;
1096         rc->critical->this = &rc->value;
1097         rc->critical->after = &rc->db_after;
1098         rc->critical->before = &rc->db_before;
1099         rc->critical->rrdcalc = rc;
1100     }
1101
1102     // link it to the host
1103     if(likely(host->alarms)) {
1104         // append it
1105         RRDCALC *t;
1106         for(t = host->alarms; t && t->next ; t = t->next) ;
1107         t->next = rc;
1108     }
1109     else {
1110         host->alarms = rc;
1111     }
1112
1113     // link it to its chart
1114     RRDSET *st;
1115     for(st = host->rrdset_root; st ; st = st->next) {
1116         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1117             rrdsetcalc_link(st, rc);
1118             break;
1119         }
1120     }
1121 }
1122
1123 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1124
1125     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1126
1127     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1128         return NULL;
1129
1130     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1131     rc->next_event_id = 1;
1132     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1133     rc->name = strdupz(rt->name);
1134     rc->hash = simple_hash(rc->name);
1135     rc->chart = strdupz(chart);
1136     rc->hash_chart = simple_hash(rc->chart);
1137
1138     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1139
1140     rc->green = rt->green;
1141     rc->red = rt->red;
1142     rc->value = NAN;
1143     rc->old_value = NAN;
1144
1145     rc->delay_up_duration = rt->delay_up_duration;
1146     rc->delay_down_duration = rt->delay_down_duration;
1147     rc->delay_max_duration = rt->delay_max_duration;
1148     rc->delay_multiplier = rt->delay_multiplier;
1149
1150     rc->group = rt->group;
1151     rc->after = rt->after;
1152     rc->before = rt->before;
1153     rc->update_every = rt->update_every;
1154     rc->options = rt->options;
1155
1156     if(rt->exec) rc->exec = strdupz(rt->exec);
1157     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1158     if(rt->source) rc->source = strdupz(rt->source);
1159     if(rt->units) rc->units = strdupz(rt->units);
1160     if(rt->info) rc->info = strdupz(rt->info);
1161
1162     if(rt->calculation) {
1163         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1164         if(!rc->calculation)
1165             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1166     }
1167     if(rt->warning) {
1168         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1169         if(!rc->warning)
1170             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1171     }
1172     if(rt->critical) {
1173         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1174         if(!rc->critical)
1175             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1176     }
1177
1178     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1179           (rc->chart)?rc->chart:"NOCHART",
1180           rc->name,
1181           (rc->exec)?rc->exec:"DEFAULT",
1182           (rc->recipient)?rc->recipient:"DEFAULT",
1183           rc->green,
1184           rc->red,
1185           rc->group,
1186           rc->after,
1187           rc->before,
1188           rc->options,
1189           (rc->dimensions)?rc->dimensions:"NONE",
1190           rc->update_every,
1191           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1192           (rc->warning)?rc->warning->parsed_as:"NONE",
1193           (rc->critical)?rc->critical->parsed_as:"NONE",
1194           rc->source,
1195           rc->delay_up_duration,
1196           rc->delay_down_duration,
1197           rc->delay_max_duration,
1198           rc->delay_multiplier
1199     );
1200
1201     rrdcalc_create_part2(host, rc);
1202     return rc;
1203 }
1204
1205 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1206     if(!rc) return;
1207
1208     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1209
1210     // unlink it from RRDSET
1211     if(rc->rrdset) rrdsetcalc_unlink(rc);
1212
1213     // unlink it from RRDHOST
1214     if(unlikely(rc == host->alarms))
1215         host->alarms = rc->next;
1216
1217     else if(likely(host->alarms)) {
1218         RRDCALC *t, *last = host->alarms;
1219         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1220         if(last->next == rc)
1221             last->next = rc->next;
1222         else
1223             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1224     }
1225     else
1226         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1227
1228     expression_free(rc->calculation);
1229     expression_free(rc->warning);
1230     expression_free(rc->critical);
1231
1232     freez(rc->name);
1233     freez(rc->chart);
1234     freez(rc->family);
1235     freez(rc->dimensions);
1236     freez(rc->exec);
1237     freez(rc->recipient);
1238     freez(rc->source);
1239     freez(rc->units);
1240     freez(rc->info);
1241     freez(rc);
1242 }
1243
1244 // ----------------------------------------------------------------------------
1245 // RRDCALCTEMPLATE management
1246
1247 void rrdcalctemplate_link_matching(RRDSET *st) {
1248     RRDCALCTEMPLATE *rt;
1249
1250     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1251         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1252             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1253             if(unlikely(!rc))
1254                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1255
1256 #ifdef NETDATA_INTERNAL_CHECKS
1257             else if(rc->rrdset != st)
1258                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1259 #endif
1260         }
1261     }
1262 }
1263
1264 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1265     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1266
1267     if(host->templates) {
1268         if(host->templates == rt) {
1269             host->templates = rt->next;
1270         }
1271         else {
1272             RRDCALCTEMPLATE *t, *last = host->templates;
1273             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1274             if(last && last->next == rt) {
1275                 last->next = rt->next;
1276                 rt->next = NULL;
1277             }
1278             else
1279                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1280         }
1281     }
1282
1283     expression_free(rt->calculation);
1284     expression_free(rt->warning);
1285     expression_free(rt->critical);
1286
1287     freez(rt->name);
1288     freez(rt->exec);
1289     freez(rt->recipient);
1290     freez(rt->context);
1291     freez(rt->source);
1292     freez(rt->units);
1293     freez(rt->info);
1294     freez(rt->dimensions);
1295     freez(rt);
1296 }
1297
1298 // ----------------------------------------------------------------------------
1299 // load health configuration
1300
1301 #define HEALTH_CONF_MAX_LINE 4096
1302
1303 #define HEALTH_ALARM_KEY "alarm"
1304 #define HEALTH_TEMPLATE_KEY "template"
1305 #define HEALTH_ON_KEY "on"
1306 #define HEALTH_LOOKUP_KEY "lookup"
1307 #define HEALTH_CALC_KEY "calc"
1308 #define HEALTH_EVERY_KEY "every"
1309 #define HEALTH_GREEN_KEY "green"
1310 #define HEALTH_RED_KEY "red"
1311 #define HEALTH_WARN_KEY "warn"
1312 #define HEALTH_CRIT_KEY "crit"
1313 #define HEALTH_EXEC_KEY "exec"
1314 #define HEALTH_RECIPIENT_KEY "to"
1315 #define HEALTH_UNITS_KEY "units"
1316 #define HEALTH_INFO_KEY "info"
1317 #define HEALTH_DELAY_KEY "delay"
1318
1319 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1320     if(!rc->chart) {
1321         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1322         return 0;
1323     }
1324
1325     if(!rc->update_every) {
1326         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1327         return 0;
1328     }
1329
1330     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1331         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1332         return 0;
1333     }
1334
1335     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1336         return 0;
1337
1338     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1339
1340     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1341           rc->chart?rc->chart:"NOCHART",
1342           rc->name,
1343           rc->id,
1344           (rc->exec)?rc->exec:"DEFAULT",
1345           (rc->recipient)?rc->recipient:"DEFAULT",
1346           rc->green,
1347           rc->red,
1348           rc->group,
1349           rc->after,
1350           rc->before,
1351           rc->options,
1352           (rc->dimensions)?rc->dimensions:"NONE",
1353           rc->update_every,
1354           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1355           (rc->warning)?rc->warning->parsed_as:"NONE",
1356           (rc->critical)?rc->critical->parsed_as:"NONE",
1357           rc->source,
1358           rc->delay_up_duration,
1359           rc->delay_down_duration,
1360           rc->delay_max_duration,
1361           rc->delay_multiplier
1362     );
1363
1364     rrdcalc_create_part2(host, rc);
1365     return 1;
1366 }
1367
1368 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1369     if(unlikely(!rt->context)) {
1370         error("Health configuration for template '%s' does not have a context", rt->name);
1371         return 0;
1372     }
1373
1374     if(unlikely(!rt->update_every)) {
1375         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1376         return 0;
1377     }
1378
1379     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1380         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1381         return 0;
1382     }
1383
1384     RRDCALCTEMPLATE *t, *last = NULL;
1385     for (t = host->templates; t ; last = t, t = t->next) {
1386         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1387             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1388             return 0;
1389         }
1390     }
1391
1392     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1393           rt->name,
1394           (rt->context)?rt->context:"NONE",
1395           (rt->exec)?rt->exec:"DEFAULT",
1396           (rt->recipient)?rt->recipient:"DEFAULT",
1397           rt->green,
1398           rt->red,
1399           rt->group,
1400           rt->after,
1401           rt->before,
1402           rt->options,
1403           (rt->dimensions)?rt->dimensions:"NONE",
1404           rt->update_every,
1405           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1406           (rt->warning)?rt->warning->parsed_as:"NONE",
1407           (rt->critical)?rt->critical->parsed_as:"NONE",
1408           rt->source,
1409           rt->delay_up_duration,
1410           rt->delay_down_duration,
1411           rt->delay_max_duration,
1412           rt->delay_multiplier
1413     );
1414
1415     if(likely(last)) {
1416         last->next = rt;
1417     }
1418     else {
1419         rt->next = host->templates;
1420         host->templates = rt;
1421     }
1422
1423     return 1;
1424 }
1425
1426 static inline int health_parse_duration(char *string, int *result) {
1427     // make sure it is a number
1428     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1429         *result = 0;
1430         return 0;
1431     }
1432
1433     char *e = NULL;
1434     calculated_number n = strtold(string, &e);
1435     if(e && *e) {
1436         switch (*e) {
1437             case 'Y':
1438                 *result = (int) (n * 86400 * 365);
1439                 break;
1440             case 'M':
1441                 *result = (int) (n * 86400 * 30);
1442                 break;
1443             case 'w':
1444                 *result = (int) (n * 86400 * 7);
1445                 break;
1446             case 'd':
1447                 *result = (int) (n * 86400);
1448                 break;
1449             case 'h':
1450                 *result = (int) (n * 3600);
1451                 break;
1452             case 'm':
1453                 *result = (int) (n * 60);
1454                 break;
1455
1456             default:
1457             case 's':
1458                 *result = (int) (n);
1459                 break;
1460         }
1461     }
1462     else
1463        *result = (int)(n);
1464
1465     return 1;
1466 }
1467
1468 static inline int health_parse_delay(
1469         size_t line, const char *path, const char *file, char *string,
1470         int *delay_up_duration,
1471         int *delay_down_duration,
1472         int *delay_max_duration,
1473         float *delay_multiplier) {
1474
1475     char given_up = 0;
1476     char given_down = 0;
1477     char given_max = 0;
1478     char given_multiplier = 0;
1479
1480     char *s = string;
1481     while(*s) {
1482         char *key = s;
1483
1484         while(*s && !isspace(*s)) s++;
1485         while(*s && isspace(*s)) *s++ = '\0';
1486
1487         if(!*key) break;
1488
1489         char *value = s;
1490         while(*s && !isspace(*s)) s++;
1491         while(*s && isspace(*s)) *s++ = '\0';
1492
1493         if(!strcasecmp(key, "up")) {
1494             if (!health_parse_duration(value, delay_up_duration)) {
1495                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1496                       line, path, file, value, key);
1497             }
1498             else given_up = 1;
1499         }
1500         else if(!strcasecmp(key, "down")) {
1501             if (!health_parse_duration(value, delay_down_duration)) {
1502                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1503                       line, path, file, value, key);
1504             }
1505             else given_down = 1;
1506         }
1507         else if(!strcasecmp(key, "multiplier")) {
1508             *delay_multiplier = strtof(value, NULL);
1509             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1510                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1511                       line, path, file, value, key);
1512             }
1513             else given_multiplier = 1;
1514         }
1515         else if(!strcasecmp(key, "max")) {
1516             if (!health_parse_duration(value, delay_max_duration)) {
1517                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1518                       line, path, file, value, key);
1519             }
1520             else given_max = 1;
1521         }
1522         else {
1523             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1524                   line, path, file, key);
1525         }
1526     }
1527
1528     if(!given_up)
1529         *delay_up_duration = 0;
1530
1531     if(!given_down)
1532         *delay_down_duration = 0;
1533
1534     if(!given_multiplier)
1535         *delay_multiplier = 1.0;
1536
1537     if(!given_max) {
1538         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1539             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1540
1541         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1542             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1543     }
1544
1545     return 1;
1546 }
1547
1548 static inline int health_parse_db_lookup(
1549         size_t line, const char *path, const char *file, char *string,
1550         int *group_method, int *after, int *before, int *every,
1551         uint32_t *options, char **dimensions
1552 ) {
1553     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1554
1555     if(*dimensions) freez(*dimensions);
1556     *dimensions = NULL;
1557     *after = 0;
1558     *before = 0;
1559     *every = 0;
1560     *options = 0;
1561
1562     char *s = string, *key;
1563
1564     // first is the group method
1565     key = s;
1566     while(*s && !isspace(*s)) s++;
1567     while(*s && isspace(*s)) *s++ = '\0';
1568     if(!*s) {
1569         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1570               line, path, file, key);
1571         return 0;
1572     }
1573
1574     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1575         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1576               line, path, file, key);
1577         return 0;
1578     }
1579
1580     // then is the 'after' time
1581     key = s;
1582     while(*s && !isspace(*s)) s++;
1583     while(*s && isspace(*s)) *s++ = '\0';
1584
1585     if(!health_parse_duration(key, after)) {
1586         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1587               line, path, file, key);
1588         return 0;
1589     }
1590
1591     // sane defaults
1592     *every = abs(*after);
1593
1594     // now we may have optional parameters
1595     while(*s) {
1596         key = s;
1597         while(*s && !isspace(*s)) s++;
1598         while(*s && isspace(*s)) *s++ = '\0';
1599         if(!*key) break;
1600
1601         if(!strcasecmp(key, "at")) {
1602             char *value = s;
1603             while(*s && !isspace(*s)) s++;
1604             while(*s && isspace(*s)) *s++ = '\0';
1605
1606             if (!health_parse_duration(value, before)) {
1607                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1608                       line, path, file, value, key);
1609             }
1610         }
1611         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1612             char *value = s;
1613             while(*s && !isspace(*s)) s++;
1614             while(*s && isspace(*s)) *s++ = '\0';
1615
1616             if (!health_parse_duration(value, every)) {
1617                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1618                       line, path, file, value, key);
1619             }
1620         }
1621         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1622             *options |= RRDR_OPTION_ABSOLUTE;
1623         }
1624         else if(!strcasecmp(key, "min2max")) {
1625             *options |= RRDR_OPTION_MIN2MAX;
1626         }
1627         else if(!strcasecmp(key, "null2zero")) {
1628             *options |= RRDR_OPTION_NULL2ZERO;
1629         }
1630         else if(!strcasecmp(key, "percentage")) {
1631             *options |= RRDR_OPTION_PERCENTAGE;
1632         }
1633         else if(!strcasecmp(key, "unaligned")) {
1634             *options |= RRDR_OPTION_NOT_ALIGNED;
1635         }
1636         else if(!strcasecmp(key, "of")) {
1637             if(*s && strcasecmp(s, "all"))
1638                *dimensions = strdupz(s);
1639             break;
1640         }
1641         else {
1642             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1643                   line, path, file, key);
1644         }
1645     }
1646
1647     return 1;
1648 }
1649
1650 static inline char *tabs2spaces(char *s) {
1651     char *t = s;
1652     while(*t) {
1653         if(unlikely(*t == '\t')) *t = ' ';
1654         t++;
1655     }
1656
1657     return s;
1658 }
1659
1660 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1661     char buffer[FILENAME_MAX + 1];
1662     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1663     return strdupz(buffer);
1664 }
1665
1666 static inline void strip_quotes(char *s) {
1667     while(*s) {
1668         if(*s == '\'' || *s == '"') *s = ' ';
1669         s++;
1670     }
1671 }
1672
1673 int health_readfile(const char *path, const char *filename) {
1674     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1675
1676     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1677     char buffer[HEALTH_CONF_MAX_LINE + 1];
1678
1679     if(unlikely(!hash_alarm)) {
1680         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1681         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1682         hash_on = simple_uhash(HEALTH_ON_KEY);
1683         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1684         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1685         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1686         hash_red = simple_uhash(HEALTH_RED_KEY);
1687         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1688         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1689         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1690         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1691         hash_units = simple_hash(HEALTH_UNITS_KEY);
1692         hash_info = simple_hash(HEALTH_INFO_KEY);
1693         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1694         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1695     }
1696
1697     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1698     FILE *fp = fopen(buffer, "r");
1699     if(!fp) {
1700         error("Health configuration cannot read file '%s'.", buffer);
1701         return 0;
1702     }
1703
1704     RRDCALC *rc = NULL;
1705     RRDCALCTEMPLATE *rt = NULL;
1706
1707     size_t line = 0, append = 0;
1708     char *s;
1709     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1710         int stop_appending = !s;
1711         line++;
1712         s = trim(buffer);
1713         if(!s) continue;
1714
1715         append = strlen(s);
1716         if(!stop_appending && s[append - 1] == '\\') {
1717             s[append - 1] = ' ';
1718             append = &s[append] - buffer;
1719             if(append < HEALTH_CONF_MAX_LINE)
1720                 continue;
1721             else {
1722                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1723             }
1724         }
1725         append = 0;
1726
1727         char *key = s;
1728         while(*s && *s != ':') s++;
1729         if(!*s) {
1730             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1731             continue;
1732         }
1733         *s = '\0';
1734         s++;
1735
1736         char *value = s;
1737         key = trim(key);
1738         value = trim(value);
1739
1740         if(!key) {
1741             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1742             continue;
1743         }
1744
1745         if(!value) {
1746             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1747             continue;
1748         }
1749
1750         uint32_t hash = simple_uhash(key);
1751
1752         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1753             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1754                 rrdcalc_free(&localhost, rc);
1755
1756             if(rt) {
1757                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1758                     rrdcalctemplate_free(&localhost, rt);
1759                 rt = NULL;
1760             }
1761
1762             rc = callocz(1, sizeof(RRDCALC));
1763             rc->next_event_id = 1;
1764             rc->name = tabs2spaces(strdupz(value));
1765             rc->hash = simple_hash(rc->name);
1766             rc->source = health_source_file(line, path, filename);
1767             rc->green = NAN;
1768             rc->red = NAN;
1769             rc->value = NAN;
1770             rc->old_value = NAN;
1771             rc->delay_multiplier = 1.0;
1772
1773             if(rrdvar_fix_name(rc->name))
1774                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1775         }
1776         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1777             if(rc) {
1778                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1779                     rrdcalc_free(&localhost, rc);
1780                 rc = NULL;
1781             }
1782
1783             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1784                 rrdcalctemplate_free(&localhost, rt);
1785
1786             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1787             rt->name = tabs2spaces(strdupz(value));
1788             rt->hash_name = simple_hash(rt->name);
1789             rt->source = health_source_file(line, path, filename);
1790             rt->green = NAN;
1791             rt->red = NAN;
1792             rt->delay_multiplier = 1.0;
1793
1794             if(rrdvar_fix_name(rt->name))
1795                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1796         }
1797         else if(rc) {
1798             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1799                 if(rc->chart) {
1800                     if(strcmp(rc->chart, value))
1801                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1802                              line, path, filename, rc->name, key, rc->chart, value, value);
1803
1804                     freez(rc->chart);
1805                 }
1806                 rc->chart = tabs2spaces(strdupz(value));
1807                 rc->hash_chart = simple_hash(rc->chart);
1808             }
1809             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1810                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1811                                        &rc->update_every,
1812                                        &rc->options, &rc->dimensions);
1813             }
1814             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1815                 if(!health_parse_duration(value, &rc->update_every))
1816                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1817                          line, path, filename, rc->name, key, value);
1818             }
1819             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1820                 char *e;
1821                 rc->green = strtold(value, &e);
1822                 if(e && *e) {
1823                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1824                          line, path, filename, rc->name, key, e);
1825                 }
1826             }
1827             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1828                 char *e;
1829                 rc->red = strtold(value, &e);
1830                 if(e && *e) {
1831                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1832                          line, path, filename, rc->name, key, e);
1833                 }
1834             }
1835             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1836                 const char *failed_at = NULL;
1837                 int error = 0;
1838                 rc->calculation = expression_parse(value, &failed_at, &error);
1839                 if(!rc->calculation) {
1840                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1841                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1842                 }
1843             }
1844             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1845                 const char *failed_at = NULL;
1846                 int error = 0;
1847                 rc->warning = expression_parse(value, &failed_at, &error);
1848                 if(!rc->warning) {
1849                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1850                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1851                 }
1852             }
1853             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1854                 const char *failed_at = NULL;
1855                 int error = 0;
1856                 rc->critical = expression_parse(value, &failed_at, &error);
1857                 if(!rc->critical) {
1858                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1859                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1860                 }
1861             }
1862             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1863                 if(rc->exec) {
1864                     if(strcmp(rc->exec, value))
1865                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1866                              line, path, filename, rc->name, key, rc->exec, value, value);
1867
1868                     freez(rc->exec);
1869                 }
1870                 rc->exec = tabs2spaces(strdupz(value));
1871             }
1872             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1873                 if(rc->recipient) {
1874                     if(strcmp(rc->recipient, value))
1875                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1876                              line, path, filename, rc->name, key, rc->recipient, value, value);
1877
1878                     freez(rc->recipient);
1879                 }
1880                 rc->recipient = tabs2spaces(strdupz(value));
1881             }
1882             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1883                 if(rc->units) {
1884                     if(strcmp(rc->units, value))
1885                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1886                              line, path, filename, rc->name, key, rc->units, value, value);
1887
1888                     freez(rc->units);
1889                 }
1890                 rc->units = tabs2spaces(strdupz(value));
1891                 strip_quotes(rc->units);
1892             }
1893             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1894                 if(rc->info) {
1895                     if(strcmp(rc->info, value))
1896                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1897                              line, path, filename, rc->name, key, rc->info, value, value);
1898
1899                     freez(rc->info);
1900                 }
1901                 rc->info = tabs2spaces(strdupz(value));
1902                 strip_quotes(rc->info);
1903             }
1904             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1905                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1906             }
1907             else {
1908                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1909                      line, path, filename, rc->name, key);
1910             }
1911         }
1912         else if(rt) {
1913             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1914                 if(rt->context) {
1915                     if(strcmp(rt->context, value))
1916                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1917                              line, path, filename, rt->name, key, rt->context, value, value);
1918
1919                     freez(rt->context);
1920                 }
1921                 rt->context = tabs2spaces(strdupz(value));
1922                 rt->hash_context = simple_hash(rt->context);
1923             }
1924             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1925                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1926                                        &rt->update_every,
1927                                        &rt->options, &rt->dimensions);
1928             }
1929             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1930                 if(!health_parse_duration(value, &rt->update_every))
1931                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1932                          line, path, filename, rt->name, key, value);
1933             }
1934             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1935                 char *e;
1936                 rt->green = strtold(value, &e);
1937                 if(e && *e) {
1938                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1939                          line, path, filename, rt->name, key, e);
1940                 }
1941             }
1942             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1943                 char *e;
1944                 rt->red = strtold(value, &e);
1945                 if(e && *e) {
1946                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1947                          line, path, filename, rt->name, key, e);
1948                 }
1949             }
1950             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1951                 const char *failed_at = NULL;
1952                 int error = 0;
1953                 rt->calculation = expression_parse(value, &failed_at, &error);
1954                 if(!rt->calculation) {
1955                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1956                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1957                 }
1958             }
1959             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1960                 const char *failed_at = NULL;
1961                 int error = 0;
1962                 rt->warning = expression_parse(value, &failed_at, &error);
1963                 if(!rt->warning) {
1964                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1965                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1966                 }
1967             }
1968             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1969                 const char *failed_at = NULL;
1970                 int error = 0;
1971                 rt->critical = expression_parse(value, &failed_at, &error);
1972                 if(!rt->critical) {
1973                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1974                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1975                 }
1976             }
1977             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1978                 if(rt->exec) {
1979                     if(strcmp(rt->exec, value))
1980                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1981                              line, path, filename, rt->name, key, rt->exec, value, value);
1982
1983                     freez(rt->exec);
1984                 }
1985                 rt->exec = tabs2spaces(strdupz(value));
1986             }
1987             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1988                 if(rt->recipient) {
1989                     if(strcmp(rt->recipient, value))
1990                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1991                              line, path, filename, rt->name, key, rt->recipient, value, value);
1992
1993                     freez(rt->recipient);
1994                 }
1995                 rt->recipient = tabs2spaces(strdupz(value));
1996             }
1997             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1998                 if(rt->units) {
1999                     if(strcmp(rt->units, value))
2000                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2001                              line, path, filename, rt->name, key, rt->units, value, value);
2002
2003                     freez(rt->units);
2004                 }
2005                 rt->units = tabs2spaces(strdupz(value));
2006                 strip_quotes(rt->units);
2007             }
2008             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2009                 if(rt->info) {
2010                     if(strcmp(rt->info, value))
2011                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2012                              line, path, filename, rt->name, key, rt->info, value, value);
2013
2014                     freez(rt->info);
2015                 }
2016                 rt->info = tabs2spaces(strdupz(value));
2017                 strip_quotes(rt->info);
2018             }
2019             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2020                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
2021             }
2022             else {
2023                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
2024                       line, path, filename, rt->name, key);
2025             }
2026         }
2027         else {
2028             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
2029                   line, path, filename, key);
2030         }
2031     }
2032
2033     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2034         rrdcalc_free(&localhost, rc);
2035
2036     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2037         rrdcalctemplate_free(&localhost, rt);
2038
2039     fclose(fp);
2040     return 1;
2041 }
2042
2043 void health_readdir(const char *path) {
2044     size_t pathlen = strlen(path);
2045
2046     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2047
2048     DIR *dir = opendir(path);
2049     if (!dir) {
2050         error("Health configuration cannot open directory '%s'.", path);
2051         return;
2052     }
2053
2054     struct dirent *de = NULL;
2055     while ((de = readdir(dir))) {
2056         size_t len = strlen(de->d_name);
2057
2058         if(de->d_type == DT_DIR
2059            && (
2060                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2061                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2062            )) {
2063             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2064             continue;
2065         }
2066
2067         else if(de->d_type == DT_DIR) {
2068             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2069             strcpy(s, path);
2070             strcat(s, "/");
2071             strcat(s, de->d_name);
2072             health_readdir(s);
2073             freez(s);
2074             continue;
2075         }
2076
2077         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2078                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2079             health_readfile(path, de->d_name);
2080         }
2081
2082         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2083     }
2084
2085     closedir(dir);
2086 }
2087
2088 static inline char *health_config_dir(void) {
2089     char buffer[FILENAME_MAX + 1];
2090     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2091     return config_get("health", "health configuration directory", buffer);
2092 }
2093
2094 void health_init(void) {
2095     debug(D_HEALTH, "Health configuration initializing");
2096
2097     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2098         debug(D_HEALTH, "Health is disabled.");
2099         return;
2100     }
2101
2102     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2103     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2104         fatal("Cannot create directory '%s'.", pathname);
2105
2106     char filename[FILENAME_MAX + 1];
2107     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2108     health.log_filename = config_get("health", "health db file", filename);
2109
2110     health_alarm_log_load(&localhost);
2111     health_alarm_log_open();
2112
2113     char *path = health_config_dir();
2114
2115     {
2116         char buffer[FILENAME_MAX + 1];
2117         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2118         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2119     }
2120
2121     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2122     if(n < 10) {
2123         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2124         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2125     }
2126     else localhost.health_log.max = (unsigned int)n;
2127
2128     rrdhost_rwlock(&localhost);
2129     health_readdir(path);
2130     rrdhost_unlock(&localhost);
2131 }
2132
2133 // ----------------------------------------------------------------------------
2134 // JSON generation
2135
2136 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2137     if(value && *value)
2138         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2139     else
2140         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2141 }
2142
2143 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2144     buffer_sprintf(wb, "\n\t{\n"
2145                            "\t\t\"hostname\": \"%s\",\n"
2146                            "\t\t\"unique_id\": %u,\n"
2147                            "\t\t\"alarm_id\": %u,\n"
2148                            "\t\t\"alarm_event_id\": %u,\n"
2149                            "\t\t\"name\": \"%s\",\n"
2150                            "\t\t\"chart\": \"%s\",\n"
2151                            "\t\t\"family\": \"%s\",\n"
2152                            "\t\t\"processed\": %s,\n"
2153                            "\t\t\"updated\": %s,\n"
2154                            "\t\t\"exec_run\": %lu,\n"
2155                            "\t\t\"exec_failed\": %s,\n"
2156                            "\t\t\"exec\": \"%s\",\n"
2157                            "\t\t\"recipient\": \"%s\",\n"
2158                            "\t\t\"exec_code\": %d,\n"
2159                            "\t\t\"source\": \"%s\",\n"
2160                            "\t\t\"units\": \"%s\",\n"
2161                            "\t\t\"info\": \"%s\",\n"
2162                            "\t\t\"when\": %lu,\n"
2163                            "\t\t\"duration\": %lu,\n"
2164                            "\t\t\"non_clear_duration\": %lu,\n"
2165                            "\t\t\"status\": \"%s\",\n"
2166                            "\t\t\"old_status\": \"%s\",\n"
2167                            "\t\t\"delay\": %d,\n"
2168                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2169                            "\t\t\"updated_by_id\": %u,\n"
2170                            "\t\t\"updates_id\": %u,\n",
2171                    host->hostname,
2172                    ae->unique_id,
2173                    ae->alarm_id,
2174                    ae->alarm_event_id,
2175                    ae->name,
2176                    ae->chart,
2177                    ae->family,
2178                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2179                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2180                    (unsigned long)ae->exec_run_timestamp,
2181                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2182                    ae->exec?ae->exec:health.health_default_exec,
2183                    ae->recipient?ae->recipient:health.health_default_recipient,
2184                    ae->exec_code,
2185                    ae->source,
2186                    ae->units?ae->units:"",
2187                    ae->info?ae->info:"",
2188                    (unsigned long)ae->when,
2189                    (unsigned long)ae->duration,
2190                    (unsigned long)ae->non_clear_duration,
2191                    rrdcalc_status2string(ae->new_status),
2192                    rrdcalc_status2string(ae->old_status),
2193                    ae->delay,
2194                    (unsigned long)ae->delay_up_to_timestamp,
2195                    ae->updated_by_id,
2196                    ae->updates_id
2197     );
2198
2199     buffer_strcat(wb, "\t\t\"value\":");
2200     buffer_rrd_value(wb, ae->new_value);
2201     buffer_strcat(wb, ",\n");
2202
2203     buffer_strcat(wb, "\t\t\"old_value\":");
2204     buffer_rrd_value(wb, ae->old_value);
2205     buffer_strcat(wb, "\n");
2206
2207     buffer_strcat(wb, "\t}");
2208 }
2209
2210 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2211     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2212
2213     buffer_strcat(wb, "[");
2214
2215     unsigned int max = host->health_log.max;
2216     unsigned int count = 0;
2217     ALARM_ENTRY *ae;
2218     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2219         if(ae->unique_id > after) {
2220             if(likely(count)) buffer_strcat(wb, ",");
2221             health_alarm_entry2json_nolock(wb, ae, host);
2222         }
2223     }
2224
2225     buffer_strcat(wb, "\n]\n");
2226
2227     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2228 }
2229
2230 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2231     buffer_sprintf(wb,
2232            "\t\t\"%s.%s\": {\n"
2233                    "\t\t\t\"id\": %lu,\n"
2234                    "\t\t\t\"name\": \"%s\",\n"
2235                    "\t\t\t\"chart\": \"%s\",\n"
2236                    "\t\t\t\"family\": \"%s\",\n"
2237                    "\t\t\t\"active\": %s,\n"
2238                    "\t\t\t\"exec\": \"%s\",\n"
2239                    "\t\t\t\"recipient\": \"%s\",\n"
2240                    "\t\t\t\"source\": \"%s\",\n"
2241                    "\t\t\t\"units\": \"%s\",\n"
2242                    "\t\t\t\"info\": \"%s\",\n"
2243                                    "\t\t\t\"status\": \"%s\",\n"
2244                    "\t\t\t\"last_status_change\": %lu,\n"
2245                    "\t\t\t\"last_updated\": %lu,\n"
2246                    "\t\t\t\"next_update\": %lu,\n"
2247                    "\t\t\t\"update_every\": %d,\n"
2248                    "\t\t\t\"delay_up_duration\": %d,\n"
2249                    "\t\t\t\"delay_down_duration\": %d,\n"
2250                    "\t\t\t\"delay_max_duration\": %d,\n"
2251                    "\t\t\t\"delay_multiplier\": %f,\n"
2252                    "\t\t\t\"delay\": %d,\n"
2253                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2254             , rc->chart, rc->name
2255             , (unsigned long)rc->id
2256             , rc->name
2257             , rc->chart
2258             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2259             , (rc->rrdset)?"true":"false"
2260             , rc->exec?rc->exec:health.health_default_exec
2261             , rc->recipient?rc->recipient:health.health_default_recipient
2262             , rc->source
2263             , rc->units?rc->units:""
2264             , rc->info?rc->info:""
2265             , rrdcalc_status2string(rc->status)
2266             , (unsigned long)rc->last_status_change
2267             , (unsigned long)rc->last_updated
2268             , (unsigned long)rc->next_update
2269             , rc->update_every
2270             , rc->delay_up_duration
2271             , rc->delay_down_duration
2272             , rc->delay_max_duration
2273             , rc->delay_multiplier
2274             , rc->delay_last
2275             , (unsigned long)rc->delay_up_to_timestamp
2276     );
2277
2278     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2279         if(rc->dimensions && *rc->dimensions)
2280             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2281
2282         buffer_sprintf(wb,
2283                        "\t\t\t\"db_after\": %lu,\n"
2284                        "\t\t\t\"db_before\": %lu,\n"
2285                        "\t\t\t\"lookup_method\": \"%s\",\n"
2286                        "\t\t\t\"lookup_after\": %d,\n"
2287                        "\t\t\t\"lookup_before\": %d,\n"
2288                        "\t\t\t\"lookup_options\": \"",
2289                        (unsigned long) rc->db_after,
2290                        (unsigned long) rc->db_before,
2291                        group_method2string(rc->group),
2292                        rc->after,
2293                        rc->before
2294         );
2295         buffer_data_options2string(wb, rc->options);
2296         buffer_strcat(wb, "\",\n");
2297     }
2298
2299     if(rc->calculation) {
2300         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2301         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2302     }
2303
2304     if(rc->warning) {
2305         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2306         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2307     }
2308
2309     if(rc->critical) {
2310         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2311         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2312     }
2313
2314     buffer_strcat(wb, "\t\t\t\"green\":");
2315     buffer_rrd_value(wb, rc->green);
2316     buffer_strcat(wb, ",\n");
2317
2318     buffer_strcat(wb, "\t\t\t\"red\":");
2319     buffer_rrd_value(wb, rc->red);
2320     buffer_strcat(wb, ",\n");
2321
2322     buffer_strcat(wb, "\t\t\t\"value\":");
2323     buffer_rrd_value(wb, rc->value);
2324     buffer_strcat(wb, "\n");
2325
2326     buffer_strcat(wb, "\t\t}");
2327 }
2328
2329 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2330 //
2331 //}
2332
2333 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2334     int i;
2335
2336     rrdhost_rdlock(&localhost);
2337     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2338                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2339                         "\n\t\"status\": %s,"
2340                         "\n\t\"now\": %lu,"
2341                         "\n\t\"alarms\": {\n",
2342                         host->hostname,
2343                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2344                         health_enabled?"true":"false",
2345                         (unsigned long)time(NULL));
2346
2347     RRDCALC *rc;
2348     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2349         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2350             continue;
2351
2352         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2353             continue;
2354
2355         if(likely(i)) buffer_strcat(wb, ",\n");
2356         health_rrdcalc2json_nolock(wb, rc);
2357         i++;
2358     }
2359
2360 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2361 //    RRDCALCTEMPLATE *rt;
2362 //    for(rt = host->templates; rt ; rt = rt->next)
2363 //        health_rrdcalctemplate2json_nolock(wb, rt);
2364
2365     buffer_strcat(wb, "\n\t}\n}\n");
2366     rrdhost_unlock(&localhost);
2367 }
2368
2369
2370 // ----------------------------------------------------------------------------
2371 // re-load health configuration
2372
2373 static inline void health_free_all_nolock(RRDHOST *host) {
2374     while(host->templates)
2375         rrdcalctemplate_free(host, host->templates);
2376
2377     while(host->alarms)
2378         rrdcalc_free(host, host->alarms);
2379 }
2380
2381 void health_reload(void) {
2382     if(!health_enabled) {
2383         error("Health reload is requested, but health is not enabled.");
2384         return;
2385     }
2386
2387     char *path = health_config_dir();
2388
2389     // free all running alarms
2390     rrdhost_rwlock(&localhost);
2391     health_free_all_nolock(&localhost);
2392     rrdhost_unlock(&localhost);
2393
2394     // invalidate all previous entries in the alarm log
2395     ALARM_ENTRY *t;
2396     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2397         if(t->new_status != RRDCALC_STATUS_REMOVED)
2398             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2399     }
2400
2401     // reset all thresholds to all charts
2402     RRDSET *st;
2403     for(st = localhost.rrdset_root; st ; st = st->next) {
2404         st->green = NAN;
2405         st->red = NAN;
2406     }
2407
2408     // load the new alarms
2409     rrdhost_rwlock(&localhost);
2410     health_readdir(path);
2411     rrdhost_unlock(&localhost);
2412
2413     // link the loaded alarms to their charts
2414     for(st = localhost.rrdset_root; st ; st = st->next) {
2415         rrdhost_rwlock(&localhost);
2416
2417         rrdsetcalc_link_matching(st);
2418         rrdcalctemplate_link_matching(st);
2419
2420         rrdhost_unlock(&localhost);
2421     }
2422 }
2423
2424 // ----------------------------------------------------------------------------
2425 // health main thread and friends
2426
2427 static inline int rrdcalc_value2status(calculated_number n) {
2428     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2429     if(n) return RRDCALC_STATUS_RAISED;
2430     return RRDCALC_STATUS_CLEAR;
2431 }
2432
2433 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2434     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2435
2436     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2437         // do not send notifications for internal statuses
2438         goto done;
2439     }
2440
2441     // find the previous notification for the same alarm
2442     // which we have run the exec script
2443     ALARM_ENTRY *t;
2444     for(t = ae->next; t ;t = t->next) {
2445         if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2446             break;
2447     }
2448
2449     if(likely(t)) {
2450         // we have executed this alarm notification in the past
2451         if (t && t->new_status == ae->new_status) {
2452             // don't send the same notification again
2453             debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name,
2454                  rrdcalc_status2string(ae->new_status));
2455             goto done;
2456         }
2457     }
2458     else {
2459         // we have not executed this alarm notification in the past
2460         if(unlikely(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2461             debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2462             goto done;
2463         }
2464     }
2465
2466     char buffer[FILENAME_MAX + 1];
2467     pid_t command_pid;
2468
2469     const char *exec = ae->exec;
2470     if(!exec) exec = health.health_default_exec;
2471
2472     const char *recipient = ae->recipient;
2473     if(!recipient) recipient = health.health_default_recipient;
2474
2475     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2476               exec,
2477               recipient,
2478               host->hostname,
2479               ae->unique_id,
2480               ae->alarm_id,
2481               ae->alarm_event_id,
2482               (unsigned long)ae->when,
2483               ae->name,
2484               ae->chart?ae->chart:"NOCAHRT",
2485               ae->family?ae->family:"NOFAMILY",
2486               rrdcalc_status2string(ae->new_status),
2487               rrdcalc_status2string(ae->old_status),
2488               ae->new_value,
2489               ae->old_value,
2490               ae->source?ae->source:"UNKNOWN",
2491               (uint32_t)ae->duration,
2492               (uint32_t)ae->non_clear_duration,
2493               ae->units?ae->units:"",
2494               ae->info?ae->info:""
2495     );
2496
2497     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2498     ae->exec_run_timestamp = time(NULL);
2499
2500     debug(D_HEALTH, "executing command '%s'", buffer);
2501     FILE *fp = mypopen(buffer, &command_pid);
2502     if(!fp) {
2503         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2504         goto done;
2505     }
2506     debug(D_HEALTH, "HEALTH reading from command");
2507     char *s = fgets(buffer, FILENAME_MAX, fp);
2508     (void)s;
2509     ae->exec_code = mypclose(fp, command_pid);
2510     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2511
2512     if(ae->exec_code != 0)
2513         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2514
2515 done:
2516     health_alarm_log_save(host, ae);
2517     return;
2518 }
2519
2520 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2521     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2522          ae->chart?ae->chart:"NOCHART", ae->name,
2523          ae->new_value,
2524          rrdcalc_status2string(ae->old_status),
2525          rrdcalc_status2string(ae->new_status)
2526     );
2527
2528     health_alarm_execute(host, ae);
2529 }
2530
2531 static inline void health_alarm_log_process(RRDHOST *host) {
2532     static uint32_t stop_at_id = 0;
2533     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2534     time_t now = time(NULL);
2535
2536     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2537
2538     ALARM_ENTRY *ae;
2539     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2540         if(unlikely(
2541             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2542             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2543             )) {
2544
2545             if(unlikely(ae->unique_id < first_waiting))
2546                 first_waiting = ae->unique_id;
2547
2548             if(likely(now >= ae->delay_up_to_timestamp))
2549                 health_process_notifications(host, ae);
2550         }
2551     }
2552
2553     // remember this for the next iteration
2554     stop_at_id = first_waiting;
2555
2556     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2557
2558     if(host->health_log.count <= host->health_log.max)
2559         return;
2560
2561     // cleanup excess entries in the log
2562     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2563
2564     ALARM_ENTRY *last = NULL;
2565     unsigned int count = host->health_log.max * 2 / 3;
2566     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2567
2568     if(ae && last && last->next == ae)
2569         last->next = NULL;
2570     else
2571         ae = NULL;
2572
2573     while(ae) {
2574         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2575
2576         ALARM_ENTRY *t = ae->next;
2577
2578         freez(ae->name);
2579         freez(ae->chart);
2580         freez(ae->family);
2581         freez(ae->exec);
2582         freez(ae->recipient);
2583         freez(ae->source);
2584         freez(ae->units);
2585         freez(ae->info);
2586         freez(ae);
2587
2588         ae = t;
2589         host->health_log.count--;
2590     }
2591
2592     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2593 }
2594
2595 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2596     if(unlikely(!rc->rrdset)) {
2597         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2598         return 0;
2599     }
2600
2601     if(unlikely(rc->next_update > now)) {
2602         if (unlikely(*next_run > rc->next_update)) {
2603             // update the next_run time of the main loop
2604             // to run this alarm precisely the time required
2605             *next_run = rc->next_update;
2606         }
2607
2608         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2609         return 0;
2610     }
2611
2612     if(unlikely(!rc->update_every)) {
2613         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2614         return 0;
2615     }
2616
2617     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
2618         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
2619         return 0;
2620     }
2621
2622     int update_every = rc->rrdset->update_every;
2623     time_t first = rrdset_first_entry_t(rc->rrdset);
2624     time_t last = rrdset_last_entry_t(rc->rrdset);
2625
2626     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
2627         debug(D_HEALTH
2628               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
2629               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
2630               , (unsigned long) last);
2631         return 0;
2632     }
2633
2634     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2635         time_t needed = now + rc->before + rc->after;
2636
2637         if(needed + update_every < first || needed - update_every > last) {
2638             debug(D_HEALTH
2639                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
2640                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
2641                   , (unsigned long) last);
2642             return 0;
2643         }
2644     }
2645
2646     return 1;
2647 }
2648
2649 void *health_main(void *ptr) {
2650     (void)ptr;
2651
2652     info("HEALTH thread created with task id %d", gettid());
2653
2654     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2655         error("Cannot set pthread cancel type to DEFERRED.");
2656
2657     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2658         error("Cannot set pthread cancel state to ENABLE.");
2659
2660     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2661     if(min_run_every < 1) min_run_every = 1;
2662
2663     BUFFER *wb = buffer_create(100);
2664
2665     unsigned int loop = 0;
2666     while(health_enabled && !netdata_exit) {
2667         loop++;
2668         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2669
2670         int oldstate, runnable = 0;
2671         time_t now = time(NULL);
2672         time_t next_run = now + min_run_every;
2673         RRDCALC *rc;
2674
2675         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2676             error("Cannot set pthread cancel state to DISABLE.");
2677
2678         rrdhost_rdlock(&localhost);
2679
2680         // the first loop is to lookup values from the db
2681         for(rc = localhost.alarms; rc; rc = rc->next) {
2682             if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
2683                 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
2684                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
2685                 continue;
2686             }
2687
2688             runnable++;
2689             rc->old_value = rc->value;
2690             rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
2691
2692             // 1. if there is database lookup, do it
2693             // 2. if there is calculation expression, run it
2694
2695             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2696                 /* time_t old_db_timestamp = rc->db_before; */
2697                 int value_is_null = 0;
2698
2699                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2700                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2701                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2702
2703                 if (unlikely(ret != 200)) {
2704                     // database lookup failed
2705                     rc->value = NAN;
2706
2707                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2708
2709                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2710                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2711                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2712                     }
2713                 }
2714                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2715                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2716
2717                 /* - RRDCALC_FLAG_DB_STALE not currently used
2718                 if (unlikely(old_db_timestamp == rc->db_before)) {
2719                     // database is stale
2720
2721                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2722
2723                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2724                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2725                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2726                     }
2727                 }
2728                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2729                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2730                 */
2731
2732                 if (unlikely(value_is_null)) {
2733                     // collected value is null
2734
2735                     rc->value = NAN;
2736
2737                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2738                           rc->chart?rc->chart:"NOCHART", rc->name);
2739
2740                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2741                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2742                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2743                               rc->chart?rc->chart:"NOCHART", rc->name);
2744                     }
2745                 }
2746                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2747                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2748
2749                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2750                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2751             }
2752
2753             if(unlikely(rc->calculation)) {
2754                 if (unlikely(!expression_evaluate(rc->calculation))) {
2755                     // calculation failed
2756
2757                     rc->value = NAN;
2758
2759                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s",
2760                           rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2761
2762                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2763                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2764                         error("Health alarm '%s.%s': expression '%s' failed: %s",
2765                               rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2766                     }
2767                 }
2768                 else {
2769                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2770                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2771
2772                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value "
2773                             CALCULATED_NUMBER_FORMAT
2774                             ": %s (source: %s)",
2775                           rc->chart?rc->chart:"NOCHART", rc->name,
2776                           rc->calculation->parsed_as,
2777                           rc->calculation->result,
2778                           buffer_tostring(rc->calculation->error_msg),
2779                           rc->source
2780                     );
2781
2782                     rc->value = rc->calculation->result;
2783                 }
2784             }
2785         }
2786         rrdhost_unlock(&localhost);
2787
2788         if(unlikely(runnable && !netdata_exit)) {
2789             rrdhost_rdlock(&localhost);
2790
2791             for(rc = localhost.alarms; rc; rc = rc->next) {
2792                 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
2793                     continue;
2794
2795                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2796                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2797
2798                 if(likely(rc->warning)) {
2799                     if(unlikely(!expression_evaluate(rc->warning))) {
2800                         // calculation failed
2801
2802                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2803                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2804
2805                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2806                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2807                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2808                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2809                         }
2810                     }
2811                     else {
2812                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2813                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2814
2815                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2816                                 CALCULATED_NUMBER_FORMAT
2817                                 ": %s (source: %s)",
2818                               rc->chart?rc->chart:"NOCHART", rc->name,
2819                               rc->warning->result,
2820                               buffer_tostring(rc->warning->error_msg),
2821                               rc->source
2822                         );
2823
2824                         warning_status = rrdcalc_value2status(rc->warning->result);
2825                     }
2826                 }
2827
2828                 if(likely(rc->critical)) {
2829                     if(unlikely(!expression_evaluate(rc->critical))) {
2830                         // calculation failed
2831
2832                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2833                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2834
2835                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2836                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2837                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2838                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2839                         }
2840                     }
2841                     else {
2842                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2843                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2844
2845                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2846                                 CALCULATED_NUMBER_FORMAT
2847                                 ": %s (source: %s)",
2848                               rc->chart?rc->chart:"NOCHART", rc->name,
2849                               rc->critical->result,
2850                               buffer_tostring(rc->critical->error_msg),
2851                               rc->source
2852                         );
2853
2854                         critical_status = rrdcalc_value2status(rc->critical->result);
2855                     }
2856                 }
2857
2858                 int status = RRDCALC_STATUS_UNDEFINED;
2859
2860                 switch(warning_status) {
2861                     case RRDCALC_STATUS_CLEAR:
2862                         status = RRDCALC_STATUS_CLEAR;
2863                         break;
2864
2865                     case RRDCALC_STATUS_RAISED:
2866                         status = RRDCALC_STATUS_WARNING;
2867                         break;
2868
2869                     default:
2870                         break;
2871                 }
2872
2873                 switch(critical_status) {
2874                     case RRDCALC_STATUS_CLEAR:
2875                         if(status == RRDCALC_STATUS_UNDEFINED)
2876                             status = RRDCALC_STATUS_CLEAR;
2877                         break;
2878
2879                     case RRDCALC_STATUS_RAISED:
2880                         status = RRDCALC_STATUS_CRITICAL;
2881                         break;
2882
2883                     default:
2884                         break;
2885                 }
2886
2887                 if(status != rc->status) {
2888                     int delay = 0;
2889
2890                     if(now > rc->delay_up_to_timestamp) {
2891                         rc->delay_up_current = rc->delay_up_duration;
2892                         rc->delay_down_current = rc->delay_down_duration;
2893                         rc->delay_last = 0;
2894                         rc->delay_up_to_timestamp = 0;
2895                     }
2896                     else {
2897                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2898                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2899
2900                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2901                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2902                     }
2903
2904                     if(status > rc->status)
2905                         delay = rc->delay_up_current;
2906                     else
2907                         delay = rc->delay_down_current;
2908
2909                     // COMMENTED: because we do need to send raising alarms
2910                     // if(now + delay < rc->delay_up_to_timestamp)
2911                     //    delay = (int)(rc->delay_up_to_timestamp - now);
2912
2913                     rc->delay_last = delay;
2914                     rc->delay_up_to_timestamp = now + delay;
2915                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
2916                     rc->last_status_change = now;
2917                     rc->status = status;
2918                 }
2919
2920                 rc->last_updated = now;
2921                 rc->next_update = now + rc->update_every;
2922
2923                 if (next_run > rc->next_update)
2924                     next_run = rc->next_update;
2925             }
2926
2927             rrdhost_unlock(&localhost);
2928         }
2929
2930         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2931             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2932
2933         if(unlikely(netdata_exit))
2934             break;
2935
2936         // execute notifications
2937         // and cleanup
2938         health_alarm_log_process(&localhost);
2939
2940         if(unlikely(netdata_exit))
2941             break;
2942         
2943         now = time(NULL);
2944         if(now < next_run) {
2945             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2946                   loop, (int) (next_run - now));
2947             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2948         }
2949         else {
2950             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2951         }
2952     }
2953
2954     buffer_free(wb);
2955
2956     info("HEALTH thread exiting");
2957     pthread_exit(NULL);
2958     return NULL;
2959 }