]> arthur.barton.de Git - netdata.git/blob - src/health.c
dimensions should also be referred as context.dimension under the same family
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         if(unlink(health.log_filename) == -1 && errno != ENOENT)
71             error("Health: cannot remove old alarms log file '%s'", health.log_filename);
72
73         // open it with truncate
74         health.log_fp = fopen(health.log_filename, "w");
75
76         if(health.log_fp)
77             fclose(health.log_fp);
78         else
79             error("Health: cannot truncate health log '%s'", health.log_filename);
80
81         health.log_fp = NULL;
82
83         health.log_entries_written = 0;
84         health_alarm_log_open();
85     }
86 }
87
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
89     health_log_rotate();
90
91     if(likely(health.log_fp)) {
92         if(unlikely(fprintf(health.log_fp
93                 , "%c\t%s"
94                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
95                   "\t%08x\t%08x\t%08x"
96                   "\t%08x\t%08x\t%08x"
97                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
98                   "\t%d\t%d\t%d\t%d"
99                   "\t%Lf\t%Lf"
100                   "\n"
101                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
102                 , host->hostname
103
104                 , ae->unique_id
105                 , ae->alarm_id
106                 , ae->alarm_event_id
107                 , ae->updated_by_id
108                 , ae->updates_id
109
110                 , (uint32_t)ae->when
111                 , (uint32_t)ae->duration
112                 , (uint32_t)ae->non_clear_duration
113                 , (uint32_t)ae->flags
114                 , (uint32_t)ae->exec_run_timestamp
115                 , (uint32_t)ae->delay_up_to_timestamp
116
117                 , (ae->name)?ae->name:""
118                 , (ae->chart)?ae->chart:""
119                 , (ae->family)?ae->family:""
120                 , (ae->exec)?ae->exec:""
121                 , (ae->recipient)?ae->recipient:""
122                 , (ae->source)?ae->source:""
123                 , (ae->units)?ae->units:""
124                 , (ae->info)?ae->info:""
125
126                 , ae->exec_code
127                 , ae->new_status
128                 , ae->old_status
129                 , ae->delay
130
131                 , (long double)ae->new_value
132                 , (long double)ae->old_value
133         ) < 0))
134             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
135         else {
136             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137             health.log_entries_written++;
138         }
139     }
140 }
141
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143     static uint32_t max_unique_id = 0, max_alarm_id = 0;
144     ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
145
146     errno = 0;
147
148     char *s, *buf = mallocz(65536 + 1);
149     size_t line = 0, len = 0;
150     loaded = updated = errored = duplicate = 0;
151
152     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
153
154     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
155         health.log_entries_written++;
156         line++;
157
158         int max_entries = 30, entries = 0;
159         char *pointers[max_entries];
160
161         pointers[entries++] = s++;
162         while(*s) {
163             if(unlikely(*s == '\t')) {
164                 *s = '\0';
165                 pointers[entries++] = ++s;
166                 if(entries >= max_entries) {
167                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
168                     break;
169                 }
170             }
171             else s++;
172         }
173
174         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
175             ALARM_ENTRY *ae = NULL;
176
177             if(entries < 26) {
178                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries);
179                 errored++;
180                 continue;
181             }
182
183             // check that we have valid ids
184             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
185             if(!unique_id) {
186                 error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]);
187                 errored++;
188                 continue;
189             }
190
191             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
192             if(!alarm_id) {
193                 error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]);
194                 errored++;
195                 continue;
196             }
197
198             if(unlikely(*pointers[0] == 'A')) {
199                 // make sure it is properly numbered
200                 if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
201                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id);
202                     errored++;
203                     continue;
204                 }
205
206                 ae = callocz(1, sizeof(ALARM_ENTRY));
207             }
208             else if(unlikely(*pointers[0] == 'U')) {
209                 // find the original
210                 for(ae = host->health_log.alarms; ae; ae = ae->next) {
211                     if(unlikely(unique_id == ae->unique_id)) {
212                         if(unlikely(*pointers[0] == 'A')) {
213                             error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later."
214                                   , line, filename, unique_id);
215                             *pointers[0] = 'U';
216                             duplicate++;
217                         }
218                         break;
219                     }
220                     else if(unlikely(unique_id > ae->unique_id)) {
221                         // no need to continue
222                         // the linked list is sorted
223                         ae = NULL;
224                         break;
225                     }
226                 }
227
228                 // if not found, skip this line
229                 if(!ae) {
230                     // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
231                     continue;
232                 }
233             }
234
235             // check for a possible host missmatch
236             //if(strcmp(pointers[1], host->hostname))
237             //    error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
238
239             ae->unique_id               = unique_id;
240             ae->alarm_id                = alarm_id;
241             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
242             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
243             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
244
245             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
246             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
247             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
248
249             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
250             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
251
252             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
253             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
254
255             if(unlikely(ae->name)) freez(ae->name);
256             ae->name = strdupz(pointers[13]);
257             ae->hash_name = simple_hash(ae->name);
258
259             if(unlikely(ae->chart)) freez(ae->chart);
260             ae->chart = strdupz(pointers[14]);
261             ae->hash_chart = simple_hash(ae->chart);
262
263             if(unlikely(ae->family)) freez(ae->family);
264             ae->family = strdupz(pointers[15]);
265
266             if(unlikely(ae->exec)) freez(ae->exec);
267             ae->exec = strdupz(pointers[16]);
268             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
269
270             if(unlikely(ae->recipient)) freez(ae->recipient);
271             ae->recipient = strdupz(pointers[17]);
272             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
273
274             if(unlikely(ae->source)) freez(ae->source);
275             ae->source = strdupz(pointers[18]);
276             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
277
278             if(unlikely(ae->units)) freez(ae->units);
279             ae->units = strdupz(pointers[19]);
280             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
281
282             if(unlikely(ae->info)) freez(ae->info);
283             ae->info = strdupz(pointers[20]);
284             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
285
286             ae->exec_code   = atoi(pointers[21]);
287             ae->new_status  = atoi(pointers[22]);
288             ae->old_status  = atoi(pointers[23]);
289             ae->delay       = atoi(pointers[24]);
290
291             ae->new_value   = strtold(pointers[25], NULL);
292             ae->old_value   = strtold(pointers[26], NULL);
293
294             // add it to host if not already there
295             if(unlikely(*pointers[0] == 'A')) {
296                 ae->next = host->health_log.alarms;
297                 host->health_log.alarms = ae;
298                 loaded++;
299             }
300             else updated++;
301
302             if(unlikely(ae->unique_id > max_unique_id))
303                 max_unique_id = ae->unique_id;
304
305             if(unlikely(ae->alarm_id >= max_alarm_id))
306                 max_alarm_id = ae->alarm_id;
307         }
308         else {
309             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
310             errored++;
311         }
312     }
313
314     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
315
316     freez(buf);
317
318     if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
319     if(!max_alarm_id)  max_alarm_id  = (uint32_t)time(NULL);
320
321     host->health_log.next_log_id = max_unique_id + 1;
322     host->health_log.next_alarm_id = max_alarm_id + 1;
323
324     debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
325     return loaded;
326 }
327
328 static inline void health_alarm_log_load(RRDHOST *host) {
329     health_alarm_log_close();
330
331     char filename[FILENAME_MAX + 1];
332     snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename);
333     FILE *fp = fopen(filename, "r");
334     if(!fp)
335         error("Health: cannot open health file: %s", filename);
336     else {
337         health_alarm_log_read(host, fp, filename);
338         fclose(fp);
339     }
340
341     health.log_entries_written = 0;
342     fp = fopen(health.log_filename, "r");
343     if(!fp)
344         error("Health: cannot open health file: %s", health.log_filename);
345     else {
346         health_alarm_log_read(host, fp, health.log_filename);
347         fclose(fp);
348     }
349
350     health_alarm_log_open();
351 }
352
353
354 // ----------------------------------------------------------------------------
355 // health alarm log management
356
357 static inline void health_alarm_log(RRDHOST *host,
358                 uint32_t alarm_id, uint32_t alarm_event_id,
359                 time_t when,
360                 const char *name, const char *chart, const char *family,
361                 const char *exec, const char *recipient, time_t duration,
362                 calculated_number old_value, calculated_number new_value,
363                 int old_status, int new_status,
364                 const char *source,
365                 const char *units,
366                 const char *info,
367                 int delay
368 ) {
369     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
370
371     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
372     ae->name = strdupz(name);
373     ae->hash_name = simple_hash(ae->name);
374
375     if(chart) {
376         ae->chart = strdupz(chart);
377         ae->hash_chart = simple_hash(ae->chart);
378     }
379
380     if(family)
381         ae->family = strdupz(family);
382
383     if(exec) ae->exec = strdupz(exec);
384     if(recipient) ae->recipient = strdupz(recipient);
385     if(source) ae->source = strdupz(source);
386     if(units) ae->units = strdupz(units);
387     if(info) ae->info = strdupz(info);
388
389     ae->unique_id = host->health_log.next_log_id++;
390     ae->alarm_id = alarm_id;
391     ae->alarm_event_id = alarm_event_id;
392     ae->when = when;
393     ae->old_value = old_value;
394     ae->new_value = new_value;
395     ae->old_status = old_status;
396     ae->new_status = new_status;
397     ae->duration = duration;
398     ae->delay = delay;
399     ae->delay_up_to_timestamp = when + delay;
400
401     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
402         ae->non_clear_duration += ae->duration;
403
404     // link it
405     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
406     ae->next = host->health_log.alarms;
407     host->health_log.alarms = ae;
408     host->health_log.count++;
409     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
410
411     // match previous alarms
412     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
413     ALARM_ENTRY *t;
414     for(t = host->health_log.alarms ; t ; t = t->next) {
415         if(t != ae && t->alarm_id == ae->alarm_id) {
416             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
417                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
418                 t->updated_by_id = ae->unique_id;
419                 ae->updates_id = t->unique_id;
420
421                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
422                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
423                     ae->non_clear_duration += t->non_clear_duration;
424
425                 health_alarm_log_save(host, t);
426             }
427
428             // no need to continue
429             break;
430         }
431     }
432     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
433
434     health_alarm_log_save(host, ae);
435 }
436
437 // ----------------------------------------------------------------------------
438 // RRDVAR management
439
440 static inline int rrdvar_fix_name(char *variable) {
441     int fixed = 0;
442     while(*variable) {
443         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
444             *variable++ = '_';
445             fixed++;
446         }
447         else
448             variable++;
449     }
450
451     return fixed;
452 }
453
454 int rrdvar_compare(void* a, void* b) {
455     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
456     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
457     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
458 }
459
460 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
461     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
462     if(ret != rv)
463         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
464
465     return ret;
466 }
467
468 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
469     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
470     if(!ret)
471         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
472
473     return ret;
474 }
475
476 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
477     RRDVAR tmp;
478     tmp.name = (char *)name;
479     tmp.hash = (hash)?hash:simple_hash(tmp.name);
480
481     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
482 }
483
484 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
485     (void)host;
486
487     if(!rv) return;
488
489     if(tree)
490         rrdvar_index_del(tree, rv);
491
492     freez(rv->name);
493     freez(rv);
494 }
495
496 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
497     char *variable = strdupz(name);
498     rrdvar_fix_name(variable);
499     uint32_t hash = simple_hash(variable);
500
501     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
502     if(unlikely(!rv)) {
503         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
504
505         rv = callocz(1, sizeof(RRDVAR));
506         rv->name = variable;
507         rv->hash = hash;
508         rv->type = type;
509         rv->value = value;
510
511         RRDVAR *ret = rrdvar_index_add(tree, rv);
512         if(unlikely(ret != rv)) {
513             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
514             rrdvar_free(NULL, NULL, rv);
515             rv = NULL;
516         }
517         else
518             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
519     }
520     else {
521         // already exists
522         freez(variable);
523
524         // this is important
525         // it must return NULL - not the existing variable - or double-free will happen
526         rv = NULL;
527     }
528
529     return rv;
530 }
531
532 // ----------------------------------------------------------------------------
533 // RRDVAR lookup
534
535 calculated_number rrdvar2number(RRDVAR *rv) {
536     switch(rv->type) {
537         case RRDVAR_TYPE_CALCULATED: {
538             calculated_number *n = (calculated_number *)rv->value;
539             return *n;
540         }
541
542         case RRDVAR_TYPE_TIME_T: {
543             time_t *n = (time_t *)rv->value;
544             return *n;
545         }
546
547         case RRDVAR_TYPE_COLLECTED: {
548             collected_number *n = (collected_number *)rv->value;
549             return *n;
550         }
551
552         case RRDVAR_TYPE_TOTAL: {
553             total_number *n = (total_number *)rv->value;
554             return *n;
555         }
556
557         case RRDVAR_TYPE_INT: {
558             int *n = (int *)rv->value;
559             return *n;
560         }
561
562         default:
563             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
564             return NAN;
565     }
566 }
567
568 void dump_variable(void *data) {
569     RRDVAR *rv = (RRDVAR *)data;
570     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
571 }
572
573 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
574     RRDSET *st = rc->rrdset;
575     RRDVAR *rv;
576
577     if(!st) return 0;
578
579     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
580     if(rv) {
581         *result = rrdvar2number(rv);
582         return 1;
583     }
584
585     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
586     if(rv) {
587         *result = rrdvar2number(rv);
588         return 1;
589     }
590
591     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
592     if(rv) {
593         *result = rrdvar2number(rv);
594         return 1;
595     }
596
597     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
598     avl_traverse_lock(&st->variables_root_index, dump_variable);
599
600     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
601     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
602
603     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
604     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
605
606     return 0;
607 }
608
609 // ----------------------------------------------------------------------------
610 // RRDSETVAR management
611
612 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
613     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
614     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
615
616     char buffer[RRDVAR_MAX_LENGTH + 1];
617     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
618     rs->fullid = strdupz(buffer);
619
620     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
621     rs->fullname = strdupz(buffer);
622
623     rs->variable = strdupz(variable);
624
625     rs->type = type;
626     rs->value = value;
627     rs->options = options;
628     rs->rrdset = st;
629
630     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
631     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
632     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
633     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
634     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
635
636     rs->next = st->variables;
637     st->variables = rs;
638
639     return rs;
640 }
641
642 void rrdsetvar_rename_all(RRDSET *st) {
643     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
644
645     // only these 2 can change name
646     // rs->family_name
647     // rs->host_name
648
649     char buffer[RRDVAR_MAX_LENGTH + 1];
650     RRDSETVAR *rs, *next = st->variables;
651     while((rs = next)) {
652         next = rs->next;
653
654         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
655
656         if (strcmp(buffer, rs->fullname)) {
657             // name changed
658             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
659             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
660
661             freez(rs->fullname);
662             rs->fullname = strdupz(st->name);
663             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
664             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
665         }
666     }
667
668     rrdsetcalc_link_matching(st);
669 }
670
671 void rrdsetvar_free(RRDSETVAR *rs) {
672     RRDSET *st = rs->rrdset;
673     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
674
675     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
676     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
677     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
678     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
679     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
680
681     if(st->variables == rs) {
682         st->variables = rs->next;
683     }
684     else {
685         RRDSETVAR *t;
686         for (t = st->variables; t && t->next != rs; t = t->next);
687         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
688         else t->next = rs->next;
689     }
690
691     freez(rs->fullid);
692     freez(rs->fullname);
693     freez(rs->variable);
694     freez(rs);
695 }
696
697 // ----------------------------------------------------------------------------
698 // RRDDIMVAR management
699
700 #define RRDDIMVAR_ID_MAX 1024
701
702 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
703     RRDSET *st = rd->rrdset;
704
705     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
706
707     if(!prefix) prefix = "";
708     if(!suffix) suffix = "";
709
710     char buffer[RRDDIMVAR_ID_MAX + 1];
711     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
712
713     rs->prefix = strdupz(prefix);
714     rs->suffix = strdupz(suffix);
715
716     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
717     rs->id = strdupz(buffer);
718
719     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
720     rs->name = strdupz(buffer);
721
722     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
723     rs->fullidid = strdupz(buffer);
724
725     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
726     rs->fullidname = strdupz(buffer);
727
728     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->context, rs->id);
729     rs->contextid = strdupz(buffer);
730
731     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->context, rs->name);
732     rs->contextname = strdupz(buffer);
733
734     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
735     rs->fullnameid = strdupz(buffer);
736
737     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
738     rs->fullnamename = strdupz(buffer);
739
740     rs->type = type;
741     rs->value = value;
742     rs->options = options;
743     rs->rrddim = rd;
744
745     rs->local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
746     rs->local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
747
748     rs->family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
749     rs->family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
750     rs->family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->contextid, rs->type, rs->value);
751     rs->family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->contextname, rs->type, rs->value);
752
753     rs->host_fullidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
754     rs->host_fullidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
755     rs->host_fullnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
756     rs->host_fullnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
757
758     rs->next = rd->variables;
759     rd->variables = rs;
760
761     return rs;
762 }
763
764 void rrddimvar_rename_all(RRDDIM *rd) {
765     RRDSET *st = rd->rrdset;
766     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
767
768     RRDDIMVAR *rs, *next = rd->variables;
769     while((rs = next)) {
770         next = rs->next;
771
772         if (strcmp(rd->name, rs->name)) {
773             char buffer[RRDDIMVAR_ID_MAX + 1];
774             // name changed
775
776             // name and family name
777             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
778             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
779             freez(rs->name);
780             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
781             rs->name = strdupz(buffer);
782             rs->local_name  = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
783             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
784
785             // family_contextname
786             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->family_contextname);
787             freez(rs->contextname);
788             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->context, rs->name);
789             rs->contextname = strdupz(buffer);
790             rs->family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->contextname, rs->type, rs->value);
791
792             // fullidname
793             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
794             freez(rs->fullidname);
795             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
796             rs->fullidname = strdupz(buffer);
797             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
798
799             // fullnameid
800             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
801             freez(rs->fullnameid);
802             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
803             rs->fullnameid = strdupz(buffer);
804             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
805
806             // fullnamename
807             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
808             freez(rs->fullnamename);
809             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
810             rs->fullnamename = strdupz(buffer);
811             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
812         }
813     }
814 }
815
816 void rrddimvar_free(RRDDIMVAR *rs) {
817     RRDDIM *rd = rs->rrddim;
818     RRDSET *st = rd->rrdset;
819     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
820
821     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
822     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
823
824     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
825     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
826     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_contextid);
827     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_contextname);
828
829     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
830     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
831     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
832     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
833
834     if(rd->variables == rs) {
835         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
836         rd->variables = rs->next;
837     }
838     else {
839         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
840         RRDDIMVAR *t;
841         for (t = rd->variables; t && t->next != rs; t = t->next) ;
842         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
843         else t->next = rs->next;
844     }
845
846     freez(rs->prefix);
847     freez(rs->suffix);
848     freez(rs->id);
849     freez(rs->name);
850     freez(rs->contextid);
851     freez(rs->contextname);
852     freez(rs->fullidid);
853     freez(rs->fullidname);
854     freez(rs->fullnameid);
855     freez(rs->fullnamename);
856     freez(rs);
857 }
858
859 // ----------------------------------------------------------------------------
860 // RRDCALC management
861
862 static inline const char *rrdcalc_status2string(int status) {
863     switch(status) {
864         case RRDCALC_STATUS_REMOVED:
865             return "REMOVED";
866
867         case RRDCALC_STATUS_UNDEFINED:
868             return "UNDEFINED";
869
870         case RRDCALC_STATUS_UNINITIALIZED:
871             return "UNINITIALIZED";
872
873         case RRDCALC_STATUS_CLEAR:
874             return "CLEAR";
875
876         case RRDCALC_STATUS_RAISED:
877             return "RAISED";
878
879         case RRDCALC_STATUS_WARNING:
880             return "WARNING";
881
882         case RRDCALC_STATUS_CRITICAL:
883             return "CRITICAL";
884
885         default:
886             error("Unknown alarm status %d", status);
887             return "UNKNOWN";
888     }
889 }
890
891 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
892     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
893
894     rc->last_status_change = time(NULL);
895     rc->rrdset = st;
896
897     rc->rrdset_next = st->alarms;
898     rc->rrdset_prev = NULL;
899     
900     if(rc->rrdset_next)
901         rc->rrdset_next->rrdset_prev = rc;
902
903     st->alarms = rc;
904
905     if(rc->update_every < rc->rrdset->update_every) {
906         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
907         rc->update_every = rc->rrdset->update_every;
908     }
909
910     if(!isnan(rc->green) && isnan(st->green)) {
911         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
912         st->green = rc->green;
913     }
914
915     if(!isnan(rc->red) && isnan(st->red)) {
916         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
917         st->red = rc->red;
918     }
919
920     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
921     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
922
923     char fullname[RRDVAR_MAX_LENGTH + 1];
924     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
925     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
926
927     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
928     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
929
930         if(!rc->units) rc->units = strdupz(st->units);
931
932     {
933         time_t now = time(NULL);
934         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
935     }
936 }
937
938 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
939     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
940             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
941         return 1;
942
943     return 0;
944 }
945
946 // this has to be called while the RRDHOST is locked
947 inline void rrdsetcalc_link_matching(RRDSET *st) {
948     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
949
950     RRDCALC *rc;
951     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
952         if(unlikely(rc->rrdset))
953             continue;
954
955         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
956             rrdsetcalc_link(st, rc);
957     }
958 }
959
960 // this has to be called while the RRDHOST is locked
961 inline void rrdsetcalc_unlink(RRDCALC *rc) {
962     RRDSET *st = rc->rrdset;
963
964     if(!st) {
965         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
966         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
967         return;
968     }
969
970     {
971         time_t now = time(NULL);
972         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
973     }
974
975     RRDHOST *host = st->rrdhost;
976
977     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
978
979     // unlink it
980     if(rc->rrdset_prev)
981         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
982
983     if(rc->rrdset_next)
984         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
985
986     if(st->alarms == rc)
987         st->alarms = rc->rrdset_next;
988
989     rc->rrdset_prev = rc->rrdset_next = NULL;
990
991     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
992     rc->local = NULL;
993
994     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
995     rc->family = NULL;
996
997     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
998     rc->hostid = NULL;
999
1000     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
1001     rc->hostname = NULL;
1002
1003     rc->rrdset = NULL;
1004
1005     // RRDCALC will remain in RRDHOST
1006     // so that if the matching chart is found in the future
1007     // it will be applied automatically
1008 }
1009
1010 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
1011     RRDCALC *rc;
1012     uint32_t hash = simple_hash(name);
1013
1014     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
1015         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
1016             return rc;
1017     }
1018
1019     return NULL;
1020 }
1021
1022 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
1023     RRDCALC *rc;
1024
1025     if(unlikely(!chart)) {
1026         error("attempt to find RRDCALC '%s' without giving a chart name", name);
1027         return 1;
1028     }
1029
1030     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1031     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
1032
1033     // make sure it does not already exist
1034     for(rc = host->alarms; rc ; rc = rc->next) {
1035         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1036             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1037             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1038             return 1;
1039         }
1040     }
1041
1042     return 0;
1043 }
1044
1045 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1046     if(chart && name) {
1047         uint32_t hash_chart = simple_hash(chart);
1048         uint32_t hash_name = simple_hash(name);
1049
1050         // re-use old IDs, by looking them up in the alarm log
1051         ALARM_ENTRY *ae;
1052         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1053             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1054                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1055                 return ae->alarm_id;
1056             }
1057         }
1058     }
1059
1060     return host->health_log.next_alarm_id++;
1061 }
1062
1063 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1064     rrdhost_check_rdlock(host);
1065
1066     if(rc->calculation) {
1067         rc->calculation->status = &rc->status;
1068         rc->calculation->this = &rc->value;
1069         rc->calculation->after = &rc->db_after;
1070         rc->calculation->before = &rc->db_before;
1071         rc->calculation->rrdcalc = rc;
1072     }
1073
1074     if(rc->warning) {
1075         rc->warning->status = &rc->status;
1076         rc->warning->this = &rc->value;
1077         rc->warning->after = &rc->db_after;
1078         rc->warning->before = &rc->db_before;
1079         rc->warning->rrdcalc = rc;
1080     }
1081
1082     if(rc->critical) {
1083         rc->critical->status = &rc->status;
1084         rc->critical->this = &rc->value;
1085         rc->critical->after = &rc->db_after;
1086         rc->critical->before = &rc->db_before;
1087         rc->critical->rrdcalc = rc;
1088     }
1089
1090     // link it to the host
1091     if(likely(host->alarms)) {
1092         // append it
1093         RRDCALC *t;
1094         for(t = host->alarms; t && t->next ; t = t->next) ;
1095         t->next = rc;
1096     }
1097     else {
1098         host->alarms = rc;
1099     }
1100
1101     // link it to its chart
1102     RRDSET *st;
1103     for(st = host->rrdset_root; st ; st = st->next) {
1104         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1105             rrdsetcalc_link(st, rc);
1106             break;
1107         }
1108     }
1109 }
1110
1111 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1112
1113     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1114
1115     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1116         return NULL;
1117
1118     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1119     rc->next_event_id = 1;
1120     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1121     rc->name = strdupz(rt->name);
1122     rc->hash = simple_hash(rc->name);
1123     rc->chart = strdupz(chart);
1124     rc->hash_chart = simple_hash(rc->chart);
1125
1126     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1127
1128     rc->green = rt->green;
1129     rc->red = rt->red;
1130     rc->value = NAN;
1131     rc->old_value = NAN;
1132
1133     rc->delay_up_duration = rt->delay_up_duration;
1134     rc->delay_down_duration = rt->delay_down_duration;
1135     rc->delay_max_duration = rt->delay_max_duration;
1136     rc->delay_multiplier = rt->delay_multiplier;
1137
1138     rc->group = rt->group;
1139     rc->after = rt->after;
1140     rc->before = rt->before;
1141     rc->update_every = rt->update_every;
1142     rc->options = rt->options;
1143
1144     if(rt->exec) rc->exec = strdupz(rt->exec);
1145     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1146     if(rt->source) rc->source = strdupz(rt->source);
1147     if(rt->units) rc->units = strdupz(rt->units);
1148     if(rt->info) rc->info = strdupz(rt->info);
1149
1150     if(rt->calculation) {
1151         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1152         if(!rc->calculation)
1153             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1154     }
1155     if(rt->warning) {
1156         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1157         if(!rc->warning)
1158             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1159     }
1160     if(rt->critical) {
1161         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1162         if(!rc->critical)
1163             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1164     }
1165
1166     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1167           (rc->chart)?rc->chart:"NOCHART",
1168           rc->name,
1169           (rc->exec)?rc->exec:"DEFAULT",
1170           (rc->recipient)?rc->recipient:"DEFAULT",
1171           rc->green,
1172           rc->red,
1173           rc->group,
1174           rc->after,
1175           rc->before,
1176           rc->options,
1177           (rc->dimensions)?rc->dimensions:"NONE",
1178           rc->update_every,
1179           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1180           (rc->warning)?rc->warning->parsed_as:"NONE",
1181           (rc->critical)?rc->critical->parsed_as:"NONE",
1182           rc->source,
1183           rc->delay_up_duration,
1184           rc->delay_down_duration,
1185           rc->delay_max_duration,
1186           rc->delay_multiplier
1187     );
1188
1189     rrdcalc_create_part2(host, rc);
1190     return rc;
1191 }
1192
1193 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1194     if(!rc) return;
1195
1196     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1197
1198     // unlink it from RRDSET
1199     if(rc->rrdset) rrdsetcalc_unlink(rc);
1200
1201     // unlink it from RRDHOST
1202     if(unlikely(rc == host->alarms))
1203         host->alarms = rc->next;
1204
1205     else if(likely(host->alarms)) {
1206         RRDCALC *t, *last = host->alarms;
1207         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1208         if(last->next == rc)
1209             last->next = rc->next;
1210         else
1211             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1212     }
1213     else
1214         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1215
1216     expression_free(rc->calculation);
1217     expression_free(rc->warning);
1218     expression_free(rc->critical);
1219
1220     freez(rc->name);
1221     freez(rc->chart);
1222     freez(rc->family);
1223     freez(rc->dimensions);
1224     freez(rc->exec);
1225     freez(rc->recipient);
1226     freez(rc->source);
1227     freez(rc->units);
1228     freez(rc->info);
1229     freez(rc);
1230 }
1231
1232 // ----------------------------------------------------------------------------
1233 // RRDCALCTEMPLATE management
1234
1235 void rrdcalctemplate_link_matching(RRDSET *st) {
1236     RRDCALCTEMPLATE *rt;
1237
1238     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1239         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1240             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1241             if(unlikely(!rc))
1242                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1243
1244 #ifdef NETDATA_INTERNAL_CHECKS
1245             else if(rc->rrdset != st)
1246                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1247 #endif
1248         }
1249     }
1250 }
1251
1252 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1253     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1254
1255     if(host->templates) {
1256         if(host->templates == rt) {
1257             host->templates = rt->next;
1258         }
1259         else {
1260             RRDCALCTEMPLATE *t, *last = host->templates;
1261             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1262             if(last && last->next == rt) {
1263                 last->next = rt->next;
1264                 rt->next = NULL;
1265             }
1266             else
1267                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1268         }
1269     }
1270
1271     expression_free(rt->calculation);
1272     expression_free(rt->warning);
1273     expression_free(rt->critical);
1274
1275     freez(rt->name);
1276     freez(rt->exec);
1277     freez(rt->recipient);
1278     freez(rt->context);
1279     freez(rt->source);
1280     freez(rt->units);
1281     freez(rt->info);
1282     freez(rt->dimensions);
1283     freez(rt);
1284 }
1285
1286 // ----------------------------------------------------------------------------
1287 // load health configuration
1288
1289 #define HEALTH_CONF_MAX_LINE 4096
1290
1291 #define HEALTH_ALARM_KEY "alarm"
1292 #define HEALTH_TEMPLATE_KEY "template"
1293 #define HEALTH_ON_KEY "on"
1294 #define HEALTH_LOOKUP_KEY "lookup"
1295 #define HEALTH_CALC_KEY "calc"
1296 #define HEALTH_EVERY_KEY "every"
1297 #define HEALTH_GREEN_KEY "green"
1298 #define HEALTH_RED_KEY "red"
1299 #define HEALTH_WARN_KEY "warn"
1300 #define HEALTH_CRIT_KEY "crit"
1301 #define HEALTH_EXEC_KEY "exec"
1302 #define HEALTH_RECIPIENT_KEY "to"
1303 #define HEALTH_UNITS_KEY "units"
1304 #define HEALTH_INFO_KEY "info"
1305 #define HEALTH_DELAY_KEY "delay"
1306
1307 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1308     if(!rc->chart) {
1309         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1310         return 0;
1311     }
1312
1313     if(!rc->update_every) {
1314         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1315         return 0;
1316     }
1317
1318     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1319         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1320         return 0;
1321     }
1322
1323     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1324         return 0;
1325
1326     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1327
1328     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1329           rc->chart?rc->chart:"NOCHART",
1330           rc->name,
1331           rc->id,
1332           (rc->exec)?rc->exec:"DEFAULT",
1333           (rc->recipient)?rc->recipient:"DEFAULT",
1334           rc->green,
1335           rc->red,
1336           rc->group,
1337           rc->after,
1338           rc->before,
1339           rc->options,
1340           (rc->dimensions)?rc->dimensions:"NONE",
1341           rc->update_every,
1342           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1343           (rc->warning)?rc->warning->parsed_as:"NONE",
1344           (rc->critical)?rc->critical->parsed_as:"NONE",
1345           rc->source,
1346           rc->delay_up_duration,
1347           rc->delay_down_duration,
1348           rc->delay_max_duration,
1349           rc->delay_multiplier
1350     );
1351
1352     rrdcalc_create_part2(host, rc);
1353     return 1;
1354 }
1355
1356 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1357     if(unlikely(!rt->context)) {
1358         error("Health configuration for template '%s' does not have a context", rt->name);
1359         return 0;
1360     }
1361
1362     if(unlikely(!rt->update_every)) {
1363         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1364         return 0;
1365     }
1366
1367     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1368         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1369         return 0;
1370     }
1371
1372     RRDCALCTEMPLATE *t, *last = NULL;
1373     for (t = host->templates; t ; last = t, t = t->next) {
1374         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1375             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1376             return 0;
1377         }
1378     }
1379
1380     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1381           rt->name,
1382           (rt->context)?rt->context:"NONE",
1383           (rt->exec)?rt->exec:"DEFAULT",
1384           (rt->recipient)?rt->recipient:"DEFAULT",
1385           rt->green,
1386           rt->red,
1387           rt->group,
1388           rt->after,
1389           rt->before,
1390           rt->options,
1391           (rt->dimensions)?rt->dimensions:"NONE",
1392           rt->update_every,
1393           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1394           (rt->warning)?rt->warning->parsed_as:"NONE",
1395           (rt->critical)?rt->critical->parsed_as:"NONE",
1396           rt->source,
1397           rt->delay_up_duration,
1398           rt->delay_down_duration,
1399           rt->delay_max_duration,
1400           rt->delay_multiplier
1401     );
1402
1403     if(likely(last)) {
1404         last->next = rt;
1405     }
1406     else {
1407         rt->next = host->templates;
1408         host->templates = rt;
1409     }
1410
1411     return 1;
1412 }
1413
1414 static inline int health_parse_duration(char *string, int *result) {
1415     // make sure it is a number
1416     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1417         *result = 0;
1418         return 0;
1419     }
1420
1421     char *e = NULL;
1422     calculated_number n = strtold(string, &e);
1423     if(e && *e) {
1424         switch (*e) {
1425             case 'Y':
1426                 *result = (int) (n * 86400 * 365);
1427                 break;
1428             case 'M':
1429                 *result = (int) (n * 86400 * 30);
1430                 break;
1431             case 'w':
1432                 *result = (int) (n * 86400 * 7);
1433                 break;
1434             case 'd':
1435                 *result = (int) (n * 86400);
1436                 break;
1437             case 'h':
1438                 *result = (int) (n * 3600);
1439                 break;
1440             case 'm':
1441                 *result = (int) (n * 60);
1442                 break;
1443
1444             default:
1445             case 's':
1446                 *result = (int) (n);
1447                 break;
1448         }
1449     }
1450     else
1451        *result = (int)(n);
1452
1453     return 1;
1454 }
1455
1456 static inline int health_parse_delay(
1457         size_t line, const char *path, const char *file, char *string,
1458         int *delay_up_duration,
1459         int *delay_down_duration,
1460         int *delay_max_duration,
1461         float *delay_multiplier) {
1462
1463     char given_up = 0;
1464     char given_down = 0;
1465     char given_max = 0;
1466     char given_multiplier = 0;
1467
1468     char *s = string;
1469     while(*s) {
1470         char *key = s;
1471
1472         while(*s && !isspace(*s)) s++;
1473         while(*s && isspace(*s)) *s++ = '\0';
1474
1475         if(!*key) break;
1476
1477         char *value = s;
1478         while(*s && !isspace(*s)) s++;
1479         while(*s && isspace(*s)) *s++ = '\0';
1480
1481         if(!strcasecmp(key, "up")) {
1482             if (!health_parse_duration(value, delay_up_duration)) {
1483                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1484                       line, path, file, value, key);
1485             }
1486             else given_up = 1;
1487         }
1488         else if(!strcasecmp(key, "down")) {
1489             if (!health_parse_duration(value, delay_down_duration)) {
1490                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1491                       line, path, file, value, key);
1492             }
1493             else given_down = 1;
1494         }
1495         else if(!strcasecmp(key, "multiplier")) {
1496             *delay_multiplier = strtof(value, NULL);
1497             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1498                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1499                       line, path, file, value, key);
1500             }
1501             else given_multiplier = 1;
1502         }
1503         else if(!strcasecmp(key, "max")) {
1504             if (!health_parse_duration(value, delay_max_duration)) {
1505                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1506                       line, path, file, value, key);
1507             }
1508             else given_max = 1;
1509         }
1510         else {
1511             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1512                   line, path, file, key);
1513         }
1514     }
1515
1516     if(!given_up)
1517         *delay_up_duration = 0;
1518
1519     if(!given_down)
1520         *delay_down_duration = 0;
1521
1522     if(!given_multiplier)
1523         *delay_multiplier = 1.0;
1524
1525     if(!given_max) {
1526         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1527             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1528
1529         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1530             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1531     }
1532
1533     return 1;
1534 }
1535
1536 static inline int health_parse_db_lookup(
1537         size_t line, const char *path, const char *file, char *string,
1538         int *group_method, int *after, int *before, int *every,
1539         uint32_t *options, char **dimensions
1540 ) {
1541     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1542
1543     if(*dimensions) freez(*dimensions);
1544     *dimensions = NULL;
1545     *after = 0;
1546     *before = 0;
1547     *every = 0;
1548     *options = 0;
1549
1550     char *s = string, *key;
1551
1552     // first is the group method
1553     key = s;
1554     while(*s && !isspace(*s)) s++;
1555     while(*s && isspace(*s)) *s++ = '\0';
1556     if(!*s) {
1557         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1558               line, path, file, key);
1559         return 0;
1560     }
1561
1562     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1563         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1564               line, path, file, key);
1565         return 0;
1566     }
1567
1568     // then is the 'after' time
1569     key = s;
1570     while(*s && !isspace(*s)) s++;
1571     while(*s && isspace(*s)) *s++ = '\0';
1572
1573     if(!health_parse_duration(key, after)) {
1574         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1575               line, path, file, key);
1576         return 0;
1577     }
1578
1579     // sane defaults
1580     *every = abs(*after);
1581
1582     // now we may have optional parameters
1583     while(*s) {
1584         key = s;
1585         while(*s && !isspace(*s)) s++;
1586         while(*s && isspace(*s)) *s++ = '\0';
1587         if(!*key) break;
1588
1589         if(!strcasecmp(key, "at")) {
1590             char *value = s;
1591             while(*s && !isspace(*s)) s++;
1592             while(*s && isspace(*s)) *s++ = '\0';
1593
1594             if (!health_parse_duration(value, before)) {
1595                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1596                       line, path, file, value, key);
1597             }
1598         }
1599         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1600             char *value = s;
1601             while(*s && !isspace(*s)) s++;
1602             while(*s && isspace(*s)) *s++ = '\0';
1603
1604             if (!health_parse_duration(value, every)) {
1605                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1606                       line, path, file, value, key);
1607             }
1608         }
1609         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1610             *options |= RRDR_OPTION_ABSOLUTE;
1611         }
1612         else if(!strcasecmp(key, "min2max")) {
1613             *options |= RRDR_OPTION_MIN2MAX;
1614         }
1615         else if(!strcasecmp(key, "null2zero")) {
1616             *options |= RRDR_OPTION_NULL2ZERO;
1617         }
1618         else if(!strcasecmp(key, "percentage")) {
1619             *options |= RRDR_OPTION_PERCENTAGE;
1620         }
1621         else if(!strcasecmp(key, "unaligned")) {
1622             *options |= RRDR_OPTION_NOT_ALIGNED;
1623         }
1624         else if(!strcasecmp(key, "of")) {
1625             if(*s && strcasecmp(s, "all"))
1626                *dimensions = strdupz(s);
1627             break;
1628         }
1629         else {
1630             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1631                   line, path, file, key);
1632         }
1633     }
1634
1635     return 1;
1636 }
1637
1638 static inline char *tabs2spaces(char *s) {
1639     char *t = s;
1640     while(*t) {
1641         if(unlikely(*t == '\t')) *t = ' ';
1642         t++;
1643     }
1644
1645     return s;
1646 }
1647
1648 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1649     char buffer[FILENAME_MAX + 1];
1650     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1651     return strdupz(buffer);
1652 }
1653
1654 static inline void strip_quotes(char *s) {
1655     while(*s) {
1656         if(*s == '\'' || *s == '"') *s = ' ';
1657         s++;
1658     }
1659 }
1660
1661 int health_readfile(const char *path, const char *filename) {
1662     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1663
1664     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1665     char buffer[HEALTH_CONF_MAX_LINE + 1];
1666
1667     if(unlikely(!hash_alarm)) {
1668         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1669         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1670         hash_on = simple_uhash(HEALTH_ON_KEY);
1671         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1672         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1673         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1674         hash_red = simple_uhash(HEALTH_RED_KEY);
1675         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1676         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1677         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1678         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1679         hash_units = simple_hash(HEALTH_UNITS_KEY);
1680         hash_info = simple_hash(HEALTH_INFO_KEY);
1681         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1682         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1683     }
1684
1685     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1686     FILE *fp = fopen(buffer, "r");
1687     if(!fp) {
1688         error("Health configuration cannot read file '%s'.", buffer);
1689         return 0;
1690     }
1691
1692     RRDCALC *rc = NULL;
1693     RRDCALCTEMPLATE *rt = NULL;
1694
1695     size_t line = 0, append = 0;
1696     char *s;
1697     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1698         int stop_appending = !s;
1699         line++;
1700         s = trim(buffer);
1701         if(!s) continue;
1702
1703         append = strlen(s);
1704         if(!stop_appending && s[append - 1] == '\\') {
1705             s[append - 1] = ' ';
1706             append = &s[append] - buffer;
1707             if(append < HEALTH_CONF_MAX_LINE)
1708                 continue;
1709             else {
1710                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1711             }
1712         }
1713         append = 0;
1714
1715         char *key = s;
1716         while(*s && *s != ':') s++;
1717         if(!*s) {
1718             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1719             continue;
1720         }
1721         *s = '\0';
1722         s++;
1723
1724         char *value = s;
1725         key = trim(key);
1726         value = trim(value);
1727
1728         if(!key) {
1729             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1730             continue;
1731         }
1732
1733         if(!value) {
1734             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1735             continue;
1736         }
1737
1738         uint32_t hash = simple_uhash(key);
1739
1740         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1741             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1742                 rrdcalc_free(&localhost, rc);
1743
1744             if(rt) {
1745                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1746                     rrdcalctemplate_free(&localhost, rt);
1747                 rt = NULL;
1748             }
1749
1750             rc = callocz(1, sizeof(RRDCALC));
1751             rc->next_event_id = 1;
1752             rc->name = tabs2spaces(strdupz(value));
1753             rc->hash = simple_hash(rc->name);
1754             rc->source = health_source_file(line, path, filename);
1755             rc->green = NAN;
1756             rc->red = NAN;
1757             rc->value = NAN;
1758             rc->old_value = NAN;
1759             rc->delay_multiplier = 1.0;
1760
1761             if(rrdvar_fix_name(rc->name))
1762                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1763         }
1764         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1765             if(rc) {
1766                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1767                     rrdcalc_free(&localhost, rc);
1768                 rc = NULL;
1769             }
1770
1771             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1772                 rrdcalctemplate_free(&localhost, rt);
1773
1774             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1775             rt->name = tabs2spaces(strdupz(value));
1776             rt->hash_name = simple_hash(rt->name);
1777             rt->source = health_source_file(line, path, filename);
1778             rt->green = NAN;
1779             rt->red = NAN;
1780             rt->delay_multiplier = 1.0;
1781
1782             if(rrdvar_fix_name(rt->name))
1783                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1784         }
1785         else if(rc) {
1786             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1787                 if(rc->chart) {
1788                     if(strcmp(rc->chart, value))
1789                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1790                              line, path, filename, rc->name, key, rc->chart, value, value);
1791
1792                     freez(rc->chart);
1793                 }
1794                 rc->chart = tabs2spaces(strdupz(value));
1795                 rc->hash_chart = simple_hash(rc->chart);
1796             }
1797             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1798                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1799                                        &rc->update_every,
1800                                        &rc->options, &rc->dimensions);
1801             }
1802             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1803                 if(!health_parse_duration(value, &rc->update_every))
1804                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1805                          line, path, filename, rc->name, key, value);
1806             }
1807             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1808                 char *e;
1809                 rc->green = strtold(value, &e);
1810                 if(e && *e) {
1811                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1812                          line, path, filename, rc->name, key, e);
1813                 }
1814             }
1815             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1816                 char *e;
1817                 rc->red = strtold(value, &e);
1818                 if(e && *e) {
1819                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1820                          line, path, filename, rc->name, key, e);
1821                 }
1822             }
1823             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1824                 const char *failed_at = NULL;
1825                 int error = 0;
1826                 rc->calculation = expression_parse(value, &failed_at, &error);
1827                 if(!rc->calculation) {
1828                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1829                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1830                 }
1831             }
1832             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1833                 const char *failed_at = NULL;
1834                 int error = 0;
1835                 rc->warning = expression_parse(value, &failed_at, &error);
1836                 if(!rc->warning) {
1837                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1838                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1839                 }
1840             }
1841             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1842                 const char *failed_at = NULL;
1843                 int error = 0;
1844                 rc->critical = expression_parse(value, &failed_at, &error);
1845                 if(!rc->critical) {
1846                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1847                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1848                 }
1849             }
1850             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1851                 if(rc->exec) {
1852                     if(strcmp(rc->exec, value))
1853                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1854                              line, path, filename, rc->name, key, rc->exec, value, value);
1855
1856                     freez(rc->exec);
1857                 }
1858                 rc->exec = tabs2spaces(strdupz(value));
1859             }
1860             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1861                 if(rc->recipient) {
1862                     if(strcmp(rc->recipient, value))
1863                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1864                              line, path, filename, rc->name, key, rc->recipient, value, value);
1865
1866                     freez(rc->recipient);
1867                 }
1868                 rc->recipient = tabs2spaces(strdupz(value));
1869             }
1870             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1871                 if(rc->units) {
1872                     if(strcmp(rc->units, value))
1873                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1874                              line, path, filename, rc->name, key, rc->units, value, value);
1875
1876                     freez(rc->units);
1877                 }
1878                 rc->units = tabs2spaces(strdupz(value));
1879                 strip_quotes(rc->units);
1880             }
1881             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1882                 if(rc->info) {
1883                     if(strcmp(rc->info, value))
1884                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1885                              line, path, filename, rc->name, key, rc->info, value, value);
1886
1887                     freez(rc->info);
1888                 }
1889                 rc->info = tabs2spaces(strdupz(value));
1890                 strip_quotes(rc->info);
1891             }
1892             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1893                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1894             }
1895             else {
1896                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1897                      line, path, filename, rc->name, key);
1898             }
1899         }
1900         else if(rt) {
1901             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1902                 if(rt->context) {
1903                     if(strcmp(rt->context, value))
1904                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1905                              line, path, filename, rt->name, key, rt->context, value, value);
1906
1907                     freez(rt->context);
1908                 }
1909                 rt->context = tabs2spaces(strdupz(value));
1910                 rt->hash_context = simple_hash(rt->context);
1911             }
1912             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1913                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1914                                        &rt->update_every,
1915                                        &rt->options, &rt->dimensions);
1916             }
1917             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1918                 if(!health_parse_duration(value, &rt->update_every))
1919                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1920                          line, path, filename, rt->name, key, value);
1921             }
1922             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1923                 char *e;
1924                 rt->green = strtold(value, &e);
1925                 if(e && *e) {
1926                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1927                          line, path, filename, rt->name, key, e);
1928                 }
1929             }
1930             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1931                 char *e;
1932                 rt->red = strtold(value, &e);
1933                 if(e && *e) {
1934                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1935                          line, path, filename, rt->name, key, e);
1936                 }
1937             }
1938             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1939                 const char *failed_at = NULL;
1940                 int error = 0;
1941                 rt->calculation = expression_parse(value, &failed_at, &error);
1942                 if(!rt->calculation) {
1943                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1944                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1945                 }
1946             }
1947             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1948                 const char *failed_at = NULL;
1949                 int error = 0;
1950                 rt->warning = expression_parse(value, &failed_at, &error);
1951                 if(!rt->warning) {
1952                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1953                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1954                 }
1955             }
1956             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1957                 const char *failed_at = NULL;
1958                 int error = 0;
1959                 rt->critical = expression_parse(value, &failed_at, &error);
1960                 if(!rt->critical) {
1961                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1962                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1963                 }
1964             }
1965             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1966                 if(rt->exec) {
1967                     if(strcmp(rt->exec, value))
1968                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1969                              line, path, filename, rt->name, key, rt->exec, value, value);
1970
1971                     freez(rt->exec);
1972                 }
1973                 rt->exec = tabs2spaces(strdupz(value));
1974             }
1975             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1976                 if(rt->recipient) {
1977                     if(strcmp(rt->recipient, value))
1978                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1979                              line, path, filename, rt->name, key, rt->recipient, value, value);
1980
1981                     freez(rt->recipient);
1982                 }
1983                 rt->recipient = tabs2spaces(strdupz(value));
1984             }
1985             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1986                 if(rt->units) {
1987                     if(strcmp(rt->units, value))
1988                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1989                              line, path, filename, rt->name, key, rt->units, value, value);
1990
1991                     freez(rt->units);
1992                 }
1993                 rt->units = tabs2spaces(strdupz(value));
1994                 strip_quotes(rt->units);
1995             }
1996             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1997                 if(rt->info) {
1998                     if(strcmp(rt->info, value))
1999                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2000                              line, path, filename, rt->name, key, rt->info, value, value);
2001
2002                     freez(rt->info);
2003                 }
2004                 rt->info = tabs2spaces(strdupz(value));
2005                 strip_quotes(rt->info);
2006             }
2007             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2008                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
2009             }
2010             else {
2011                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
2012                       line, path, filename, rt->name, key);
2013             }
2014         }
2015         else {
2016             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
2017                   line, path, filename, key);
2018         }
2019     }
2020
2021     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2022         rrdcalc_free(&localhost, rc);
2023
2024     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2025         rrdcalctemplate_free(&localhost, rt);
2026
2027     fclose(fp);
2028     return 1;
2029 }
2030
2031 void health_readdir(const char *path) {
2032     size_t pathlen = strlen(path);
2033
2034     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2035
2036     DIR *dir = opendir(path);
2037     if (!dir) {
2038         error("Health configuration cannot open directory '%s'.", path);
2039         return;
2040     }
2041
2042     struct dirent *de = NULL;
2043     while ((de = readdir(dir))) {
2044         size_t len = strlen(de->d_name);
2045
2046         if(de->d_type == DT_DIR
2047            && (
2048                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2049                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2050            )) {
2051             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2052             continue;
2053         }
2054
2055         else if(de->d_type == DT_DIR) {
2056             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2057             strcpy(s, path);
2058             strcat(s, "/");
2059             strcat(s, de->d_name);
2060             health_readdir(s);
2061             freez(s);
2062             continue;
2063         }
2064
2065         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2066                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2067             health_readfile(path, de->d_name);
2068         }
2069
2070         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2071     }
2072
2073     closedir(dir);
2074 }
2075
2076 static inline char *health_config_dir(void) {
2077     char buffer[FILENAME_MAX + 1];
2078     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2079     return config_get("health", "health configuration directory", buffer);
2080 }
2081
2082 void health_init(void) {
2083     debug(D_HEALTH, "Health configuration initializing");
2084
2085     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2086         debug(D_HEALTH, "Health is disabled.");
2087         return;
2088     }
2089
2090     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2091     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2092         fatal("Cannot create directory '%s'.", pathname);
2093
2094     char filename[FILENAME_MAX + 1];
2095     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2096     health.log_filename = config_get("health", "health db file", filename);
2097
2098     health_alarm_log_load(&localhost);
2099     health_alarm_log_open();
2100
2101     char *path = health_config_dir();
2102
2103     {
2104         char buffer[FILENAME_MAX + 1];
2105         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2106         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2107     }
2108
2109     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2110     if(n < 10) {
2111         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2112         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2113     }
2114     else localhost.health_log.max = (unsigned int)n;
2115
2116     rrdhost_rwlock(&localhost);
2117     health_readdir(path);
2118     rrdhost_unlock(&localhost);
2119 }
2120
2121 // ----------------------------------------------------------------------------
2122 // JSON generation
2123
2124 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2125     if(value && *value)
2126         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2127     else
2128         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2129 }
2130
2131 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2132     buffer_sprintf(wb, "\n\t{\n"
2133                            "\t\t\"hostname\": \"%s\",\n"
2134                            "\t\t\"unique_id\": %u,\n"
2135                            "\t\t\"alarm_id\": %u,\n"
2136                            "\t\t\"alarm_event_id\": %u,\n"
2137                            "\t\t\"name\": \"%s\",\n"
2138                            "\t\t\"chart\": \"%s\",\n"
2139                            "\t\t\"family\": \"%s\",\n"
2140                            "\t\t\"processed\": %s,\n"
2141                            "\t\t\"updated\": %s,\n"
2142                            "\t\t\"exec_run\": %lu,\n"
2143                            "\t\t\"exec_failed\": %s,\n"
2144                            "\t\t\"exec\": \"%s\",\n"
2145                            "\t\t\"recipient\": \"%s\",\n"
2146                            "\t\t\"exec_code\": %d,\n"
2147                            "\t\t\"source\": \"%s\",\n"
2148                            "\t\t\"units\": \"%s\",\n"
2149                            "\t\t\"info\": \"%s\",\n"
2150                            "\t\t\"when\": %lu,\n"
2151                            "\t\t\"duration\": %lu,\n"
2152                            "\t\t\"non_clear_duration\": %lu,\n"
2153                            "\t\t\"status\": \"%s\",\n"
2154                            "\t\t\"old_status\": \"%s\",\n"
2155                            "\t\t\"delay\": %d,\n"
2156                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2157                            "\t\t\"updated_by_id\": %u,\n"
2158                            "\t\t\"updates_id\": %u,\n",
2159                    host->hostname,
2160                    ae->unique_id,
2161                    ae->alarm_id,
2162                    ae->alarm_event_id,
2163                    ae->name,
2164                    ae->chart,
2165                    ae->family,
2166                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2167                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2168                    (unsigned long)ae->exec_run_timestamp,
2169                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2170                    ae->exec?ae->exec:health.health_default_exec,
2171                    ae->recipient?ae->recipient:health.health_default_recipient,
2172                    ae->exec_code,
2173                    ae->source,
2174                    ae->units?ae->units:"",
2175                    ae->info?ae->info:"",
2176                    (unsigned long)ae->when,
2177                    (unsigned long)ae->duration,
2178                    (unsigned long)ae->non_clear_duration,
2179                    rrdcalc_status2string(ae->new_status),
2180                    rrdcalc_status2string(ae->old_status),
2181                    ae->delay,
2182                    (unsigned long)ae->delay_up_to_timestamp,
2183                    ae->updated_by_id,
2184                    ae->updates_id
2185     );
2186
2187     buffer_strcat(wb, "\t\t\"value\":");
2188     buffer_rrd_value(wb, ae->new_value);
2189     buffer_strcat(wb, ",\n");
2190
2191     buffer_strcat(wb, "\t\t\"old_value\":");
2192     buffer_rrd_value(wb, ae->old_value);
2193     buffer_strcat(wb, "\n");
2194
2195     buffer_strcat(wb, "\t}");
2196 }
2197
2198 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2199     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2200
2201     buffer_strcat(wb, "[");
2202
2203     unsigned int max = host->health_log.max;
2204     unsigned int count = 0;
2205     ALARM_ENTRY *ae;
2206     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2207         if(ae->unique_id > after) {
2208             if(likely(count)) buffer_strcat(wb, ",");
2209             health_alarm_entry2json_nolock(wb, ae, host);
2210         }
2211     }
2212
2213     buffer_strcat(wb, "\n]\n");
2214
2215     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2216 }
2217
2218 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2219     buffer_sprintf(wb,
2220            "\t\t\"%s.%s\": {\n"
2221                    "\t\t\t\"id\": %lu,\n"
2222                    "\t\t\t\"name\": \"%s\",\n"
2223                    "\t\t\t\"chart\": \"%s\",\n"
2224                    "\t\t\t\"family\": \"%s\",\n"
2225                    "\t\t\t\"active\": %s,\n"
2226                    "\t\t\t\"exec\": \"%s\",\n"
2227                    "\t\t\t\"recipient\": \"%s\",\n"
2228                    "\t\t\t\"source\": \"%s\",\n"
2229                    "\t\t\t\"units\": \"%s\",\n"
2230                    "\t\t\t\"info\": \"%s\",\n"
2231                                    "\t\t\t\"status\": \"%s\",\n"
2232                    "\t\t\t\"last_status_change\": %lu,\n"
2233                    "\t\t\t\"last_updated\": %lu,\n"
2234                    "\t\t\t\"next_update\": %lu,\n"
2235                    "\t\t\t\"update_every\": %d,\n"
2236                    "\t\t\t\"delay_up_duration\": %d,\n"
2237                    "\t\t\t\"delay_down_duration\": %d,\n"
2238                    "\t\t\t\"delay_max_duration\": %d,\n"
2239                    "\t\t\t\"delay_multiplier\": %f,\n"
2240                    "\t\t\t\"delay\": %d,\n"
2241                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2242             , rc->chart, rc->name
2243             , (unsigned long)rc->id
2244             , rc->name
2245             , rc->chart
2246             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2247             , (rc->rrdset)?"true":"false"
2248             , rc->exec?rc->exec:health.health_default_exec
2249             , rc->recipient?rc->recipient:health.health_default_recipient
2250             , rc->source
2251             , rc->units?rc->units:""
2252             , rc->info?rc->info:""
2253             , rrdcalc_status2string(rc->status)
2254             , (unsigned long)rc->last_status_change
2255             , (unsigned long)rc->last_updated
2256             , (unsigned long)rc->next_update
2257             , rc->update_every
2258             , rc->delay_up_duration
2259             , rc->delay_down_duration
2260             , rc->delay_max_duration
2261             , rc->delay_multiplier
2262             , rc->delay_last
2263             , (unsigned long)rc->delay_up_to_timestamp
2264     );
2265
2266     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2267         if(rc->dimensions && *rc->dimensions)
2268             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2269
2270         buffer_sprintf(wb,
2271                        "\t\t\t\"db_after\": %lu,\n"
2272                        "\t\t\t\"db_before\": %lu,\n"
2273                        "\t\t\t\"lookup_method\": \"%s\",\n"
2274                        "\t\t\t\"lookup_after\": %d,\n"
2275                        "\t\t\t\"lookup_before\": %d,\n"
2276                        "\t\t\t\"lookup_options\": \"",
2277                        (unsigned long) rc->db_after,
2278                        (unsigned long) rc->db_before,
2279                        group_method2string(rc->group),
2280                        rc->after,
2281                        rc->before
2282         );
2283         buffer_data_options2string(wb, rc->options);
2284         buffer_strcat(wb, "\",\n");
2285     }
2286
2287     if(rc->calculation) {
2288         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2289         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2290     }
2291
2292     if(rc->warning) {
2293         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2294         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2295     }
2296
2297     if(rc->critical) {
2298         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2299         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2300     }
2301
2302     buffer_strcat(wb, "\t\t\t\"green\":");
2303     buffer_rrd_value(wb, rc->green);
2304     buffer_strcat(wb, ",\n");
2305
2306     buffer_strcat(wb, "\t\t\t\"red\":");
2307     buffer_rrd_value(wb, rc->red);
2308     buffer_strcat(wb, ",\n");
2309
2310     buffer_strcat(wb, "\t\t\t\"value\":");
2311     buffer_rrd_value(wb, rc->value);
2312     buffer_strcat(wb, "\n");
2313
2314     buffer_strcat(wb, "\t\t}");
2315 }
2316
2317 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2318 //
2319 //}
2320
2321 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2322     int i;
2323
2324     rrdhost_rdlock(&localhost);
2325     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2326                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2327                         "\n\t\"status\": %s,"
2328                         "\n\t\"now\": %lu,"
2329                         "\n\t\"alarms\": {\n",
2330                         host->hostname,
2331                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2332                         health_enabled?"true":"false",
2333                         (unsigned long)time(NULL));
2334
2335     RRDCALC *rc;
2336     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2337         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2338             continue;
2339
2340         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2341             continue;
2342
2343         if(likely(i)) buffer_strcat(wb, ",\n");
2344         health_rrdcalc2json_nolock(wb, rc);
2345         i++;
2346     }
2347
2348 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2349 //    RRDCALCTEMPLATE *rt;
2350 //    for(rt = host->templates; rt ; rt = rt->next)
2351 //        health_rrdcalctemplate2json_nolock(wb, rt);
2352
2353     buffer_strcat(wb, "\n\t}\n}\n");
2354     rrdhost_unlock(&localhost);
2355 }
2356
2357
2358 // ----------------------------------------------------------------------------
2359 // re-load health configuration
2360
2361 static inline void health_free_all_nolock(RRDHOST *host) {
2362     while(host->templates)
2363         rrdcalctemplate_free(host, host->templates);
2364
2365     while(host->alarms)
2366         rrdcalc_free(host, host->alarms);
2367 }
2368
2369 void health_reload(void) {
2370     if(!health_enabled) {
2371         error("Health reload is requested, but health is not enabled.");
2372         return;
2373     }
2374
2375     char *path = health_config_dir();
2376
2377     // free all running alarms
2378     rrdhost_rwlock(&localhost);
2379     health_free_all_nolock(&localhost);
2380     rrdhost_unlock(&localhost);
2381
2382     // invalidate all previous entries in the alarm log
2383     ALARM_ENTRY *t;
2384     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2385         if(t->new_status != RRDCALC_STATUS_REMOVED)
2386             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2387     }
2388
2389     // reset all thresholds to all charts
2390     RRDSET *st;
2391     for(st = localhost.rrdset_root; st ; st = st->next) {
2392         st->green = NAN;
2393         st->red = NAN;
2394     }
2395
2396     // load the new alarms
2397     rrdhost_rwlock(&localhost);
2398     health_readdir(path);
2399     rrdhost_unlock(&localhost);
2400
2401     // link the loaded alarms to their charts
2402     for(st = localhost.rrdset_root; st ; st = st->next) {
2403         rrdhost_rwlock(&localhost);
2404
2405         rrdsetcalc_link_matching(st);
2406         rrdcalctemplate_link_matching(st);
2407
2408         rrdhost_unlock(&localhost);
2409     }
2410 }
2411
2412 // ----------------------------------------------------------------------------
2413 // health main thread and friends
2414
2415 static inline int rrdcalc_value2status(calculated_number n) {
2416     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2417     if(n) return RRDCALC_STATUS_RAISED;
2418     return RRDCALC_STATUS_CLEAR;
2419 }
2420
2421 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2422     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2423
2424     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2425         // do not send notifications for internal statuses
2426         goto done;
2427     }
2428
2429     // find the previous notification for the same alarm
2430     // which we have run the exec script
2431     ALARM_ENTRY *t;
2432     for(t = ae->next; t ;t = t->next) {
2433         if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2434             break;
2435     }
2436
2437     if(likely(t)) {
2438         // we have executed this alarm notification in the past
2439         if (t && t->new_status == ae->new_status) {
2440             // don't send the same notification again
2441             debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name,
2442                  rrdcalc_status2string(ae->new_status));
2443             goto done;
2444         }
2445     }
2446     else {
2447         // we have not executed this alarm notification in the past
2448         if(unlikely(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2449             debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2450             goto done;
2451         }
2452     }
2453
2454     char buffer[FILENAME_MAX + 1];
2455     pid_t command_pid;
2456
2457     const char *exec = ae->exec;
2458     if(!exec) exec = health.health_default_exec;
2459
2460     const char *recipient = ae->recipient;
2461     if(!recipient) recipient = health.health_default_recipient;
2462
2463     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2464               exec,
2465               recipient,
2466               host->hostname,
2467               ae->unique_id,
2468               ae->alarm_id,
2469               ae->alarm_event_id,
2470               (unsigned long)ae->when,
2471               ae->name,
2472               ae->chart?ae->chart:"NOCAHRT",
2473               ae->family?ae->family:"NOFAMILY",
2474               rrdcalc_status2string(ae->new_status),
2475               rrdcalc_status2string(ae->old_status),
2476               ae->new_value,
2477               ae->old_value,
2478               ae->source?ae->source:"UNKNOWN",
2479               (uint32_t)ae->duration,
2480               (uint32_t)ae->non_clear_duration,
2481               ae->units?ae->units:"",
2482               ae->info?ae->info:""
2483     );
2484
2485     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2486     ae->exec_run_timestamp = time(NULL);
2487
2488     debug(D_HEALTH, "executing command '%s'", buffer);
2489     FILE *fp = mypopen(buffer, &command_pid);
2490     if(!fp) {
2491         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2492         goto done;
2493     }
2494     debug(D_HEALTH, "HEALTH reading from command");
2495     char *s = fgets(buffer, FILENAME_MAX, fp);
2496     (void)s;
2497     ae->exec_code = mypclose(fp, command_pid);
2498     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2499
2500     if(ae->exec_code != 0)
2501         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2502
2503 done:
2504     health_alarm_log_save(host, ae);
2505     return;
2506 }
2507
2508 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2509     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2510          ae->chart?ae->chart:"NOCHART", ae->name,
2511          ae->new_value,
2512          rrdcalc_status2string(ae->old_status),
2513          rrdcalc_status2string(ae->new_status)
2514     );
2515
2516     health_alarm_execute(host, ae);
2517 }
2518
2519 static inline void health_alarm_log_process(RRDHOST *host) {
2520     static uint32_t stop_at_id = 0;
2521     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2522     time_t now = time(NULL);
2523
2524     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2525
2526     ALARM_ENTRY *ae;
2527     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2528         if(unlikely(
2529             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2530             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2531             )) {
2532
2533             if(unlikely(ae->unique_id < first_waiting))
2534                 first_waiting = ae->unique_id;
2535
2536             if(likely(now >= ae->delay_up_to_timestamp))
2537                 health_process_notifications(host, ae);
2538         }
2539     }
2540
2541     // remember this for the next iteration
2542     stop_at_id = first_waiting;
2543
2544     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2545
2546     if(host->health_log.count <= host->health_log.max)
2547         return;
2548
2549     // cleanup excess entries in the log
2550     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2551
2552     ALARM_ENTRY *last = NULL;
2553     unsigned int count = host->health_log.max * 2 / 3;
2554     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2555
2556     if(ae && last && last->next == ae)
2557         last->next = NULL;
2558     else
2559         ae = NULL;
2560
2561     while(ae) {
2562         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2563
2564         ALARM_ENTRY *t = ae->next;
2565
2566         freez(ae->name);
2567         freez(ae->chart);
2568         freez(ae->family);
2569         freez(ae->exec);
2570         freez(ae->recipient);
2571         freez(ae->source);
2572         freez(ae->units);
2573         freez(ae->info);
2574         freez(ae);
2575
2576         ae = t;
2577         host->health_log.count--;
2578     }
2579
2580     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2581 }
2582
2583 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2584     if(unlikely(!rc->rrdset)) {
2585         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2586         return 0;
2587     }
2588
2589     if(unlikely(rc->next_update > now)) {
2590         if (unlikely(*next_run > rc->next_update)) {
2591             // update the next_run time of the main loop
2592             // to run this alarm precisely the time required
2593             *next_run = rc->next_update;
2594         }
2595
2596         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2597         return 0;
2598     }
2599
2600     if(unlikely(!rc->update_every)) {
2601         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2602         return 0;
2603     }
2604
2605     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
2606         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
2607         return 0;
2608     }
2609
2610     int update_every = rc->rrdset->update_every;
2611     time_t first = rrdset_first_entry_t(rc->rrdset);
2612     time_t last = rrdset_last_entry_t(rc->rrdset);
2613
2614     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
2615         debug(D_HEALTH
2616               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
2617               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
2618               , (unsigned long) last);
2619         return 0;
2620     }
2621
2622     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2623         time_t needed = now + rc->before + rc->after;
2624
2625         if(needed + update_every < first || needed - update_every > last) {
2626             debug(D_HEALTH
2627                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
2628                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
2629                   , (unsigned long) last);
2630             return 0;
2631         }
2632     }
2633
2634     return 1;
2635 }
2636
2637 void *health_main(void *ptr) {
2638     (void)ptr;
2639
2640     info("HEALTH thread created with task id %d", gettid());
2641
2642     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2643         error("Cannot set pthread cancel type to DEFERRED.");
2644
2645     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2646         error("Cannot set pthread cancel state to ENABLE.");
2647
2648     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2649     if(min_run_every < 1) min_run_every = 1;
2650
2651     BUFFER *wb = buffer_create(100);
2652
2653     unsigned int loop = 0;
2654     while(health_enabled && !netdata_exit) {
2655         loop++;
2656         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2657
2658         int oldstate, runnable = 0;
2659         time_t now = time(NULL);
2660         time_t next_run = now + min_run_every;
2661         RRDCALC *rc;
2662
2663         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2664             error("Cannot set pthread cancel state to DISABLE.");
2665
2666         rrdhost_rdlock(&localhost);
2667
2668         // the first loop is to lookup values from the db
2669         for(rc = localhost.alarms; rc; rc = rc->next) {
2670             if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
2671                 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
2672                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
2673                 continue;
2674             }
2675
2676             runnable++;
2677             rc->old_value = rc->value;
2678             rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
2679
2680             // 1. if there is database lookup, do it
2681             // 2. if there is calculation expression, run it
2682
2683             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2684                 /* time_t old_db_timestamp = rc->db_before; */
2685                 int value_is_null = 0;
2686
2687                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2688                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2689                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2690
2691                 if (unlikely(ret != 200)) {
2692                     // database lookup failed
2693                     rc->value = NAN;
2694
2695                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2696
2697                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2698                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2699                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2700                     }
2701                 }
2702                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2703                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2704
2705                 /* - RRDCALC_FLAG_DB_STALE not currently used
2706                 if (unlikely(old_db_timestamp == rc->db_before)) {
2707                     // database is stale
2708
2709                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2710
2711                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2712                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2713                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2714                     }
2715                 }
2716                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2717                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2718                 */
2719
2720                 if (unlikely(value_is_null)) {
2721                     // collected value is null
2722
2723                     rc->value = NAN;
2724
2725                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2726                           rc->chart?rc->chart:"NOCHART", rc->name);
2727
2728                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2729                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2730                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2731                               rc->chart?rc->chart:"NOCHART", rc->name);
2732                     }
2733                 }
2734                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2735                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2736
2737                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2738                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2739             }
2740
2741             if(unlikely(rc->calculation)) {
2742                 if (unlikely(!expression_evaluate(rc->calculation))) {
2743                     // calculation failed
2744
2745                     rc->value = NAN;
2746
2747                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s",
2748                           rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2749
2750                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2751                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2752                         error("Health alarm '%s.%s': expression '%s' failed: %s",
2753                               rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2754                     }
2755                 }
2756                 else {
2757                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2758                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2759
2760                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value "
2761                             CALCULATED_NUMBER_FORMAT
2762                             ": %s (source: %s)",
2763                           rc->chart?rc->chart:"NOCHART", rc->name,
2764                           rc->calculation->parsed_as,
2765                           rc->calculation->result,
2766                           buffer_tostring(rc->calculation->error_msg),
2767                           rc->source
2768                     );
2769
2770                     rc->value = rc->calculation->result;
2771                 }
2772             }
2773         }
2774         rrdhost_unlock(&localhost);
2775
2776         if(unlikely(runnable && !netdata_exit)) {
2777             rrdhost_rdlock(&localhost);
2778
2779             for(rc = localhost.alarms; rc; rc = rc->next) {
2780                 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
2781                     continue;
2782
2783                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2784                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2785
2786                 if(likely(rc->warning)) {
2787                     if(unlikely(!expression_evaluate(rc->warning))) {
2788                         // calculation failed
2789
2790                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2791                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2792
2793                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2794                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2795                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2796                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2797                         }
2798                     }
2799                     else {
2800                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2801                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2802
2803                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2804                                 CALCULATED_NUMBER_FORMAT
2805                                 ": %s (source: %s)",
2806                               rc->chart?rc->chart:"NOCHART", rc->name,
2807                               rc->warning->result,
2808                               buffer_tostring(rc->warning->error_msg),
2809                               rc->source
2810                         );
2811
2812                         warning_status = rrdcalc_value2status(rc->warning->result);
2813                     }
2814                 }
2815
2816                 if(likely(rc->critical)) {
2817                     if(unlikely(!expression_evaluate(rc->critical))) {
2818                         // calculation failed
2819
2820                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2821                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2822
2823                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2824                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2825                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2826                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2827                         }
2828                     }
2829                     else {
2830                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2831                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2832
2833                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2834                                 CALCULATED_NUMBER_FORMAT
2835                                 ": %s (source: %s)",
2836                               rc->chart?rc->chart:"NOCHART", rc->name,
2837                               rc->critical->result,
2838                               buffer_tostring(rc->critical->error_msg),
2839                               rc->source
2840                         );
2841
2842                         critical_status = rrdcalc_value2status(rc->critical->result);
2843                     }
2844                 }
2845
2846                 int status = RRDCALC_STATUS_UNDEFINED;
2847
2848                 switch(warning_status) {
2849                     case RRDCALC_STATUS_CLEAR:
2850                         status = RRDCALC_STATUS_CLEAR;
2851                         break;
2852
2853                     case RRDCALC_STATUS_RAISED:
2854                         status = RRDCALC_STATUS_WARNING;
2855                         break;
2856
2857                     default:
2858                         break;
2859                 }
2860
2861                 switch(critical_status) {
2862                     case RRDCALC_STATUS_CLEAR:
2863                         if(status == RRDCALC_STATUS_UNDEFINED)
2864                             status = RRDCALC_STATUS_CLEAR;
2865                         break;
2866
2867                     case RRDCALC_STATUS_RAISED:
2868                         status = RRDCALC_STATUS_CRITICAL;
2869                         break;
2870
2871                     default:
2872                         break;
2873                 }
2874
2875                 if(status != rc->status) {
2876                     int delay = 0;
2877
2878                     if(now > rc->delay_up_to_timestamp) {
2879                         rc->delay_up_current = rc->delay_up_duration;
2880                         rc->delay_down_current = rc->delay_down_duration;
2881                         rc->delay_last = 0;
2882                         rc->delay_up_to_timestamp = 0;
2883                     }
2884                     else {
2885                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2886                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2887
2888                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2889                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2890                     }
2891
2892                     if(status > rc->status)
2893                         delay = rc->delay_up_current;
2894                     else
2895                         delay = rc->delay_down_current;
2896
2897                     // COMMENTED: because we do need to send raising alarms
2898                     // if(now + delay < rc->delay_up_to_timestamp)
2899                     //    delay = (int)(rc->delay_up_to_timestamp - now);
2900
2901                     rc->delay_last = delay;
2902                     rc->delay_up_to_timestamp = now + delay;
2903                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
2904                     rc->last_status_change = now;
2905                     rc->status = status;
2906                 }
2907
2908                 rc->last_updated = now;
2909                 rc->next_update = now + rc->update_every;
2910
2911                 if (next_run > rc->next_update)
2912                     next_run = rc->next_update;
2913             }
2914
2915             rrdhost_unlock(&localhost);
2916         }
2917
2918         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2919             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2920
2921         if(unlikely(netdata_exit))
2922             break;
2923
2924         // execute notifications
2925         // and cleanup
2926         health_alarm_log_process(&localhost);
2927
2928         if(unlikely(netdata_exit))
2929             break;
2930         
2931         now = time(NULL);
2932         if(now < next_run) {
2933             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2934                   loop, (int) (next_run - now));
2935             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2936         }
2937         else {
2938             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2939         }
2940     }
2941
2942     buffer_free(wb);
2943
2944     info("HEALTH thread exiting");
2945     pthread_exit(NULL);
2946     return NULL;
2947 }