]> arthur.barton.de Git - netdata.git/blob - src/health.c
minor optimization on alarms parsing
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         if(unlink(health.log_filename) == -1 && errno != ENOENT)
71             error("Health: cannot remove old alarms log file '%s'", health.log_filename);
72
73         // open it with truncate
74         health.log_fp = fopen(health.log_filename, "w");
75
76         if(health.log_fp)
77             fclose(health.log_fp);
78         else
79             error("Health: cannot truncate health log '%s'", health.log_filename);
80
81         health.log_fp = NULL;
82
83         health.log_entries_written = 0;
84         health_alarm_log_open();
85     }
86 }
87
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
89     health_log_rotate();
90
91     if(likely(health.log_fp)) {
92         if(unlikely(fprintf(health.log_fp
93                 , "%c\t%s"
94                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
95                   "\t%08x\t%08x\t%08x"
96                   "\t%08x\t%08x\t%08x"
97                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
98                   "\t%d\t%d\t%d\t%d"
99                   "\t%Lf\t%Lf"
100                   "\n"
101                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
102                 , host->hostname
103
104                 , ae->unique_id
105                 , ae->alarm_id
106                 , ae->alarm_event_id
107                 , ae->updated_by_id
108                 , ae->updates_id
109
110                 , (uint32_t)ae->when
111                 , (uint32_t)ae->duration
112                 , (uint32_t)ae->non_clear_duration
113                 , (uint32_t)ae->flags
114                 , (uint32_t)ae->exec_run_timestamp
115                 , (uint32_t)ae->delay_up_to_timestamp
116
117                 , (ae->name)?ae->name:""
118                 , (ae->chart)?ae->chart:""
119                 , (ae->family)?ae->family:""
120                 , (ae->exec)?ae->exec:""
121                 , (ae->recipient)?ae->recipient:""
122                 , (ae->source)?ae->source:""
123                 , (ae->units)?ae->units:""
124                 , (ae->info)?ae->info:""
125
126                 , ae->exec_code
127                 , ae->new_status
128                 , ae->old_status
129                 , ae->delay
130
131                 , (long double)ae->new_value
132                 , (long double)ae->old_value
133         ) < 0))
134             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
135         else {
136             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137             health.log_entries_written++;
138         }
139     }
140 }
141
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143     static uint32_t max_unique_id = 0, max_alarm_id = 0;
144
145     errno = 0;
146
147     char *s, *buf = mallocz(65536 + 1);
148     size_t line = 0, len = 0;
149     ssize_t loaded = 0, updated = 0, errored = 0, duplicate = 0;
150
151     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
152
153     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
154         health.log_entries_written++;
155         line++;
156
157         int max_entries = 30, entries = 0;
158         char *pointers[max_entries];
159
160         pointers[entries++] = s++;
161         while(*s) {
162             if(unlikely(*s == '\t')) {
163                 *s = '\0';
164                 pointers[entries++] = ++s;
165                 if(entries >= max_entries) {
166                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
167                     break;
168                 }
169             }
170             else s++;
171         }
172
173         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
174             ALARM_ENTRY *ae = NULL;
175
176             if(entries < 26) {
177                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries);
178                 errored++;
179                 continue;
180             }
181
182             // check that we have valid ids
183             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
184             if(!unique_id) {
185                 error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]);
186                 errored++;
187                 continue;
188             }
189
190             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
191             if(!alarm_id) {
192                 error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]);
193                 errored++;
194                 continue;
195             }
196
197             if(unlikely(*pointers[0] == 'A')) {
198                 // make sure it is properly numbered
199                 if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
200                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id);
201                     errored++;
202                     continue;
203                 }
204
205                 ae = callocz(1, sizeof(ALARM_ENTRY));
206             }
207             else if(unlikely(*pointers[0] == 'U')) {
208                 // find the original
209                 for(ae = host->health_log.alarms; ae; ae = ae->next) {
210                     if(unlikely(unique_id == ae->unique_id)) {
211                         if(unlikely(*pointers[0] == 'A')) {
212                             error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later."
213                                   , line, filename, unique_id);
214                             *pointers[0] = 'U';
215                             duplicate++;
216                         }
217                         break;
218                     }
219                     else if(unlikely(unique_id > ae->unique_id)) {
220                         // no need to continue
221                         // the linked list is sorted
222                         ae = NULL;
223                         break;
224                     }
225                 }
226
227                 // if not found, skip this line
228                 if(!ae) {
229                     // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
230                     continue;
231                 }
232             }
233
234             // check for a possible host missmatch
235             //if(strcmp(pointers[1], host->hostname))
236             //    error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
237
238             ae->unique_id               = unique_id;
239             ae->alarm_id                = alarm_id;
240             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
241             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
242             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
243
244             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
245             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
246             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
247
248             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
249             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
250
251             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
252             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
253
254             freez(ae->name);
255             ae->name = strdupz(pointers[13]);
256             ae->hash_name = simple_hash(ae->name);
257
258             freez(ae->chart);
259             ae->chart = strdupz(pointers[14]);
260             ae->hash_chart = simple_hash(ae->chart);
261
262             freez(ae->family);
263             ae->family = strdupz(pointers[15]);
264
265             freez(ae->exec);
266             ae->exec = strdupz(pointers[16]);
267             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
268
269             freez(ae->recipient);
270             ae->recipient = strdupz(pointers[17]);
271             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
272
273             freez(ae->source);
274             ae->source = strdupz(pointers[18]);
275             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
276
277             freez(ae->units);
278             ae->units = strdupz(pointers[19]);
279             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
280
281             freez(ae->info);
282             ae->info = strdupz(pointers[20]);
283             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
284
285             ae->exec_code   = str2i(pointers[21]);
286             ae->new_status  = str2i(pointers[22]);
287             ae->old_status  = str2i(pointers[23]);
288             ae->delay       = str2i(pointers[24]);
289
290             ae->new_value   = str2l(pointers[25]);
291             ae->old_value   = str2l(pointers[26]);
292
293             static char value_string[100 + 1];
294             freez(ae->old_value_string);
295             freez(ae->new_value_string);
296             ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
297             ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
298
299             // add it to host if not already there
300             if(unlikely(*pointers[0] == 'A')) {
301                 ae->next = host->health_log.alarms;
302                 host->health_log.alarms = ae;
303                 loaded++;
304             }
305             else updated++;
306
307             if(unlikely(ae->unique_id > max_unique_id))
308                 max_unique_id = ae->unique_id;
309
310             if(unlikely(ae->alarm_id >= max_alarm_id))
311                 max_alarm_id = ae->alarm_id;
312         }
313         else {
314             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
315             errored++;
316         }
317     }
318
319     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
320
321     freez(buf);
322
323     if(!max_unique_id) max_unique_id = (uint32_t)now_realtime_sec();
324     if(!max_alarm_id)  max_alarm_id  = (uint32_t)now_realtime_sec();
325
326     host->health_log.next_log_id = max_unique_id + 1;
327     host->health_log.next_alarm_id = max_alarm_id + 1;
328
329     debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
330     return loaded;
331 }
332
333 static inline void health_alarm_log_load(RRDHOST *host) {
334     health_alarm_log_close();
335
336     char filename[FILENAME_MAX + 1];
337     snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename);
338     FILE *fp = fopen(filename, "r");
339     if(!fp)
340         error("Health: cannot open health file: %s", filename);
341     else {
342         health_alarm_log_read(host, fp, filename);
343         fclose(fp);
344     }
345
346     health.log_entries_written = 0;
347     fp = fopen(health.log_filename, "r");
348     if(!fp)
349         error("Health: cannot open health file: %s", health.log_filename);
350     else {
351         health_alarm_log_read(host, fp, health.log_filename);
352         fclose(fp);
353     }
354
355     health_alarm_log_open();
356 }
357
358
359 // ----------------------------------------------------------------------------
360 // health alarm log management
361
362 static inline void health_alarm_log(
363         RRDHOST *host,
364         uint32_t alarm_id,
365         uint32_t alarm_event_id,
366         time_t when,
367         const char *name,
368         const char *chart,
369         const char *family,
370         const char *exec,
371         const char *recipient,
372         time_t duration,
373         calculated_number old_value,
374         calculated_number new_value,
375         int old_status,
376         int new_status,
377         const char *source,
378         const char *units,
379         const char *info,
380         int delay,
381         uint32_t flags
382 ) {
383     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
384
385     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
386     ae->name = strdupz(name);
387     ae->hash_name = simple_hash(ae->name);
388
389     if(chart) {
390         ae->chart = strdupz(chart);
391         ae->hash_chart = simple_hash(ae->chart);
392     }
393
394     if(family)
395         ae->family = strdupz(family);
396
397     if(exec) ae->exec = strdupz(exec);
398     if(recipient) ae->recipient = strdupz(recipient);
399     if(source) ae->source = strdupz(source);
400     if(units) ae->units = strdupz(units);
401     if(info) ae->info = strdupz(info);
402
403     ae->unique_id = host->health_log.next_log_id++;
404     ae->alarm_id = alarm_id;
405     ae->alarm_event_id = alarm_event_id;
406     ae->when = when;
407     ae->old_value = old_value;
408     ae->new_value = new_value;
409
410     static char value_string[100 + 1];
411     ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
412     ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
413
414     ae->old_status = old_status;
415     ae->new_status = new_status;
416     ae->duration = duration;
417     ae->delay = delay;
418     ae->delay_up_to_timestamp = when + delay;
419
420     ae->flags |= flags;
421
422     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
423         ae->non_clear_duration += ae->duration;
424
425     // link it
426     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
427     ae->next = host->health_log.alarms;
428     host->health_log.alarms = ae;
429     host->health_log.count++;
430     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
431
432     // match previous alarms
433     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
434     ALARM_ENTRY *t;
435     for(t = host->health_log.alarms ; t ; t = t->next) {
436         if(t != ae && t->alarm_id == ae->alarm_id) {
437             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
438                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
439                 t->updated_by_id = ae->unique_id;
440                 ae->updates_id = t->unique_id;
441
442                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
443                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
444                     ae->non_clear_duration += t->non_clear_duration;
445
446                 health_alarm_log_save(host, t);
447             }
448
449             // no need to continue
450             break;
451         }
452     }
453     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
454
455     health_alarm_log_save(host, ae);
456 }
457
458 // ----------------------------------------------------------------------------
459 // RRDVAR management
460
461 static inline int rrdvar_fix_name(char *variable) {
462     int fixed = 0;
463     while(*variable) {
464         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
465             *variable++ = '_';
466             fixed++;
467         }
468         else
469             variable++;
470     }
471
472     return fixed;
473 }
474
475 int rrdvar_compare(void* a, void* b) {
476     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
477     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
478     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
479 }
480
481 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
482     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
483     if(ret != rv)
484         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
485
486     return ret;
487 }
488
489 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
490     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
491     if(!ret)
492         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
493
494     return ret;
495 }
496
497 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
498     RRDVAR tmp;
499     tmp.name = (char *)name;
500     tmp.hash = (hash)?hash:simple_hash(tmp.name);
501
502     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
503 }
504
505 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
506     (void)host;
507
508     if(!rv) return;
509
510     if(tree) {
511         debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
512         if(unlikely(!rrdvar_index_del(tree, rv)))
513             error("Attempted to delete variable '%s' from host '%s', but it is not found.", rv->name, host->hostname);
514     }
515
516     freez(rv->name);
517     freez(rv);
518 }
519
520 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, void *value) {
521     char *variable = strdupz(name);
522     rrdvar_fix_name(variable);
523     uint32_t hash = simple_hash(variable);
524
525     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
526     if(unlikely(!rv)) {
527         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
528
529         rv = callocz(1, sizeof(RRDVAR));
530         rv->name = variable;
531         rv->hash = hash;
532         rv->type = type;
533         rv->value = value;
534
535         RRDVAR *ret = rrdvar_index_add(tree, rv);
536         if(unlikely(ret != rv)) {
537             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
538             rrdvar_free(NULL, NULL, rv);
539             rv = NULL;
540         }
541         else
542             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
543     }
544     else {
545         debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
546
547         // already exists
548         freez(variable);
549
550         // this is important
551         // it must return NULL - not the existing variable - or double-free will happen
552         rv = NULL;
553     }
554
555     return rv;
556 }
557
558 // ----------------------------------------------------------------------------
559 // CUSTOM VARIABLES
560
561 RRDVAR *rrdvar_custom_host_variable_create(RRDHOST *host, const char *name) {
562     calculated_number *v = callocz(1, sizeof(calculated_number));
563     *v = NAN;
564     RRDVAR *rv = rrdvar_create_and_index("host", &host->variables_root_index, name, RRDVAR_TYPE_CALCULATED_ALLOCATED, v);
565     if(unlikely(!rv)) {
566         free(v);
567         error("Requested variable '%s' already exists - possibly 2 plugins will be updating it at the same time", name);
568
569         char *variable = strdupz(name);
570         rrdvar_fix_name(variable);
571         uint32_t hash = simple_hash(variable);
572
573         rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
574     }
575
576     return rv;
577 }
578
579 void rrdvar_custom_host_variable_destroy(RRDHOST *host, const char *name) {
580     char *variable = strdupz(name);
581     rrdvar_fix_name(variable);
582     uint32_t hash = simple_hash(variable);
583
584     RRDVAR *rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
585     freez(variable);
586
587     if(!rv) {
588         error("Attempted to remove variable '%s' from host '%s', but it does not exist.", name, host->hostname);
589         return;
590     }
591
592     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED) {
593         error("Attempted to remove variable '%s' from host '%s', but it does not a custom allocated variable.", name, host->hostname);
594         return;
595     }
596
597     if(!rrdvar_index_del(&host->variables_root_index, rv)) {
598         error("Attempted to remove variable '%s' from host '%s', but it cannot be found.", name, host->hostname);
599         return;
600     }
601
602     freez(rv->name);
603     freez(rv->value);
604     freez(rv);
605 }
606
607 void rrdvar_custom_host_variable_set(RRDVAR *rv, calculated_number value) {
608     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED)
609         error("requested to set variable '%s' to value " CALCULATED_NUMBER_FORMAT " but the variable is not a custom one.", rv->name, value);
610     else {
611         calculated_number *v = rv->value;
612         *v = value;
613     }
614 }
615
616 // ----------------------------------------------------------------------------
617 // RRDVAR lookup
618
619 static calculated_number rrdvar2number(RRDVAR *rv) {
620     switch(rv->type) {
621         case RRDVAR_TYPE_CALCULATED_ALLOCATED:
622         case RRDVAR_TYPE_CALCULATED: {
623             calculated_number *n = (calculated_number *)rv->value;
624             return *n;
625         }
626
627         case RRDVAR_TYPE_TIME_T: {
628             time_t *n = (time_t *)rv->value;
629             return *n;
630         }
631
632         case RRDVAR_TYPE_COLLECTED: {
633             collected_number *n = (collected_number *)rv->value;
634             return *n;
635         }
636
637         case RRDVAR_TYPE_TOTAL: {
638             total_number *n = (total_number *)rv->value;
639             return *n;
640         }
641
642         case RRDVAR_TYPE_INT: {
643             int *n = (int *)rv->value;
644             return *n;
645         }
646
647         default:
648             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
649             return NAN;
650     }
651 }
652
653 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
654     RRDSET *st = rc->rrdset;
655     RRDVAR *rv;
656
657     if(!st) return 0;
658
659     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
660     if(rv) {
661         *result = rrdvar2number(rv);
662         return 1;
663     }
664
665     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
666     if(rv) {
667         *result = rrdvar2number(rv);
668         return 1;
669     }
670
671     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
672     if(rv) {
673         *result = rrdvar2number(rv);
674         return 1;
675     }
676
677     return 0;
678 }
679
680 // ----------------------------------------------------------------------------
681 // RRDVAR to JSON
682
683 struct variable2json_helper {
684     BUFFER *buf;
685     size_t counter;
686 };
687
688 static int single_variable2json(void *entry, void *data) {
689     struct variable2json_helper *helper = (struct variable2json_helper *)data;
690     RRDVAR *rv = (RRDVAR *)entry;
691     calculated_number value = rrdvar2number(rv);
692
693     if(unlikely(isnan(value) || isinf(value)))
694         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
695     else
696         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
697
698     helper->counter++;
699
700     return 0;
701 }
702
703 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
704     struct variable2json_helper helper = {
705             .buf = buf,
706             .counter = 0
707     };
708
709     buffer_sprintf(buf, "{\n\t\"chart\": \"%s\",\n\t\"chart_name\": \"%s\",\n\t\"chart_context\": \"%s\",\n\t\"chart_variables\": {", st->id, st->name, st->context);
710     avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
711     buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
712     helper.counter = 0;
713     avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
714     buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
715     helper.counter = 0;
716     avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
717     buffer_strcat(buf, "\n\t}\n}\n");
718 }
719
720
721 // ----------------------------------------------------------------------------
722 // RRDDIMVAR management
723 // DIMENSION VARIABLES
724
725 #define RRDDIMVAR_ID_MAX 1024
726
727 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
728     RRDDIM *rd = rs->rrddim;
729     RRDSET *st = rd->rrdset;
730
731     // CHART VARIABLES FOR THIS DIMENSION
732
733     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
734     rs->var_local_id = NULL;
735
736     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
737     rs->var_local_name = NULL;
738
739     // FAMILY VARIABLES FOR THIS DIMENSION
740
741     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
742     rs->var_family_id = NULL;
743
744     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
745     rs->var_family_name = NULL;
746
747     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
748     rs->var_family_contextid = NULL;
749
750     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
751     rs->var_family_contextname = NULL;
752
753     // HOST VARIABLES FOR THIS DIMENSION
754
755     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
756     rs->var_host_chartidid = NULL;
757
758     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
759     rs->var_host_chartidname = NULL;
760
761     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
762     rs->var_host_chartnameid = NULL;
763
764     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
765     rs->var_host_chartnamename = NULL;
766
767     // KEYS
768
769     freez(rs->key_id);
770     rs->key_id = NULL;
771
772     freez(rs->key_name);
773     rs->key_name = NULL;
774
775     freez(rs->key_fullidid);
776     rs->key_fullidid = NULL;
777
778     freez(rs->key_fullidname);
779     rs->key_fullidname = NULL;
780
781     freez(rs->key_contextid);
782     rs->key_contextid = NULL;
783
784     freez(rs->key_contextname);
785     rs->key_contextname = NULL;
786
787     freez(rs->key_fullnameid);
788     rs->key_fullnameid = NULL;
789
790     freez(rs->key_fullnamename);
791     rs->key_fullnamename = NULL;
792 }
793
794 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
795     rrddimvar_free_variables(rs);
796
797     RRDDIM *rd = rs->rrddim;
798     RRDSET *st = rd->rrdset;
799
800     char buffer[RRDDIMVAR_ID_MAX + 1];
801
802     // KEYS
803
804     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
805     rs->key_id = strdupz(buffer);
806
807     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
808     rs->key_name = strdupz(buffer);
809
810     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
811     rs->key_fullidid = strdupz(buffer);
812
813     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
814     rs->key_fullidname = strdupz(buffer);
815
816     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
817     rs->key_contextid = strdupz(buffer);
818
819     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
820     rs->key_contextname = strdupz(buffer);
821
822     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
823     rs->key_fullnameid = strdupz(buffer);
824
825     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
826     rs->key_fullnamename = strdupz(buffer);
827
828     // CHART VARIABLES FOR THIS DIMENSION
829     // -----------------------------------
830     //
831     // dimensions are available as:
832     // - $id
833     // - $name
834
835     rs->var_local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
836     rs->var_local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
837
838     // FAMILY VARIABLES FOR THIS DIMENSION
839     // -----------------------------------
840     //
841     // dimensions are available as:
842     // - $id                 (only the first, when multiple overlap)
843     // - $name               (only the first, when multiple overlap)
844     // - $chart-context.id
845     // - $chart-context.name
846
847     rs->var_family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
848     rs->var_family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
849     rs->var_family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
850     rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
851
852     // HOST VARIABLES FOR THIS DIMENSION
853     // -----------------------------------
854     //
855     // dimensions are available as:
856     // - $chart-id.id
857     // - $chart-id.name
858     // - $chart-name.id
859     // - $chart-name.name
860
861     rs->var_host_chartidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
862     rs->var_host_chartidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
863     rs->var_host_chartnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
864     rs->var_host_chartnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
865 }
866
867 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
868     RRDSET *st = rd->rrdset;
869
870     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
871
872     if(!prefix) prefix = "";
873     if(!suffix) suffix = "";
874
875     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
876
877     rs->prefix = strdupz(prefix);
878     rs->suffix = strdupz(suffix);
879
880     rs->type = type;
881     rs->value = value;
882     rs->options = options;
883     rs->rrddim = rd;
884
885     rs->next = rd->variables;
886     rd->variables = rs;
887
888     rrddimvar_create_variables(rs);
889
890     return rs;
891 }
892
893 void rrddimvar_rename_all(RRDDIM *rd) {
894     RRDSET *st = rd->rrdset;
895     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
896
897     RRDDIMVAR *rs, *next = rd->variables;
898     while((rs = next)) {
899         next = rs->next;
900         rrddimvar_create_variables(rs);
901     }
902 }
903
904 void rrddimvar_free(RRDDIMVAR *rs) {
905     RRDDIM *rd = rs->rrddim;
906     RRDSET *st = rd->rrdset;
907     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
908
909     rrddimvar_free_variables(rs);
910
911     if(rd->variables == rs) {
912         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
913         rd->variables = rs->next;
914     }
915     else {
916         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
917         RRDDIMVAR *t;
918         for (t = rd->variables; t && t->next != rs; t = t->next) ;
919         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
920         else t->next = rs->next;
921     }
922
923     freez(rs->prefix);
924     freez(rs->suffix);
925     freez(rs);
926 }
927
928 // ----------------------------------------------------------------------------
929 // RRDSETVAR management
930 // CHART VARIABLES
931
932 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
933     RRDSET *st = rs->rrdset;
934
935     // CHART
936
937     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
938     rs->var_local = NULL;
939
940     // FAMILY
941
942     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
943     rs->var_family = NULL;
944
945     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
946     rs->var_host = NULL;
947
948     // HOST
949
950     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
951     rs->var_family_name = NULL;
952
953     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
954     rs->var_host_name = NULL;
955
956     // KEYS
957
958     freez(rs->key_fullid);
959     rs->key_fullid = NULL;
960
961     freez(rs->key_fullname);
962     rs->key_fullname = NULL;
963 }
964
965 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
966     rrdsetvar_free_variables(rs);
967
968     RRDSET *st = rs->rrdset;
969
970     // KEYS
971
972     char buffer[RRDVAR_MAX_LENGTH + 1];
973     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
974     rs->key_fullid = strdupz(buffer);
975
976     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
977     rs->key_fullname = strdupz(buffer);
978
979     // CHART
980
981     rs->var_local       = rrdvar_create_and_index("local",  &st->variables_root_index,               rs->variable, rs->type, rs->value);
982
983     // FAMILY
984
985     rs->var_family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullid,   rs->type, rs->value);
986     rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullname, rs->type, rs->value);
987
988     // HOST
989
990     rs->var_host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullid,   rs->type, rs->value);
991     rs->var_host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullname, rs->type, rs->value);
992
993 }
994
995 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
996     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
997     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
998
999     rs->variable = strdupz(variable);
1000     rs->type = type;
1001     rs->value = value;
1002     rs->options = options;
1003     rs->rrdset = st;
1004
1005     rs->next = st->variables;
1006     st->variables = rs;
1007
1008     rrdsetvar_create_variables(rs);
1009
1010     return rs;
1011 }
1012
1013 void rrdsetvar_rename_all(RRDSET *st) {
1014     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
1015
1016     RRDSETVAR *rs, *next = st->variables;
1017     while((rs = next)) {
1018         next = rs->next;
1019         rrdsetvar_create_variables(rs);
1020     }
1021
1022     rrdsetcalc_link_matching(st);
1023 }
1024
1025 void rrdsetvar_free(RRDSETVAR *rs) {
1026     RRDSET *st = rs->rrdset;
1027     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
1028
1029     if(st->variables == rs) {
1030         st->variables = rs->next;
1031     }
1032     else {
1033         RRDSETVAR *t;
1034         for (t = st->variables; t && t->next != rs; t = t->next);
1035         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
1036         else t->next = rs->next;
1037     }
1038
1039     rrdsetvar_free_variables(rs);
1040
1041     freez(rs->variable);
1042     freez(rs);
1043 }
1044
1045 // ----------------------------------------------------------------------------
1046 // RRDCALC management
1047
1048 inline const char *rrdcalc_status2string(int status) {
1049     switch(status) {
1050         case RRDCALC_STATUS_REMOVED:
1051             return "REMOVED";
1052
1053         case RRDCALC_STATUS_UNDEFINED:
1054             return "UNDEFINED";
1055
1056         case RRDCALC_STATUS_UNINITIALIZED:
1057             return "UNINITIALIZED";
1058
1059         case RRDCALC_STATUS_CLEAR:
1060             return "CLEAR";
1061
1062         case RRDCALC_STATUS_RAISED:
1063             return "RAISED";
1064
1065         case RRDCALC_STATUS_WARNING:
1066             return "WARNING";
1067
1068         case RRDCALC_STATUS_CRITICAL:
1069             return "CRITICAL";
1070
1071         default:
1072             error("Unknown alarm status %d", status);
1073             return "UNKNOWN";
1074     }
1075 }
1076
1077 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
1078     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
1079
1080     rc->last_status_change = now_realtime_sec();
1081     rc->rrdset = st;
1082
1083     rc->rrdset_next = st->alarms;
1084     rc->rrdset_prev = NULL;
1085     
1086     if(rc->rrdset_next)
1087         rc->rrdset_next->rrdset_prev = rc;
1088
1089     st->alarms = rc;
1090
1091     if(rc->update_every < rc->rrdset->update_every) {
1092         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
1093         rc->update_every = rc->rrdset->update_every;
1094     }
1095
1096     if(!isnan(rc->green) && isnan(st->green)) {
1097         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
1098         st->green = rc->green;
1099     }
1100
1101     if(!isnan(rc->red) && isnan(st->red)) {
1102         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
1103         st->red = rc->red;
1104     }
1105
1106     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1107     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1108
1109     char fullname[RRDVAR_MAX_LENGTH + 1];
1110     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
1111     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1112
1113     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
1114     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1115
1116         if(!rc->units) rc->units = strdupz(st->units);
1117
1118     {
1119         time_t now = now_realtime_sec();
1120         health_alarm_log(
1121                 st->rrdhost,
1122                 rc->id,
1123                 rc->next_event_id++,
1124                 now,
1125                 rc->name,
1126                 rc->rrdset->id,
1127                 rc->rrdset->family,
1128                 rc->exec,
1129                 rc->recipient,
1130                 now - rc->last_status_change,
1131                 rc->old_value,
1132                 rc->value,
1133                 rc->status,
1134                 RRDCALC_STATUS_UNINITIALIZED,
1135                 rc->source,
1136                 rc->units,
1137                 rc->info,
1138                 0,
1139                 0
1140         );
1141     }
1142 }
1143
1144 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
1145     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
1146             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
1147         return 1;
1148
1149     return 0;
1150 }
1151
1152 // this has to be called while the RRDHOST is locked
1153 inline void rrdsetcalc_link_matching(RRDSET *st) {
1154     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
1155
1156     RRDCALC *rc;
1157     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
1158         if(unlikely(rc->rrdset))
1159             continue;
1160
1161         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
1162             rrdsetcalc_link(st, rc);
1163     }
1164 }
1165
1166 // this has to be called while the RRDHOST is locked
1167 inline void rrdsetcalc_unlink(RRDCALC *rc) {
1168     RRDSET *st = rc->rrdset;
1169
1170     if(!st) {
1171         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1172         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1173         return;
1174     }
1175
1176     {
1177         time_t now = now_realtime_sec();
1178         health_alarm_log(
1179                 st->rrdhost,
1180                 rc->id,
1181                 rc->next_event_id++,
1182                 now,
1183                 rc->name,
1184                 rc->rrdset->id,
1185                 rc->rrdset->family,
1186                 rc->exec,
1187                 rc->recipient,
1188                 now - rc->last_status_change,
1189                 rc->old_value,
1190                 rc->value,
1191                 rc->status,
1192                 RRDCALC_STATUS_REMOVED,
1193                 rc->source,
1194                 rc->units,
1195                 rc->info,
1196                 0,
1197                 0
1198         );
1199     }
1200
1201     RRDHOST *host = st->rrdhost;
1202
1203     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
1204
1205     // unlink it
1206     if(rc->rrdset_prev)
1207         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
1208
1209     if(rc->rrdset_next)
1210         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
1211
1212     if(st->alarms == rc)
1213         st->alarms = rc->rrdset_next;
1214
1215     rc->rrdset_prev = rc->rrdset_next = NULL;
1216
1217     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
1218     rc->local = NULL;
1219
1220     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
1221     rc->family = NULL;
1222
1223     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
1224     rc->hostid = NULL;
1225
1226     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
1227     rc->hostname = NULL;
1228
1229     rc->rrdset = NULL;
1230
1231     // RRDCALC will remain in RRDHOST
1232     // so that if the matching chart is found in the future
1233     // it will be applied automatically
1234 }
1235
1236 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
1237     RRDCALC *rc;
1238     uint32_t hash = simple_hash(name);
1239
1240     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
1241         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
1242             return rc;
1243     }
1244
1245     return NULL;
1246 }
1247
1248 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
1249     RRDCALC *rc;
1250
1251     if(unlikely(!chart)) {
1252         error("attempt to find RRDCALC '%s' without giving a chart name", name);
1253         return 1;
1254     }
1255
1256     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1257     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
1258
1259     // make sure it does not already exist
1260     for(rc = host->alarms; rc ; rc = rc->next) {
1261         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1262             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1263             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1264             return 1;
1265         }
1266     }
1267
1268     return 0;
1269 }
1270
1271 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1272     if(chart && name) {
1273         uint32_t hash_chart = simple_hash(chart);
1274         uint32_t hash_name = simple_hash(name);
1275
1276         // re-use old IDs, by looking them up in the alarm log
1277         ALARM_ENTRY *ae;
1278         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1279             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1280                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1281                 return ae->alarm_id;
1282             }
1283         }
1284     }
1285
1286     return host->health_log.next_alarm_id++;
1287 }
1288
1289 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1290     rrdhost_check_rdlock(host);
1291
1292     if(rc->calculation) {
1293         rc->calculation->status = &rc->status;
1294         rc->calculation->this = &rc->value;
1295         rc->calculation->after = &rc->db_after;
1296         rc->calculation->before = &rc->db_before;
1297         rc->calculation->rrdcalc = rc;
1298     }
1299
1300     if(rc->warning) {
1301         rc->warning->status = &rc->status;
1302         rc->warning->this = &rc->value;
1303         rc->warning->after = &rc->db_after;
1304         rc->warning->before = &rc->db_before;
1305         rc->warning->rrdcalc = rc;
1306     }
1307
1308     if(rc->critical) {
1309         rc->critical->status = &rc->status;
1310         rc->critical->this = &rc->value;
1311         rc->critical->after = &rc->db_after;
1312         rc->critical->before = &rc->db_before;
1313         rc->critical->rrdcalc = rc;
1314     }
1315
1316     // link it to the host
1317     if(likely(host->alarms)) {
1318         // append it
1319         RRDCALC *t;
1320         for(t = host->alarms; t && t->next ; t = t->next) ;
1321         t->next = rc;
1322     }
1323     else {
1324         host->alarms = rc;
1325     }
1326
1327     // link it to its chart
1328     RRDSET *st;
1329     for(st = host->rrdset_root; st ; st = st->next) {
1330         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1331             rrdsetcalc_link(st, rc);
1332             break;
1333         }
1334     }
1335 }
1336
1337 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1338
1339     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1340
1341     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1342         return NULL;
1343
1344     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1345     rc->next_event_id = 1;
1346     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1347     rc->name = strdupz(rt->name);
1348     rc->hash = simple_hash(rc->name);
1349     rc->chart = strdupz(chart);
1350     rc->hash_chart = simple_hash(rc->chart);
1351
1352     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1353
1354     rc->green = rt->green;
1355     rc->red = rt->red;
1356     rc->value = NAN;
1357     rc->old_value = NAN;
1358
1359     rc->delay_up_duration = rt->delay_up_duration;
1360     rc->delay_down_duration = rt->delay_down_duration;
1361     rc->delay_max_duration = rt->delay_max_duration;
1362     rc->delay_multiplier = rt->delay_multiplier;
1363
1364     rc->group = rt->group;
1365     rc->after = rt->after;
1366     rc->before = rt->before;
1367     rc->update_every = rt->update_every;
1368     rc->options = rt->options;
1369
1370     if(rt->exec) rc->exec = strdupz(rt->exec);
1371     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1372     if(rt->source) rc->source = strdupz(rt->source);
1373     if(rt->units) rc->units = strdupz(rt->units);
1374     if(rt->info) rc->info = strdupz(rt->info);
1375
1376     if(rt->calculation) {
1377         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1378         if(!rc->calculation)
1379             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1380     }
1381     if(rt->warning) {
1382         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1383         if(!rc->warning)
1384             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1385     }
1386     if(rt->critical) {
1387         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1388         if(!rc->critical)
1389             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1390     }
1391
1392     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1393           (rc->chart)?rc->chart:"NOCHART",
1394           rc->name,
1395           (rc->exec)?rc->exec:"DEFAULT",
1396           (rc->recipient)?rc->recipient:"DEFAULT",
1397           rc->green,
1398           rc->red,
1399           rc->group,
1400           rc->after,
1401           rc->before,
1402           rc->options,
1403           (rc->dimensions)?rc->dimensions:"NONE",
1404           rc->update_every,
1405           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1406           (rc->warning)?rc->warning->parsed_as:"NONE",
1407           (rc->critical)?rc->critical->parsed_as:"NONE",
1408           rc->source,
1409           rc->delay_up_duration,
1410           rc->delay_down_duration,
1411           rc->delay_max_duration,
1412           rc->delay_multiplier
1413     );
1414
1415     rrdcalc_create_part2(host, rc);
1416     return rc;
1417 }
1418
1419 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1420     if(!rc) return;
1421
1422     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1423
1424     // unlink it from RRDSET
1425     if(rc->rrdset) rrdsetcalc_unlink(rc);
1426
1427     // unlink it from RRDHOST
1428     if(unlikely(rc == host->alarms))
1429         host->alarms = rc->next;
1430
1431     else if(likely(host->alarms)) {
1432         RRDCALC *t, *last = host->alarms;
1433         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1434         if(last->next == rc)
1435             last->next = rc->next;
1436         else
1437             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1438     }
1439     else
1440         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1441
1442     expression_free(rc->calculation);
1443     expression_free(rc->warning);
1444     expression_free(rc->critical);
1445
1446     freez(rc->name);
1447     freez(rc->chart);
1448     freez(rc->family);
1449     freez(rc->dimensions);
1450     freez(rc->exec);
1451     freez(rc->recipient);
1452     freez(rc->source);
1453     freez(rc->units);
1454     freez(rc->info);
1455     freez(rc);
1456 }
1457
1458 // ----------------------------------------------------------------------------
1459 // RRDCALCTEMPLATE management
1460
1461 void rrdcalctemplate_link_matching(RRDSET *st) {
1462     RRDCALCTEMPLATE *rt;
1463
1464     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1465         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
1466                 && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
1467             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1468             if(unlikely(!rc))
1469                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1470
1471 #ifdef NETDATA_INTERNAL_CHECKS
1472             else if(rc->rrdset != st)
1473                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1474 #endif
1475         }
1476     }
1477 }
1478
1479 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1480     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1481
1482     if(host->templates) {
1483         if(host->templates == rt) {
1484             host->templates = rt->next;
1485         }
1486         else {
1487             RRDCALCTEMPLATE *t, *last = host->templates;
1488             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1489             if(last && last->next == rt) {
1490                 last->next = rt->next;
1491                 rt->next = NULL;
1492             }
1493             else
1494                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1495         }
1496     }
1497
1498     expression_free(rt->calculation);
1499     expression_free(rt->warning);
1500     expression_free(rt->critical);
1501
1502     freez(rt->family_match);
1503     simple_pattern_free(rt->family_pattern);
1504
1505     freez(rt->name);
1506     freez(rt->exec);
1507     freez(rt->recipient);
1508     freez(rt->context);
1509     freez(rt->source);
1510     freez(rt->units);
1511     freez(rt->info);
1512     freez(rt->dimensions);
1513     freez(rt);
1514 }
1515
1516 // ----------------------------------------------------------------------------
1517 // load health configuration
1518
1519 #define HEALTH_CONF_MAX_LINE 4096
1520
1521 #define HEALTH_ALARM_KEY "alarm"
1522 #define HEALTH_TEMPLATE_KEY "template"
1523 #define HEALTH_ON_KEY "on"
1524 #define HEALTH_FAMILIES_KEY "families"
1525 #define HEALTH_LOOKUP_KEY "lookup"
1526 #define HEALTH_CALC_KEY "calc"
1527 #define HEALTH_EVERY_KEY "every"
1528 #define HEALTH_GREEN_KEY "green"
1529 #define HEALTH_RED_KEY "red"
1530 #define HEALTH_WARN_KEY "warn"
1531 #define HEALTH_CRIT_KEY "crit"
1532 #define HEALTH_EXEC_KEY "exec"
1533 #define HEALTH_RECIPIENT_KEY "to"
1534 #define HEALTH_UNITS_KEY "units"
1535 #define HEALTH_INFO_KEY "info"
1536 #define HEALTH_DELAY_KEY "delay"
1537 #define HEALTH_OPTIONS_KEY "options"
1538
1539 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1540     if(!rc->chart) {
1541         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1542         return 0;
1543     }
1544
1545     if(!rc->update_every) {
1546         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1547         return 0;
1548     }
1549
1550     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1551         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1552         return 0;
1553     }
1554
1555     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1556         return 0;
1557
1558     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1559
1560     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1561           rc->chart?rc->chart:"NOCHART",
1562           rc->name,
1563           rc->id,
1564           (rc->exec)?rc->exec:"DEFAULT",
1565           (rc->recipient)?rc->recipient:"DEFAULT",
1566           rc->green,
1567           rc->red,
1568           rc->group,
1569           rc->after,
1570           rc->before,
1571           rc->options,
1572           (rc->dimensions)?rc->dimensions:"NONE",
1573           rc->update_every,
1574           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1575           (rc->warning)?rc->warning->parsed_as:"NONE",
1576           (rc->critical)?rc->critical->parsed_as:"NONE",
1577           rc->source,
1578           rc->delay_up_duration,
1579           rc->delay_down_duration,
1580           rc->delay_max_duration,
1581           rc->delay_multiplier
1582     );
1583
1584     rrdcalc_create_part2(host, rc);
1585     return 1;
1586 }
1587
1588 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1589     if(unlikely(!rt->context)) {
1590         error("Health configuration for template '%s' does not have a context", rt->name);
1591         return 0;
1592     }
1593
1594     if(unlikely(!rt->update_every)) {
1595         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1596         return 0;
1597     }
1598
1599     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1600         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1601         return 0;
1602     }
1603
1604     RRDCALCTEMPLATE *t, *last = NULL;
1605     for (t = host->templates; t ; last = t, t = t->next) {
1606         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1607             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1608             return 0;
1609         }
1610     }
1611
1612     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1613           rt->name,
1614           (rt->context)?rt->context:"NONE",
1615           (rt->exec)?rt->exec:"DEFAULT",
1616           (rt->recipient)?rt->recipient:"DEFAULT",
1617           rt->green,
1618           rt->red,
1619           rt->group,
1620           rt->after,
1621           rt->before,
1622           rt->options,
1623           (rt->dimensions)?rt->dimensions:"NONE",
1624           rt->update_every,
1625           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1626           (rt->warning)?rt->warning->parsed_as:"NONE",
1627           (rt->critical)?rt->critical->parsed_as:"NONE",
1628           rt->source,
1629           rt->delay_up_duration,
1630           rt->delay_down_duration,
1631           rt->delay_max_duration,
1632           rt->delay_multiplier
1633     );
1634
1635     if(likely(last)) {
1636         last->next = rt;
1637     }
1638     else {
1639         rt->next = host->templates;
1640         host->templates = rt;
1641     }
1642
1643     return 1;
1644 }
1645
1646 static inline int health_parse_duration(char *string, int *result) {
1647     // make sure it is a number
1648     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1649         *result = 0;
1650         return 0;
1651     }
1652
1653     char *e = NULL;
1654     calculated_number n = strtold(string, &e);
1655     if(e && *e) {
1656         switch (*e) {
1657             case 'Y':
1658                 *result = (int) (n * 86400 * 365);
1659                 break;
1660             case 'M':
1661                 *result = (int) (n * 86400 * 30);
1662                 break;
1663             case 'w':
1664                 *result = (int) (n * 86400 * 7);
1665                 break;
1666             case 'd':
1667                 *result = (int) (n * 86400);
1668                 break;
1669             case 'h':
1670                 *result = (int) (n * 3600);
1671                 break;
1672             case 'm':
1673                 *result = (int) (n * 60);
1674                 break;
1675
1676             default:
1677             case 's':
1678                 *result = (int) (n);
1679                 break;
1680         }
1681     }
1682     else
1683        *result = (int)(n);
1684
1685     return 1;
1686 }
1687
1688 static inline int health_parse_delay(
1689         size_t line, const char *path, const char *file, char *string,
1690         int *delay_up_duration,
1691         int *delay_down_duration,
1692         int *delay_max_duration,
1693         float *delay_multiplier) {
1694
1695     char given_up = 0;
1696     char given_down = 0;
1697     char given_max = 0;
1698     char given_multiplier = 0;
1699
1700     char *s = string;
1701     while(*s) {
1702         char *key = s;
1703
1704         while(*s && !isspace(*s)) s++;
1705         while(*s && isspace(*s)) *s++ = '\0';
1706
1707         if(!*key) break;
1708
1709         char *value = s;
1710         while(*s && !isspace(*s)) s++;
1711         while(*s && isspace(*s)) *s++ = '\0';
1712
1713         if(!strcasecmp(key, "up")) {
1714             if (!health_parse_duration(value, delay_up_duration)) {
1715                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1716                       line, path, file, value, key);
1717             }
1718             else given_up = 1;
1719         }
1720         else if(!strcasecmp(key, "down")) {
1721             if (!health_parse_duration(value, delay_down_duration)) {
1722                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1723                       line, path, file, value, key);
1724             }
1725             else given_down = 1;
1726         }
1727         else if(!strcasecmp(key, "multiplier")) {
1728             *delay_multiplier = strtof(value, NULL);
1729             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1730                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1731                       line, path, file, value, key);
1732             }
1733             else given_multiplier = 1;
1734         }
1735         else if(!strcasecmp(key, "max")) {
1736             if (!health_parse_duration(value, delay_max_duration)) {
1737                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1738                       line, path, file, value, key);
1739             }
1740             else given_max = 1;
1741         }
1742         else {
1743             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1744                   line, path, file, key);
1745         }
1746     }
1747
1748     if(!given_up)
1749         *delay_up_duration = 0;
1750
1751     if(!given_down)
1752         *delay_down_duration = 0;
1753
1754     if(!given_multiplier)
1755         *delay_multiplier = 1.0;
1756
1757     if(!given_max) {
1758         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1759             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1760
1761         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1762             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1763     }
1764
1765     return 1;
1766 }
1767
1768 static inline uint32_t health_parse_options(const char *s) {
1769     uint32_t options = 0;
1770     char buf[100+1] = "";
1771
1772     while(*s) {
1773         buf[0] = '\0';
1774
1775         // skip spaces
1776         while(*s && isspace(*s))
1777             s++;
1778
1779         // find the next space
1780         size_t count = 0;
1781         while(*s && count < 100 && !isspace(*s))
1782             buf[count++] = *s++;
1783
1784         if(buf[0]) {
1785             buf[count] = '\0';
1786
1787             if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear"))
1788                 options |= RRDCALC_FLAG_NO_CLEAR_NOTIFICATION;
1789             else
1790                 error("Ignoring unknown alarm option '%s'", buf);
1791         }
1792     }
1793
1794     return options;
1795 }
1796
1797 static inline int health_parse_db_lookup(
1798         size_t line, const char *path, const char *file, char *string,
1799         int *group_method, int *after, int *before, int *every,
1800         uint32_t *options, char **dimensions
1801 ) {
1802     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1803
1804     if(*dimensions) freez(*dimensions);
1805     *dimensions = NULL;
1806     *after = 0;
1807     *before = 0;
1808     *every = 0;
1809     *options = 0;
1810
1811     char *s = string, *key;
1812
1813     // first is the group method
1814     key = s;
1815     while(*s && !isspace(*s)) s++;
1816     while(*s && isspace(*s)) *s++ = '\0';
1817     if(!*s) {
1818         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1819               line, path, file, key);
1820         return 0;
1821     }
1822
1823     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1824         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1825               line, path, file, key);
1826         return 0;
1827     }
1828
1829     // then is the 'after' time
1830     key = s;
1831     while(*s && !isspace(*s)) s++;
1832     while(*s && isspace(*s)) *s++ = '\0';
1833
1834     if(!health_parse_duration(key, after)) {
1835         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1836               line, path, file, key);
1837         return 0;
1838     }
1839
1840     // sane defaults
1841     *every = abs(*after);
1842
1843     // now we may have optional parameters
1844     while(*s) {
1845         key = s;
1846         while(*s && !isspace(*s)) s++;
1847         while(*s && isspace(*s)) *s++ = '\0';
1848         if(!*key) break;
1849
1850         if(!strcasecmp(key, "at")) {
1851             char *value = s;
1852             while(*s && !isspace(*s)) s++;
1853             while(*s && isspace(*s)) *s++ = '\0';
1854
1855             if (!health_parse_duration(value, before)) {
1856                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1857                       line, path, file, value, key);
1858             }
1859         }
1860         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1861             char *value = s;
1862             while(*s && !isspace(*s)) s++;
1863             while(*s && isspace(*s)) *s++ = '\0';
1864
1865             if (!health_parse_duration(value, every)) {
1866                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1867                       line, path, file, value, key);
1868             }
1869         }
1870         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1871             *options |= RRDR_OPTION_ABSOLUTE;
1872         }
1873         else if(!strcasecmp(key, "min2max")) {
1874             *options |= RRDR_OPTION_MIN2MAX;
1875         }
1876         else if(!strcasecmp(key, "null2zero")) {
1877             *options |= RRDR_OPTION_NULL2ZERO;
1878         }
1879         else if(!strcasecmp(key, "percentage")) {
1880             *options |= RRDR_OPTION_PERCENTAGE;
1881         }
1882         else if(!strcasecmp(key, "unaligned")) {
1883             *options |= RRDR_OPTION_NOT_ALIGNED;
1884         }
1885         else if(!strcasecmp(key, "of")) {
1886             if(*s && strcasecmp(s, "all"))
1887                *dimensions = strdupz(s);
1888             break;
1889         }
1890         else {
1891             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1892                   line, path, file, key);
1893         }
1894     }
1895
1896     return 1;
1897 }
1898
1899 static inline char *trim_all_spaces(char *buffer) {
1900     char *d = buffer, *s = buffer;
1901
1902     // skip spaces
1903     while(isspace(*s)) s++;
1904
1905     while(*s) {
1906         // copy the non-space part
1907         while(*s && !isspace(*s)) *d++ = *s++;
1908
1909         // add a space if we have to
1910         if(*s && isspace(*s)) {
1911             *d++ = ' ';
1912             s++;
1913         }
1914
1915         // skip spaces
1916         while(isspace(*s)) s++;
1917     }
1918
1919     *d = '\0';
1920
1921     if(d > buffer) {
1922         d--;
1923         if(isspace(*d)) *d = '\0';
1924     }
1925
1926     if(!buffer[0]) return NULL;
1927     return buffer;
1928 }
1929
1930 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1931     char buffer[FILENAME_MAX + 1];
1932     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1933     return strdupz(buffer);
1934 }
1935
1936 static inline void strip_quotes(char *s) {
1937     while(*s) {
1938         if(*s == '\'' || *s == '"') *s = ' ';
1939         s++;
1940     }
1941 }
1942
1943 int health_readfile(const char *path, const char *filename) {
1944     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1945
1946     static uint32_t
1947             hash_alarm = 0,
1948             hash_template = 0,
1949             hash_on = 0,
1950             hash_families = 0,
1951             hash_calc = 0,
1952             hash_green = 0,
1953             hash_red = 0,
1954             hash_warn = 0,
1955             hash_crit = 0,
1956             hash_exec = 0,
1957             hash_every = 0,
1958             hash_lookup = 0,
1959             hash_units = 0,
1960             hash_info = 0,
1961             hash_recipient = 0,
1962             hash_delay = 0,
1963             hash_options = 0;
1964
1965     char buffer[HEALTH_CONF_MAX_LINE + 1];
1966
1967     if(unlikely(!hash_alarm)) {
1968         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1969         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1970         hash_on = simple_uhash(HEALTH_ON_KEY);
1971         hash_families = simple_uhash(HEALTH_FAMILIES_KEY);
1972         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1973         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1974         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1975         hash_red = simple_uhash(HEALTH_RED_KEY);
1976         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1977         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1978         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1979         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1980         hash_units = simple_hash(HEALTH_UNITS_KEY);
1981         hash_info = simple_hash(HEALTH_INFO_KEY);
1982         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1983         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1984         hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
1985     }
1986
1987     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1988     FILE *fp = fopen(buffer, "r");
1989     if(!fp) {
1990         error("Health configuration cannot read file '%s'.", buffer);
1991         return 0;
1992     }
1993
1994     RRDCALC *rc = NULL;
1995     RRDCALCTEMPLATE *rt = NULL;
1996
1997     size_t line = 0, append = 0;
1998     char *s;
1999     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
2000         int stop_appending = !s;
2001         line++;
2002         s = trim(buffer);
2003         if(!s) continue;
2004
2005         append = strlen(s);
2006         if(!stop_appending && s[append - 1] == '\\') {
2007             s[append - 1] = ' ';
2008             append = &s[append] - buffer;
2009             if(append < HEALTH_CONF_MAX_LINE)
2010                 continue;
2011             else {
2012                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
2013             }
2014         }
2015         append = 0;
2016
2017         char *key = s;
2018         while(*s && *s != ':') s++;
2019         if(!*s) {
2020             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
2021             continue;
2022         }
2023         *s = '\0';
2024         s++;
2025
2026         char *value = s;
2027         key = trim_all_spaces(key);
2028         value = trim_all_spaces(value);
2029
2030         if(!key) {
2031             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
2032             continue;
2033         }
2034
2035         if(!value) {
2036             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
2037             continue;
2038         }
2039
2040         uint32_t hash = simple_uhash(key);
2041
2042         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
2043             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2044                 rrdcalc_free(&localhost, rc);
2045
2046             if(rt) {
2047                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
2048                     rrdcalctemplate_free(&localhost, rt);
2049                 rt = NULL;
2050             }
2051
2052             rc = callocz(1, sizeof(RRDCALC));
2053             rc->next_event_id = 1;
2054             rc->name = strdupz(value);
2055             rc->hash = simple_hash(rc->name);
2056             rc->source = health_source_file(line, path, filename);
2057             rc->green = NAN;
2058             rc->red = NAN;
2059             rc->value = NAN;
2060             rc->old_value = NAN;
2061             rc->delay_multiplier = 1.0;
2062
2063             if(rrdvar_fix_name(rc->name))
2064                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
2065         }
2066         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
2067             if(rc) {
2068                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
2069                     rrdcalc_free(&localhost, rc);
2070                 rc = NULL;
2071             }
2072
2073             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2074                 rrdcalctemplate_free(&localhost, rt);
2075
2076             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
2077             rt->name = strdupz(value);
2078             rt->hash_name = simple_hash(rt->name);
2079             rt->source = health_source_file(line, path, filename);
2080             rt->green = NAN;
2081             rt->red = NAN;
2082             rt->delay_multiplier = 1.0;
2083
2084             if(rrdvar_fix_name(rt->name))
2085                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
2086         }
2087         else if(rc) {
2088             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
2089                 if(rc->chart) {
2090                     if(strcmp(rc->chart, value))
2091                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2092                                 line, path, filename, rc->name, key, rc->chart, value, value);
2093
2094                     freez(rc->chart);
2095                 }
2096                 rc->chart = strdupz(value);
2097                 rc->hash_chart = simple_hash(rc->chart);
2098             }
2099             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
2100                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
2101                                        &rc->update_every,
2102                                        &rc->options, &rc->dimensions);
2103             }
2104             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
2105                 if(!health_parse_duration(value, &rc->update_every))
2106                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
2107                          line, path, filename, rc->name, key, value);
2108             }
2109             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
2110                 char *e;
2111                 rc->green = strtold(value, &e);
2112                 if(e && *e) {
2113                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
2114                          line, path, filename, rc->name, key, e);
2115                 }
2116             }
2117             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
2118                 char *e;
2119                 rc->red = strtold(value, &e);
2120                 if(e && *e) {
2121                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
2122                          line, path, filename, rc->name, key, e);
2123                 }
2124             }
2125             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2126                 const char *failed_at = NULL;
2127                 int error = 0;
2128                 rc->calculation = expression_parse(value, &failed_at, &error);
2129                 if(!rc->calculation) {
2130                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2131                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2132                 }
2133             }
2134             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2135                 const char *failed_at = NULL;
2136                 int error = 0;
2137                 rc->warning = expression_parse(value, &failed_at, &error);
2138                 if(!rc->warning) {
2139                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2140                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2141                 }
2142             }
2143             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2144                 const char *failed_at = NULL;
2145                 int error = 0;
2146                 rc->critical = expression_parse(value, &failed_at, &error);
2147                 if(!rc->critical) {
2148                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2149                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2150                 }
2151             }
2152             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2153                 if(rc->exec) {
2154                     if(strcmp(rc->exec, value))
2155                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2156                              line, path, filename, rc->name, key, rc->exec, value, value);
2157
2158                     freez(rc->exec);
2159                 }
2160                 rc->exec = strdupz(value);
2161             }
2162             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2163                 if(rc->recipient) {
2164                     if(strcmp(rc->recipient, value))
2165                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2166                              line, path, filename, rc->name, key, rc->recipient, value, value);
2167
2168                     freez(rc->recipient);
2169                 }
2170                 rc->recipient = strdupz(value);
2171             }
2172             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2173                 if(rc->units) {
2174                     if(strcmp(rc->units, value))
2175                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2176                              line, path, filename, rc->name, key, rc->units, value, value);
2177
2178                     freez(rc->units);
2179                 }
2180                 rc->units = strdupz(value);
2181                 strip_quotes(rc->units);
2182             }
2183             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2184                 if(rc->info) {
2185                     if(strcmp(rc->info, value))
2186                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2187                              line, path, filename, rc->name, key, rc->info, value, value);
2188
2189                     freez(rc->info);
2190                 }
2191                 rc->info = strdupz(value);
2192                 strip_quotes(rc->info);
2193             }
2194             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2195                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
2196             }
2197             else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
2198                 rc->options |= health_parse_options(value);
2199             }
2200             else {
2201                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
2202                      line, path, filename, rc->name, key);
2203             }
2204         }
2205         else if(rt) {
2206             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
2207                 if(rt->context) {
2208                     if(strcmp(rt->context, value))
2209                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2210                                 line, path, filename, rt->name, key, rt->context, value, value);
2211
2212                     freez(rt->context);
2213                 }
2214                 rt->context = strdupz(value);
2215                 rt->hash_context = simple_hash(rt->context);
2216             }
2217             else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) {
2218                 freez(rt->family_match);
2219                 simple_pattern_free(rt->family_pattern);
2220
2221                 rt->family_match = strdupz(value);
2222                 rt->family_pattern = simple_pattern_create(rt->family_match, SIMPLE_PATTERN_EXACT);
2223             }
2224             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
2225                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
2226                                        &rt->update_every, &rt->options, &rt->dimensions);
2227             }
2228             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
2229                 if(!health_parse_duration(value, &rt->update_every))
2230                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
2231                          line, path, filename, rt->name, key, value);
2232             }
2233             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
2234                 char *e;
2235                 rt->green = strtold(value, &e);
2236                 if(e && *e) {
2237                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2238                          line, path, filename, rt->name, key, e);
2239                 }
2240             }
2241             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
2242                 char *e;
2243                 rt->red = strtold(value, &e);
2244                 if(e && *e) {
2245                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2246                          line, path, filename, rt->name, key, e);
2247                 }
2248             }
2249             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2250                 const char *failed_at = NULL;
2251                 int error = 0;
2252                 rt->calculation = expression_parse(value, &failed_at, &error);
2253                 if(!rt->calculation) {
2254                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2255                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2256                 }
2257             }
2258             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2259                 const char *failed_at = NULL;
2260                 int error = 0;
2261                 rt->warning = expression_parse(value, &failed_at, &error);
2262                 if(!rt->warning) {
2263                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2264                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2265                 }
2266             }
2267             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2268                 const char *failed_at = NULL;
2269                 int error = 0;
2270                 rt->critical = expression_parse(value, &failed_at, &error);
2271                 if(!rt->critical) {
2272                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2273                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2274                 }
2275             }
2276             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2277                 if(rt->exec) {
2278                     if(strcmp(rt->exec, value))
2279                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2280                              line, path, filename, rt->name, key, rt->exec, value, value);
2281
2282                     freez(rt->exec);
2283                 }
2284                 rt->exec = strdupz(value);
2285             }
2286             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2287                 if(rt->recipient) {
2288                     if(strcmp(rt->recipient, value))
2289                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2290                              line, path, filename, rt->name, key, rt->recipient, value, value);
2291
2292                     freez(rt->recipient);
2293                 }
2294                 rt->recipient = strdupz(value);
2295             }
2296             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2297                 if(rt->units) {
2298                     if(strcmp(rt->units, value))
2299                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2300                              line, path, filename, rt->name, key, rt->units, value, value);
2301
2302                     freez(rt->units);
2303                 }
2304                 rt->units = strdupz(value);
2305                 strip_quotes(rt->units);
2306             }
2307             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2308                 if(rt->info) {
2309                     if(strcmp(rt->info, value))
2310                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2311                              line, path, filename, rt->name, key, rt->info, value, value);
2312
2313                     freez(rt->info);
2314                 }
2315                 rt->info = strdupz(value);
2316                 strip_quotes(rt->info);
2317             }
2318             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2319                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
2320             }
2321             else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
2322                 rt->options |= health_parse_options(value);
2323             }
2324             else {
2325                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
2326                       line, path, filename, rt->name, key);
2327             }
2328         }
2329         else {
2330             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
2331                   line, path, filename, key);
2332         }
2333     }
2334
2335     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2336         rrdcalc_free(&localhost, rc);
2337
2338     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2339         rrdcalctemplate_free(&localhost, rt);
2340
2341     fclose(fp);
2342     return 1;
2343 }
2344
2345 void health_readdir(const char *path) {
2346     size_t pathlen = strlen(path);
2347
2348     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2349
2350     DIR *dir = opendir(path);
2351     if (!dir) {
2352         error("Health configuration cannot open directory '%s'.", path);
2353         return;
2354     }
2355
2356     struct dirent *de = NULL;
2357     while ((de = readdir(dir))) {
2358         size_t len = strlen(de->d_name);
2359
2360         if(de->d_type == DT_DIR
2361            && (
2362                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2363                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2364            )) {
2365             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2366             continue;
2367         }
2368
2369         else if(de->d_type == DT_DIR) {
2370             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2371             strcpy(s, path);
2372             strcat(s, "/");
2373             strcat(s, de->d_name);
2374             health_readdir(s);
2375             freez(s);
2376             continue;
2377         }
2378
2379         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2380                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2381             health_readfile(path, de->d_name);
2382         }
2383
2384         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2385     }
2386
2387     closedir(dir);
2388 }
2389
2390 static inline char *health_config_dir(void) {
2391     char buffer[FILENAME_MAX + 1];
2392     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2393     return config_get("health", "health configuration directory", buffer);
2394 }
2395
2396 void health_init(void) {
2397     debug(D_HEALTH, "Health configuration initializing");
2398
2399     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2400         debug(D_HEALTH, "Health is disabled.");
2401         return;
2402     }
2403
2404     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2405     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2406         fatal("Cannot create directory '%s'.", pathname);
2407
2408     char filename[FILENAME_MAX + 1];
2409     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2410     health.log_filename = config_get("health", "health db file", filename);
2411
2412     health_alarm_log_load(&localhost);
2413     health_alarm_log_open();
2414
2415     char *path = health_config_dir();
2416
2417     {
2418         char buffer[FILENAME_MAX + 1];
2419         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2420         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2421     }
2422
2423     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2424     if(n < 10) {
2425         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2426         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2427     }
2428     else localhost.health_log.max = (unsigned int)n;
2429
2430     rrdhost_rwlock(&localhost);
2431     health_readdir(path);
2432     rrdhost_unlock(&localhost);
2433 }
2434
2435 // ----------------------------------------------------------------------------
2436 // JSON generation
2437
2438 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2439     if(value && *value)
2440         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2441     else
2442         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2443 }
2444
2445 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2446     buffer_sprintf(wb,
2447             "\n\t{\n"
2448                     "\t\t\"hostname\": \"%s\",\n"
2449                     "\t\t\"unique_id\": %u,\n"
2450                     "\t\t\"alarm_id\": %u,\n"
2451                     "\t\t\"alarm_event_id\": %u,\n"
2452                     "\t\t\"name\": \"%s\",\n"
2453                     "\t\t\"chart\": \"%s\",\n"
2454                     "\t\t\"family\": \"%s\",\n"
2455                     "\t\t\"processed\": %s,\n"
2456                     "\t\t\"updated\": %s,\n"
2457                     "\t\t\"exec_run\": %lu,\n"
2458                     "\t\t\"exec_failed\": %s,\n"
2459                     "\t\t\"exec\": \"%s\",\n"
2460                     "\t\t\"recipient\": \"%s\",\n"
2461                     "\t\t\"exec_code\": %d,\n"
2462                     "\t\t\"source\": \"%s\",\n"
2463                     "\t\t\"units\": \"%s\",\n"
2464                     "\t\t\"info\": \"%s\",\n"
2465                     "\t\t\"when\": %lu,\n"
2466                     "\t\t\"duration\": %lu,\n"
2467                     "\t\t\"non_clear_duration\": %lu,\n"
2468                     "\t\t\"status\": \"%s\",\n"
2469                     "\t\t\"old_status\": \"%s\",\n"
2470                     "\t\t\"delay\": %d,\n"
2471                     "\t\t\"delay_up_to_timestamp\": %lu,\n"
2472                     "\t\t\"updated_by_id\": %u,\n"
2473                     "\t\t\"updates_id\": %u,\n"
2474                     "\t\t\"value_string\": \"%s\",\n"
2475                     "\t\t\"old_value_string\": \"%s\",\n"
2476             , host->hostname
2477             , ae->unique_id
2478             , ae->alarm_id
2479             , ae->alarm_event_id
2480             , ae->name
2481             , ae->chart
2482             , ae->family
2483             , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
2484             , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
2485             , (unsigned long)ae->exec_run_timestamp
2486             , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false"
2487             , ae->exec?ae->exec:health.health_default_exec
2488             , ae->recipient?ae->recipient:health.health_default_recipient
2489             , ae->exec_code
2490             , ae->source
2491             , ae->units?ae->units:""
2492             , ae->info?ae->info:""
2493             , (unsigned long)ae->when
2494             , (unsigned long)ae->duration
2495             , (unsigned long)ae->non_clear_duration
2496             , rrdcalc_status2string(ae->new_status)
2497             , rrdcalc_status2string(ae->old_status)
2498             , ae->delay
2499             , (unsigned long)ae->delay_up_to_timestamp
2500             , ae->updated_by_id
2501             , ae->updates_id
2502             , ae->new_value_string
2503             , ae->old_value_string
2504     );
2505
2506     if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
2507         buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
2508     }
2509
2510     buffer_strcat(wb, "\t\t\"value\":");
2511     buffer_rrd_value(wb, ae->new_value);
2512     buffer_strcat(wb, ",\n");
2513
2514     buffer_strcat(wb, "\t\t\"old_value\":");
2515     buffer_rrd_value(wb, ae->old_value);
2516     buffer_strcat(wb, "\n");
2517
2518     buffer_strcat(wb, "\t}");
2519 }
2520
2521 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2522     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2523
2524     buffer_strcat(wb, "[");
2525
2526     unsigned int max = host->health_log.max;
2527     unsigned int count = 0;
2528     ALARM_ENTRY *ae;
2529     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2530         if(ae->unique_id > after) {
2531             if(likely(count)) buffer_strcat(wb, ",");
2532             health_alarm_entry2json_nolock(wb, ae, host);
2533         }
2534     }
2535
2536     buffer_strcat(wb, "\n]\n");
2537
2538     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2539 }
2540
2541 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2542     char value_string[100 + 1];
2543     format_value_and_unit(value_string, 100, rc->value, rc->units, -1);
2544
2545     buffer_sprintf(wb,
2546            "\t\t\"%s.%s\": {\n"
2547                    "\t\t\t\"id\": %lu,\n"
2548                    "\t\t\t\"name\": \"%s\",\n"
2549                    "\t\t\t\"chart\": \"%s\",\n"
2550                    "\t\t\t\"family\": \"%s\",\n"
2551                    "\t\t\t\"active\": %s,\n"
2552                    "\t\t\t\"exec\": \"%s\",\n"
2553                    "\t\t\t\"recipient\": \"%s\",\n"
2554                    "\t\t\t\"source\": \"%s\",\n"
2555                    "\t\t\t\"units\": \"%s\",\n"
2556                    "\t\t\t\"info\": \"%s\",\n"
2557                                    "\t\t\t\"status\": \"%s\",\n"
2558                    "\t\t\t\"last_status_change\": %lu,\n"
2559                    "\t\t\t\"last_updated\": %lu,\n"
2560                    "\t\t\t\"next_update\": %lu,\n"
2561                    "\t\t\t\"update_every\": %d,\n"
2562                    "\t\t\t\"delay_up_duration\": %d,\n"
2563                    "\t\t\t\"delay_down_duration\": %d,\n"
2564                    "\t\t\t\"delay_max_duration\": %d,\n"
2565                    "\t\t\t\"delay_multiplier\": %f,\n"
2566                    "\t\t\t\"delay\": %d,\n"
2567                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2568                    "\t\t\t\"value_string\": \"%s\",\n"
2569            , rc->chart, rc->name
2570            , (unsigned long)rc->id
2571            , rc->name
2572            , rc->chart
2573            , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2574            , (rc->rrdset)?"true":"false"
2575            , rc->exec?rc->exec:health.health_default_exec
2576            , rc->recipient?rc->recipient:health.health_default_recipient
2577            , rc->source
2578            , rc->units?rc->units:""
2579            , rc->info?rc->info:""
2580            , rrdcalc_status2string(rc->status)
2581            , (unsigned long)rc->last_status_change
2582            , (unsigned long)rc->last_updated
2583            , (unsigned long)rc->next_update
2584            , rc->update_every
2585            , rc->delay_up_duration
2586            , rc->delay_down_duration
2587            , rc->delay_max_duration
2588            , rc->delay_multiplier
2589            , rc->delay_last
2590            , (unsigned long)rc->delay_up_to_timestamp
2591            , value_string
2592     );
2593
2594     if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
2595         buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n");
2596     }
2597
2598     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2599         if(rc->dimensions && *rc->dimensions)
2600             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2601
2602         buffer_sprintf(wb,
2603                        "\t\t\t\"db_after\": %lu,\n"
2604                        "\t\t\t\"db_before\": %lu,\n"
2605                        "\t\t\t\"lookup_method\": \"%s\",\n"
2606                        "\t\t\t\"lookup_after\": %d,\n"
2607                        "\t\t\t\"lookup_before\": %d,\n"
2608                        "\t\t\t\"lookup_options\": \"",
2609                        (unsigned long) rc->db_after,
2610                        (unsigned long) rc->db_before,
2611                        group_method2string(rc->group),
2612                        rc->after,
2613                        rc->before
2614         );
2615         buffer_data_options2string(wb, rc->options);
2616         buffer_strcat(wb, "\",\n");
2617     }
2618
2619     if(rc->calculation) {
2620         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2621         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2622     }
2623
2624     if(rc->warning) {
2625         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2626         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2627     }
2628
2629     if(rc->critical) {
2630         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2631         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2632     }
2633
2634     buffer_strcat(wb, "\t\t\t\"green\":");
2635     buffer_rrd_value(wb, rc->green);
2636     buffer_strcat(wb, ",\n");
2637
2638     buffer_strcat(wb, "\t\t\t\"red\":");
2639     buffer_rrd_value(wb, rc->red);
2640     buffer_strcat(wb, ",\n");
2641
2642     buffer_strcat(wb, "\t\t\t\"value\":");
2643     buffer_rrd_value(wb, rc->value);
2644     buffer_strcat(wb, "\n");
2645
2646     buffer_strcat(wb, "\t\t}");
2647 }
2648
2649 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2650 //
2651 //}
2652
2653 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2654     int i;
2655
2656     rrdhost_rdlock(&localhost);
2657     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2658                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2659                         "\n\t\"status\": %s,"
2660                         "\n\t\"now\": %lu,"
2661                         "\n\t\"alarms\": {\n",
2662                         host->hostname,
2663                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2664                         health_enabled?"true":"false",
2665                         (unsigned long)now_realtime_sec());
2666
2667     RRDCALC *rc;
2668     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2669         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2670             continue;
2671
2672         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2673             continue;
2674
2675         if(likely(i)) buffer_strcat(wb, ",\n");
2676         health_rrdcalc2json_nolock(wb, rc);
2677         i++;
2678     }
2679
2680 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2681 //    RRDCALCTEMPLATE *rt;
2682 //    for(rt = host->templates; rt ; rt = rt->next)
2683 //        health_rrdcalctemplate2json_nolock(wb, rt);
2684
2685     buffer_strcat(wb, "\n\t}\n}\n");
2686     rrdhost_unlock(&localhost);
2687 }
2688
2689
2690 // ----------------------------------------------------------------------------
2691 // re-load health configuration
2692
2693 static inline void health_free_all_nolock(RRDHOST *host) {
2694     while(host->templates)
2695         rrdcalctemplate_free(host, host->templates);
2696
2697     while(host->alarms)
2698         rrdcalc_free(host, host->alarms);
2699 }
2700
2701 void health_reload(void) {
2702     if(!health_enabled) {
2703         error("Health reload is requested, but health is not enabled.");
2704         return;
2705     }
2706
2707     char *path = health_config_dir();
2708
2709     // free all running alarms
2710     rrdhost_rwlock(&localhost);
2711     health_free_all_nolock(&localhost);
2712     rrdhost_unlock(&localhost);
2713
2714     // invalidate all previous entries in the alarm log
2715     ALARM_ENTRY *t;
2716     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2717         if(t->new_status != RRDCALC_STATUS_REMOVED)
2718             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2719     }
2720
2721     // reset all thresholds to all charts
2722     RRDSET *st;
2723     for(st = localhost.rrdset_root; st ; st = st->next) {
2724         st->green = NAN;
2725         st->red = NAN;
2726     }
2727
2728     // load the new alarms
2729     rrdhost_rwlock(&localhost);
2730     health_readdir(path);
2731     rrdhost_unlock(&localhost);
2732
2733     // link the loaded alarms to their charts
2734     for(st = localhost.rrdset_root; st ; st = st->next) {
2735         rrdhost_rwlock(&localhost);
2736
2737         rrdsetcalc_link_matching(st);
2738         rrdcalctemplate_link_matching(st);
2739
2740         rrdhost_unlock(&localhost);
2741     }
2742 }
2743
2744 // ----------------------------------------------------------------------------
2745 // health main thread and friends
2746
2747 static inline int rrdcalc_value2status(calculated_number n) {
2748     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
2749     if(n) return RRDCALC_STATUS_RAISED;
2750     return RRDCALC_STATUS_CLEAR;
2751 }
2752
2753 #define ALARM_EXEC_COMMAND_LENGTH 8192
2754
2755 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2756     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2757
2758     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2759         // do not send notifications for internal statuses
2760         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2761         goto done;
2762     }
2763
2764     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
2765         // do not send notifications for disabled statuses
2766         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2767         // mark it as run, so that we will send the same alarm if it happens again
2768         goto done;
2769     }
2770
2771     // find the previous notification for the same alarm
2772     // which we have run the exec script
2773     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
2774     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
2775         uint32_t id = ae->alarm_id;
2776         ALARM_ENTRY *t;
2777         for(t = ae->next; t ; t = t->next) {
2778             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2779                 break;
2780         }
2781
2782         if(likely(t)) {
2783             // we have executed this alarm notification in the past
2784             if(t && t->new_status == ae->new_status) {
2785                 // don't send the notification for the same status again
2786                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
2787                       , rrdcalc_status2string(ae->new_status));
2788                 goto done;
2789             }
2790         }
2791         else {
2792             // we have not executed this alarm notification in the past
2793             // so, don't send CLEAR notifications
2794             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
2795                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
2796                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2797                 goto done;
2798             }
2799         }
2800     }
2801
2802     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
2803     pid_t command_pid;
2804
2805     const char *exec = ae->exec;
2806     if(!exec) exec = health.health_default_exec;
2807
2808     const char *recipient = ae->recipient;
2809     if(!recipient) recipient = health.health_default_recipient;
2810
2811     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
2812               exec,
2813               recipient,
2814               host->hostname,
2815               ae->unique_id,
2816               ae->alarm_id,
2817               ae->alarm_event_id,
2818               (unsigned long)ae->when,
2819               ae->name,
2820               ae->chart?ae->chart:"NOCAHRT",
2821               ae->family?ae->family:"NOFAMILY",
2822               rrdcalc_status2string(ae->new_status),
2823               rrdcalc_status2string(ae->old_status),
2824               ae->new_value,
2825               ae->old_value,
2826               ae->source?ae->source:"UNKNOWN",
2827               (uint32_t)ae->duration,
2828               (uint32_t)ae->non_clear_duration,
2829               ae->units?ae->units:"",
2830               ae->info?ae->info:"",
2831               ae->new_value_string,
2832               ae->old_value_string
2833     );
2834
2835     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2836     ae->exec_run_timestamp = now_realtime_sec();
2837
2838     debug(D_HEALTH, "executing command '%s'", command_to_run);
2839     FILE *fp = mypopen(command_to_run, &command_pid);
2840     if(!fp) {
2841         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
2842         goto done;
2843     }
2844     debug(D_HEALTH, "HEALTH reading from command");
2845     char *s = fgets(command_to_run, FILENAME_MAX, fp);
2846     (void)s;
2847     ae->exec_code = mypclose(fp, command_pid);
2848     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2849
2850     if(ae->exec_code != 0)
2851         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2852
2853 done:
2854     health_alarm_log_save(host, ae);
2855     return;
2856 }
2857
2858 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2859     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2860          ae->chart?ae->chart:"NOCHART", ae->name,
2861          ae->new_value,
2862          rrdcalc_status2string(ae->old_status),
2863          rrdcalc_status2string(ae->new_status)
2864     );
2865
2866     health_alarm_execute(host, ae);
2867 }
2868
2869 static inline void health_alarm_log_process(RRDHOST *host) {
2870     static uint32_t stop_at_id = 0;
2871     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2872     time_t now = now_realtime_sec();
2873
2874     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2875
2876     ALARM_ENTRY *ae;
2877     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2878         if(unlikely(
2879             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2880             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2881             )) {
2882
2883             if(unlikely(ae->unique_id < first_waiting))
2884                 first_waiting = ae->unique_id;
2885
2886             if(likely(now >= ae->delay_up_to_timestamp))
2887                 health_process_notifications(host, ae);
2888         }
2889     }
2890
2891     // remember this for the next iteration
2892     stop_at_id = first_waiting;
2893
2894     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2895
2896     if(host->health_log.count <= host->health_log.max)
2897         return;
2898
2899     // cleanup excess entries in the log
2900     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2901
2902     ALARM_ENTRY *last = NULL;
2903     unsigned int count = host->health_log.max * 2 / 3;
2904     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2905
2906     if(ae && last && last->next == ae)
2907         last->next = NULL;
2908     else
2909         ae = NULL;
2910
2911     while(ae) {
2912         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2913
2914         ALARM_ENTRY *t = ae->next;
2915
2916         freez(ae->name);
2917         freez(ae->chart);
2918         freez(ae->family);
2919         freez(ae->exec);
2920         freez(ae->recipient);
2921         freez(ae->source);
2922         freez(ae->units);
2923         freez(ae->info);
2924         freez(ae->old_value_string);
2925         freez(ae->new_value_string);
2926         freez(ae);
2927
2928         ae = t;
2929         host->health_log.count--;
2930     }
2931
2932     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2933 }
2934
2935 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2936     if(unlikely(!rc->rrdset)) {
2937         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2938         return 0;
2939     }
2940
2941     if(unlikely(rc->next_update > now)) {
2942         if (unlikely(*next_run > rc->next_update)) {
2943             // update the next_run time of the main loop
2944             // to run this alarm precisely the time required
2945             *next_run = rc->next_update;
2946         }
2947
2948         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2949         return 0;
2950     }
2951
2952     if(unlikely(!rc->update_every)) {
2953         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2954         return 0;
2955     }
2956
2957     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
2958         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
2959         return 0;
2960     }
2961
2962     int update_every = rc->rrdset->update_every;
2963     time_t first = rrdset_first_entry_t(rc->rrdset);
2964     time_t last = rrdset_last_entry_t(rc->rrdset);
2965
2966     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
2967         debug(D_HEALTH
2968               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
2969               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
2970               , (unsigned long) last);
2971         return 0;
2972     }
2973
2974     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2975         time_t needed = now + rc->before + rc->after;
2976
2977         if(needed + update_every < first || needed - update_every > last) {
2978             debug(D_HEALTH
2979                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
2980                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
2981                   , (unsigned long) last);
2982             return 0;
2983         }
2984     }
2985
2986     return 1;
2987 }
2988
2989 void *health_main(void *ptr) {
2990     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
2991
2992     info("HEALTH thread created with task id %d", gettid());
2993
2994     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2995         error("Cannot set pthread cancel type to DEFERRED.");
2996
2997     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2998         error("Cannot set pthread cancel state to ENABLE.");
2999
3000     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
3001     if(min_run_every < 1) min_run_every = 1;
3002
3003     BUFFER *wb = buffer_create(100);
3004
3005     unsigned int loop = 0;
3006     while(health_enabled && !netdata_exit) {
3007         loop++;
3008         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
3009
3010         int oldstate, runnable = 0;
3011         time_t now = now_realtime_sec();
3012         time_t next_run = now + min_run_every;
3013         RRDCALC *rc;
3014
3015         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
3016             error("Cannot set pthread cancel state to DISABLE.");
3017
3018         rrdhost_rdlock(&localhost);
3019
3020         // the first loop is to lookup values from the db
3021         for(rc = localhost.alarms; rc; rc = rc->next) {
3022             if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
3023                 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
3024                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
3025                 continue;
3026             }
3027
3028             runnable++;
3029             rc->old_value = rc->value;
3030             rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
3031
3032             // 1. if there is database lookup, do it
3033             // 2. if there is calculation expression, run it
3034
3035             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
3036                 /* time_t old_db_timestamp = rc->db_before; */
3037                 int value_is_null = 0;
3038
3039                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
3040                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
3041                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
3042
3043                 if (unlikely(ret != 200)) {
3044                     // database lookup failed
3045                     rc->value = NAN;
3046
3047                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
3048
3049                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
3050                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
3051                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
3052                     }
3053                 }
3054                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
3055                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
3056
3057                 /* - RRDCALC_FLAG_DB_STALE not currently used
3058                 if (unlikely(old_db_timestamp == rc->db_before)) {
3059                     // database is stale
3060
3061                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
3062
3063                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
3064                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
3065                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
3066                     }
3067                 }
3068                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
3069                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
3070                 */
3071
3072                 if (unlikely(value_is_null)) {
3073                     // collected value is null
3074
3075                     rc->value = NAN;
3076
3077                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
3078                           rc->chart?rc->chart:"NOCHART", rc->name);
3079
3080                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
3081                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
3082                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
3083                               rc->chart?rc->chart:"NOCHART", rc->name);
3084                     }
3085                 }
3086                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
3087                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
3088
3089                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
3090                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
3091             }
3092
3093             if(unlikely(rc->calculation)) {
3094                 if (unlikely(!expression_evaluate(rc->calculation))) {
3095                     // calculation failed
3096
3097                     rc->value = NAN;
3098
3099                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s",
3100                           rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
3101
3102                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
3103                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
3104                         error("Health alarm '%s.%s': expression '%s' failed: %s",
3105                               rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
3106                     }
3107                 }
3108                 else {
3109                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
3110                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
3111
3112                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value "
3113                             CALCULATED_NUMBER_FORMAT
3114                             ": %s (source: %s)",
3115                           rc->chart?rc->chart:"NOCHART", rc->name,
3116                           rc->calculation->parsed_as,
3117                           rc->calculation->result,
3118                           buffer_tostring(rc->calculation->error_msg),
3119                           rc->source
3120                     );
3121
3122                     rc->value = rc->calculation->result;
3123                 }
3124             }
3125         }
3126         rrdhost_unlock(&localhost);
3127
3128         if(unlikely(runnable && !netdata_exit)) {
3129             rrdhost_rdlock(&localhost);
3130
3131             for(rc = localhost.alarms; rc; rc = rc->next) {
3132                 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
3133                     continue;
3134
3135                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
3136                 int critical_status = RRDCALC_STATUS_UNDEFINED;
3137
3138                 if(likely(rc->warning)) {
3139                     if(unlikely(!expression_evaluate(rc->warning))) {
3140                         // calculation failed
3141
3142                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
3143                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
3144
3145                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
3146                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
3147                             error("Health alarm '%s.%s': warning expression failed with error: %s",
3148                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
3149                         }
3150                     }
3151                     else {
3152                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
3153                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
3154
3155                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
3156                                 CALCULATED_NUMBER_FORMAT
3157                                 ": %s (source: %s)",
3158                               rc->chart?rc->chart:"NOCHART", rc->name,
3159                               rc->warning->result,
3160                               buffer_tostring(rc->warning->error_msg),
3161                               rc->source
3162                         );
3163
3164                         warning_status = rrdcalc_value2status(rc->warning->result);
3165                     }
3166                 }
3167
3168                 if(likely(rc->critical)) {
3169                     if(unlikely(!expression_evaluate(rc->critical))) {
3170                         // calculation failed
3171
3172                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
3173                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
3174
3175                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
3176                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
3177                             error("Health alarm '%s.%s': critical expression failed with error: %s",
3178                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
3179                         }
3180                     }
3181                     else {
3182                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
3183                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
3184
3185                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
3186                                 CALCULATED_NUMBER_FORMAT
3187                                 ": %s (source: %s)",
3188                               rc->chart?rc->chart:"NOCHART", rc->name,
3189                               rc->critical->result,
3190                               buffer_tostring(rc->critical->error_msg),
3191                               rc->source
3192                         );
3193
3194                         critical_status = rrdcalc_value2status(rc->critical->result);
3195                     }
3196                 }
3197
3198                 int status = RRDCALC_STATUS_UNDEFINED;
3199
3200                 switch(warning_status) {
3201                     case RRDCALC_STATUS_CLEAR:
3202                         status = RRDCALC_STATUS_CLEAR;
3203                         break;
3204
3205                     case RRDCALC_STATUS_RAISED:
3206                         status = RRDCALC_STATUS_WARNING;
3207                         break;
3208
3209                     default:
3210                         break;
3211                 }
3212
3213                 switch(critical_status) {
3214                     case RRDCALC_STATUS_CLEAR:
3215                         if(status == RRDCALC_STATUS_UNDEFINED)
3216                             status = RRDCALC_STATUS_CLEAR;
3217                         break;
3218
3219                     case RRDCALC_STATUS_RAISED:
3220                         status = RRDCALC_STATUS_CRITICAL;
3221                         break;
3222
3223                     default:
3224                         break;
3225                 }
3226
3227                 if(status != rc->status) {
3228                     int delay = 0;
3229
3230                     if(now > rc->delay_up_to_timestamp) {
3231                         rc->delay_up_current = rc->delay_up_duration;
3232                         rc->delay_down_current = rc->delay_down_duration;
3233                         rc->delay_last = 0;
3234                         rc->delay_up_to_timestamp = 0;
3235                     }
3236                     else {
3237                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
3238                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
3239
3240                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
3241                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
3242                     }
3243
3244                     if(status > rc->status)
3245                         delay = rc->delay_up_current;
3246                     else
3247                         delay = rc->delay_down_current;
3248
3249                     // COMMENTED: because we do need to send raising alarms
3250                     // if(now + delay < rc->delay_up_to_timestamp)
3251                     //    delay = (int)(rc->delay_up_to_timestamp - now);
3252
3253                     rc->delay_last = delay;
3254                     rc->delay_up_to_timestamp = now + delay;
3255                     health_alarm_log(
3256                             &localhost,
3257                             rc->id,
3258                             rc->next_event_id++,
3259                             now,
3260                             rc->name,
3261                             rc->rrdset->id,
3262                             rc->rrdset->family,
3263                             rc->exec,
3264                             rc->recipient,
3265                             now - rc->last_status_change,
3266                             rc->old_value,
3267                             rc->value,
3268                             rc->status,
3269                             status,
3270                             rc->source,
3271                             rc->units,
3272                             rc->info,
3273                             rc->delay_last,
3274                             (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)?HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION:0
3275                     );
3276                     rc->last_status_change = now;
3277                     rc->status = status;
3278                 }
3279
3280                 rc->last_updated = now;
3281                 rc->next_update = now + rc->update_every;
3282
3283                 if (next_run > rc->next_update)
3284                     next_run = rc->next_update;
3285             }
3286
3287             rrdhost_unlock(&localhost);
3288         }
3289
3290         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
3291             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
3292
3293         if(unlikely(netdata_exit))
3294             break;
3295
3296         // execute notifications
3297         // and cleanup
3298         health_alarm_log_process(&localhost);
3299
3300         if(unlikely(netdata_exit))
3301             break;
3302         
3303         now = now_realtime_sec();
3304         if(now < next_run) {
3305             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
3306                   loop, (int) (next_run - now));
3307             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
3308         }
3309         else {
3310             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
3311         }
3312     }
3313
3314     buffer_free(wb);
3315
3316     info("HEALTH thread exiting");
3317
3318     static_thread->enabled = 0;
3319     pthread_exit(NULL);
3320     return NULL;
3321 }