]> arthur.barton.de Git - netdata.git/blob - src/health.c
pretty value formatting on all alarm notifications
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         if(unlink(health.log_filename) == -1 && errno != ENOENT)
71             error("Health: cannot remove old alarms log file '%s'", health.log_filename);
72
73         // open it with truncate
74         health.log_fp = fopen(health.log_filename, "w");
75
76         if(health.log_fp)
77             fclose(health.log_fp);
78         else
79             error("Health: cannot truncate health log '%s'", health.log_filename);
80
81         health.log_fp = NULL;
82
83         health.log_entries_written = 0;
84         health_alarm_log_open();
85     }
86 }
87
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
89     health_log_rotate();
90
91     if(likely(health.log_fp)) {
92         if(unlikely(fprintf(health.log_fp
93                 , "%c\t%s"
94                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
95                   "\t%08x\t%08x\t%08x"
96                   "\t%08x\t%08x\t%08x"
97                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
98                   "\t%d\t%d\t%d\t%d"
99                   "\t%Lf\t%Lf"
100                   "\n"
101                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
102                 , host->hostname
103
104                 , ae->unique_id
105                 , ae->alarm_id
106                 , ae->alarm_event_id
107                 , ae->updated_by_id
108                 , ae->updates_id
109
110                 , (uint32_t)ae->when
111                 , (uint32_t)ae->duration
112                 , (uint32_t)ae->non_clear_duration
113                 , (uint32_t)ae->flags
114                 , (uint32_t)ae->exec_run_timestamp
115                 , (uint32_t)ae->delay_up_to_timestamp
116
117                 , (ae->name)?ae->name:""
118                 , (ae->chart)?ae->chart:""
119                 , (ae->family)?ae->family:""
120                 , (ae->exec)?ae->exec:""
121                 , (ae->recipient)?ae->recipient:""
122                 , (ae->source)?ae->source:""
123                 , (ae->units)?ae->units:""
124                 , (ae->info)?ae->info:""
125
126                 , ae->exec_code
127                 , ae->new_status
128                 , ae->old_status
129                 , ae->delay
130
131                 , (long double)ae->new_value
132                 , (long double)ae->old_value
133         ) < 0))
134             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
135         else {
136             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137             health.log_entries_written++;
138         }
139     }
140 }
141
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143     static uint32_t max_unique_id = 0, max_alarm_id = 0;
144
145     errno = 0;
146
147     char *s, *buf = mallocz(65536 + 1);
148     size_t line = 0, len = 0;
149     ssize_t loaded = 0, updated = 0, errored = 0, duplicate = 0;
150
151     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
152
153     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
154         health.log_entries_written++;
155         line++;
156
157         int max_entries = 30, entries = 0;
158         char *pointers[max_entries];
159
160         pointers[entries++] = s++;
161         while(*s) {
162             if(unlikely(*s == '\t')) {
163                 *s = '\0';
164                 pointers[entries++] = ++s;
165                 if(entries >= max_entries) {
166                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
167                     break;
168                 }
169             }
170             else s++;
171         }
172
173         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
174             ALARM_ENTRY *ae = NULL;
175
176             if(entries < 26) {
177                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries);
178                 errored++;
179                 continue;
180             }
181
182             // check that we have valid ids
183             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
184             if(!unique_id) {
185                 error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]);
186                 errored++;
187                 continue;
188             }
189
190             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
191             if(!alarm_id) {
192                 error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]);
193                 errored++;
194                 continue;
195             }
196
197             if(unlikely(*pointers[0] == 'A')) {
198                 // make sure it is properly numbered
199                 if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
200                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id);
201                     errored++;
202                     continue;
203                 }
204
205                 ae = callocz(1, sizeof(ALARM_ENTRY));
206             }
207             else if(unlikely(*pointers[0] == 'U')) {
208                 // find the original
209                 for(ae = host->health_log.alarms; ae; ae = ae->next) {
210                     if(unlikely(unique_id == ae->unique_id)) {
211                         if(unlikely(*pointers[0] == 'A')) {
212                             error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later."
213                                   , line, filename, unique_id);
214                             *pointers[0] = 'U';
215                             duplicate++;
216                         }
217                         break;
218                     }
219                     else if(unlikely(unique_id > ae->unique_id)) {
220                         // no need to continue
221                         // the linked list is sorted
222                         ae = NULL;
223                         break;
224                     }
225                 }
226
227                 // if not found, skip this line
228                 if(!ae) {
229                     // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
230                     continue;
231                 }
232             }
233
234             // check for a possible host missmatch
235             //if(strcmp(pointers[1], host->hostname))
236             //    error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
237
238             ae->unique_id               = unique_id;
239             ae->alarm_id                = alarm_id;
240             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
241             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
242             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
243
244             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
245             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
246             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
247
248             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
249             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
250
251             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
252             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
253
254             if(unlikely(ae->name)) freez(ae->name);
255             ae->name = strdupz(pointers[13]);
256             ae->hash_name = simple_hash(ae->name);
257
258             if(unlikely(ae->chart)) freez(ae->chart);
259             ae->chart = strdupz(pointers[14]);
260             ae->hash_chart = simple_hash(ae->chart);
261
262             if(unlikely(ae->family)) freez(ae->family);
263             ae->family = strdupz(pointers[15]);
264
265             if(unlikely(ae->exec)) freez(ae->exec);
266             ae->exec = strdupz(pointers[16]);
267             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
268
269             if(unlikely(ae->recipient)) freez(ae->recipient);
270             ae->recipient = strdupz(pointers[17]);
271             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
272
273             if(unlikely(ae->source)) freez(ae->source);
274             ae->source = strdupz(pointers[18]);
275             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
276
277             if(unlikely(ae->units)) freez(ae->units);
278             ae->units = strdupz(pointers[19]);
279             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
280
281             if(unlikely(ae->info)) freez(ae->info);
282             ae->info = strdupz(pointers[20]);
283             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
284
285             ae->exec_code   = str2i(pointers[21]);
286             ae->new_status  = str2i(pointers[22]);
287             ae->old_status  = str2i(pointers[23]);
288             ae->delay       = str2i(pointers[24]);
289
290             ae->new_value   = str2l(pointers[25]);
291             ae->old_value   = str2l(pointers[26]);
292
293             static char value_string[100 + 1];
294             ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
295             ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
296
297             // add it to host if not already there
298             if(unlikely(*pointers[0] == 'A')) {
299                 ae->next = host->health_log.alarms;
300                 host->health_log.alarms = ae;
301                 loaded++;
302             }
303             else updated++;
304
305             if(unlikely(ae->unique_id > max_unique_id))
306                 max_unique_id = ae->unique_id;
307
308             if(unlikely(ae->alarm_id >= max_alarm_id))
309                 max_alarm_id = ae->alarm_id;
310         }
311         else {
312             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
313             errored++;
314         }
315     }
316
317     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
318
319     freez(buf);
320
321     if(!max_unique_id) max_unique_id = (uint32_t)now_realtime_sec();
322     if(!max_alarm_id)  max_alarm_id  = (uint32_t)now_realtime_sec();
323
324     host->health_log.next_log_id = max_unique_id + 1;
325     host->health_log.next_alarm_id = max_alarm_id + 1;
326
327     debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
328     return loaded;
329 }
330
331 static inline void health_alarm_log_load(RRDHOST *host) {
332     health_alarm_log_close();
333
334     char filename[FILENAME_MAX + 1];
335     snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename);
336     FILE *fp = fopen(filename, "r");
337     if(!fp)
338         error("Health: cannot open health file: %s", filename);
339     else {
340         health_alarm_log_read(host, fp, filename);
341         fclose(fp);
342     }
343
344     health.log_entries_written = 0;
345     fp = fopen(health.log_filename, "r");
346     if(!fp)
347         error("Health: cannot open health file: %s", health.log_filename);
348     else {
349         health_alarm_log_read(host, fp, health.log_filename);
350         fclose(fp);
351     }
352
353     health_alarm_log_open();
354 }
355
356
357 // ----------------------------------------------------------------------------
358 // health alarm log management
359
360 static inline void health_alarm_log(RRDHOST *host,
361                 uint32_t alarm_id, uint32_t alarm_event_id,
362                 time_t when,
363                 const char *name, const char *chart, const char *family,
364                 const char *exec, const char *recipient, time_t duration,
365                 calculated_number old_value, calculated_number new_value,
366                 int old_status, int new_status,
367                 const char *source,
368                 const char *units,
369                 const char *info,
370                 int delay
371 ) {
372     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
373
374     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
375     ae->name = strdupz(name);
376     ae->hash_name = simple_hash(ae->name);
377
378     if(chart) {
379         ae->chart = strdupz(chart);
380         ae->hash_chart = simple_hash(ae->chart);
381     }
382
383     if(family)
384         ae->family = strdupz(family);
385
386     if(exec) ae->exec = strdupz(exec);
387     if(recipient) ae->recipient = strdupz(recipient);
388     if(source) ae->source = strdupz(source);
389     if(units) ae->units = strdupz(units);
390     if(info) ae->info = strdupz(info);
391
392     ae->unique_id = host->health_log.next_log_id++;
393     ae->alarm_id = alarm_id;
394     ae->alarm_event_id = alarm_event_id;
395     ae->when = when;
396     ae->old_value = old_value;
397     ae->new_value = new_value;
398
399     static char value_string[100 + 1];
400     ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
401     ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
402
403     ae->old_status = old_status;
404     ae->new_status = new_status;
405     ae->duration = duration;
406     ae->delay = delay;
407     ae->delay_up_to_timestamp = when + delay;
408
409     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
410         ae->non_clear_duration += ae->duration;
411
412     // link it
413     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
414     ae->next = host->health_log.alarms;
415     host->health_log.alarms = ae;
416     host->health_log.count++;
417     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
418
419     // match previous alarms
420     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
421     ALARM_ENTRY *t;
422     for(t = host->health_log.alarms ; t ; t = t->next) {
423         if(t != ae && t->alarm_id == ae->alarm_id) {
424             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
425                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
426                 t->updated_by_id = ae->unique_id;
427                 ae->updates_id = t->unique_id;
428
429                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
430                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
431                     ae->non_clear_duration += t->non_clear_duration;
432
433                 health_alarm_log_save(host, t);
434             }
435
436             // no need to continue
437             break;
438         }
439     }
440     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
441
442     health_alarm_log_save(host, ae);
443 }
444
445 // ----------------------------------------------------------------------------
446 // RRDVAR management
447
448 static inline int rrdvar_fix_name(char *variable) {
449     int fixed = 0;
450     while(*variable) {
451         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
452             *variable++ = '_';
453             fixed++;
454         }
455         else
456             variable++;
457     }
458
459     return fixed;
460 }
461
462 int rrdvar_compare(void* a, void* b) {
463     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
464     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
465     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
466 }
467
468 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
469     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
470     if(ret != rv)
471         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
472
473     return ret;
474 }
475
476 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
477     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
478     if(!ret)
479         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
480
481     return ret;
482 }
483
484 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
485     RRDVAR tmp;
486     tmp.name = (char *)name;
487     tmp.hash = (hash)?hash:simple_hash(tmp.name);
488
489     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
490 }
491
492 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
493     (void)host;
494
495     if(!rv) return;
496
497     if(tree) {
498         debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
499         if(unlikely(!rrdvar_index_del(tree, rv)))
500             error("Attempted to delete variable '%s' from host '%s', but it is not found.", rv->name, host->hostname);
501     }
502
503     freez(rv->name);
504     freez(rv);
505 }
506
507 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, void *value) {
508     char *variable = strdupz(name);
509     rrdvar_fix_name(variable);
510     uint32_t hash = simple_hash(variable);
511
512     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
513     if(unlikely(!rv)) {
514         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
515
516         rv = callocz(1, sizeof(RRDVAR));
517         rv->name = variable;
518         rv->hash = hash;
519         rv->type = type;
520         rv->value = value;
521
522         RRDVAR *ret = rrdvar_index_add(tree, rv);
523         if(unlikely(ret != rv)) {
524             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
525             rrdvar_free(NULL, NULL, rv);
526             rv = NULL;
527         }
528         else
529             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
530     }
531     else {
532         debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
533
534         // already exists
535         freez(variable);
536
537         // this is important
538         // it must return NULL - not the existing variable - or double-free will happen
539         rv = NULL;
540     }
541
542     return rv;
543 }
544
545 // ----------------------------------------------------------------------------
546 // CUSTOM VARIABLES
547
548 RRDVAR *rrdvar_custom_host_variable_create(RRDHOST *host, const char *name) {
549     calculated_number *v = callocz(1, sizeof(calculated_number));
550     *v = NAN;
551     RRDVAR *rv = rrdvar_create_and_index("host", &host->variables_root_index, name, RRDVAR_TYPE_CALCULATED_ALLOCATED, v);
552     if(unlikely(!rv)) {
553         free(v);
554         error("Requested variable '%s' already exists - possibly 2 plugins will be updating it at the same time", name);
555
556         char *variable = strdupz(name);
557         rrdvar_fix_name(variable);
558         uint32_t hash = simple_hash(variable);
559
560         rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
561     }
562
563     return rv;
564 }
565
566 void rrdvar_custom_host_variable_destroy(RRDHOST *host, const char *name) {
567     char *variable = strdupz(name);
568     rrdvar_fix_name(variable);
569     uint32_t hash = simple_hash(variable);
570
571     RRDVAR *rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
572     freez(variable);
573
574     if(!rv) {
575         error("Attempted to remove variable '%s' from host '%s', but it does not exist.", name, host->hostname);
576         return;
577     }
578
579     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED) {
580         error("Attempted to remove variable '%s' from host '%s', but it does not a custom allocated variable.", name, host->hostname);
581         return;
582     }
583
584     if(!rrdvar_index_del(&host->variables_root_index, rv)) {
585         error("Attempted to remove variable '%s' from host '%s', but it cannot be found.", name, host->hostname);
586         return;
587     }
588
589     freez(rv->name);
590     freez(rv->value);
591     freez(rv);
592 }
593
594 void rrdvar_custom_host_variable_set(RRDVAR *rv, calculated_number value) {
595     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED)
596         error("requested to set variable '%s' to value " CALCULATED_NUMBER_FORMAT " but the variable is not a custom one.", rv->name, value);
597     else {
598         calculated_number *v = rv->value;
599         *v = value;
600     }
601 }
602
603 // ----------------------------------------------------------------------------
604 // RRDVAR lookup
605
606 static calculated_number rrdvar2number(RRDVAR *rv) {
607     switch(rv->type) {
608         case RRDVAR_TYPE_CALCULATED_ALLOCATED:
609         case RRDVAR_TYPE_CALCULATED: {
610             calculated_number *n = (calculated_number *)rv->value;
611             return *n;
612         }
613
614         case RRDVAR_TYPE_TIME_T: {
615             time_t *n = (time_t *)rv->value;
616             return *n;
617         }
618
619         case RRDVAR_TYPE_COLLECTED: {
620             collected_number *n = (collected_number *)rv->value;
621             return *n;
622         }
623
624         case RRDVAR_TYPE_TOTAL: {
625             total_number *n = (total_number *)rv->value;
626             return *n;
627         }
628
629         case RRDVAR_TYPE_INT: {
630             int *n = (int *)rv->value;
631             return *n;
632         }
633
634         default:
635             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
636             return NAN;
637     }
638 }
639
640 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
641     RRDSET *st = rc->rrdset;
642     RRDVAR *rv;
643
644     if(!st) return 0;
645
646     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
647     if(rv) {
648         *result = rrdvar2number(rv);
649         return 1;
650     }
651
652     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
653     if(rv) {
654         *result = rrdvar2number(rv);
655         return 1;
656     }
657
658     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
659     if(rv) {
660         *result = rrdvar2number(rv);
661         return 1;
662     }
663
664     return 0;
665 }
666
667 // ----------------------------------------------------------------------------
668 // RRDVAR to JSON
669
670 struct variable2json_helper {
671     BUFFER *buf;
672     size_t counter;
673 };
674
675 static int single_variable2json(void *entry, void *data) {
676     struct variable2json_helper *helper = (struct variable2json_helper *)data;
677     RRDVAR *rv = (RRDVAR *)entry;
678     calculated_number value = rrdvar2number(rv);
679
680     if(unlikely(isnan(value) || isinf(value)))
681         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
682     else
683         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
684
685     helper->counter++;
686
687     return 0;
688 }
689
690 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
691     struct variable2json_helper helper = {
692             .buf = buf,
693             .counter = 0
694     };
695
696     buffer_sprintf(buf, "{\n\t\"chart\": \"%s\",\n\t\"chart_name\": \"%s\",\n\t\"chart_context\": \"%s\",\n\t\"chart_variables\": {", st->id, st->name, st->context);
697     avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
698     buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
699     helper.counter = 0;
700     avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
701     buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
702     helper.counter = 0;
703     avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
704     buffer_strcat(buf, "\n\t}\n}\n");
705 }
706
707
708 // ----------------------------------------------------------------------------
709 // RRDDIMVAR management
710 // DIMENSION VARIABLES
711
712 #define RRDDIMVAR_ID_MAX 1024
713
714 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
715     RRDDIM *rd = rs->rrddim;
716     RRDSET *st = rd->rrdset;
717
718     // CHART VARIABLES FOR THIS DIMENSION
719
720     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
721     rs->var_local_id = NULL;
722
723     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
724     rs->var_local_name = NULL;
725
726     // FAMILY VARIABLES FOR THIS DIMENSION
727
728     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
729     rs->var_family_id = NULL;
730
731     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
732     rs->var_family_name = NULL;
733
734     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
735     rs->var_family_contextid = NULL;
736
737     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
738     rs->var_family_contextname = NULL;
739
740     // HOST VARIABLES FOR THIS DIMENSION
741
742     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
743     rs->var_host_chartidid = NULL;
744
745     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
746     rs->var_host_chartidname = NULL;
747
748     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
749     rs->var_host_chartnameid = NULL;
750
751     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
752     rs->var_host_chartnamename = NULL;
753
754     // KEYS
755
756     freez(rs->key_id);
757     rs->key_id = NULL;
758
759     freez(rs->key_name);
760     rs->key_name = NULL;
761
762     freez(rs->key_fullidid);
763     rs->key_fullidid = NULL;
764
765     freez(rs->key_fullidname);
766     rs->key_fullidname = NULL;
767
768     freez(rs->key_contextid);
769     rs->key_contextid = NULL;
770
771     freez(rs->key_contextname);
772     rs->key_contextname = NULL;
773
774     freez(rs->key_fullnameid);
775     rs->key_fullnameid = NULL;
776
777     freez(rs->key_fullnamename);
778     rs->key_fullnamename = NULL;
779 }
780
781 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
782     rrddimvar_free_variables(rs);
783
784     RRDDIM *rd = rs->rrddim;
785     RRDSET *st = rd->rrdset;
786
787     char buffer[RRDDIMVAR_ID_MAX + 1];
788
789     // KEYS
790
791     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
792     rs->key_id = strdupz(buffer);
793
794     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
795     rs->key_name = strdupz(buffer);
796
797     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
798     rs->key_fullidid = strdupz(buffer);
799
800     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
801     rs->key_fullidname = strdupz(buffer);
802
803     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
804     rs->key_contextid = strdupz(buffer);
805
806     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
807     rs->key_contextname = strdupz(buffer);
808
809     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
810     rs->key_fullnameid = strdupz(buffer);
811
812     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
813     rs->key_fullnamename = strdupz(buffer);
814
815     // CHART VARIABLES FOR THIS DIMENSION
816     // -----------------------------------
817     //
818     // dimensions are available as:
819     // - $id
820     // - $name
821
822     rs->var_local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
823     rs->var_local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
824
825     // FAMILY VARIABLES FOR THIS DIMENSION
826     // -----------------------------------
827     //
828     // dimensions are available as:
829     // - $id                 (only the first, when multiple overlap)
830     // - $name               (only the first, when multiple overlap)
831     // - $chart-context.id
832     // - $chart-context.name
833
834     rs->var_family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
835     rs->var_family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
836     rs->var_family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
837     rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
838
839     // HOST VARIABLES FOR THIS DIMENSION
840     // -----------------------------------
841     //
842     // dimensions are available as:
843     // - $chart-id.id
844     // - $chart-id.name
845     // - $chart-name.id
846     // - $chart-name.name
847
848     rs->var_host_chartidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
849     rs->var_host_chartidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
850     rs->var_host_chartnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
851     rs->var_host_chartnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
852 }
853
854 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
855     RRDSET *st = rd->rrdset;
856
857     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
858
859     if(!prefix) prefix = "";
860     if(!suffix) suffix = "";
861
862     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
863
864     rs->prefix = strdupz(prefix);
865     rs->suffix = strdupz(suffix);
866
867     rs->type = type;
868     rs->value = value;
869     rs->options = options;
870     rs->rrddim = rd;
871
872     rs->next = rd->variables;
873     rd->variables = rs;
874
875     rrddimvar_create_variables(rs);
876
877     return rs;
878 }
879
880 void rrddimvar_rename_all(RRDDIM *rd) {
881     RRDSET *st = rd->rrdset;
882     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
883
884     RRDDIMVAR *rs, *next = rd->variables;
885     while((rs = next)) {
886         next = rs->next;
887         rrddimvar_create_variables(rs);
888     }
889 }
890
891 void rrddimvar_free(RRDDIMVAR *rs) {
892     RRDDIM *rd = rs->rrddim;
893     RRDSET *st = rd->rrdset;
894     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
895
896     rrddimvar_free_variables(rs);
897
898     if(rd->variables == rs) {
899         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
900         rd->variables = rs->next;
901     }
902     else {
903         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
904         RRDDIMVAR *t;
905         for (t = rd->variables; t && t->next != rs; t = t->next) ;
906         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
907         else t->next = rs->next;
908     }
909
910     freez(rs->prefix);
911     freez(rs->suffix);
912     freez(rs);
913 }
914
915 // ----------------------------------------------------------------------------
916 // RRDSETVAR management
917 // CHART VARIABLES
918
919 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
920     RRDSET *st = rs->rrdset;
921
922     // CHART
923
924     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
925     rs->var_local = NULL;
926
927     // FAMILY
928
929     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
930     rs->var_family = NULL;
931
932     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
933     rs->var_host = NULL;
934
935     // HOST
936
937     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
938     rs->var_family_name = NULL;
939
940     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
941     rs->var_host_name = NULL;
942
943     // KEYS
944
945     freez(rs->key_fullid);
946     rs->key_fullid = NULL;
947
948     freez(rs->key_fullname);
949     rs->key_fullname = NULL;
950 }
951
952 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
953     rrdsetvar_free_variables(rs);
954
955     RRDSET *st = rs->rrdset;
956
957     // KEYS
958
959     char buffer[RRDVAR_MAX_LENGTH + 1];
960     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
961     rs->key_fullid = strdupz(buffer);
962
963     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
964     rs->key_fullname = strdupz(buffer);
965
966     // CHART
967
968     rs->var_local       = rrdvar_create_and_index("local",  &st->variables_root_index,               rs->variable, rs->type, rs->value);
969
970     // FAMILY
971
972     rs->var_family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullid,   rs->type, rs->value);
973     rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullname, rs->type, rs->value);
974
975     // HOST
976
977     rs->var_host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullid,   rs->type, rs->value);
978     rs->var_host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullname, rs->type, rs->value);
979
980 }
981
982 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
983     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
984     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
985
986     rs->variable = strdupz(variable);
987     rs->type = type;
988     rs->value = value;
989     rs->options = options;
990     rs->rrdset = st;
991
992     rs->next = st->variables;
993     st->variables = rs;
994
995     rrdsetvar_create_variables(rs);
996
997     return rs;
998 }
999
1000 void rrdsetvar_rename_all(RRDSET *st) {
1001     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
1002
1003     RRDSETVAR *rs, *next = st->variables;
1004     while((rs = next)) {
1005         next = rs->next;
1006         rrdsetvar_create_variables(rs);
1007     }
1008
1009     rrdsetcalc_link_matching(st);
1010 }
1011
1012 void rrdsetvar_free(RRDSETVAR *rs) {
1013     RRDSET *st = rs->rrdset;
1014     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
1015
1016     if(st->variables == rs) {
1017         st->variables = rs->next;
1018     }
1019     else {
1020         RRDSETVAR *t;
1021         for (t = st->variables; t && t->next != rs; t = t->next);
1022         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
1023         else t->next = rs->next;
1024     }
1025
1026     rrdsetvar_free_variables(rs);
1027
1028     freez(rs->variable);
1029     freez(rs);
1030 }
1031
1032 // ----------------------------------------------------------------------------
1033 // RRDCALC management
1034
1035 inline const char *rrdcalc_status2string(int status) {
1036     switch(status) {
1037         case RRDCALC_STATUS_REMOVED:
1038             return "REMOVED";
1039
1040         case RRDCALC_STATUS_UNDEFINED:
1041             return "UNDEFINED";
1042
1043         case RRDCALC_STATUS_UNINITIALIZED:
1044             return "UNINITIALIZED";
1045
1046         case RRDCALC_STATUS_CLEAR:
1047             return "CLEAR";
1048
1049         case RRDCALC_STATUS_RAISED:
1050             return "RAISED";
1051
1052         case RRDCALC_STATUS_WARNING:
1053             return "WARNING";
1054
1055         case RRDCALC_STATUS_CRITICAL:
1056             return "CRITICAL";
1057
1058         default:
1059             error("Unknown alarm status %d", status);
1060             return "UNKNOWN";
1061     }
1062 }
1063
1064 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
1065     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
1066
1067     rc->last_status_change = now_realtime_sec();
1068     rc->rrdset = st;
1069
1070     rc->rrdset_next = st->alarms;
1071     rc->rrdset_prev = NULL;
1072     
1073     if(rc->rrdset_next)
1074         rc->rrdset_next->rrdset_prev = rc;
1075
1076     st->alarms = rc;
1077
1078     if(rc->update_every < rc->rrdset->update_every) {
1079         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
1080         rc->update_every = rc->rrdset->update_every;
1081     }
1082
1083     if(!isnan(rc->green) && isnan(st->green)) {
1084         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
1085         st->green = rc->green;
1086     }
1087
1088     if(!isnan(rc->red) && isnan(st->red)) {
1089         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
1090         st->red = rc->red;
1091     }
1092
1093     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1094     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1095
1096     char fullname[RRDVAR_MAX_LENGTH + 1];
1097     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
1098     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1099
1100     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
1101     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1102
1103         if(!rc->units) rc->units = strdupz(st->units);
1104
1105     {
1106         time_t now = now_realtime_sec();
1107         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
1108     }
1109 }
1110
1111 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
1112     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
1113             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
1114         return 1;
1115
1116     return 0;
1117 }
1118
1119 // this has to be called while the RRDHOST is locked
1120 inline void rrdsetcalc_link_matching(RRDSET *st) {
1121     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
1122
1123     RRDCALC *rc;
1124     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
1125         if(unlikely(rc->rrdset))
1126             continue;
1127
1128         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
1129             rrdsetcalc_link(st, rc);
1130     }
1131 }
1132
1133 // this has to be called while the RRDHOST is locked
1134 inline void rrdsetcalc_unlink(RRDCALC *rc) {
1135     RRDSET *st = rc->rrdset;
1136
1137     if(!st) {
1138         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1139         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1140         return;
1141     }
1142
1143     {
1144         time_t now = now_realtime_sec();
1145         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
1146     }
1147
1148     RRDHOST *host = st->rrdhost;
1149
1150     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
1151
1152     // unlink it
1153     if(rc->rrdset_prev)
1154         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
1155
1156     if(rc->rrdset_next)
1157         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
1158
1159     if(st->alarms == rc)
1160         st->alarms = rc->rrdset_next;
1161
1162     rc->rrdset_prev = rc->rrdset_next = NULL;
1163
1164     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
1165     rc->local = NULL;
1166
1167     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
1168     rc->family = NULL;
1169
1170     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
1171     rc->hostid = NULL;
1172
1173     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
1174     rc->hostname = NULL;
1175
1176     rc->rrdset = NULL;
1177
1178     // RRDCALC will remain in RRDHOST
1179     // so that if the matching chart is found in the future
1180     // it will be applied automatically
1181 }
1182
1183 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
1184     RRDCALC *rc;
1185     uint32_t hash = simple_hash(name);
1186
1187     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
1188         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
1189             return rc;
1190     }
1191
1192     return NULL;
1193 }
1194
1195 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
1196     RRDCALC *rc;
1197
1198     if(unlikely(!chart)) {
1199         error("attempt to find RRDCALC '%s' without giving a chart name", name);
1200         return 1;
1201     }
1202
1203     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1204     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
1205
1206     // make sure it does not already exist
1207     for(rc = host->alarms; rc ; rc = rc->next) {
1208         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1209             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1210             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1211             return 1;
1212         }
1213     }
1214
1215     return 0;
1216 }
1217
1218 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1219     if(chart && name) {
1220         uint32_t hash_chart = simple_hash(chart);
1221         uint32_t hash_name = simple_hash(name);
1222
1223         // re-use old IDs, by looking them up in the alarm log
1224         ALARM_ENTRY *ae;
1225         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1226             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1227                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1228                 return ae->alarm_id;
1229             }
1230         }
1231     }
1232
1233     return host->health_log.next_alarm_id++;
1234 }
1235
1236 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1237     rrdhost_check_rdlock(host);
1238
1239     if(rc->calculation) {
1240         rc->calculation->status = &rc->status;
1241         rc->calculation->this = &rc->value;
1242         rc->calculation->after = &rc->db_after;
1243         rc->calculation->before = &rc->db_before;
1244         rc->calculation->rrdcalc = rc;
1245     }
1246
1247     if(rc->warning) {
1248         rc->warning->status = &rc->status;
1249         rc->warning->this = &rc->value;
1250         rc->warning->after = &rc->db_after;
1251         rc->warning->before = &rc->db_before;
1252         rc->warning->rrdcalc = rc;
1253     }
1254
1255     if(rc->critical) {
1256         rc->critical->status = &rc->status;
1257         rc->critical->this = &rc->value;
1258         rc->critical->after = &rc->db_after;
1259         rc->critical->before = &rc->db_before;
1260         rc->critical->rrdcalc = rc;
1261     }
1262
1263     // link it to the host
1264     if(likely(host->alarms)) {
1265         // append it
1266         RRDCALC *t;
1267         for(t = host->alarms; t && t->next ; t = t->next) ;
1268         t->next = rc;
1269     }
1270     else {
1271         host->alarms = rc;
1272     }
1273
1274     // link it to its chart
1275     RRDSET *st;
1276     for(st = host->rrdset_root; st ; st = st->next) {
1277         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1278             rrdsetcalc_link(st, rc);
1279             break;
1280         }
1281     }
1282 }
1283
1284 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1285
1286     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1287
1288     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1289         return NULL;
1290
1291     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1292     rc->next_event_id = 1;
1293     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1294     rc->name = strdupz(rt->name);
1295     rc->hash = simple_hash(rc->name);
1296     rc->chart = strdupz(chart);
1297     rc->hash_chart = simple_hash(rc->chart);
1298
1299     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1300
1301     rc->green = rt->green;
1302     rc->red = rt->red;
1303     rc->value = NAN;
1304     rc->old_value = NAN;
1305
1306     rc->delay_up_duration = rt->delay_up_duration;
1307     rc->delay_down_duration = rt->delay_down_duration;
1308     rc->delay_max_duration = rt->delay_max_duration;
1309     rc->delay_multiplier = rt->delay_multiplier;
1310
1311     rc->group = rt->group;
1312     rc->after = rt->after;
1313     rc->before = rt->before;
1314     rc->update_every = rt->update_every;
1315     rc->options = rt->options;
1316
1317     if(rt->exec) rc->exec = strdupz(rt->exec);
1318     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1319     if(rt->source) rc->source = strdupz(rt->source);
1320     if(rt->units) rc->units = strdupz(rt->units);
1321     if(rt->info) rc->info = strdupz(rt->info);
1322
1323     if(rt->calculation) {
1324         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1325         if(!rc->calculation)
1326             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1327     }
1328     if(rt->warning) {
1329         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1330         if(!rc->warning)
1331             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1332     }
1333     if(rt->critical) {
1334         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1335         if(!rc->critical)
1336             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1337     }
1338
1339     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1340           (rc->chart)?rc->chart:"NOCHART",
1341           rc->name,
1342           (rc->exec)?rc->exec:"DEFAULT",
1343           (rc->recipient)?rc->recipient:"DEFAULT",
1344           rc->green,
1345           rc->red,
1346           rc->group,
1347           rc->after,
1348           rc->before,
1349           rc->options,
1350           (rc->dimensions)?rc->dimensions:"NONE",
1351           rc->update_every,
1352           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1353           (rc->warning)?rc->warning->parsed_as:"NONE",
1354           (rc->critical)?rc->critical->parsed_as:"NONE",
1355           rc->source,
1356           rc->delay_up_duration,
1357           rc->delay_down_duration,
1358           rc->delay_max_duration,
1359           rc->delay_multiplier
1360     );
1361
1362     rrdcalc_create_part2(host, rc);
1363     return rc;
1364 }
1365
1366 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1367     if(!rc) return;
1368
1369     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1370
1371     // unlink it from RRDSET
1372     if(rc->rrdset) rrdsetcalc_unlink(rc);
1373
1374     // unlink it from RRDHOST
1375     if(unlikely(rc == host->alarms))
1376         host->alarms = rc->next;
1377
1378     else if(likely(host->alarms)) {
1379         RRDCALC *t, *last = host->alarms;
1380         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1381         if(last->next == rc)
1382             last->next = rc->next;
1383         else
1384             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1385     }
1386     else
1387         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1388
1389     expression_free(rc->calculation);
1390     expression_free(rc->warning);
1391     expression_free(rc->critical);
1392
1393     freez(rc->name);
1394     freez(rc->chart);
1395     freez(rc->family);
1396     freez(rc->dimensions);
1397     freez(rc->exec);
1398     freez(rc->recipient);
1399     freez(rc->source);
1400     freez(rc->units);
1401     freez(rc->info);
1402     freez(rc);
1403 }
1404
1405 // ----------------------------------------------------------------------------
1406 // RRDCALCTEMPLATE management
1407
1408 void rrdcalctemplate_link_matching(RRDSET *st) {
1409     RRDCALCTEMPLATE *rt;
1410
1411     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1412         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
1413                 && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
1414             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1415             if(unlikely(!rc))
1416                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1417
1418 #ifdef NETDATA_INTERNAL_CHECKS
1419             else if(rc->rrdset != st)
1420                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1421 #endif
1422         }
1423     }
1424 }
1425
1426 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1427     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1428
1429     if(host->templates) {
1430         if(host->templates == rt) {
1431             host->templates = rt->next;
1432         }
1433         else {
1434             RRDCALCTEMPLATE *t, *last = host->templates;
1435             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1436             if(last && last->next == rt) {
1437                 last->next = rt->next;
1438                 rt->next = NULL;
1439             }
1440             else
1441                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1442         }
1443     }
1444
1445     expression_free(rt->calculation);
1446     expression_free(rt->warning);
1447     expression_free(rt->critical);
1448
1449     freez(rt->family_match);
1450     simple_pattern_free(rt->family_pattern);
1451
1452     freez(rt->name);
1453     freez(rt->exec);
1454     freez(rt->recipient);
1455     freez(rt->context);
1456     freez(rt->source);
1457     freez(rt->units);
1458     freez(rt->info);
1459     freez(rt->dimensions);
1460     freez(rt);
1461 }
1462
1463 // ----------------------------------------------------------------------------
1464 // load health configuration
1465
1466 #define HEALTH_CONF_MAX_LINE 4096
1467
1468 #define HEALTH_ALARM_KEY "alarm"
1469 #define HEALTH_TEMPLATE_KEY "template"
1470 #define HEALTH_ON_KEY "on"
1471 #define HEALTH_FAMILIES_KEY "families"
1472 #define HEALTH_LOOKUP_KEY "lookup"
1473 #define HEALTH_CALC_KEY "calc"
1474 #define HEALTH_EVERY_KEY "every"
1475 #define HEALTH_GREEN_KEY "green"
1476 #define HEALTH_RED_KEY "red"
1477 #define HEALTH_WARN_KEY "warn"
1478 #define HEALTH_CRIT_KEY "crit"
1479 #define HEALTH_EXEC_KEY "exec"
1480 #define HEALTH_RECIPIENT_KEY "to"
1481 #define HEALTH_UNITS_KEY "units"
1482 #define HEALTH_INFO_KEY "info"
1483 #define HEALTH_DELAY_KEY "delay"
1484
1485 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1486     if(!rc->chart) {
1487         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1488         return 0;
1489     }
1490
1491     if(!rc->update_every) {
1492         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1493         return 0;
1494     }
1495
1496     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1497         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1498         return 0;
1499     }
1500
1501     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1502         return 0;
1503
1504     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1505
1506     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1507           rc->chart?rc->chart:"NOCHART",
1508           rc->name,
1509           rc->id,
1510           (rc->exec)?rc->exec:"DEFAULT",
1511           (rc->recipient)?rc->recipient:"DEFAULT",
1512           rc->green,
1513           rc->red,
1514           rc->group,
1515           rc->after,
1516           rc->before,
1517           rc->options,
1518           (rc->dimensions)?rc->dimensions:"NONE",
1519           rc->update_every,
1520           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1521           (rc->warning)?rc->warning->parsed_as:"NONE",
1522           (rc->critical)?rc->critical->parsed_as:"NONE",
1523           rc->source,
1524           rc->delay_up_duration,
1525           rc->delay_down_duration,
1526           rc->delay_max_duration,
1527           rc->delay_multiplier
1528     );
1529
1530     rrdcalc_create_part2(host, rc);
1531     return 1;
1532 }
1533
1534 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1535     if(unlikely(!rt->context)) {
1536         error("Health configuration for template '%s' does not have a context", rt->name);
1537         return 0;
1538     }
1539
1540     if(unlikely(!rt->update_every)) {
1541         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1542         return 0;
1543     }
1544
1545     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1546         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1547         return 0;
1548     }
1549
1550     RRDCALCTEMPLATE *t, *last = NULL;
1551     for (t = host->templates; t ; last = t, t = t->next) {
1552         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1553             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1554             return 0;
1555         }
1556     }
1557
1558     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1559           rt->name,
1560           (rt->context)?rt->context:"NONE",
1561           (rt->exec)?rt->exec:"DEFAULT",
1562           (rt->recipient)?rt->recipient:"DEFAULT",
1563           rt->green,
1564           rt->red,
1565           rt->group,
1566           rt->after,
1567           rt->before,
1568           rt->options,
1569           (rt->dimensions)?rt->dimensions:"NONE",
1570           rt->update_every,
1571           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1572           (rt->warning)?rt->warning->parsed_as:"NONE",
1573           (rt->critical)?rt->critical->parsed_as:"NONE",
1574           rt->source,
1575           rt->delay_up_duration,
1576           rt->delay_down_duration,
1577           rt->delay_max_duration,
1578           rt->delay_multiplier
1579     );
1580
1581     if(likely(last)) {
1582         last->next = rt;
1583     }
1584     else {
1585         rt->next = host->templates;
1586         host->templates = rt;
1587     }
1588
1589     return 1;
1590 }
1591
1592 static inline int health_parse_duration(char *string, int *result) {
1593     // make sure it is a number
1594     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1595         *result = 0;
1596         return 0;
1597     }
1598
1599     char *e = NULL;
1600     calculated_number n = strtold(string, &e);
1601     if(e && *e) {
1602         switch (*e) {
1603             case 'Y':
1604                 *result = (int) (n * 86400 * 365);
1605                 break;
1606             case 'M':
1607                 *result = (int) (n * 86400 * 30);
1608                 break;
1609             case 'w':
1610                 *result = (int) (n * 86400 * 7);
1611                 break;
1612             case 'd':
1613                 *result = (int) (n * 86400);
1614                 break;
1615             case 'h':
1616                 *result = (int) (n * 3600);
1617                 break;
1618             case 'm':
1619                 *result = (int) (n * 60);
1620                 break;
1621
1622             default:
1623             case 's':
1624                 *result = (int) (n);
1625                 break;
1626         }
1627     }
1628     else
1629        *result = (int)(n);
1630
1631     return 1;
1632 }
1633
1634 static inline int health_parse_delay(
1635         size_t line, const char *path, const char *file, char *string,
1636         int *delay_up_duration,
1637         int *delay_down_duration,
1638         int *delay_max_duration,
1639         float *delay_multiplier) {
1640
1641     char given_up = 0;
1642     char given_down = 0;
1643     char given_max = 0;
1644     char given_multiplier = 0;
1645
1646     char *s = string;
1647     while(*s) {
1648         char *key = s;
1649
1650         while(*s && !isspace(*s)) s++;
1651         while(*s && isspace(*s)) *s++ = '\0';
1652
1653         if(!*key) break;
1654
1655         char *value = s;
1656         while(*s && !isspace(*s)) s++;
1657         while(*s && isspace(*s)) *s++ = '\0';
1658
1659         if(!strcasecmp(key, "up")) {
1660             if (!health_parse_duration(value, delay_up_duration)) {
1661                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1662                       line, path, file, value, key);
1663             }
1664             else given_up = 1;
1665         }
1666         else if(!strcasecmp(key, "down")) {
1667             if (!health_parse_duration(value, delay_down_duration)) {
1668                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1669                       line, path, file, value, key);
1670             }
1671             else given_down = 1;
1672         }
1673         else if(!strcasecmp(key, "multiplier")) {
1674             *delay_multiplier = strtof(value, NULL);
1675             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1676                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1677                       line, path, file, value, key);
1678             }
1679             else given_multiplier = 1;
1680         }
1681         else if(!strcasecmp(key, "max")) {
1682             if (!health_parse_duration(value, delay_max_duration)) {
1683                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1684                       line, path, file, value, key);
1685             }
1686             else given_max = 1;
1687         }
1688         else {
1689             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1690                   line, path, file, key);
1691         }
1692     }
1693
1694     if(!given_up)
1695         *delay_up_duration = 0;
1696
1697     if(!given_down)
1698         *delay_down_duration = 0;
1699
1700     if(!given_multiplier)
1701         *delay_multiplier = 1.0;
1702
1703     if(!given_max) {
1704         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1705             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1706
1707         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1708             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1709     }
1710
1711     return 1;
1712 }
1713
1714 static inline int health_parse_db_lookup(
1715         size_t line, const char *path, const char *file, char *string,
1716         int *group_method, int *after, int *before, int *every,
1717         uint32_t *options, char **dimensions
1718 ) {
1719     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1720
1721     if(*dimensions) freez(*dimensions);
1722     *dimensions = NULL;
1723     *after = 0;
1724     *before = 0;
1725     *every = 0;
1726     *options = 0;
1727
1728     char *s = string, *key;
1729
1730     // first is the group method
1731     key = s;
1732     while(*s && !isspace(*s)) s++;
1733     while(*s && isspace(*s)) *s++ = '\0';
1734     if(!*s) {
1735         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1736               line, path, file, key);
1737         return 0;
1738     }
1739
1740     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1741         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1742               line, path, file, key);
1743         return 0;
1744     }
1745
1746     // then is the 'after' time
1747     key = s;
1748     while(*s && !isspace(*s)) s++;
1749     while(*s && isspace(*s)) *s++ = '\0';
1750
1751     if(!health_parse_duration(key, after)) {
1752         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1753               line, path, file, key);
1754         return 0;
1755     }
1756
1757     // sane defaults
1758     *every = abs(*after);
1759
1760     // now we may have optional parameters
1761     while(*s) {
1762         key = s;
1763         while(*s && !isspace(*s)) s++;
1764         while(*s && isspace(*s)) *s++ = '\0';
1765         if(!*key) break;
1766
1767         if(!strcasecmp(key, "at")) {
1768             char *value = s;
1769             while(*s && !isspace(*s)) s++;
1770             while(*s && isspace(*s)) *s++ = '\0';
1771
1772             if (!health_parse_duration(value, before)) {
1773                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1774                       line, path, file, value, key);
1775             }
1776         }
1777         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1778             char *value = s;
1779             while(*s && !isspace(*s)) s++;
1780             while(*s && isspace(*s)) *s++ = '\0';
1781
1782             if (!health_parse_duration(value, every)) {
1783                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1784                       line, path, file, value, key);
1785             }
1786         }
1787         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1788             *options |= RRDR_OPTION_ABSOLUTE;
1789         }
1790         else if(!strcasecmp(key, "min2max")) {
1791             *options |= RRDR_OPTION_MIN2MAX;
1792         }
1793         else if(!strcasecmp(key, "null2zero")) {
1794             *options |= RRDR_OPTION_NULL2ZERO;
1795         }
1796         else if(!strcasecmp(key, "percentage")) {
1797             *options |= RRDR_OPTION_PERCENTAGE;
1798         }
1799         else if(!strcasecmp(key, "unaligned")) {
1800             *options |= RRDR_OPTION_NOT_ALIGNED;
1801         }
1802         else if(!strcasecmp(key, "of")) {
1803             if(*s && strcasecmp(s, "all"))
1804                *dimensions = strdupz(s);
1805             break;
1806         }
1807         else {
1808             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1809                   line, path, file, key);
1810         }
1811     }
1812
1813     return 1;
1814 }
1815
1816 static inline char *tabs2spaces(char *s) {
1817     char *t = s;
1818     while(*t) {
1819         if(unlikely(*t == '\t')) *t = ' ';
1820         t++;
1821     }
1822
1823     return s;
1824 }
1825
1826 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1827     char buffer[FILENAME_MAX + 1];
1828     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1829     return strdupz(buffer);
1830 }
1831
1832 static inline void strip_quotes(char *s) {
1833     while(*s) {
1834         if(*s == '\'' || *s == '"') *s = ' ';
1835         s++;
1836     }
1837 }
1838
1839 int health_readfile(const char *path, const char *filename) {
1840     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1841
1842     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_families = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1843     char buffer[HEALTH_CONF_MAX_LINE + 1];
1844
1845     if(unlikely(!hash_alarm)) {
1846         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1847         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1848         hash_on = simple_uhash(HEALTH_ON_KEY);
1849         hash_families = simple_uhash(HEALTH_FAMILIES_KEY);
1850         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1851         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1852         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1853         hash_red = simple_uhash(HEALTH_RED_KEY);
1854         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1855         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1856         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1857         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1858         hash_units = simple_hash(HEALTH_UNITS_KEY);
1859         hash_info = simple_hash(HEALTH_INFO_KEY);
1860         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1861         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1862     }
1863
1864     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1865     FILE *fp = fopen(buffer, "r");
1866     if(!fp) {
1867         error("Health configuration cannot read file '%s'.", buffer);
1868         return 0;
1869     }
1870
1871     RRDCALC *rc = NULL;
1872     RRDCALCTEMPLATE *rt = NULL;
1873
1874     size_t line = 0, append = 0;
1875     char *s;
1876     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1877         int stop_appending = !s;
1878         line++;
1879         s = trim(buffer);
1880         if(!s) continue;
1881
1882         append = strlen(s);
1883         if(!stop_appending && s[append - 1] == '\\') {
1884             s[append - 1] = ' ';
1885             append = &s[append] - buffer;
1886             if(append < HEALTH_CONF_MAX_LINE)
1887                 continue;
1888             else {
1889                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1890             }
1891         }
1892         append = 0;
1893
1894         char *key = s;
1895         while(*s && *s != ':') s++;
1896         if(!*s) {
1897             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1898             continue;
1899         }
1900         *s = '\0';
1901         s++;
1902
1903         char *value = s;
1904         key = trim(key);
1905         value = trim(value);
1906
1907         if(!key) {
1908             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1909             continue;
1910         }
1911
1912         if(!value) {
1913             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1914             continue;
1915         }
1916
1917         uint32_t hash = simple_uhash(key);
1918
1919         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1920             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1921                 rrdcalc_free(&localhost, rc);
1922
1923             if(rt) {
1924                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1925                     rrdcalctemplate_free(&localhost, rt);
1926                 rt = NULL;
1927             }
1928
1929             rc = callocz(1, sizeof(RRDCALC));
1930             rc->next_event_id = 1;
1931             rc->name = tabs2spaces(strdupz(value));
1932             rc->hash = simple_hash(rc->name);
1933             rc->source = health_source_file(line, path, filename);
1934             rc->green = NAN;
1935             rc->red = NAN;
1936             rc->value = NAN;
1937             rc->old_value = NAN;
1938             rc->delay_multiplier = 1.0;
1939
1940             if(rrdvar_fix_name(rc->name))
1941                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1942         }
1943         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1944             if(rc) {
1945                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1946                     rrdcalc_free(&localhost, rc);
1947                 rc = NULL;
1948             }
1949
1950             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1951                 rrdcalctemplate_free(&localhost, rt);
1952
1953             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1954             rt->name = tabs2spaces(strdupz(value));
1955             rt->hash_name = simple_hash(rt->name);
1956             rt->source = health_source_file(line, path, filename);
1957             rt->green = NAN;
1958             rt->red = NAN;
1959             rt->delay_multiplier = 1.0;
1960
1961             if(rrdvar_fix_name(rt->name))
1962                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1963         }
1964         else if(rc) {
1965             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1966                 if(rc->chart) {
1967                     if(strcmp(rc->chart, value))
1968                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1969                                 line, path, filename, rc->name, key, rc->chart, value, value);
1970
1971                     freez(rc->chart);
1972                 }
1973                 rc->chart = tabs2spaces(strdupz(value));
1974                 rc->hash_chart = simple_hash(rc->chart);
1975             }
1976             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1977                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1978                                        &rc->update_every,
1979                                        &rc->options, &rc->dimensions);
1980             }
1981             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1982                 if(!health_parse_duration(value, &rc->update_every))
1983                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1984                          line, path, filename, rc->name, key, value);
1985             }
1986             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1987                 char *e;
1988                 rc->green = strtold(value, &e);
1989                 if(e && *e) {
1990                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1991                          line, path, filename, rc->name, key, e);
1992                 }
1993             }
1994             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1995                 char *e;
1996                 rc->red = strtold(value, &e);
1997                 if(e && *e) {
1998                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1999                          line, path, filename, rc->name, key, e);
2000                 }
2001             }
2002             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2003                 const char *failed_at = NULL;
2004                 int error = 0;
2005                 rc->calculation = expression_parse(value, &failed_at, &error);
2006                 if(!rc->calculation) {
2007                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2008                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2009                 }
2010             }
2011             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2012                 const char *failed_at = NULL;
2013                 int error = 0;
2014                 rc->warning = expression_parse(value, &failed_at, &error);
2015                 if(!rc->warning) {
2016                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2017                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2018                 }
2019             }
2020             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2021                 const char *failed_at = NULL;
2022                 int error = 0;
2023                 rc->critical = expression_parse(value, &failed_at, &error);
2024                 if(!rc->critical) {
2025                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2026                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2027                 }
2028             }
2029             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2030                 if(rc->exec) {
2031                     if(strcmp(rc->exec, value))
2032                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2033                              line, path, filename, rc->name, key, rc->exec, value, value);
2034
2035                     freez(rc->exec);
2036                 }
2037                 rc->exec = tabs2spaces(strdupz(value));
2038             }
2039             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2040                 if(rc->recipient) {
2041                     if(strcmp(rc->recipient, value))
2042                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2043                              line, path, filename, rc->name, key, rc->recipient, value, value);
2044
2045                     freez(rc->recipient);
2046                 }
2047                 rc->recipient = tabs2spaces(strdupz(value));
2048             }
2049             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2050                 if(rc->units) {
2051                     if(strcmp(rc->units, value))
2052                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2053                              line, path, filename, rc->name, key, rc->units, value, value);
2054
2055                     freez(rc->units);
2056                 }
2057                 rc->units = tabs2spaces(strdupz(value));
2058                 strip_quotes(rc->units);
2059             }
2060             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2061                 if(rc->info) {
2062                     if(strcmp(rc->info, value))
2063                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2064                              line, path, filename, rc->name, key, rc->info, value, value);
2065
2066                     freez(rc->info);
2067                 }
2068                 rc->info = tabs2spaces(strdupz(value));
2069                 strip_quotes(rc->info);
2070             }
2071             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2072                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
2073             }
2074             else {
2075                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
2076                      line, path, filename, rc->name, key);
2077             }
2078         }
2079         else if(rt) {
2080             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
2081                 if(rt->context) {
2082                     if(strcmp(rt->context, value))
2083                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2084                                 line, path, filename, rt->name, key, rt->context, value, value);
2085
2086                     freez(rt->context);
2087                 }
2088                 rt->context = tabs2spaces(strdupz(value));
2089                 rt->hash_context = simple_hash(rt->context);
2090             }
2091             else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) {
2092                 freez(rt->family_match);
2093                 simple_pattern_free(rt->family_pattern);
2094
2095                 rt->family_match = tabs2spaces(strdupz(value));
2096                 rt->family_pattern = simple_pattern_create(rt->family_match, SIMPLE_PATTERN_EXACT);
2097             }
2098             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
2099                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
2100                                        &rt->update_every, &rt->options, &rt->dimensions);
2101             }
2102             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
2103                 if(!health_parse_duration(value, &rt->update_every))
2104                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
2105                          line, path, filename, rt->name, key, value);
2106             }
2107             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
2108                 char *e;
2109                 rt->green = strtold(value, &e);
2110                 if(e && *e) {
2111                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2112                          line, path, filename, rt->name, key, e);
2113                 }
2114             }
2115             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
2116                 char *e;
2117                 rt->red = strtold(value, &e);
2118                 if(e && *e) {
2119                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2120                          line, path, filename, rt->name, key, e);
2121                 }
2122             }
2123             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2124                 const char *failed_at = NULL;
2125                 int error = 0;
2126                 rt->calculation = expression_parse(value, &failed_at, &error);
2127                 if(!rt->calculation) {
2128                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2129                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2130                 }
2131             }
2132             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2133                 const char *failed_at = NULL;
2134                 int error = 0;
2135                 rt->warning = expression_parse(value, &failed_at, &error);
2136                 if(!rt->warning) {
2137                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2138                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2139                 }
2140             }
2141             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2142                 const char *failed_at = NULL;
2143                 int error = 0;
2144                 rt->critical = expression_parse(value, &failed_at, &error);
2145                 if(!rt->critical) {
2146                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2147                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2148                 }
2149             }
2150             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2151                 if(rt->exec) {
2152                     if(strcmp(rt->exec, value))
2153                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2154                              line, path, filename, rt->name, key, rt->exec, value, value);
2155
2156                     freez(rt->exec);
2157                 }
2158                 rt->exec = tabs2spaces(strdupz(value));
2159             }
2160             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2161                 if(rt->recipient) {
2162                     if(strcmp(rt->recipient, value))
2163                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2164                              line, path, filename, rt->name, key, rt->recipient, value, value);
2165
2166                     freez(rt->recipient);
2167                 }
2168                 rt->recipient = tabs2spaces(strdupz(value));
2169             }
2170             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2171                 if(rt->units) {
2172                     if(strcmp(rt->units, value))
2173                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2174                              line, path, filename, rt->name, key, rt->units, value, value);
2175
2176                     freez(rt->units);
2177                 }
2178                 rt->units = tabs2spaces(strdupz(value));
2179                 strip_quotes(rt->units);
2180             }
2181             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2182                 if(rt->info) {
2183                     if(strcmp(rt->info, value))
2184                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2185                              line, path, filename, rt->name, key, rt->info, value, value);
2186
2187                     freez(rt->info);
2188                 }
2189                 rt->info = tabs2spaces(strdupz(value));
2190                 strip_quotes(rt->info);
2191             }
2192             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2193                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
2194             }
2195             else {
2196                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
2197                       line, path, filename, rt->name, key);
2198             }
2199         }
2200         else {
2201             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
2202                   line, path, filename, key);
2203         }
2204     }
2205
2206     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2207         rrdcalc_free(&localhost, rc);
2208
2209     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2210         rrdcalctemplate_free(&localhost, rt);
2211
2212     fclose(fp);
2213     return 1;
2214 }
2215
2216 void health_readdir(const char *path) {
2217     size_t pathlen = strlen(path);
2218
2219     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2220
2221     DIR *dir = opendir(path);
2222     if (!dir) {
2223         error("Health configuration cannot open directory '%s'.", path);
2224         return;
2225     }
2226
2227     struct dirent *de = NULL;
2228     while ((de = readdir(dir))) {
2229         size_t len = strlen(de->d_name);
2230
2231         if(de->d_type == DT_DIR
2232            && (
2233                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2234                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2235            )) {
2236             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2237             continue;
2238         }
2239
2240         else if(de->d_type == DT_DIR) {
2241             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2242             strcpy(s, path);
2243             strcat(s, "/");
2244             strcat(s, de->d_name);
2245             health_readdir(s);
2246             freez(s);
2247             continue;
2248         }
2249
2250         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2251                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2252             health_readfile(path, de->d_name);
2253         }
2254
2255         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2256     }
2257
2258     closedir(dir);
2259 }
2260
2261 static inline char *health_config_dir(void) {
2262     char buffer[FILENAME_MAX + 1];
2263     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2264     return config_get("health", "health configuration directory", buffer);
2265 }
2266
2267 void health_init(void) {
2268     debug(D_HEALTH, "Health configuration initializing");
2269
2270     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2271         debug(D_HEALTH, "Health is disabled.");
2272         return;
2273     }
2274
2275     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2276     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2277         fatal("Cannot create directory '%s'.", pathname);
2278
2279     char filename[FILENAME_MAX + 1];
2280     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2281     health.log_filename = config_get("health", "health db file", filename);
2282
2283     health_alarm_log_load(&localhost);
2284     health_alarm_log_open();
2285
2286     char *path = health_config_dir();
2287
2288     {
2289         char buffer[FILENAME_MAX + 1];
2290         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2291         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2292     }
2293
2294     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2295     if(n < 10) {
2296         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2297         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2298     }
2299     else localhost.health_log.max = (unsigned int)n;
2300
2301     rrdhost_rwlock(&localhost);
2302     health_readdir(path);
2303     rrdhost_unlock(&localhost);
2304 }
2305
2306 // ----------------------------------------------------------------------------
2307 // JSON generation
2308
2309 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2310     if(value && *value)
2311         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2312     else
2313         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2314 }
2315
2316 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2317     buffer_sprintf(wb, "\n\t{\n"
2318                            "\t\t\"hostname\": \"%s\",\n"
2319                            "\t\t\"unique_id\": %u,\n"
2320                            "\t\t\"alarm_id\": %u,\n"
2321                            "\t\t\"alarm_event_id\": %u,\n"
2322                            "\t\t\"name\": \"%s\",\n"
2323                            "\t\t\"chart\": \"%s\",\n"
2324                            "\t\t\"family\": \"%s\",\n"
2325                            "\t\t\"processed\": %s,\n"
2326                            "\t\t\"updated\": %s,\n"
2327                            "\t\t\"exec_run\": %lu,\n"
2328                            "\t\t\"exec_failed\": %s,\n"
2329                            "\t\t\"exec\": \"%s\",\n"
2330                            "\t\t\"recipient\": \"%s\",\n"
2331                            "\t\t\"exec_code\": %d,\n"
2332                            "\t\t\"source\": \"%s\",\n"
2333                            "\t\t\"units\": \"%s\",\n"
2334                            "\t\t\"info\": \"%s\",\n"
2335                            "\t\t\"when\": %lu,\n"
2336                            "\t\t\"duration\": %lu,\n"
2337                            "\t\t\"non_clear_duration\": %lu,\n"
2338                            "\t\t\"status\": \"%s\",\n"
2339                            "\t\t\"old_status\": \"%s\",\n"
2340                            "\t\t\"delay\": %d,\n"
2341                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2342                            "\t\t\"updated_by_id\": %u,\n"
2343                            "\t\t\"updates_id\": %u,\n"
2344                            "\t\t\"value_string\": \"%s\",\n"
2345                            "\t\t\"old_value_string\": \"%s\",\n",
2346                    host->hostname,
2347                    ae->unique_id,
2348                    ae->alarm_id,
2349                    ae->alarm_event_id,
2350                    ae->name,
2351                    ae->chart,
2352                    ae->family,
2353                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2354                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2355                    (unsigned long)ae->exec_run_timestamp,
2356                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2357                    ae->exec?ae->exec:health.health_default_exec,
2358                    ae->recipient?ae->recipient:health.health_default_recipient,
2359                    ae->exec_code,
2360                    ae->source,
2361                    ae->units?ae->units:"",
2362                    ae->info?ae->info:"",
2363                    (unsigned long)ae->when,
2364                    (unsigned long)ae->duration,
2365                    (unsigned long)ae->non_clear_duration,
2366                    rrdcalc_status2string(ae->new_status),
2367                    rrdcalc_status2string(ae->old_status),
2368                    ae->delay,
2369                    (unsigned long)ae->delay_up_to_timestamp,
2370                    ae->updated_by_id,
2371                    ae->updates_id,
2372                    ae->new_value_string,
2373                    ae->old_value_string
2374     );
2375
2376     buffer_strcat(wb, "\t\t\"value\":");
2377     buffer_rrd_value(wb, ae->new_value);
2378     buffer_strcat(wb, ",\n");
2379
2380     buffer_strcat(wb, "\t\t\"old_value\":");
2381     buffer_rrd_value(wb, ae->old_value);
2382     buffer_strcat(wb, "\n");
2383
2384     buffer_strcat(wb, "\t}");
2385 }
2386
2387 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2388     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2389
2390     buffer_strcat(wb, "[");
2391
2392     unsigned int max = host->health_log.max;
2393     unsigned int count = 0;
2394     ALARM_ENTRY *ae;
2395     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2396         if(ae->unique_id > after) {
2397             if(likely(count)) buffer_strcat(wb, ",");
2398             health_alarm_entry2json_nolock(wb, ae, host);
2399         }
2400     }
2401
2402     buffer_strcat(wb, "\n]\n");
2403
2404     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2405 }
2406
2407 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2408     buffer_sprintf(wb,
2409            "\t\t\"%s.%s\": {\n"
2410                    "\t\t\t\"id\": %lu,\n"
2411                    "\t\t\t\"name\": \"%s\",\n"
2412                    "\t\t\t\"chart\": \"%s\",\n"
2413                    "\t\t\t\"family\": \"%s\",\n"
2414                    "\t\t\t\"active\": %s,\n"
2415                    "\t\t\t\"exec\": \"%s\",\n"
2416                    "\t\t\t\"recipient\": \"%s\",\n"
2417                    "\t\t\t\"source\": \"%s\",\n"
2418                    "\t\t\t\"units\": \"%s\",\n"
2419                    "\t\t\t\"info\": \"%s\",\n"
2420                                    "\t\t\t\"status\": \"%s\",\n"
2421                    "\t\t\t\"last_status_change\": %lu,\n"
2422                    "\t\t\t\"last_updated\": %lu,\n"
2423                    "\t\t\t\"next_update\": %lu,\n"
2424                    "\t\t\t\"update_every\": %d,\n"
2425                    "\t\t\t\"delay_up_duration\": %d,\n"
2426                    "\t\t\t\"delay_down_duration\": %d,\n"
2427                    "\t\t\t\"delay_max_duration\": %d,\n"
2428                    "\t\t\t\"delay_multiplier\": %f,\n"
2429                    "\t\t\t\"delay\": %d,\n"
2430                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2431             , rc->chart, rc->name
2432             , (unsigned long)rc->id
2433             , rc->name
2434             , rc->chart
2435             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2436             , (rc->rrdset)?"true":"false"
2437             , rc->exec?rc->exec:health.health_default_exec
2438             , rc->recipient?rc->recipient:health.health_default_recipient
2439             , rc->source
2440             , rc->units?rc->units:""
2441             , rc->info?rc->info:""
2442             , rrdcalc_status2string(rc->status)
2443             , (unsigned long)rc->last_status_change
2444             , (unsigned long)rc->last_updated
2445             , (unsigned long)rc->next_update
2446             , rc->update_every
2447             , rc->delay_up_duration
2448             , rc->delay_down_duration
2449             , rc->delay_max_duration
2450             , rc->delay_multiplier
2451             , rc->delay_last
2452             , (unsigned long)rc->delay_up_to_timestamp
2453     );
2454
2455     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2456         if(rc->dimensions && *rc->dimensions)
2457             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2458
2459         buffer_sprintf(wb,
2460                        "\t\t\t\"db_after\": %lu,\n"
2461                        "\t\t\t\"db_before\": %lu,\n"
2462                        "\t\t\t\"lookup_method\": \"%s\",\n"
2463                        "\t\t\t\"lookup_after\": %d,\n"
2464                        "\t\t\t\"lookup_before\": %d,\n"
2465                        "\t\t\t\"lookup_options\": \"",
2466                        (unsigned long) rc->db_after,
2467                        (unsigned long) rc->db_before,
2468                        group_method2string(rc->group),
2469                        rc->after,
2470                        rc->before
2471         );
2472         buffer_data_options2string(wb, rc->options);
2473         buffer_strcat(wb, "\",\n");
2474     }
2475
2476     if(rc->calculation) {
2477         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2478         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2479     }
2480
2481     if(rc->warning) {
2482         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2483         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2484     }
2485
2486     if(rc->critical) {
2487         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2488         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2489     }
2490
2491     buffer_strcat(wb, "\t\t\t\"green\":");
2492     buffer_rrd_value(wb, rc->green);
2493     buffer_strcat(wb, ",\n");
2494
2495     buffer_strcat(wb, "\t\t\t\"red\":");
2496     buffer_rrd_value(wb, rc->red);
2497     buffer_strcat(wb, ",\n");
2498
2499     buffer_strcat(wb, "\t\t\t\"value\":");
2500     buffer_rrd_value(wb, rc->value);
2501     buffer_strcat(wb, "\n");
2502
2503     buffer_strcat(wb, "\t\t}");
2504 }
2505
2506 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2507 //
2508 //}
2509
2510 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2511     int i;
2512
2513     rrdhost_rdlock(&localhost);
2514     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2515                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2516                         "\n\t\"status\": %s,"
2517                         "\n\t\"now\": %lu,"
2518                         "\n\t\"alarms\": {\n",
2519                         host->hostname,
2520                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2521                         health_enabled?"true":"false",
2522                         (unsigned long)now_realtime_sec());
2523
2524     RRDCALC *rc;
2525     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2526         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2527             continue;
2528
2529         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2530             continue;
2531
2532         if(likely(i)) buffer_strcat(wb, ",\n");
2533         health_rrdcalc2json_nolock(wb, rc);
2534         i++;
2535     }
2536
2537 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2538 //    RRDCALCTEMPLATE *rt;
2539 //    for(rt = host->templates; rt ; rt = rt->next)
2540 //        health_rrdcalctemplate2json_nolock(wb, rt);
2541
2542     buffer_strcat(wb, "\n\t}\n}\n");
2543     rrdhost_unlock(&localhost);
2544 }
2545
2546
2547 // ----------------------------------------------------------------------------
2548 // re-load health configuration
2549
2550 static inline void health_free_all_nolock(RRDHOST *host) {
2551     while(host->templates)
2552         rrdcalctemplate_free(host, host->templates);
2553
2554     while(host->alarms)
2555         rrdcalc_free(host, host->alarms);
2556 }
2557
2558 void health_reload(void) {
2559     if(!health_enabled) {
2560         error("Health reload is requested, but health is not enabled.");
2561         return;
2562     }
2563
2564     char *path = health_config_dir();
2565
2566     // free all running alarms
2567     rrdhost_rwlock(&localhost);
2568     health_free_all_nolock(&localhost);
2569     rrdhost_unlock(&localhost);
2570
2571     // invalidate all previous entries in the alarm log
2572     ALARM_ENTRY *t;
2573     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2574         if(t->new_status != RRDCALC_STATUS_REMOVED)
2575             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2576     }
2577
2578     // reset all thresholds to all charts
2579     RRDSET *st;
2580     for(st = localhost.rrdset_root; st ; st = st->next) {
2581         st->green = NAN;
2582         st->red = NAN;
2583     }
2584
2585     // load the new alarms
2586     rrdhost_rwlock(&localhost);
2587     health_readdir(path);
2588     rrdhost_unlock(&localhost);
2589
2590     // link the loaded alarms to their charts
2591     for(st = localhost.rrdset_root; st ; st = st->next) {
2592         rrdhost_rwlock(&localhost);
2593
2594         rrdsetcalc_link_matching(st);
2595         rrdcalctemplate_link_matching(st);
2596
2597         rrdhost_unlock(&localhost);
2598     }
2599 }
2600
2601 // ----------------------------------------------------------------------------
2602 // health main thread and friends
2603
2604 static inline int rrdcalc_value2status(calculated_number n) {
2605     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
2606     if(n) return RRDCALC_STATUS_RAISED;
2607     return RRDCALC_STATUS_CLEAR;
2608 }
2609
2610 #define ALARM_EXEC_COMMAND_LENGTH 8192
2611
2612 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2613     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2614
2615     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2616         // do not send notifications for internal statuses
2617         goto done;
2618     }
2619
2620     // find the previous notification for the same alarm
2621     // which we have run the exec script
2622     {
2623         uint32_t id = ae->alarm_id;
2624         ALARM_ENTRY *t;
2625         for(t = ae->next; t ; t = t->next) {
2626             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2627                 break;
2628         }
2629
2630         if(likely(t)) {
2631             // we have executed this alarm notification in the past
2632             if(t && t->new_status == ae->new_status) {
2633                 // don't send the notification for the same status again
2634                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
2635                       , rrdcalc_status2string(ae->new_status));
2636                 goto done;
2637             }
2638         }
2639         else {
2640             // we have not executed this alarm notification in the past
2641             // so, don't send CLEAR notifications
2642             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
2643                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
2644                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2645                 goto done;
2646             }
2647         }
2648     }
2649
2650     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
2651     pid_t command_pid;
2652
2653     const char *exec = ae->exec;
2654     if(!exec) exec = health.health_default_exec;
2655
2656     const char *recipient = ae->recipient;
2657     if(!recipient) recipient = health.health_default_recipient;
2658
2659     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%s' '%s' '%s' '%u' '%u' '%s' '%s'",
2660               exec,
2661               recipient,
2662               host->hostname,
2663               ae->unique_id,
2664               ae->alarm_id,
2665               ae->alarm_event_id,
2666               (unsigned long)ae->when,
2667               ae->name,
2668               ae->chart?ae->chart:"NOCAHRT",
2669               ae->family?ae->family:"NOFAMILY",
2670               rrdcalc_status2string(ae->new_status),
2671               rrdcalc_status2string(ae->old_status),
2672               ae->new_value_string,
2673               ae->old_value_string,
2674               ae->source?ae->source:"UNKNOWN",
2675               (uint32_t)ae->duration,
2676               (uint32_t)ae->non_clear_duration,
2677               ae->units?ae->units:"",
2678               ae->info?ae->info:""
2679     );
2680
2681     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2682     ae->exec_run_timestamp = now_realtime_sec();
2683
2684     debug(D_HEALTH, "executing command '%s'", command_to_run);
2685     FILE *fp = mypopen(command_to_run, &command_pid);
2686     if(!fp) {
2687         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
2688         goto done;
2689     }
2690     debug(D_HEALTH, "HEALTH reading from command");
2691     char *s = fgets(command_to_run, FILENAME_MAX, fp);
2692     (void)s;
2693     ae->exec_code = mypclose(fp, command_pid);
2694     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2695
2696     if(ae->exec_code != 0)
2697         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2698
2699 done:
2700     health_alarm_log_save(host, ae);
2701     return;
2702 }
2703
2704 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2705     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2706          ae->chart?ae->chart:"NOCHART", ae->name,
2707          ae->new_value,
2708          rrdcalc_status2string(ae->old_status),
2709          rrdcalc_status2string(ae->new_status)
2710     );
2711
2712     health_alarm_execute(host, ae);
2713 }
2714
2715 static inline void health_alarm_log_process(RRDHOST *host) {
2716     static uint32_t stop_at_id = 0;
2717     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2718     time_t now = now_realtime_sec();
2719
2720     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2721
2722     ALARM_ENTRY *ae;
2723     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2724         if(unlikely(
2725             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2726             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2727             )) {
2728
2729             if(unlikely(ae->unique_id < first_waiting))
2730                 first_waiting = ae->unique_id;
2731
2732             if(likely(now >= ae->delay_up_to_timestamp))
2733                 health_process_notifications(host, ae);
2734         }
2735     }
2736
2737     // remember this for the next iteration
2738     stop_at_id = first_waiting;
2739
2740     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2741
2742     if(host->health_log.count <= host->health_log.max)
2743         return;
2744
2745     // cleanup excess entries in the log
2746     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2747
2748     ALARM_ENTRY *last = NULL;
2749     unsigned int count = host->health_log.max * 2 / 3;
2750     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2751
2752     if(ae && last && last->next == ae)
2753         last->next = NULL;
2754     else
2755         ae = NULL;
2756
2757     while(ae) {
2758         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2759
2760         ALARM_ENTRY *t = ae->next;
2761
2762         freez(ae->name);
2763         freez(ae->chart);
2764         freez(ae->family);
2765         freez(ae->exec);
2766         freez(ae->recipient);
2767         freez(ae->source);
2768         freez(ae->units);
2769         freez(ae->info);
2770         freez(ae->old_value_string);
2771         freez(ae->new_value_string);
2772         freez(ae);
2773
2774         ae = t;
2775         host->health_log.count--;
2776     }
2777
2778     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2779 }
2780
2781 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2782     if(unlikely(!rc->rrdset)) {
2783         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2784         return 0;
2785     }
2786
2787     if(unlikely(rc->next_update > now)) {
2788         if (unlikely(*next_run > rc->next_update)) {
2789             // update the next_run time of the main loop
2790             // to run this alarm precisely the time required
2791             *next_run = rc->next_update;
2792         }
2793
2794         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2795         return 0;
2796     }
2797
2798     if(unlikely(!rc->update_every)) {
2799         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2800         return 0;
2801     }
2802
2803     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
2804         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
2805         return 0;
2806     }
2807
2808     int update_every = rc->rrdset->update_every;
2809     time_t first = rrdset_first_entry_t(rc->rrdset);
2810     time_t last = rrdset_last_entry_t(rc->rrdset);
2811
2812     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
2813         debug(D_HEALTH
2814               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
2815               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
2816               , (unsigned long) last);
2817         return 0;
2818     }
2819
2820     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2821         time_t needed = now + rc->before + rc->after;
2822
2823         if(needed + update_every < first || needed - update_every > last) {
2824             debug(D_HEALTH
2825                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
2826                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
2827                   , (unsigned long) last);
2828             return 0;
2829         }
2830     }
2831
2832     return 1;
2833 }
2834
2835 void *health_main(void *ptr) {
2836     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
2837
2838     info("HEALTH thread created with task id %d", gettid());
2839
2840     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2841         error("Cannot set pthread cancel type to DEFERRED.");
2842
2843     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2844         error("Cannot set pthread cancel state to ENABLE.");
2845
2846     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2847     if(min_run_every < 1) min_run_every = 1;
2848
2849     BUFFER *wb = buffer_create(100);
2850
2851     unsigned int loop = 0;
2852     while(health_enabled && !netdata_exit) {
2853         loop++;
2854         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2855
2856         int oldstate, runnable = 0;
2857         time_t now = now_realtime_sec();
2858         time_t next_run = now + min_run_every;
2859         RRDCALC *rc;
2860
2861         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2862             error("Cannot set pthread cancel state to DISABLE.");
2863
2864         rrdhost_rdlock(&localhost);
2865
2866         // the first loop is to lookup values from the db
2867         for(rc = localhost.alarms; rc; rc = rc->next) {
2868             if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
2869                 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
2870                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
2871                 continue;
2872             }
2873
2874             runnable++;
2875             rc->old_value = rc->value;
2876             rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
2877
2878             // 1. if there is database lookup, do it
2879             // 2. if there is calculation expression, run it
2880
2881             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2882                 /* time_t old_db_timestamp = rc->db_before; */
2883                 int value_is_null = 0;
2884
2885                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2886                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2887                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2888
2889                 if (unlikely(ret != 200)) {
2890                     // database lookup failed
2891                     rc->value = NAN;
2892
2893                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2894
2895                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2896                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2897                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2898                     }
2899                 }
2900                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2901                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2902
2903                 /* - RRDCALC_FLAG_DB_STALE not currently used
2904                 if (unlikely(old_db_timestamp == rc->db_before)) {
2905                     // database is stale
2906
2907                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2908
2909                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2910                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2911                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2912                     }
2913                 }
2914                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2915                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2916                 */
2917
2918                 if (unlikely(value_is_null)) {
2919                     // collected value is null
2920
2921                     rc->value = NAN;
2922
2923                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2924                           rc->chart?rc->chart:"NOCHART", rc->name);
2925
2926                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2927                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2928                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2929                               rc->chart?rc->chart:"NOCHART", rc->name);
2930                     }
2931                 }
2932                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2933                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2934
2935                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2936                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2937             }
2938
2939             if(unlikely(rc->calculation)) {
2940                 if (unlikely(!expression_evaluate(rc->calculation))) {
2941                     // calculation failed
2942
2943                     rc->value = NAN;
2944
2945                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s",
2946                           rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2947
2948                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2949                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2950                         error("Health alarm '%s.%s': expression '%s' failed: %s",
2951                               rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2952                     }
2953                 }
2954                 else {
2955                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2956                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2957
2958                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value "
2959                             CALCULATED_NUMBER_FORMAT
2960                             ": %s (source: %s)",
2961                           rc->chart?rc->chart:"NOCHART", rc->name,
2962                           rc->calculation->parsed_as,
2963                           rc->calculation->result,
2964                           buffer_tostring(rc->calculation->error_msg),
2965                           rc->source
2966                     );
2967
2968                     rc->value = rc->calculation->result;
2969                 }
2970             }
2971         }
2972         rrdhost_unlock(&localhost);
2973
2974         if(unlikely(runnable && !netdata_exit)) {
2975             rrdhost_rdlock(&localhost);
2976
2977             for(rc = localhost.alarms; rc; rc = rc->next) {
2978                 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
2979                     continue;
2980
2981                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2982                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2983
2984                 if(likely(rc->warning)) {
2985                     if(unlikely(!expression_evaluate(rc->warning))) {
2986                         // calculation failed
2987
2988                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2989                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2990
2991                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2992                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2993                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2994                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2995                         }
2996                     }
2997                     else {
2998                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2999                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
3000
3001                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
3002                                 CALCULATED_NUMBER_FORMAT
3003                                 ": %s (source: %s)",
3004                               rc->chart?rc->chart:"NOCHART", rc->name,
3005                               rc->warning->result,
3006                               buffer_tostring(rc->warning->error_msg),
3007                               rc->source
3008                         );
3009
3010                         warning_status = rrdcalc_value2status(rc->warning->result);
3011                     }
3012                 }
3013
3014                 if(likely(rc->critical)) {
3015                     if(unlikely(!expression_evaluate(rc->critical))) {
3016                         // calculation failed
3017
3018                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
3019                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
3020
3021                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
3022                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
3023                             error("Health alarm '%s.%s': critical expression failed with error: %s",
3024                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
3025                         }
3026                     }
3027                     else {
3028                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
3029                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
3030
3031                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
3032                                 CALCULATED_NUMBER_FORMAT
3033                                 ": %s (source: %s)",
3034                               rc->chart?rc->chart:"NOCHART", rc->name,
3035                               rc->critical->result,
3036                               buffer_tostring(rc->critical->error_msg),
3037                               rc->source
3038                         );
3039
3040                         critical_status = rrdcalc_value2status(rc->critical->result);
3041                     }
3042                 }
3043
3044                 int status = RRDCALC_STATUS_UNDEFINED;
3045
3046                 switch(warning_status) {
3047                     case RRDCALC_STATUS_CLEAR:
3048                         status = RRDCALC_STATUS_CLEAR;
3049                         break;
3050
3051                     case RRDCALC_STATUS_RAISED:
3052                         status = RRDCALC_STATUS_WARNING;
3053                         break;
3054
3055                     default:
3056                         break;
3057                 }
3058
3059                 switch(critical_status) {
3060                     case RRDCALC_STATUS_CLEAR:
3061                         if(status == RRDCALC_STATUS_UNDEFINED)
3062                             status = RRDCALC_STATUS_CLEAR;
3063                         break;
3064
3065                     case RRDCALC_STATUS_RAISED:
3066                         status = RRDCALC_STATUS_CRITICAL;
3067                         break;
3068
3069                     default:
3070                         break;
3071                 }
3072
3073                 if(status != rc->status) {
3074                     int delay = 0;
3075
3076                     if(now > rc->delay_up_to_timestamp) {
3077                         rc->delay_up_current = rc->delay_up_duration;
3078                         rc->delay_down_current = rc->delay_down_duration;
3079                         rc->delay_last = 0;
3080                         rc->delay_up_to_timestamp = 0;
3081                     }
3082                     else {
3083                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
3084                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
3085
3086                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
3087                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
3088                     }
3089
3090                     if(status > rc->status)
3091                         delay = rc->delay_up_current;
3092                     else
3093                         delay = rc->delay_down_current;
3094
3095                     // COMMENTED: because we do need to send raising alarms
3096                     // if(now + delay < rc->delay_up_to_timestamp)
3097                     //    delay = (int)(rc->delay_up_to_timestamp - now);
3098
3099                     rc->delay_last = delay;
3100                     rc->delay_up_to_timestamp = now + delay;
3101                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
3102                     rc->last_status_change = now;
3103                     rc->status = status;
3104                 }
3105
3106                 rc->last_updated = now;
3107                 rc->next_update = now + rc->update_every;
3108
3109                 if (next_run > rc->next_update)
3110                     next_run = rc->next_update;
3111             }
3112
3113             rrdhost_unlock(&localhost);
3114         }
3115
3116         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
3117             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
3118
3119         if(unlikely(netdata_exit))
3120             break;
3121
3122         // execute notifications
3123         // and cleanup
3124         health_alarm_log_process(&localhost);
3125
3126         if(unlikely(netdata_exit))
3127             break;
3128         
3129         now = now_realtime_sec();
3130         if(now < next_run) {
3131             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
3132                   loop, (int) (next_run - now));
3133             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
3134         }
3135         else {
3136             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
3137         }
3138     }
3139
3140     buffer_free(wb);
3141
3142     info("HEALTH thread exiting");
3143
3144     static_thread->enabled = 0;
3145     pthread_exit(NULL);
3146     return NULL;
3147 }