]> arthur.barton.de Git - netdata.git/blob - src/health.c
added alarm_variables API call that returns all the available variables for a chart
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         if(unlink(health.log_filename) == -1 && errno != ENOENT)
71             error("Health: cannot remove old alarms log file '%s'", health.log_filename);
72
73         // open it with truncate
74         health.log_fp = fopen(health.log_filename, "w");
75
76         if(health.log_fp)
77             fclose(health.log_fp);
78         else
79             error("Health: cannot truncate health log '%s'", health.log_filename);
80
81         health.log_fp = NULL;
82
83         health.log_entries_written = 0;
84         health_alarm_log_open();
85     }
86 }
87
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
89     health_log_rotate();
90
91     if(likely(health.log_fp)) {
92         if(unlikely(fprintf(health.log_fp
93                 , "%c\t%s"
94                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
95                   "\t%08x\t%08x\t%08x"
96                   "\t%08x\t%08x\t%08x"
97                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
98                   "\t%d\t%d\t%d\t%d"
99                   "\t%Lf\t%Lf"
100                   "\n"
101                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
102                 , host->hostname
103
104                 , ae->unique_id
105                 , ae->alarm_id
106                 , ae->alarm_event_id
107                 , ae->updated_by_id
108                 , ae->updates_id
109
110                 , (uint32_t)ae->when
111                 , (uint32_t)ae->duration
112                 , (uint32_t)ae->non_clear_duration
113                 , (uint32_t)ae->flags
114                 , (uint32_t)ae->exec_run_timestamp
115                 , (uint32_t)ae->delay_up_to_timestamp
116
117                 , (ae->name)?ae->name:""
118                 , (ae->chart)?ae->chart:""
119                 , (ae->family)?ae->family:""
120                 , (ae->exec)?ae->exec:""
121                 , (ae->recipient)?ae->recipient:""
122                 , (ae->source)?ae->source:""
123                 , (ae->units)?ae->units:""
124                 , (ae->info)?ae->info:""
125
126                 , ae->exec_code
127                 , ae->new_status
128                 , ae->old_status
129                 , ae->delay
130
131                 , (long double)ae->new_value
132                 , (long double)ae->old_value
133         ) < 0))
134             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
135         else {
136             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137             health.log_entries_written++;
138         }
139     }
140 }
141
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143     static uint32_t max_unique_id = 0, max_alarm_id = 0;
144     ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
145
146     errno = 0;
147
148     char *s, *buf = mallocz(65536 + 1);
149     size_t line = 0, len = 0;
150     loaded = updated = errored = duplicate = 0;
151
152     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
153
154     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
155         health.log_entries_written++;
156         line++;
157
158         int max_entries = 30, entries = 0;
159         char *pointers[max_entries];
160
161         pointers[entries++] = s++;
162         while(*s) {
163             if(unlikely(*s == '\t')) {
164                 *s = '\0';
165                 pointers[entries++] = ++s;
166                 if(entries >= max_entries) {
167                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
168                     break;
169                 }
170             }
171             else s++;
172         }
173
174         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
175             ALARM_ENTRY *ae = NULL;
176
177             if(entries < 26) {
178                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries);
179                 errored++;
180                 continue;
181             }
182
183             // check that we have valid ids
184             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
185             if(!unique_id) {
186                 error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]);
187                 errored++;
188                 continue;
189             }
190
191             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
192             if(!alarm_id) {
193                 error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]);
194                 errored++;
195                 continue;
196             }
197
198             if(unlikely(*pointers[0] == 'A')) {
199                 // make sure it is properly numbered
200                 if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
201                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id);
202                     errored++;
203                     continue;
204                 }
205
206                 ae = callocz(1, sizeof(ALARM_ENTRY));
207             }
208             else if(unlikely(*pointers[0] == 'U')) {
209                 // find the original
210                 for(ae = host->health_log.alarms; ae; ae = ae->next) {
211                     if(unlikely(unique_id == ae->unique_id)) {
212                         if(unlikely(*pointers[0] == 'A')) {
213                             error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later."
214                                   , line, filename, unique_id);
215                             *pointers[0] = 'U';
216                             duplicate++;
217                         }
218                         break;
219                     }
220                     else if(unlikely(unique_id > ae->unique_id)) {
221                         // no need to continue
222                         // the linked list is sorted
223                         ae = NULL;
224                         break;
225                     }
226                 }
227
228                 // if not found, skip this line
229                 if(!ae) {
230                     // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
231                     continue;
232                 }
233             }
234
235             // check for a possible host missmatch
236             //if(strcmp(pointers[1], host->hostname))
237             //    error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
238
239             ae->unique_id               = unique_id;
240             ae->alarm_id                = alarm_id;
241             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
242             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
243             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
244
245             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
246             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
247             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
248
249             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
250             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
251
252             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
253             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
254
255             if(unlikely(ae->name)) freez(ae->name);
256             ae->name = strdupz(pointers[13]);
257             ae->hash_name = simple_hash(ae->name);
258
259             if(unlikely(ae->chart)) freez(ae->chart);
260             ae->chart = strdupz(pointers[14]);
261             ae->hash_chart = simple_hash(ae->chart);
262
263             if(unlikely(ae->family)) freez(ae->family);
264             ae->family = strdupz(pointers[15]);
265
266             if(unlikely(ae->exec)) freez(ae->exec);
267             ae->exec = strdupz(pointers[16]);
268             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
269
270             if(unlikely(ae->recipient)) freez(ae->recipient);
271             ae->recipient = strdupz(pointers[17]);
272             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
273
274             if(unlikely(ae->source)) freez(ae->source);
275             ae->source = strdupz(pointers[18]);
276             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
277
278             if(unlikely(ae->units)) freez(ae->units);
279             ae->units = strdupz(pointers[19]);
280             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
281
282             if(unlikely(ae->info)) freez(ae->info);
283             ae->info = strdupz(pointers[20]);
284             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
285
286             ae->exec_code   = atoi(pointers[21]);
287             ae->new_status  = atoi(pointers[22]);
288             ae->old_status  = atoi(pointers[23]);
289             ae->delay       = atoi(pointers[24]);
290
291             ae->new_value   = strtold(pointers[25], NULL);
292             ae->old_value   = strtold(pointers[26], NULL);
293
294             // add it to host if not already there
295             if(unlikely(*pointers[0] == 'A')) {
296                 ae->next = host->health_log.alarms;
297                 host->health_log.alarms = ae;
298                 loaded++;
299             }
300             else updated++;
301
302             if(unlikely(ae->unique_id > max_unique_id))
303                 max_unique_id = ae->unique_id;
304
305             if(unlikely(ae->alarm_id >= max_alarm_id))
306                 max_alarm_id = ae->alarm_id;
307         }
308         else {
309             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
310             errored++;
311         }
312     }
313
314     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
315
316     freez(buf);
317
318     if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
319     if(!max_alarm_id)  max_alarm_id  = (uint32_t)time(NULL);
320
321     host->health_log.next_log_id = max_unique_id + 1;
322     host->health_log.next_alarm_id = max_alarm_id + 1;
323
324     debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
325     return loaded;
326 }
327
328 static inline void health_alarm_log_load(RRDHOST *host) {
329     health_alarm_log_close();
330
331     char filename[FILENAME_MAX + 1];
332     snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename);
333     FILE *fp = fopen(filename, "r");
334     if(!fp)
335         error("Health: cannot open health file: %s", filename);
336     else {
337         health_alarm_log_read(host, fp, filename);
338         fclose(fp);
339     }
340
341     health.log_entries_written = 0;
342     fp = fopen(health.log_filename, "r");
343     if(!fp)
344         error("Health: cannot open health file: %s", health.log_filename);
345     else {
346         health_alarm_log_read(host, fp, health.log_filename);
347         fclose(fp);
348     }
349
350     health_alarm_log_open();
351 }
352
353
354 // ----------------------------------------------------------------------------
355 // health alarm log management
356
357 static inline void health_alarm_log(RRDHOST *host,
358                 uint32_t alarm_id, uint32_t alarm_event_id,
359                 time_t when,
360                 const char *name, const char *chart, const char *family,
361                 const char *exec, const char *recipient, time_t duration,
362                 calculated_number old_value, calculated_number new_value,
363                 int old_status, int new_status,
364                 const char *source,
365                 const char *units,
366                 const char *info,
367                 int delay
368 ) {
369     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
370
371     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
372     ae->name = strdupz(name);
373     ae->hash_name = simple_hash(ae->name);
374
375     if(chart) {
376         ae->chart = strdupz(chart);
377         ae->hash_chart = simple_hash(ae->chart);
378     }
379
380     if(family)
381         ae->family = strdupz(family);
382
383     if(exec) ae->exec = strdupz(exec);
384     if(recipient) ae->recipient = strdupz(recipient);
385     if(source) ae->source = strdupz(source);
386     if(units) ae->units = strdupz(units);
387     if(info) ae->info = strdupz(info);
388
389     ae->unique_id = host->health_log.next_log_id++;
390     ae->alarm_id = alarm_id;
391     ae->alarm_event_id = alarm_event_id;
392     ae->when = when;
393     ae->old_value = old_value;
394     ae->new_value = new_value;
395     ae->old_status = old_status;
396     ae->new_status = new_status;
397     ae->duration = duration;
398     ae->delay = delay;
399     ae->delay_up_to_timestamp = when + delay;
400
401     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
402         ae->non_clear_duration += ae->duration;
403
404     // link it
405     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
406     ae->next = host->health_log.alarms;
407     host->health_log.alarms = ae;
408     host->health_log.count++;
409     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
410
411     // match previous alarms
412     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
413     ALARM_ENTRY *t;
414     for(t = host->health_log.alarms ; t ; t = t->next) {
415         if(t != ae && t->alarm_id == ae->alarm_id) {
416             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
417                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
418                 t->updated_by_id = ae->unique_id;
419                 ae->updates_id = t->unique_id;
420
421                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
422                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
423                     ae->non_clear_duration += t->non_clear_duration;
424
425                 health_alarm_log_save(host, t);
426             }
427
428             // no need to continue
429             break;
430         }
431     }
432     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
433
434     health_alarm_log_save(host, ae);
435 }
436
437 // ----------------------------------------------------------------------------
438 // RRDVAR management
439
440 static inline int rrdvar_fix_name(char *variable) {
441     int fixed = 0;
442     while(*variable) {
443         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
444             *variable++ = '_';
445             fixed++;
446         }
447         else
448             variable++;
449     }
450
451     return fixed;
452 }
453
454 int rrdvar_compare(void* a, void* b) {
455     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
456     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
457     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
458 }
459
460 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
461     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
462     if(ret != rv)
463         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
464
465     return ret;
466 }
467
468 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
469     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
470     if(!ret)
471         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
472
473     return ret;
474 }
475
476 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
477     RRDVAR tmp;
478     tmp.name = (char *)name;
479     tmp.hash = (hash)?hash:simple_hash(tmp.name);
480
481     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
482 }
483
484 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
485     (void)host;
486
487     if(!rv) return;
488
489     if(tree) {
490         debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
491         rrdvar_index_del(tree, rv);
492     }
493
494     freez(rv->name);
495     freez(rv);
496 }
497
498 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
499     char *variable = strdupz(name);
500     rrdvar_fix_name(variable);
501     uint32_t hash = simple_hash(variable);
502
503     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
504     if(unlikely(!rv)) {
505         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
506
507         rv = callocz(1, sizeof(RRDVAR));
508         rv->name = variable;
509         rv->hash = hash;
510         rv->type = type;
511         rv->value = value;
512
513         RRDVAR *ret = rrdvar_index_add(tree, rv);
514         if(unlikely(ret != rv)) {
515             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
516             rrdvar_free(NULL, NULL, rv);
517             rv = NULL;
518         }
519         else
520             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
521     }
522     else {
523         debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
524
525         // already exists
526         freez(variable);
527
528         // this is important
529         // it must return NULL - not the existing variable - or double-free will happen
530         rv = NULL;
531     }
532
533     return rv;
534 }
535
536 // ----------------------------------------------------------------------------
537 // RRDVAR lookup
538
539 static calculated_number rrdvar2number(RRDVAR *rv) {
540     switch(rv->type) {
541         case RRDVAR_TYPE_CALCULATED: {
542             calculated_number *n = (calculated_number *)rv->value;
543             return *n;
544         }
545
546         case RRDVAR_TYPE_TIME_T: {
547             time_t *n = (time_t *)rv->value;
548             return *n;
549         }
550
551         case RRDVAR_TYPE_COLLECTED: {
552             collected_number *n = (collected_number *)rv->value;
553             return *n;
554         }
555
556         case RRDVAR_TYPE_TOTAL: {
557             total_number *n = (total_number *)rv->value;
558             return *n;
559         }
560
561         case RRDVAR_TYPE_INT: {
562             int *n = (int *)rv->value;
563             return *n;
564         }
565
566         default:
567             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
568             return NAN;
569     }
570 }
571
572 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
573     RRDSET *st = rc->rrdset;
574     RRDVAR *rv;
575
576     if(!st) return 0;
577
578     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
579     if(rv) {
580         *result = rrdvar2number(rv);
581         return 1;
582     }
583
584     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
585     if(rv) {
586         *result = rrdvar2number(rv);
587         return 1;
588     }
589
590     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
591     if(rv) {
592         *result = rrdvar2number(rv);
593         return 1;
594     }
595
596     return 0;
597 }
598
599 // ----------------------------------------------------------------------------
600 // RRDVAR to JSON
601
602 struct variable2json_helper {
603     BUFFER *buf;
604     size_t counter;
605 };
606
607 static void single_variable2json(void *entry, void *data) {
608     struct variable2json_helper *helper = (struct variable2json_helper *)data;
609     RRDVAR *rv = (RRDVAR *)entry;
610     calculated_number value = rrdvar2number(rv);
611
612     if(unlikely(isnan(value) || isinf(value)))
613         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
614     else
615         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
616
617     helper->counter++;
618 }
619
620 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
621     struct variable2json_helper helper = {
622             .buf = buf,
623             .counter = 0
624     };
625
626     buffer_sprintf(buf, "{\n\t\"chart\": \"%s.%s\",\n\t\"chart_name\": \"%s.%s\",\n\t\"chart_variables\": {", st->type, st->id, st->type, st->name);
627     avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
628     buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
629     helper.counter = 0;
630     avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
631     buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
632     helper.counter = 0;
633     avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
634     buffer_strcat(buf, "\n\t}\n}\n");
635 }
636
637
638 // ----------------------------------------------------------------------------
639 // RRDDIMVAR management
640 // DIMENSION VARIABLES
641
642 #define RRDDIMVAR_ID_MAX 1024
643
644 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
645     RRDDIM *rd = rs->rrddim;
646     RRDSET *st = rd->rrdset;
647
648     // CHART VARIABLES FOR THIS DIMENSION
649
650     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
651     rs->var_local_id = NULL;
652
653     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
654     rs->var_local_name = NULL;
655
656     // FAMILY VARIABLES FOR THIS DIMENSION
657
658     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
659     rs->var_family_id = NULL;
660
661     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
662     rs->var_family_name = NULL;
663
664     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
665     rs->var_family_contextid = NULL;
666
667     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
668     rs->var_family_contextname = NULL;
669
670     // HOST VARIABLES FOR THIS DIMENSION
671
672     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
673     rs->var_host_chartidid = NULL;
674
675     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
676     rs->var_host_chartidname = NULL;
677
678     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
679     rs->var_host_chartnameid = NULL;
680
681     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
682     rs->var_host_chartnamename = NULL;
683
684     // KEYS
685
686     freez(rs->key_id);
687     rs->key_id = NULL;
688
689     freez(rs->key_name);
690     rs->key_name = NULL;
691
692     freez(rs->key_fullidid);
693     rs->key_fullidid = NULL;
694
695     freez(rs->key_fullidname);
696     rs->key_fullidname = NULL;
697
698     freez(rs->key_contextid);
699     rs->key_contextid = NULL;
700
701     freez(rs->key_contextname);
702     rs->key_contextname = NULL;
703
704     freez(rs->key_fullnameid);
705     rs->key_fullnameid = NULL;
706
707     freez(rs->key_fullnamename);
708     rs->key_fullnamename = NULL;
709 }
710
711 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
712     rrddimvar_free_variables(rs);
713
714     RRDDIM *rd = rs->rrddim;
715     RRDSET *st = rd->rrdset;
716
717     char buffer[RRDDIMVAR_ID_MAX + 1];
718
719     // KEYS
720
721     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
722     rs->key_id = strdupz(buffer);
723
724     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
725     rs->key_name = strdupz(buffer);
726
727     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
728     rs->key_fullidid = strdupz(buffer);
729
730     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
731     rs->key_fullidname = strdupz(buffer);
732
733     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
734     rs->key_contextid = strdupz(buffer);
735
736     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
737     rs->key_contextname = strdupz(buffer);
738
739     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
740     rs->key_fullnameid = strdupz(buffer);
741
742     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
743     rs->key_fullnamename = strdupz(buffer);
744
745     // CHART VARIABLES FOR THIS DIMENSION
746     // -----------------------------------
747     //
748     // dimensions are available as:
749     // - $id
750     // - $name
751
752     rs->var_local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
753     rs->var_local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
754
755     // FAMILY VARIABLES FOR THIS DIMENSION
756     // -----------------------------------
757     //
758     // dimensions are available as:
759     // - $id                 (only the first, when multiple overlap)
760     // - $name               (only the first, when multiple overlap)
761     // - $chart-context.id
762     // - $chart-context.name
763
764     rs->var_family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
765     rs->var_family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
766     rs->var_family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
767     rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
768
769     // HOST VARIABLES FOR THIS DIMENSION
770     // -----------------------------------
771     //
772     // dimensions are available as:
773     // - $chart-id.id
774     // - $chart-id.name
775     // - $chart-name.id
776     // - $chart-name.name
777
778     rs->var_host_chartidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
779     rs->var_host_chartidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
780     rs->var_host_chartnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
781     rs->var_host_chartnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
782 }
783
784 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
785     RRDSET *st = rd->rrdset;
786
787     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
788
789     if(!prefix) prefix = "";
790     if(!suffix) suffix = "";
791
792     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
793
794     rs->prefix = strdupz(prefix);
795     rs->suffix = strdupz(suffix);
796
797     rs->type = type;
798     rs->value = value;
799     rs->options = options;
800     rs->rrddim = rd;
801
802     rs->next = rd->variables;
803     rd->variables = rs;
804
805     rrddimvar_create_variables(rs);
806
807     return rs;
808 }
809
810 void rrddimvar_rename_all(RRDDIM *rd) {
811     RRDSET *st = rd->rrdset;
812     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
813
814     RRDDIMVAR *rs, *next = rd->variables;
815     while((rs = next)) {
816         next = rs->next;
817         rrddimvar_create_variables(rs);
818     }
819 }
820
821 void rrddimvar_free(RRDDIMVAR *rs) {
822     RRDDIM *rd = rs->rrddim;
823     RRDSET *st = rd->rrdset;
824     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
825
826     rrddimvar_free_variables(rs);
827
828     if(rd->variables == rs) {
829         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
830         rd->variables = rs->next;
831     }
832     else {
833         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
834         RRDDIMVAR *t;
835         for (t = rd->variables; t && t->next != rs; t = t->next) ;
836         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
837         else t->next = rs->next;
838     }
839
840     freez(rs->prefix);
841     freez(rs->suffix);
842     freez(rs);
843 }
844
845 // ----------------------------------------------------------------------------
846 // RRDSETVAR management
847 // CHART VARIABLES
848
849 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
850     RRDSET *st = rs->rrdset;
851
852     // CHART
853
854     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
855     rs->var_local = NULL;
856
857     // FAMILY
858
859     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
860     rs->var_family = NULL;
861
862     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
863     rs->var_host = NULL;
864
865     // HOST
866
867     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
868     rs->var_family_name = NULL;
869
870     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
871     rs->var_host_name = NULL;
872
873     // KEYS
874
875     freez(rs->key_fullid);
876     rs->key_fullid = NULL;
877
878     freez(rs->key_fullname);
879     rs->key_fullname = NULL;
880 }
881
882 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
883     rrdsetvar_free_variables(rs);
884
885     RRDSET *st = rs->rrdset;
886
887     // KEYS
888
889     char buffer[RRDVAR_MAX_LENGTH + 1];
890     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
891     rs->key_fullid = strdupz(buffer);
892
893     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
894     rs->key_fullname = strdupz(buffer);
895
896     // CHART
897
898     rs->var_local       = rrdvar_create_and_index("local",  &st->variables_root_index,               rs->variable, rs->type, rs->value);
899
900     // FAMILY
901
902     rs->var_family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullid,   rs->type, rs->value);
903     rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullname, rs->type, rs->value);
904
905     // HOST
906
907     rs->var_host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullid,   rs->type, rs->value);
908     rs->var_host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullname, rs->type, rs->value);
909
910 }
911
912 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
913     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
914     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
915
916     rs->variable = strdupz(variable);
917     rs->type = type;
918     rs->value = value;
919     rs->options = options;
920     rs->rrdset = st;
921
922     rs->next = st->variables;
923     st->variables = rs;
924
925     rrdsetvar_create_variables(rs);
926
927     return rs;
928 }
929
930 void rrdsetvar_rename_all(RRDSET *st) {
931     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
932
933     RRDSETVAR *rs, *next = st->variables;
934     while((rs = next)) {
935         next = rs->next;
936         rrdsetvar_create_variables(rs);
937     }
938
939     rrdsetcalc_link_matching(st);
940 }
941
942 void rrdsetvar_free(RRDSETVAR *rs) {
943     RRDSET *st = rs->rrdset;
944     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
945
946     if(st->variables == rs) {
947         st->variables = rs->next;
948     }
949     else {
950         RRDSETVAR *t;
951         for (t = st->variables; t && t->next != rs; t = t->next);
952         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
953         else t->next = rs->next;
954     }
955
956     rrdsetvar_free_variables(rs);
957
958     freez(rs->variable);
959     freez(rs);
960 }
961
962 // ----------------------------------------------------------------------------
963 // RRDCALC management
964
965 static inline const char *rrdcalc_status2string(int status) {
966     switch(status) {
967         case RRDCALC_STATUS_REMOVED:
968             return "REMOVED";
969
970         case RRDCALC_STATUS_UNDEFINED:
971             return "UNDEFINED";
972
973         case RRDCALC_STATUS_UNINITIALIZED:
974             return "UNINITIALIZED";
975
976         case RRDCALC_STATUS_CLEAR:
977             return "CLEAR";
978
979         case RRDCALC_STATUS_RAISED:
980             return "RAISED";
981
982         case RRDCALC_STATUS_WARNING:
983             return "WARNING";
984
985         case RRDCALC_STATUS_CRITICAL:
986             return "CRITICAL";
987
988         default:
989             error("Unknown alarm status %d", status);
990             return "UNKNOWN";
991     }
992 }
993
994 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
995     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
996
997     rc->last_status_change = time(NULL);
998     rc->rrdset = st;
999
1000     rc->rrdset_next = st->alarms;
1001     rc->rrdset_prev = NULL;
1002     
1003     if(rc->rrdset_next)
1004         rc->rrdset_next->rrdset_prev = rc;
1005
1006     st->alarms = rc;
1007
1008     if(rc->update_every < rc->rrdset->update_every) {
1009         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
1010         rc->update_every = rc->rrdset->update_every;
1011     }
1012
1013     if(!isnan(rc->green) && isnan(st->green)) {
1014         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
1015         st->green = rc->green;
1016     }
1017
1018     if(!isnan(rc->red) && isnan(st->red)) {
1019         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
1020         st->red = rc->red;
1021     }
1022
1023     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1024     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1025
1026     char fullname[RRDVAR_MAX_LENGTH + 1];
1027     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
1028     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1029
1030     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
1031     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1032
1033         if(!rc->units) rc->units = strdupz(st->units);
1034
1035     {
1036         time_t now = time(NULL);
1037         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
1038     }
1039 }
1040
1041 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
1042     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
1043             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
1044         return 1;
1045
1046     return 0;
1047 }
1048
1049 // this has to be called while the RRDHOST is locked
1050 inline void rrdsetcalc_link_matching(RRDSET *st) {
1051     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
1052
1053     RRDCALC *rc;
1054     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
1055         if(unlikely(rc->rrdset))
1056             continue;
1057
1058         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
1059             rrdsetcalc_link(st, rc);
1060     }
1061 }
1062
1063 // this has to be called while the RRDHOST is locked
1064 inline void rrdsetcalc_unlink(RRDCALC *rc) {
1065     RRDSET *st = rc->rrdset;
1066
1067     if(!st) {
1068         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1069         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1070         return;
1071     }
1072
1073     {
1074         time_t now = time(NULL);
1075         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
1076     }
1077
1078     RRDHOST *host = st->rrdhost;
1079
1080     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
1081
1082     // unlink it
1083     if(rc->rrdset_prev)
1084         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
1085
1086     if(rc->rrdset_next)
1087         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
1088
1089     if(st->alarms == rc)
1090         st->alarms = rc->rrdset_next;
1091
1092     rc->rrdset_prev = rc->rrdset_next = NULL;
1093
1094     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
1095     rc->local = NULL;
1096
1097     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
1098     rc->family = NULL;
1099
1100     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
1101     rc->hostid = NULL;
1102
1103     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
1104     rc->hostname = NULL;
1105
1106     rc->rrdset = NULL;
1107
1108     // RRDCALC will remain in RRDHOST
1109     // so that if the matching chart is found in the future
1110     // it will be applied automatically
1111 }
1112
1113 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
1114     RRDCALC *rc;
1115     uint32_t hash = simple_hash(name);
1116
1117     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
1118         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
1119             return rc;
1120     }
1121
1122     return NULL;
1123 }
1124
1125 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
1126     RRDCALC *rc;
1127
1128     if(unlikely(!chart)) {
1129         error("attempt to find RRDCALC '%s' without giving a chart name", name);
1130         return 1;
1131     }
1132
1133     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1134     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
1135
1136     // make sure it does not already exist
1137     for(rc = host->alarms; rc ; rc = rc->next) {
1138         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1139             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1140             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1141             return 1;
1142         }
1143     }
1144
1145     return 0;
1146 }
1147
1148 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1149     if(chart && name) {
1150         uint32_t hash_chart = simple_hash(chart);
1151         uint32_t hash_name = simple_hash(name);
1152
1153         // re-use old IDs, by looking them up in the alarm log
1154         ALARM_ENTRY *ae;
1155         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1156             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1157                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1158                 return ae->alarm_id;
1159             }
1160         }
1161     }
1162
1163     return host->health_log.next_alarm_id++;
1164 }
1165
1166 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1167     rrdhost_check_rdlock(host);
1168
1169     if(rc->calculation) {
1170         rc->calculation->status = &rc->status;
1171         rc->calculation->this = &rc->value;
1172         rc->calculation->after = &rc->db_after;
1173         rc->calculation->before = &rc->db_before;
1174         rc->calculation->rrdcalc = rc;
1175     }
1176
1177     if(rc->warning) {
1178         rc->warning->status = &rc->status;
1179         rc->warning->this = &rc->value;
1180         rc->warning->after = &rc->db_after;
1181         rc->warning->before = &rc->db_before;
1182         rc->warning->rrdcalc = rc;
1183     }
1184
1185     if(rc->critical) {
1186         rc->critical->status = &rc->status;
1187         rc->critical->this = &rc->value;
1188         rc->critical->after = &rc->db_after;
1189         rc->critical->before = &rc->db_before;
1190         rc->critical->rrdcalc = rc;
1191     }
1192
1193     // link it to the host
1194     if(likely(host->alarms)) {
1195         // append it
1196         RRDCALC *t;
1197         for(t = host->alarms; t && t->next ; t = t->next) ;
1198         t->next = rc;
1199     }
1200     else {
1201         host->alarms = rc;
1202     }
1203
1204     // link it to its chart
1205     RRDSET *st;
1206     for(st = host->rrdset_root; st ; st = st->next) {
1207         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1208             rrdsetcalc_link(st, rc);
1209             break;
1210         }
1211     }
1212 }
1213
1214 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1215
1216     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1217
1218     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1219         return NULL;
1220
1221     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1222     rc->next_event_id = 1;
1223     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1224     rc->name = strdupz(rt->name);
1225     rc->hash = simple_hash(rc->name);
1226     rc->chart = strdupz(chart);
1227     rc->hash_chart = simple_hash(rc->chart);
1228
1229     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1230
1231     rc->green = rt->green;
1232     rc->red = rt->red;
1233     rc->value = NAN;
1234     rc->old_value = NAN;
1235
1236     rc->delay_up_duration = rt->delay_up_duration;
1237     rc->delay_down_duration = rt->delay_down_duration;
1238     rc->delay_max_duration = rt->delay_max_duration;
1239     rc->delay_multiplier = rt->delay_multiplier;
1240
1241     rc->group = rt->group;
1242     rc->after = rt->after;
1243     rc->before = rt->before;
1244     rc->update_every = rt->update_every;
1245     rc->options = rt->options;
1246
1247     if(rt->exec) rc->exec = strdupz(rt->exec);
1248     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1249     if(rt->source) rc->source = strdupz(rt->source);
1250     if(rt->units) rc->units = strdupz(rt->units);
1251     if(rt->info) rc->info = strdupz(rt->info);
1252
1253     if(rt->calculation) {
1254         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1255         if(!rc->calculation)
1256             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1257     }
1258     if(rt->warning) {
1259         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1260         if(!rc->warning)
1261             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1262     }
1263     if(rt->critical) {
1264         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1265         if(!rc->critical)
1266             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1267     }
1268
1269     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1270           (rc->chart)?rc->chart:"NOCHART",
1271           rc->name,
1272           (rc->exec)?rc->exec:"DEFAULT",
1273           (rc->recipient)?rc->recipient:"DEFAULT",
1274           rc->green,
1275           rc->red,
1276           rc->group,
1277           rc->after,
1278           rc->before,
1279           rc->options,
1280           (rc->dimensions)?rc->dimensions:"NONE",
1281           rc->update_every,
1282           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1283           (rc->warning)?rc->warning->parsed_as:"NONE",
1284           (rc->critical)?rc->critical->parsed_as:"NONE",
1285           rc->source,
1286           rc->delay_up_duration,
1287           rc->delay_down_duration,
1288           rc->delay_max_duration,
1289           rc->delay_multiplier
1290     );
1291
1292     rrdcalc_create_part2(host, rc);
1293     return rc;
1294 }
1295
1296 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1297     if(!rc) return;
1298
1299     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1300
1301     // unlink it from RRDSET
1302     if(rc->rrdset) rrdsetcalc_unlink(rc);
1303
1304     // unlink it from RRDHOST
1305     if(unlikely(rc == host->alarms))
1306         host->alarms = rc->next;
1307
1308     else if(likely(host->alarms)) {
1309         RRDCALC *t, *last = host->alarms;
1310         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1311         if(last->next == rc)
1312             last->next = rc->next;
1313         else
1314             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1315     }
1316     else
1317         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1318
1319     expression_free(rc->calculation);
1320     expression_free(rc->warning);
1321     expression_free(rc->critical);
1322
1323     freez(rc->name);
1324     freez(rc->chart);
1325     freez(rc->family);
1326     freez(rc->dimensions);
1327     freez(rc->exec);
1328     freez(rc->recipient);
1329     freez(rc->source);
1330     freez(rc->units);
1331     freez(rc->info);
1332     freez(rc);
1333 }
1334
1335 // ----------------------------------------------------------------------------
1336 // RRDCALCTEMPLATE management
1337
1338 void rrdcalctemplate_link_matching(RRDSET *st) {
1339     RRDCALCTEMPLATE *rt;
1340
1341     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1342         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1343             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1344             if(unlikely(!rc))
1345                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1346
1347 #ifdef NETDATA_INTERNAL_CHECKS
1348             else if(rc->rrdset != st)
1349                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1350 #endif
1351         }
1352     }
1353 }
1354
1355 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1356     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1357
1358     if(host->templates) {
1359         if(host->templates == rt) {
1360             host->templates = rt->next;
1361         }
1362         else {
1363             RRDCALCTEMPLATE *t, *last = host->templates;
1364             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1365             if(last && last->next == rt) {
1366                 last->next = rt->next;
1367                 rt->next = NULL;
1368             }
1369             else
1370                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1371         }
1372     }
1373
1374     expression_free(rt->calculation);
1375     expression_free(rt->warning);
1376     expression_free(rt->critical);
1377
1378     freez(rt->name);
1379     freez(rt->exec);
1380     freez(rt->recipient);
1381     freez(rt->context);
1382     freez(rt->source);
1383     freez(rt->units);
1384     freez(rt->info);
1385     freez(rt->dimensions);
1386     freez(rt);
1387 }
1388
1389 // ----------------------------------------------------------------------------
1390 // load health configuration
1391
1392 #define HEALTH_CONF_MAX_LINE 4096
1393
1394 #define HEALTH_ALARM_KEY "alarm"
1395 #define HEALTH_TEMPLATE_KEY "template"
1396 #define HEALTH_ON_KEY "on"
1397 #define HEALTH_LOOKUP_KEY "lookup"
1398 #define HEALTH_CALC_KEY "calc"
1399 #define HEALTH_EVERY_KEY "every"
1400 #define HEALTH_GREEN_KEY "green"
1401 #define HEALTH_RED_KEY "red"
1402 #define HEALTH_WARN_KEY "warn"
1403 #define HEALTH_CRIT_KEY "crit"
1404 #define HEALTH_EXEC_KEY "exec"
1405 #define HEALTH_RECIPIENT_KEY "to"
1406 #define HEALTH_UNITS_KEY "units"
1407 #define HEALTH_INFO_KEY "info"
1408 #define HEALTH_DELAY_KEY "delay"
1409
1410 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1411     if(!rc->chart) {
1412         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1413         return 0;
1414     }
1415
1416     if(!rc->update_every) {
1417         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1418         return 0;
1419     }
1420
1421     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1422         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1423         return 0;
1424     }
1425
1426     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1427         return 0;
1428
1429     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1430
1431     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1432           rc->chart?rc->chart:"NOCHART",
1433           rc->name,
1434           rc->id,
1435           (rc->exec)?rc->exec:"DEFAULT",
1436           (rc->recipient)?rc->recipient:"DEFAULT",
1437           rc->green,
1438           rc->red,
1439           rc->group,
1440           rc->after,
1441           rc->before,
1442           rc->options,
1443           (rc->dimensions)?rc->dimensions:"NONE",
1444           rc->update_every,
1445           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1446           (rc->warning)?rc->warning->parsed_as:"NONE",
1447           (rc->critical)?rc->critical->parsed_as:"NONE",
1448           rc->source,
1449           rc->delay_up_duration,
1450           rc->delay_down_duration,
1451           rc->delay_max_duration,
1452           rc->delay_multiplier
1453     );
1454
1455     rrdcalc_create_part2(host, rc);
1456     return 1;
1457 }
1458
1459 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1460     if(unlikely(!rt->context)) {
1461         error("Health configuration for template '%s' does not have a context", rt->name);
1462         return 0;
1463     }
1464
1465     if(unlikely(!rt->update_every)) {
1466         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1467         return 0;
1468     }
1469
1470     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1471         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1472         return 0;
1473     }
1474
1475     RRDCALCTEMPLATE *t, *last = NULL;
1476     for (t = host->templates; t ; last = t, t = t->next) {
1477         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1478             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1479             return 0;
1480         }
1481     }
1482
1483     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1484           rt->name,
1485           (rt->context)?rt->context:"NONE",
1486           (rt->exec)?rt->exec:"DEFAULT",
1487           (rt->recipient)?rt->recipient:"DEFAULT",
1488           rt->green,
1489           rt->red,
1490           rt->group,
1491           rt->after,
1492           rt->before,
1493           rt->options,
1494           (rt->dimensions)?rt->dimensions:"NONE",
1495           rt->update_every,
1496           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1497           (rt->warning)?rt->warning->parsed_as:"NONE",
1498           (rt->critical)?rt->critical->parsed_as:"NONE",
1499           rt->source,
1500           rt->delay_up_duration,
1501           rt->delay_down_duration,
1502           rt->delay_max_duration,
1503           rt->delay_multiplier
1504     );
1505
1506     if(likely(last)) {
1507         last->next = rt;
1508     }
1509     else {
1510         rt->next = host->templates;
1511         host->templates = rt;
1512     }
1513
1514     return 1;
1515 }
1516
1517 static inline int health_parse_duration(char *string, int *result) {
1518     // make sure it is a number
1519     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1520         *result = 0;
1521         return 0;
1522     }
1523
1524     char *e = NULL;
1525     calculated_number n = strtold(string, &e);
1526     if(e && *e) {
1527         switch (*e) {
1528             case 'Y':
1529                 *result = (int) (n * 86400 * 365);
1530                 break;
1531             case 'M':
1532                 *result = (int) (n * 86400 * 30);
1533                 break;
1534             case 'w':
1535                 *result = (int) (n * 86400 * 7);
1536                 break;
1537             case 'd':
1538                 *result = (int) (n * 86400);
1539                 break;
1540             case 'h':
1541                 *result = (int) (n * 3600);
1542                 break;
1543             case 'm':
1544                 *result = (int) (n * 60);
1545                 break;
1546
1547             default:
1548             case 's':
1549                 *result = (int) (n);
1550                 break;
1551         }
1552     }
1553     else
1554        *result = (int)(n);
1555
1556     return 1;
1557 }
1558
1559 static inline int health_parse_delay(
1560         size_t line, const char *path, const char *file, char *string,
1561         int *delay_up_duration,
1562         int *delay_down_duration,
1563         int *delay_max_duration,
1564         float *delay_multiplier) {
1565
1566     char given_up = 0;
1567     char given_down = 0;
1568     char given_max = 0;
1569     char given_multiplier = 0;
1570
1571     char *s = string;
1572     while(*s) {
1573         char *key = s;
1574
1575         while(*s && !isspace(*s)) s++;
1576         while(*s && isspace(*s)) *s++ = '\0';
1577
1578         if(!*key) break;
1579
1580         char *value = s;
1581         while(*s && !isspace(*s)) s++;
1582         while(*s && isspace(*s)) *s++ = '\0';
1583
1584         if(!strcasecmp(key, "up")) {
1585             if (!health_parse_duration(value, delay_up_duration)) {
1586                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1587                       line, path, file, value, key);
1588             }
1589             else given_up = 1;
1590         }
1591         else if(!strcasecmp(key, "down")) {
1592             if (!health_parse_duration(value, delay_down_duration)) {
1593                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1594                       line, path, file, value, key);
1595             }
1596             else given_down = 1;
1597         }
1598         else if(!strcasecmp(key, "multiplier")) {
1599             *delay_multiplier = strtof(value, NULL);
1600             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1601                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1602                       line, path, file, value, key);
1603             }
1604             else given_multiplier = 1;
1605         }
1606         else if(!strcasecmp(key, "max")) {
1607             if (!health_parse_duration(value, delay_max_duration)) {
1608                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1609                       line, path, file, value, key);
1610             }
1611             else given_max = 1;
1612         }
1613         else {
1614             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1615                   line, path, file, key);
1616         }
1617     }
1618
1619     if(!given_up)
1620         *delay_up_duration = 0;
1621
1622     if(!given_down)
1623         *delay_down_duration = 0;
1624
1625     if(!given_multiplier)
1626         *delay_multiplier = 1.0;
1627
1628     if(!given_max) {
1629         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1630             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1631
1632         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1633             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1634     }
1635
1636     return 1;
1637 }
1638
1639 static inline int health_parse_db_lookup(
1640         size_t line, const char *path, const char *file, char *string,
1641         int *group_method, int *after, int *before, int *every,
1642         uint32_t *options, char **dimensions
1643 ) {
1644     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1645
1646     if(*dimensions) freez(*dimensions);
1647     *dimensions = NULL;
1648     *after = 0;
1649     *before = 0;
1650     *every = 0;
1651     *options = 0;
1652
1653     char *s = string, *key;
1654
1655     // first is the group method
1656     key = s;
1657     while(*s && !isspace(*s)) s++;
1658     while(*s && isspace(*s)) *s++ = '\0';
1659     if(!*s) {
1660         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1661               line, path, file, key);
1662         return 0;
1663     }
1664
1665     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1666         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1667               line, path, file, key);
1668         return 0;
1669     }
1670
1671     // then is the 'after' time
1672     key = s;
1673     while(*s && !isspace(*s)) s++;
1674     while(*s && isspace(*s)) *s++ = '\0';
1675
1676     if(!health_parse_duration(key, after)) {
1677         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1678               line, path, file, key);
1679         return 0;
1680     }
1681
1682     // sane defaults
1683     *every = abs(*after);
1684
1685     // now we may have optional parameters
1686     while(*s) {
1687         key = s;
1688         while(*s && !isspace(*s)) s++;
1689         while(*s && isspace(*s)) *s++ = '\0';
1690         if(!*key) break;
1691
1692         if(!strcasecmp(key, "at")) {
1693             char *value = s;
1694             while(*s && !isspace(*s)) s++;
1695             while(*s && isspace(*s)) *s++ = '\0';
1696
1697             if (!health_parse_duration(value, before)) {
1698                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1699                       line, path, file, value, key);
1700             }
1701         }
1702         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1703             char *value = s;
1704             while(*s && !isspace(*s)) s++;
1705             while(*s && isspace(*s)) *s++ = '\0';
1706
1707             if (!health_parse_duration(value, every)) {
1708                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1709                       line, path, file, value, key);
1710             }
1711         }
1712         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1713             *options |= RRDR_OPTION_ABSOLUTE;
1714         }
1715         else if(!strcasecmp(key, "min2max")) {
1716             *options |= RRDR_OPTION_MIN2MAX;
1717         }
1718         else if(!strcasecmp(key, "null2zero")) {
1719             *options |= RRDR_OPTION_NULL2ZERO;
1720         }
1721         else if(!strcasecmp(key, "percentage")) {
1722             *options |= RRDR_OPTION_PERCENTAGE;
1723         }
1724         else if(!strcasecmp(key, "unaligned")) {
1725             *options |= RRDR_OPTION_NOT_ALIGNED;
1726         }
1727         else if(!strcasecmp(key, "of")) {
1728             if(*s && strcasecmp(s, "all"))
1729                *dimensions = strdupz(s);
1730             break;
1731         }
1732         else {
1733             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1734                   line, path, file, key);
1735         }
1736     }
1737
1738     return 1;
1739 }
1740
1741 static inline char *tabs2spaces(char *s) {
1742     char *t = s;
1743     while(*t) {
1744         if(unlikely(*t == '\t')) *t = ' ';
1745         t++;
1746     }
1747
1748     return s;
1749 }
1750
1751 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1752     char buffer[FILENAME_MAX + 1];
1753     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1754     return strdupz(buffer);
1755 }
1756
1757 static inline void strip_quotes(char *s) {
1758     while(*s) {
1759         if(*s == '\'' || *s == '"') *s = ' ';
1760         s++;
1761     }
1762 }
1763
1764 int health_readfile(const char *path, const char *filename) {
1765     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1766
1767     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1768     char buffer[HEALTH_CONF_MAX_LINE + 1];
1769
1770     if(unlikely(!hash_alarm)) {
1771         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1772         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1773         hash_on = simple_uhash(HEALTH_ON_KEY);
1774         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1775         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1776         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1777         hash_red = simple_uhash(HEALTH_RED_KEY);
1778         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1779         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1780         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1781         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1782         hash_units = simple_hash(HEALTH_UNITS_KEY);
1783         hash_info = simple_hash(HEALTH_INFO_KEY);
1784         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1785         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1786     }
1787
1788     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1789     FILE *fp = fopen(buffer, "r");
1790     if(!fp) {
1791         error("Health configuration cannot read file '%s'.", buffer);
1792         return 0;
1793     }
1794
1795     RRDCALC *rc = NULL;
1796     RRDCALCTEMPLATE *rt = NULL;
1797
1798     size_t line = 0, append = 0;
1799     char *s;
1800     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1801         int stop_appending = !s;
1802         line++;
1803         s = trim(buffer);
1804         if(!s) continue;
1805
1806         append = strlen(s);
1807         if(!stop_appending && s[append - 1] == '\\') {
1808             s[append - 1] = ' ';
1809             append = &s[append] - buffer;
1810             if(append < HEALTH_CONF_MAX_LINE)
1811                 continue;
1812             else {
1813                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1814             }
1815         }
1816         append = 0;
1817
1818         char *key = s;
1819         while(*s && *s != ':') s++;
1820         if(!*s) {
1821             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1822             continue;
1823         }
1824         *s = '\0';
1825         s++;
1826
1827         char *value = s;
1828         key = trim(key);
1829         value = trim(value);
1830
1831         if(!key) {
1832             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1833             continue;
1834         }
1835
1836         if(!value) {
1837             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1838             continue;
1839         }
1840
1841         uint32_t hash = simple_uhash(key);
1842
1843         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1844             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1845                 rrdcalc_free(&localhost, rc);
1846
1847             if(rt) {
1848                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1849                     rrdcalctemplate_free(&localhost, rt);
1850                 rt = NULL;
1851             }
1852
1853             rc = callocz(1, sizeof(RRDCALC));
1854             rc->next_event_id = 1;
1855             rc->name = tabs2spaces(strdupz(value));
1856             rc->hash = simple_hash(rc->name);
1857             rc->source = health_source_file(line, path, filename);
1858             rc->green = NAN;
1859             rc->red = NAN;
1860             rc->value = NAN;
1861             rc->old_value = NAN;
1862             rc->delay_multiplier = 1.0;
1863
1864             if(rrdvar_fix_name(rc->name))
1865                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1866         }
1867         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1868             if(rc) {
1869                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1870                     rrdcalc_free(&localhost, rc);
1871                 rc = NULL;
1872             }
1873
1874             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1875                 rrdcalctemplate_free(&localhost, rt);
1876
1877             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1878             rt->name = tabs2spaces(strdupz(value));
1879             rt->hash_name = simple_hash(rt->name);
1880             rt->source = health_source_file(line, path, filename);
1881             rt->green = NAN;
1882             rt->red = NAN;
1883             rt->delay_multiplier = 1.0;
1884
1885             if(rrdvar_fix_name(rt->name))
1886                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1887         }
1888         else if(rc) {
1889             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1890                 if(rc->chart) {
1891                     if(strcmp(rc->chart, value))
1892                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1893                              line, path, filename, rc->name, key, rc->chart, value, value);
1894
1895                     freez(rc->chart);
1896                 }
1897                 rc->chart = tabs2spaces(strdupz(value));
1898                 rc->hash_chart = simple_hash(rc->chart);
1899             }
1900             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1901                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1902                                        &rc->update_every,
1903                                        &rc->options, &rc->dimensions);
1904             }
1905             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1906                 if(!health_parse_duration(value, &rc->update_every))
1907                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1908                          line, path, filename, rc->name, key, value);
1909             }
1910             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1911                 char *e;
1912                 rc->green = strtold(value, &e);
1913                 if(e && *e) {
1914                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1915                          line, path, filename, rc->name, key, e);
1916                 }
1917             }
1918             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1919                 char *e;
1920                 rc->red = strtold(value, &e);
1921                 if(e && *e) {
1922                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1923                          line, path, filename, rc->name, key, e);
1924                 }
1925             }
1926             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1927                 const char *failed_at = NULL;
1928                 int error = 0;
1929                 rc->calculation = expression_parse(value, &failed_at, &error);
1930                 if(!rc->calculation) {
1931                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1932                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1933                 }
1934             }
1935             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1936                 const char *failed_at = NULL;
1937                 int error = 0;
1938                 rc->warning = expression_parse(value, &failed_at, &error);
1939                 if(!rc->warning) {
1940                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1941                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1942                 }
1943             }
1944             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1945                 const char *failed_at = NULL;
1946                 int error = 0;
1947                 rc->critical = expression_parse(value, &failed_at, &error);
1948                 if(!rc->critical) {
1949                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1950                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1951                 }
1952             }
1953             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1954                 if(rc->exec) {
1955                     if(strcmp(rc->exec, value))
1956                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1957                              line, path, filename, rc->name, key, rc->exec, value, value);
1958
1959                     freez(rc->exec);
1960                 }
1961                 rc->exec = tabs2spaces(strdupz(value));
1962             }
1963             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1964                 if(rc->recipient) {
1965                     if(strcmp(rc->recipient, value))
1966                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1967                              line, path, filename, rc->name, key, rc->recipient, value, value);
1968
1969                     freez(rc->recipient);
1970                 }
1971                 rc->recipient = tabs2spaces(strdupz(value));
1972             }
1973             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1974                 if(rc->units) {
1975                     if(strcmp(rc->units, value))
1976                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1977                              line, path, filename, rc->name, key, rc->units, value, value);
1978
1979                     freez(rc->units);
1980                 }
1981                 rc->units = tabs2spaces(strdupz(value));
1982                 strip_quotes(rc->units);
1983             }
1984             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1985                 if(rc->info) {
1986                     if(strcmp(rc->info, value))
1987                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1988                              line, path, filename, rc->name, key, rc->info, value, value);
1989
1990                     freez(rc->info);
1991                 }
1992                 rc->info = tabs2spaces(strdupz(value));
1993                 strip_quotes(rc->info);
1994             }
1995             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1996                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1997             }
1998             else {
1999                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
2000                      line, path, filename, rc->name, key);
2001             }
2002         }
2003         else if(rt) {
2004             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
2005                 if(rt->context) {
2006                     if(strcmp(rt->context, value))
2007                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2008                              line, path, filename, rt->name, key, rt->context, value, value);
2009
2010                     freez(rt->context);
2011                 }
2012                 rt->context = tabs2spaces(strdupz(value));
2013                 rt->hash_context = simple_hash(rt->context);
2014             }
2015             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
2016                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
2017                                        &rt->update_every,
2018                                        &rt->options, &rt->dimensions);
2019             }
2020             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
2021                 if(!health_parse_duration(value, &rt->update_every))
2022                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
2023                          line, path, filename, rt->name, key, value);
2024             }
2025             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
2026                 char *e;
2027                 rt->green = strtold(value, &e);
2028                 if(e && *e) {
2029                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2030                          line, path, filename, rt->name, key, e);
2031                 }
2032             }
2033             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
2034                 char *e;
2035                 rt->red = strtold(value, &e);
2036                 if(e && *e) {
2037                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2038                          line, path, filename, rt->name, key, e);
2039                 }
2040             }
2041             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2042                 const char *failed_at = NULL;
2043                 int error = 0;
2044                 rt->calculation = expression_parse(value, &failed_at, &error);
2045                 if(!rt->calculation) {
2046                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2047                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2048                 }
2049             }
2050             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2051                 const char *failed_at = NULL;
2052                 int error = 0;
2053                 rt->warning = expression_parse(value, &failed_at, &error);
2054                 if(!rt->warning) {
2055                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2056                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2057                 }
2058             }
2059             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2060                 const char *failed_at = NULL;
2061                 int error = 0;
2062                 rt->critical = expression_parse(value, &failed_at, &error);
2063                 if(!rt->critical) {
2064                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2065                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2066                 }
2067             }
2068             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2069                 if(rt->exec) {
2070                     if(strcmp(rt->exec, value))
2071                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2072                              line, path, filename, rt->name, key, rt->exec, value, value);
2073
2074                     freez(rt->exec);
2075                 }
2076                 rt->exec = tabs2spaces(strdupz(value));
2077             }
2078             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2079                 if(rt->recipient) {
2080                     if(strcmp(rt->recipient, value))
2081                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2082                              line, path, filename, rt->name, key, rt->recipient, value, value);
2083
2084                     freez(rt->recipient);
2085                 }
2086                 rt->recipient = tabs2spaces(strdupz(value));
2087             }
2088             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2089                 if(rt->units) {
2090                     if(strcmp(rt->units, value))
2091                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2092                              line, path, filename, rt->name, key, rt->units, value, value);
2093
2094                     freez(rt->units);
2095                 }
2096                 rt->units = tabs2spaces(strdupz(value));
2097                 strip_quotes(rt->units);
2098             }
2099             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2100                 if(rt->info) {
2101                     if(strcmp(rt->info, value))
2102                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2103                              line, path, filename, rt->name, key, rt->info, value, value);
2104
2105                     freez(rt->info);
2106                 }
2107                 rt->info = tabs2spaces(strdupz(value));
2108                 strip_quotes(rt->info);
2109             }
2110             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2111                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
2112             }
2113             else {
2114                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
2115                       line, path, filename, rt->name, key);
2116             }
2117         }
2118         else {
2119             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
2120                   line, path, filename, key);
2121         }
2122     }
2123
2124     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2125         rrdcalc_free(&localhost, rc);
2126
2127     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2128         rrdcalctemplate_free(&localhost, rt);
2129
2130     fclose(fp);
2131     return 1;
2132 }
2133
2134 void health_readdir(const char *path) {
2135     size_t pathlen = strlen(path);
2136
2137     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2138
2139     DIR *dir = opendir(path);
2140     if (!dir) {
2141         error("Health configuration cannot open directory '%s'.", path);
2142         return;
2143     }
2144
2145     struct dirent *de = NULL;
2146     while ((de = readdir(dir))) {
2147         size_t len = strlen(de->d_name);
2148
2149         if(de->d_type == DT_DIR
2150            && (
2151                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2152                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2153            )) {
2154             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2155             continue;
2156         }
2157
2158         else if(de->d_type == DT_DIR) {
2159             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2160             strcpy(s, path);
2161             strcat(s, "/");
2162             strcat(s, de->d_name);
2163             health_readdir(s);
2164             freez(s);
2165             continue;
2166         }
2167
2168         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2169                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2170             health_readfile(path, de->d_name);
2171         }
2172
2173         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2174     }
2175
2176     closedir(dir);
2177 }
2178
2179 static inline char *health_config_dir(void) {
2180     char buffer[FILENAME_MAX + 1];
2181     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2182     return config_get("health", "health configuration directory", buffer);
2183 }
2184
2185 void health_init(void) {
2186     debug(D_HEALTH, "Health configuration initializing");
2187
2188     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2189         debug(D_HEALTH, "Health is disabled.");
2190         return;
2191     }
2192
2193     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2194     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2195         fatal("Cannot create directory '%s'.", pathname);
2196
2197     char filename[FILENAME_MAX + 1];
2198     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2199     health.log_filename = config_get("health", "health db file", filename);
2200
2201     health_alarm_log_load(&localhost);
2202     health_alarm_log_open();
2203
2204     char *path = health_config_dir();
2205
2206     {
2207         char buffer[FILENAME_MAX + 1];
2208         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2209         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2210     }
2211
2212     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2213     if(n < 10) {
2214         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2215         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2216     }
2217     else localhost.health_log.max = (unsigned int)n;
2218
2219     rrdhost_rwlock(&localhost);
2220     health_readdir(path);
2221     rrdhost_unlock(&localhost);
2222 }
2223
2224 // ----------------------------------------------------------------------------
2225 // JSON generation
2226
2227 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2228     if(value && *value)
2229         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2230     else
2231         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2232 }
2233
2234 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2235     buffer_sprintf(wb, "\n\t{\n"
2236                            "\t\t\"hostname\": \"%s\",\n"
2237                            "\t\t\"unique_id\": %u,\n"
2238                            "\t\t\"alarm_id\": %u,\n"
2239                            "\t\t\"alarm_event_id\": %u,\n"
2240                            "\t\t\"name\": \"%s\",\n"
2241                            "\t\t\"chart\": \"%s\",\n"
2242                            "\t\t\"family\": \"%s\",\n"
2243                            "\t\t\"processed\": %s,\n"
2244                            "\t\t\"updated\": %s,\n"
2245                            "\t\t\"exec_run\": %lu,\n"
2246                            "\t\t\"exec_failed\": %s,\n"
2247                            "\t\t\"exec\": \"%s\",\n"
2248                            "\t\t\"recipient\": \"%s\",\n"
2249                            "\t\t\"exec_code\": %d,\n"
2250                            "\t\t\"source\": \"%s\",\n"
2251                            "\t\t\"units\": \"%s\",\n"
2252                            "\t\t\"info\": \"%s\",\n"
2253                            "\t\t\"when\": %lu,\n"
2254                            "\t\t\"duration\": %lu,\n"
2255                            "\t\t\"non_clear_duration\": %lu,\n"
2256                            "\t\t\"status\": \"%s\",\n"
2257                            "\t\t\"old_status\": \"%s\",\n"
2258                            "\t\t\"delay\": %d,\n"
2259                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2260                            "\t\t\"updated_by_id\": %u,\n"
2261                            "\t\t\"updates_id\": %u,\n",
2262                    host->hostname,
2263                    ae->unique_id,
2264                    ae->alarm_id,
2265                    ae->alarm_event_id,
2266                    ae->name,
2267                    ae->chart,
2268                    ae->family,
2269                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2270                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2271                    (unsigned long)ae->exec_run_timestamp,
2272                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2273                    ae->exec?ae->exec:health.health_default_exec,
2274                    ae->recipient?ae->recipient:health.health_default_recipient,
2275                    ae->exec_code,
2276                    ae->source,
2277                    ae->units?ae->units:"",
2278                    ae->info?ae->info:"",
2279                    (unsigned long)ae->when,
2280                    (unsigned long)ae->duration,
2281                    (unsigned long)ae->non_clear_duration,
2282                    rrdcalc_status2string(ae->new_status),
2283                    rrdcalc_status2string(ae->old_status),
2284                    ae->delay,
2285                    (unsigned long)ae->delay_up_to_timestamp,
2286                    ae->updated_by_id,
2287                    ae->updates_id
2288     );
2289
2290     buffer_strcat(wb, "\t\t\"value\":");
2291     buffer_rrd_value(wb, ae->new_value);
2292     buffer_strcat(wb, ",\n");
2293
2294     buffer_strcat(wb, "\t\t\"old_value\":");
2295     buffer_rrd_value(wb, ae->old_value);
2296     buffer_strcat(wb, "\n");
2297
2298     buffer_strcat(wb, "\t}");
2299 }
2300
2301 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2302     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2303
2304     buffer_strcat(wb, "[");
2305
2306     unsigned int max = host->health_log.max;
2307     unsigned int count = 0;
2308     ALARM_ENTRY *ae;
2309     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2310         if(ae->unique_id > after) {
2311             if(likely(count)) buffer_strcat(wb, ",");
2312             health_alarm_entry2json_nolock(wb, ae, host);
2313         }
2314     }
2315
2316     buffer_strcat(wb, "\n]\n");
2317
2318     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2319 }
2320
2321 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2322     buffer_sprintf(wb,
2323            "\t\t\"%s.%s\": {\n"
2324                    "\t\t\t\"id\": %lu,\n"
2325                    "\t\t\t\"name\": \"%s\",\n"
2326                    "\t\t\t\"chart\": \"%s\",\n"
2327                    "\t\t\t\"family\": \"%s\",\n"
2328                    "\t\t\t\"active\": %s,\n"
2329                    "\t\t\t\"exec\": \"%s\",\n"
2330                    "\t\t\t\"recipient\": \"%s\",\n"
2331                    "\t\t\t\"source\": \"%s\",\n"
2332                    "\t\t\t\"units\": \"%s\",\n"
2333                    "\t\t\t\"info\": \"%s\",\n"
2334                                    "\t\t\t\"status\": \"%s\",\n"
2335                    "\t\t\t\"last_status_change\": %lu,\n"
2336                    "\t\t\t\"last_updated\": %lu,\n"
2337                    "\t\t\t\"next_update\": %lu,\n"
2338                    "\t\t\t\"update_every\": %d,\n"
2339                    "\t\t\t\"delay_up_duration\": %d,\n"
2340                    "\t\t\t\"delay_down_duration\": %d,\n"
2341                    "\t\t\t\"delay_max_duration\": %d,\n"
2342                    "\t\t\t\"delay_multiplier\": %f,\n"
2343                    "\t\t\t\"delay\": %d,\n"
2344                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2345             , rc->chart, rc->name
2346             , (unsigned long)rc->id
2347             , rc->name
2348             , rc->chart
2349             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2350             , (rc->rrdset)?"true":"false"
2351             , rc->exec?rc->exec:health.health_default_exec
2352             , rc->recipient?rc->recipient:health.health_default_recipient
2353             , rc->source
2354             , rc->units?rc->units:""
2355             , rc->info?rc->info:""
2356             , rrdcalc_status2string(rc->status)
2357             , (unsigned long)rc->last_status_change
2358             , (unsigned long)rc->last_updated
2359             , (unsigned long)rc->next_update
2360             , rc->update_every
2361             , rc->delay_up_duration
2362             , rc->delay_down_duration
2363             , rc->delay_max_duration
2364             , rc->delay_multiplier
2365             , rc->delay_last
2366             , (unsigned long)rc->delay_up_to_timestamp
2367     );
2368
2369     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2370         if(rc->dimensions && *rc->dimensions)
2371             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2372
2373         buffer_sprintf(wb,
2374                        "\t\t\t\"db_after\": %lu,\n"
2375                        "\t\t\t\"db_before\": %lu,\n"
2376                        "\t\t\t\"lookup_method\": \"%s\",\n"
2377                        "\t\t\t\"lookup_after\": %d,\n"
2378                        "\t\t\t\"lookup_before\": %d,\n"
2379                        "\t\t\t\"lookup_options\": \"",
2380                        (unsigned long) rc->db_after,
2381                        (unsigned long) rc->db_before,
2382                        group_method2string(rc->group),
2383                        rc->after,
2384                        rc->before
2385         );
2386         buffer_data_options2string(wb, rc->options);
2387         buffer_strcat(wb, "\",\n");
2388     }
2389
2390     if(rc->calculation) {
2391         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2392         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2393     }
2394
2395     if(rc->warning) {
2396         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2397         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2398     }
2399
2400     if(rc->critical) {
2401         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2402         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2403     }
2404
2405     buffer_strcat(wb, "\t\t\t\"green\":");
2406     buffer_rrd_value(wb, rc->green);
2407     buffer_strcat(wb, ",\n");
2408
2409     buffer_strcat(wb, "\t\t\t\"red\":");
2410     buffer_rrd_value(wb, rc->red);
2411     buffer_strcat(wb, ",\n");
2412
2413     buffer_strcat(wb, "\t\t\t\"value\":");
2414     buffer_rrd_value(wb, rc->value);
2415     buffer_strcat(wb, "\n");
2416
2417     buffer_strcat(wb, "\t\t}");
2418 }
2419
2420 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2421 //
2422 //}
2423
2424 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2425     int i;
2426
2427     rrdhost_rdlock(&localhost);
2428     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2429                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2430                         "\n\t\"status\": %s,"
2431                         "\n\t\"now\": %lu,"
2432                         "\n\t\"alarms\": {\n",
2433                         host->hostname,
2434                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2435                         health_enabled?"true":"false",
2436                         (unsigned long)time(NULL));
2437
2438     RRDCALC *rc;
2439     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2440         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2441             continue;
2442
2443         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2444             continue;
2445
2446         if(likely(i)) buffer_strcat(wb, ",\n");
2447         health_rrdcalc2json_nolock(wb, rc);
2448         i++;
2449     }
2450
2451 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2452 //    RRDCALCTEMPLATE *rt;
2453 //    for(rt = host->templates; rt ; rt = rt->next)
2454 //        health_rrdcalctemplate2json_nolock(wb, rt);
2455
2456     buffer_strcat(wb, "\n\t}\n}\n");
2457     rrdhost_unlock(&localhost);
2458 }
2459
2460
2461 // ----------------------------------------------------------------------------
2462 // re-load health configuration
2463
2464 static inline void health_free_all_nolock(RRDHOST *host) {
2465     while(host->templates)
2466         rrdcalctemplate_free(host, host->templates);
2467
2468     while(host->alarms)
2469         rrdcalc_free(host, host->alarms);
2470 }
2471
2472 void health_reload(void) {
2473     if(!health_enabled) {
2474         error("Health reload is requested, but health is not enabled.");
2475         return;
2476     }
2477
2478     char *path = health_config_dir();
2479
2480     // free all running alarms
2481     rrdhost_rwlock(&localhost);
2482     health_free_all_nolock(&localhost);
2483     rrdhost_unlock(&localhost);
2484
2485     // invalidate all previous entries in the alarm log
2486     ALARM_ENTRY *t;
2487     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2488         if(t->new_status != RRDCALC_STATUS_REMOVED)
2489             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2490     }
2491
2492     // reset all thresholds to all charts
2493     RRDSET *st;
2494     for(st = localhost.rrdset_root; st ; st = st->next) {
2495         st->green = NAN;
2496         st->red = NAN;
2497     }
2498
2499     // load the new alarms
2500     rrdhost_rwlock(&localhost);
2501     health_readdir(path);
2502     rrdhost_unlock(&localhost);
2503
2504     // link the loaded alarms to their charts
2505     for(st = localhost.rrdset_root; st ; st = st->next) {
2506         rrdhost_rwlock(&localhost);
2507
2508         rrdsetcalc_link_matching(st);
2509         rrdcalctemplate_link_matching(st);
2510
2511         rrdhost_unlock(&localhost);
2512     }
2513 }
2514
2515 // ----------------------------------------------------------------------------
2516 // health main thread and friends
2517
2518 static inline int rrdcalc_value2status(calculated_number n) {
2519     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2520     if(n) return RRDCALC_STATUS_RAISED;
2521     return RRDCALC_STATUS_CLEAR;
2522 }
2523
2524 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2525     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2526
2527     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2528         // do not send notifications for internal statuses
2529         goto done;
2530     }
2531
2532     // find the previous notification for the same alarm
2533     // which we have run the exec script
2534     ALARM_ENTRY *t;
2535     for(t = ae->next; t ;t = t->next) {
2536         if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2537             break;
2538     }
2539
2540     if(likely(t)) {
2541         // we have executed this alarm notification in the past
2542         if (t && t->new_status == ae->new_status) {
2543             // don't send the same notification again
2544             debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name,
2545                  rrdcalc_status2string(ae->new_status));
2546             goto done;
2547         }
2548     }
2549     else {
2550         // we have not executed this alarm notification in the past
2551         if(unlikely(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2552             debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2553             goto done;
2554         }
2555     }
2556
2557     char buffer[FILENAME_MAX + 1];
2558     pid_t command_pid;
2559
2560     const char *exec = ae->exec;
2561     if(!exec) exec = health.health_default_exec;
2562
2563     const char *recipient = ae->recipient;
2564     if(!recipient) recipient = health.health_default_recipient;
2565
2566     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2567               exec,
2568               recipient,
2569               host->hostname,
2570               ae->unique_id,
2571               ae->alarm_id,
2572               ae->alarm_event_id,
2573               (unsigned long)ae->when,
2574               ae->name,
2575               ae->chart?ae->chart:"NOCAHRT",
2576               ae->family?ae->family:"NOFAMILY",
2577               rrdcalc_status2string(ae->new_status),
2578               rrdcalc_status2string(ae->old_status),
2579               ae->new_value,
2580               ae->old_value,
2581               ae->source?ae->source:"UNKNOWN",
2582               (uint32_t)ae->duration,
2583               (uint32_t)ae->non_clear_duration,
2584               ae->units?ae->units:"",
2585               ae->info?ae->info:""
2586     );
2587
2588     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2589     ae->exec_run_timestamp = time(NULL);
2590
2591     debug(D_HEALTH, "executing command '%s'", buffer);
2592     FILE *fp = mypopen(buffer, &command_pid);
2593     if(!fp) {
2594         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2595         goto done;
2596     }
2597     debug(D_HEALTH, "HEALTH reading from command");
2598     char *s = fgets(buffer, FILENAME_MAX, fp);
2599     (void)s;
2600     ae->exec_code = mypclose(fp, command_pid);
2601     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2602
2603     if(ae->exec_code != 0)
2604         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2605
2606 done:
2607     health_alarm_log_save(host, ae);
2608     return;
2609 }
2610
2611 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2612     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2613          ae->chart?ae->chart:"NOCHART", ae->name,
2614          ae->new_value,
2615          rrdcalc_status2string(ae->old_status),
2616          rrdcalc_status2string(ae->new_status)
2617     );
2618
2619     health_alarm_execute(host, ae);
2620 }
2621
2622 static inline void health_alarm_log_process(RRDHOST *host) {
2623     static uint32_t stop_at_id = 0;
2624     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2625     time_t now = time(NULL);
2626
2627     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2628
2629     ALARM_ENTRY *ae;
2630     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2631         if(unlikely(
2632             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2633             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2634             )) {
2635
2636             if(unlikely(ae->unique_id < first_waiting))
2637                 first_waiting = ae->unique_id;
2638
2639             if(likely(now >= ae->delay_up_to_timestamp))
2640                 health_process_notifications(host, ae);
2641         }
2642     }
2643
2644     // remember this for the next iteration
2645     stop_at_id = first_waiting;
2646
2647     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2648
2649     if(host->health_log.count <= host->health_log.max)
2650         return;
2651
2652     // cleanup excess entries in the log
2653     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2654
2655     ALARM_ENTRY *last = NULL;
2656     unsigned int count = host->health_log.max * 2 / 3;
2657     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2658
2659     if(ae && last && last->next == ae)
2660         last->next = NULL;
2661     else
2662         ae = NULL;
2663
2664     while(ae) {
2665         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2666
2667         ALARM_ENTRY *t = ae->next;
2668
2669         freez(ae->name);
2670         freez(ae->chart);
2671         freez(ae->family);
2672         freez(ae->exec);
2673         freez(ae->recipient);
2674         freez(ae->source);
2675         freez(ae->units);
2676         freez(ae->info);
2677         freez(ae);
2678
2679         ae = t;
2680         host->health_log.count--;
2681     }
2682
2683     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2684 }
2685
2686 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2687     if(unlikely(!rc->rrdset)) {
2688         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2689         return 0;
2690     }
2691
2692     if(unlikely(rc->next_update > now)) {
2693         if (unlikely(*next_run > rc->next_update)) {
2694             // update the next_run time of the main loop
2695             // to run this alarm precisely the time required
2696             *next_run = rc->next_update;
2697         }
2698
2699         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2700         return 0;
2701     }
2702
2703     if(unlikely(!rc->update_every)) {
2704         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2705         return 0;
2706     }
2707
2708     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
2709         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
2710         return 0;
2711     }
2712
2713     int update_every = rc->rrdset->update_every;
2714     time_t first = rrdset_first_entry_t(rc->rrdset);
2715     time_t last = rrdset_last_entry_t(rc->rrdset);
2716
2717     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
2718         debug(D_HEALTH
2719               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
2720               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
2721               , (unsigned long) last);
2722         return 0;
2723     }
2724
2725     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2726         time_t needed = now + rc->before + rc->after;
2727
2728         if(needed + update_every < first || needed - update_every > last) {
2729             debug(D_HEALTH
2730                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
2731                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
2732                   , (unsigned long) last);
2733             return 0;
2734         }
2735     }
2736
2737     return 1;
2738 }
2739
2740 void *health_main(void *ptr) {
2741     (void)ptr;
2742
2743     info("HEALTH thread created with task id %d", gettid());
2744
2745     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2746         error("Cannot set pthread cancel type to DEFERRED.");
2747
2748     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2749         error("Cannot set pthread cancel state to ENABLE.");
2750
2751     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2752     if(min_run_every < 1) min_run_every = 1;
2753
2754     BUFFER *wb = buffer_create(100);
2755
2756     unsigned int loop = 0;
2757     while(health_enabled && !netdata_exit) {
2758         loop++;
2759         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2760
2761         int oldstate, runnable = 0;
2762         time_t now = time(NULL);
2763         time_t next_run = now + min_run_every;
2764         RRDCALC *rc;
2765
2766         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2767             error("Cannot set pthread cancel state to DISABLE.");
2768
2769         rrdhost_rdlock(&localhost);
2770
2771         // the first loop is to lookup values from the db
2772         for(rc = localhost.alarms; rc; rc = rc->next) {
2773             if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
2774                 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
2775                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
2776                 continue;
2777             }
2778
2779             runnable++;
2780             rc->old_value = rc->value;
2781             rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
2782
2783             // 1. if there is database lookup, do it
2784             // 2. if there is calculation expression, run it
2785
2786             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2787                 /* time_t old_db_timestamp = rc->db_before; */
2788                 int value_is_null = 0;
2789
2790                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2791                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2792                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2793
2794                 if (unlikely(ret != 200)) {
2795                     // database lookup failed
2796                     rc->value = NAN;
2797
2798                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2799
2800                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2801                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2802                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2803                     }
2804                 }
2805                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2806                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2807
2808                 /* - RRDCALC_FLAG_DB_STALE not currently used
2809                 if (unlikely(old_db_timestamp == rc->db_before)) {
2810                     // database is stale
2811
2812                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2813
2814                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2815                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2816                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2817                     }
2818                 }
2819                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2820                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2821                 */
2822
2823                 if (unlikely(value_is_null)) {
2824                     // collected value is null
2825
2826                     rc->value = NAN;
2827
2828                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2829                           rc->chart?rc->chart:"NOCHART", rc->name);
2830
2831                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2832                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2833                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2834                               rc->chart?rc->chart:"NOCHART", rc->name);
2835                     }
2836                 }
2837                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2838                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2839
2840                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2841                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2842             }
2843
2844             if(unlikely(rc->calculation)) {
2845                 if (unlikely(!expression_evaluate(rc->calculation))) {
2846                     // calculation failed
2847
2848                     rc->value = NAN;
2849
2850                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s",
2851                           rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2852
2853                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2854                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2855                         error("Health alarm '%s.%s': expression '%s' failed: %s",
2856                               rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2857                     }
2858                 }
2859                 else {
2860                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2861                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2862
2863                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value "
2864                             CALCULATED_NUMBER_FORMAT
2865                             ": %s (source: %s)",
2866                           rc->chart?rc->chart:"NOCHART", rc->name,
2867                           rc->calculation->parsed_as,
2868                           rc->calculation->result,
2869                           buffer_tostring(rc->calculation->error_msg),
2870                           rc->source
2871                     );
2872
2873                     rc->value = rc->calculation->result;
2874                 }
2875             }
2876         }
2877         rrdhost_unlock(&localhost);
2878
2879         if(unlikely(runnable && !netdata_exit)) {
2880             rrdhost_rdlock(&localhost);
2881
2882             for(rc = localhost.alarms; rc; rc = rc->next) {
2883                 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
2884                     continue;
2885
2886                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2887                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2888
2889                 if(likely(rc->warning)) {
2890                     if(unlikely(!expression_evaluate(rc->warning))) {
2891                         // calculation failed
2892
2893                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2894                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2895
2896                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2897                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2898                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2899                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2900                         }
2901                     }
2902                     else {
2903                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2904                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2905
2906                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2907                                 CALCULATED_NUMBER_FORMAT
2908                                 ": %s (source: %s)",
2909                               rc->chart?rc->chart:"NOCHART", rc->name,
2910                               rc->warning->result,
2911                               buffer_tostring(rc->warning->error_msg),
2912                               rc->source
2913                         );
2914
2915                         warning_status = rrdcalc_value2status(rc->warning->result);
2916                     }
2917                 }
2918
2919                 if(likely(rc->critical)) {
2920                     if(unlikely(!expression_evaluate(rc->critical))) {
2921                         // calculation failed
2922
2923                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2924                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2925
2926                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2927                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2928                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2929                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2930                         }
2931                     }
2932                     else {
2933                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2934                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2935
2936                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2937                                 CALCULATED_NUMBER_FORMAT
2938                                 ": %s (source: %s)",
2939                               rc->chart?rc->chart:"NOCHART", rc->name,
2940                               rc->critical->result,
2941                               buffer_tostring(rc->critical->error_msg),
2942                               rc->source
2943                         );
2944
2945                         critical_status = rrdcalc_value2status(rc->critical->result);
2946                     }
2947                 }
2948
2949                 int status = RRDCALC_STATUS_UNDEFINED;
2950
2951                 switch(warning_status) {
2952                     case RRDCALC_STATUS_CLEAR:
2953                         status = RRDCALC_STATUS_CLEAR;
2954                         break;
2955
2956                     case RRDCALC_STATUS_RAISED:
2957                         status = RRDCALC_STATUS_WARNING;
2958                         break;
2959
2960                     default:
2961                         break;
2962                 }
2963
2964                 switch(critical_status) {
2965                     case RRDCALC_STATUS_CLEAR:
2966                         if(status == RRDCALC_STATUS_UNDEFINED)
2967                             status = RRDCALC_STATUS_CLEAR;
2968                         break;
2969
2970                     case RRDCALC_STATUS_RAISED:
2971                         status = RRDCALC_STATUS_CRITICAL;
2972                         break;
2973
2974                     default:
2975                         break;
2976                 }
2977
2978                 if(status != rc->status) {
2979                     int delay = 0;
2980
2981                     if(now > rc->delay_up_to_timestamp) {
2982                         rc->delay_up_current = rc->delay_up_duration;
2983                         rc->delay_down_current = rc->delay_down_duration;
2984                         rc->delay_last = 0;
2985                         rc->delay_up_to_timestamp = 0;
2986                     }
2987                     else {
2988                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2989                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2990
2991                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2992                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2993                     }
2994
2995                     if(status > rc->status)
2996                         delay = rc->delay_up_current;
2997                     else
2998                         delay = rc->delay_down_current;
2999
3000                     // COMMENTED: because we do need to send raising alarms
3001                     // if(now + delay < rc->delay_up_to_timestamp)
3002                     //    delay = (int)(rc->delay_up_to_timestamp - now);
3003
3004                     rc->delay_last = delay;
3005                     rc->delay_up_to_timestamp = now + delay;
3006                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
3007                     rc->last_status_change = now;
3008                     rc->status = status;
3009                 }
3010
3011                 rc->last_updated = now;
3012                 rc->next_update = now + rc->update_every;
3013
3014                 if (next_run > rc->next_update)
3015                     next_run = rc->next_update;
3016             }
3017
3018             rrdhost_unlock(&localhost);
3019         }
3020
3021         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
3022             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
3023
3024         if(unlikely(netdata_exit))
3025             break;
3026
3027         // execute notifications
3028         // and cleanup
3029         health_alarm_log_process(&localhost);
3030
3031         if(unlikely(netdata_exit))
3032             break;
3033         
3034         now = time(NULL);
3035         if(now < next_run) {
3036             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
3037                   loop, (int) (next_run - now));
3038             sleep_usec(1000000 * (unsigned long long) (next_run - now));
3039         }
3040         else {
3041             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
3042         }
3043     }
3044
3045     buffer_free(wb);
3046
3047     info("HEALTH thread exiting");
3048     pthread_exit(NULL);
3049     return NULL;
3050 }