]> arthur.barton.de Git - netdata.git/blob - src/health.c
added support for custom variables that can be exposed to alarm expressions without...
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         if(unlink(health.log_filename) == -1 && errno != ENOENT)
71             error("Health: cannot remove old alarms log file '%s'", health.log_filename);
72
73         // open it with truncate
74         health.log_fp = fopen(health.log_filename, "w");
75
76         if(health.log_fp)
77             fclose(health.log_fp);
78         else
79             error("Health: cannot truncate health log '%s'", health.log_filename);
80
81         health.log_fp = NULL;
82
83         health.log_entries_written = 0;
84         health_alarm_log_open();
85     }
86 }
87
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
89     health_log_rotate();
90
91     if(likely(health.log_fp)) {
92         if(unlikely(fprintf(health.log_fp
93                 , "%c\t%s"
94                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
95                   "\t%08x\t%08x\t%08x"
96                   "\t%08x\t%08x\t%08x"
97                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
98                   "\t%d\t%d\t%d\t%d"
99                   "\t%Lf\t%Lf"
100                   "\n"
101                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
102                 , host->hostname
103
104                 , ae->unique_id
105                 , ae->alarm_id
106                 , ae->alarm_event_id
107                 , ae->updated_by_id
108                 , ae->updates_id
109
110                 , (uint32_t)ae->when
111                 , (uint32_t)ae->duration
112                 , (uint32_t)ae->non_clear_duration
113                 , (uint32_t)ae->flags
114                 , (uint32_t)ae->exec_run_timestamp
115                 , (uint32_t)ae->delay_up_to_timestamp
116
117                 , (ae->name)?ae->name:""
118                 , (ae->chart)?ae->chart:""
119                 , (ae->family)?ae->family:""
120                 , (ae->exec)?ae->exec:""
121                 , (ae->recipient)?ae->recipient:""
122                 , (ae->source)?ae->source:""
123                 , (ae->units)?ae->units:""
124                 , (ae->info)?ae->info:""
125
126                 , ae->exec_code
127                 , ae->new_status
128                 , ae->old_status
129                 , ae->delay
130
131                 , (long double)ae->new_value
132                 , (long double)ae->old_value
133         ) < 0))
134             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
135         else {
136             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137             health.log_entries_written++;
138         }
139     }
140 }
141
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143     static uint32_t max_unique_id = 0, max_alarm_id = 0;
144     ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
145
146     errno = 0;
147
148     char *s, *buf = mallocz(65536 + 1);
149     size_t line = 0, len = 0;
150     loaded = updated = errored = duplicate = 0;
151
152     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
153
154     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
155         health.log_entries_written++;
156         line++;
157
158         int max_entries = 30, entries = 0;
159         char *pointers[max_entries];
160
161         pointers[entries++] = s++;
162         while(*s) {
163             if(unlikely(*s == '\t')) {
164                 *s = '\0';
165                 pointers[entries++] = ++s;
166                 if(entries >= max_entries) {
167                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
168                     break;
169                 }
170             }
171             else s++;
172         }
173
174         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
175             ALARM_ENTRY *ae = NULL;
176
177             if(entries < 26) {
178                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries);
179                 errored++;
180                 continue;
181             }
182
183             // check that we have valid ids
184             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
185             if(!unique_id) {
186                 error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]);
187                 errored++;
188                 continue;
189             }
190
191             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
192             if(!alarm_id) {
193                 error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]);
194                 errored++;
195                 continue;
196             }
197
198             if(unlikely(*pointers[0] == 'A')) {
199                 // make sure it is properly numbered
200                 if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
201                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id);
202                     errored++;
203                     continue;
204                 }
205
206                 ae = callocz(1, sizeof(ALARM_ENTRY));
207             }
208             else if(unlikely(*pointers[0] == 'U')) {
209                 // find the original
210                 for(ae = host->health_log.alarms; ae; ae = ae->next) {
211                     if(unlikely(unique_id == ae->unique_id)) {
212                         if(unlikely(*pointers[0] == 'A')) {
213                             error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later."
214                                   , line, filename, unique_id);
215                             *pointers[0] = 'U';
216                             duplicate++;
217                         }
218                         break;
219                     }
220                     else if(unlikely(unique_id > ae->unique_id)) {
221                         // no need to continue
222                         // the linked list is sorted
223                         ae = NULL;
224                         break;
225                     }
226                 }
227
228                 // if not found, skip this line
229                 if(!ae) {
230                     // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
231                     continue;
232                 }
233             }
234
235             // check for a possible host missmatch
236             //if(strcmp(pointers[1], host->hostname))
237             //    error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
238
239             ae->unique_id               = unique_id;
240             ae->alarm_id                = alarm_id;
241             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
242             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
243             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
244
245             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
246             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
247             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
248
249             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
250             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
251
252             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
253             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
254
255             if(unlikely(ae->name)) freez(ae->name);
256             ae->name = strdupz(pointers[13]);
257             ae->hash_name = simple_hash(ae->name);
258
259             if(unlikely(ae->chart)) freez(ae->chart);
260             ae->chart = strdupz(pointers[14]);
261             ae->hash_chart = simple_hash(ae->chart);
262
263             if(unlikely(ae->family)) freez(ae->family);
264             ae->family = strdupz(pointers[15]);
265
266             if(unlikely(ae->exec)) freez(ae->exec);
267             ae->exec = strdupz(pointers[16]);
268             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
269
270             if(unlikely(ae->recipient)) freez(ae->recipient);
271             ae->recipient = strdupz(pointers[17]);
272             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
273
274             if(unlikely(ae->source)) freez(ae->source);
275             ae->source = strdupz(pointers[18]);
276             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
277
278             if(unlikely(ae->units)) freez(ae->units);
279             ae->units = strdupz(pointers[19]);
280             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
281
282             if(unlikely(ae->info)) freez(ae->info);
283             ae->info = strdupz(pointers[20]);
284             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
285
286             ae->exec_code   = atoi(pointers[21]);
287             ae->new_status  = atoi(pointers[22]);
288             ae->old_status  = atoi(pointers[23]);
289             ae->delay       = atoi(pointers[24]);
290
291             ae->new_value   = strtold(pointers[25], NULL);
292             ae->old_value   = strtold(pointers[26], NULL);
293
294             // add it to host if not already there
295             if(unlikely(*pointers[0] == 'A')) {
296                 ae->next = host->health_log.alarms;
297                 host->health_log.alarms = ae;
298                 loaded++;
299             }
300             else updated++;
301
302             if(unlikely(ae->unique_id > max_unique_id))
303                 max_unique_id = ae->unique_id;
304
305             if(unlikely(ae->alarm_id >= max_alarm_id))
306                 max_alarm_id = ae->alarm_id;
307         }
308         else {
309             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
310             errored++;
311         }
312     }
313
314     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
315
316     freez(buf);
317
318     if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
319     if(!max_alarm_id)  max_alarm_id  = (uint32_t)time(NULL);
320
321     host->health_log.next_log_id = max_unique_id + 1;
322     host->health_log.next_alarm_id = max_alarm_id + 1;
323
324     debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
325     return loaded;
326 }
327
328 static inline void health_alarm_log_load(RRDHOST *host) {
329     health_alarm_log_close();
330
331     char filename[FILENAME_MAX + 1];
332     snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename);
333     FILE *fp = fopen(filename, "r");
334     if(!fp)
335         error("Health: cannot open health file: %s", filename);
336     else {
337         health_alarm_log_read(host, fp, filename);
338         fclose(fp);
339     }
340
341     health.log_entries_written = 0;
342     fp = fopen(health.log_filename, "r");
343     if(!fp)
344         error("Health: cannot open health file: %s", health.log_filename);
345     else {
346         health_alarm_log_read(host, fp, health.log_filename);
347         fclose(fp);
348     }
349
350     health_alarm_log_open();
351 }
352
353
354 // ----------------------------------------------------------------------------
355 // health alarm log management
356
357 static inline void health_alarm_log(RRDHOST *host,
358                 uint32_t alarm_id, uint32_t alarm_event_id,
359                 time_t when,
360                 const char *name, const char *chart, const char *family,
361                 const char *exec, const char *recipient, time_t duration,
362                 calculated_number old_value, calculated_number new_value,
363                 int old_status, int new_status,
364                 const char *source,
365                 const char *units,
366                 const char *info,
367                 int delay
368 ) {
369     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
370
371     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
372     ae->name = strdupz(name);
373     ae->hash_name = simple_hash(ae->name);
374
375     if(chart) {
376         ae->chart = strdupz(chart);
377         ae->hash_chart = simple_hash(ae->chart);
378     }
379
380     if(family)
381         ae->family = strdupz(family);
382
383     if(exec) ae->exec = strdupz(exec);
384     if(recipient) ae->recipient = strdupz(recipient);
385     if(source) ae->source = strdupz(source);
386     if(units) ae->units = strdupz(units);
387     if(info) ae->info = strdupz(info);
388
389     ae->unique_id = host->health_log.next_log_id++;
390     ae->alarm_id = alarm_id;
391     ae->alarm_event_id = alarm_event_id;
392     ae->when = when;
393     ae->old_value = old_value;
394     ae->new_value = new_value;
395     ae->old_status = old_status;
396     ae->new_status = new_status;
397     ae->duration = duration;
398     ae->delay = delay;
399     ae->delay_up_to_timestamp = when + delay;
400
401     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
402         ae->non_clear_duration += ae->duration;
403
404     // link it
405     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
406     ae->next = host->health_log.alarms;
407     host->health_log.alarms = ae;
408     host->health_log.count++;
409     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
410
411     // match previous alarms
412     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
413     ALARM_ENTRY *t;
414     for(t = host->health_log.alarms ; t ; t = t->next) {
415         if(t != ae && t->alarm_id == ae->alarm_id) {
416             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
417                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
418                 t->updated_by_id = ae->unique_id;
419                 ae->updates_id = t->unique_id;
420
421                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
422                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
423                     ae->non_clear_duration += t->non_clear_duration;
424
425                 health_alarm_log_save(host, t);
426             }
427
428             // no need to continue
429             break;
430         }
431     }
432     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
433
434     health_alarm_log_save(host, ae);
435 }
436
437 // ----------------------------------------------------------------------------
438 // RRDVAR management
439
440 static inline int rrdvar_fix_name(char *variable) {
441     int fixed = 0;
442     while(*variable) {
443         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
444             *variable++ = '_';
445             fixed++;
446         }
447         else
448             variable++;
449     }
450
451     return fixed;
452 }
453
454 int rrdvar_compare(void* a, void* b) {
455     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
456     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
457     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
458 }
459
460 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
461     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
462     if(ret != rv)
463         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
464
465     return ret;
466 }
467
468 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
469     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
470     if(!ret)
471         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
472
473     return ret;
474 }
475
476 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
477     RRDVAR tmp;
478     tmp.name = (char *)name;
479     tmp.hash = (hash)?hash:simple_hash(tmp.name);
480
481     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
482 }
483
484 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
485     (void)host;
486
487     if(!rv) return;
488
489     if(tree) {
490         debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
491         if(unlikely(!rrdvar_index_del(tree, rv)))
492             error("Attempted to delete variable '%s' from host '%s', but it is not found.", rv->name, host->hostname);
493     }
494
495     freez(rv->name);
496     freez(rv);
497 }
498
499 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, void *value) {
500     char *variable = strdupz(name);
501     rrdvar_fix_name(variable);
502     uint32_t hash = simple_hash(variable);
503
504     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
505     if(unlikely(!rv)) {
506         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
507
508         rv = callocz(1, sizeof(RRDVAR));
509         rv->name = variable;
510         rv->hash = hash;
511         rv->type = type;
512         rv->value = value;
513
514         RRDVAR *ret = rrdvar_index_add(tree, rv);
515         if(unlikely(ret != rv)) {
516             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
517             rrdvar_free(NULL, NULL, rv);
518             rv = NULL;
519         }
520         else
521             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
522     }
523     else {
524         debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
525
526         // already exists
527         freez(variable);
528
529         // this is important
530         // it must return NULL - not the existing variable - or double-free will happen
531         rv = NULL;
532     }
533
534     return rv;
535 }
536
537 // ----------------------------------------------------------------------------
538 // CUSTOM VARIABLES
539
540 RRDVAR *rrdvar_custom_host_variable_create(RRDHOST *host, const char *name) {
541     calculated_number *v = callocz(1, sizeof(calculated_number));
542     *v = NAN;
543     RRDVAR *rv = rrdvar_create_and_index("host", &host->variables_root_index, name, RRDVAR_TYPE_CALCULATED_ALLOCATED, v);
544     if(unlikely(!rv)) {
545         free(v);
546         error("Requested variable '%s' already exists - possibly 2 plugins will be updating it at the same time", name);
547
548         char *variable = strdupz(name);
549         rrdvar_fix_name(variable);
550         uint32_t hash = simple_hash(variable);
551
552         rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
553     }
554
555     return rv;
556 }
557
558 void rrdvar_custom_host_variable_destroy(RRDHOST *host, const char *name) {
559     char *variable = strdupz(name);
560     rrdvar_fix_name(variable);
561     uint32_t hash = simple_hash(variable);
562
563     RRDVAR *rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
564     freez(variable);
565
566     if(!rv) {
567         error("Attempted to remove variable '%s' from host '%s', but it does not exist.", name, host->hostname);
568         return;
569     }
570
571     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED) {
572         error("Attempted to remove variable '%s' from host '%s', but it does not a custom allocated variable.", name, host->hostname);
573         return;
574     }
575
576     if(!rrdvar_index_del(&host->variables_root_index, rv)) {
577         error("Attempted to remove variable '%s' from host '%s', but it cannot be found.", name, host->hostname);
578         return;
579     }
580
581     freez(rv->name);
582     freez(rv->value);
583     freez(rv);
584 }
585
586 void rrdvar_custom_host_variable_set(RRDVAR *rv, calculated_number value) {
587     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED)
588         error("requested to set variable '%s' to value " CALCULATED_NUMBER_FORMAT " but the variable is not a custom one.", rv->name, value);
589     else {
590         calculated_number *v = rv->value;
591         *v = value;
592     }
593 }
594
595 // ----------------------------------------------------------------------------
596 // RRDVAR lookup
597
598 static calculated_number rrdvar2number(RRDVAR *rv) {
599     switch(rv->type) {
600         case RRDVAR_TYPE_CALCULATED_ALLOCATED:
601         case RRDVAR_TYPE_CALCULATED: {
602             calculated_number *n = (calculated_number *)rv->value;
603             return *n;
604         }
605
606         case RRDVAR_TYPE_TIME_T: {
607             time_t *n = (time_t *)rv->value;
608             return *n;
609         }
610
611         case RRDVAR_TYPE_COLLECTED: {
612             collected_number *n = (collected_number *)rv->value;
613             return *n;
614         }
615
616         case RRDVAR_TYPE_TOTAL: {
617             total_number *n = (total_number *)rv->value;
618             return *n;
619         }
620
621         case RRDVAR_TYPE_INT: {
622             int *n = (int *)rv->value;
623             return *n;
624         }
625
626         default:
627             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
628             return NAN;
629     }
630 }
631
632 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
633     RRDSET *st = rc->rrdset;
634     RRDVAR *rv;
635
636     if(!st) return 0;
637
638     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
639     if(rv) {
640         *result = rrdvar2number(rv);
641         return 1;
642     }
643
644     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
645     if(rv) {
646         *result = rrdvar2number(rv);
647         return 1;
648     }
649
650     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
651     if(rv) {
652         *result = rrdvar2number(rv);
653         return 1;
654     }
655
656     return 0;
657 }
658
659 // ----------------------------------------------------------------------------
660 // RRDVAR to JSON
661
662 struct variable2json_helper {
663     BUFFER *buf;
664     size_t counter;
665 };
666
667 static void single_variable2json(void *entry, void *data) {
668     struct variable2json_helper *helper = (struct variable2json_helper *)data;
669     RRDVAR *rv = (RRDVAR *)entry;
670     calculated_number value = rrdvar2number(rv);
671
672     if(unlikely(isnan(value) || isinf(value)))
673         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
674     else
675         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
676
677     helper->counter++;
678 }
679
680 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
681     struct variable2json_helper helper = {
682             .buf = buf,
683             .counter = 0
684     };
685
686     buffer_sprintf(buf, "{\n\t\"chart\": \"%s.%s\",\n\t\"chart_name\": \"%s.%s\",\n\t\"chart_variables\": {", st->type, st->id, st->type, st->name);
687     avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
688     buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
689     helper.counter = 0;
690     avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
691     buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
692     helper.counter = 0;
693     avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
694     buffer_strcat(buf, "\n\t}\n}\n");
695 }
696
697
698 // ----------------------------------------------------------------------------
699 // RRDDIMVAR management
700 // DIMENSION VARIABLES
701
702 #define RRDDIMVAR_ID_MAX 1024
703
704 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
705     RRDDIM *rd = rs->rrddim;
706     RRDSET *st = rd->rrdset;
707
708     // CHART VARIABLES FOR THIS DIMENSION
709
710     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
711     rs->var_local_id = NULL;
712
713     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
714     rs->var_local_name = NULL;
715
716     // FAMILY VARIABLES FOR THIS DIMENSION
717
718     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
719     rs->var_family_id = NULL;
720
721     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
722     rs->var_family_name = NULL;
723
724     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
725     rs->var_family_contextid = NULL;
726
727     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
728     rs->var_family_contextname = NULL;
729
730     // HOST VARIABLES FOR THIS DIMENSION
731
732     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
733     rs->var_host_chartidid = NULL;
734
735     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
736     rs->var_host_chartidname = NULL;
737
738     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
739     rs->var_host_chartnameid = NULL;
740
741     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
742     rs->var_host_chartnamename = NULL;
743
744     // KEYS
745
746     freez(rs->key_id);
747     rs->key_id = NULL;
748
749     freez(rs->key_name);
750     rs->key_name = NULL;
751
752     freez(rs->key_fullidid);
753     rs->key_fullidid = NULL;
754
755     freez(rs->key_fullidname);
756     rs->key_fullidname = NULL;
757
758     freez(rs->key_contextid);
759     rs->key_contextid = NULL;
760
761     freez(rs->key_contextname);
762     rs->key_contextname = NULL;
763
764     freez(rs->key_fullnameid);
765     rs->key_fullnameid = NULL;
766
767     freez(rs->key_fullnamename);
768     rs->key_fullnamename = NULL;
769 }
770
771 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
772     rrddimvar_free_variables(rs);
773
774     RRDDIM *rd = rs->rrddim;
775     RRDSET *st = rd->rrdset;
776
777     char buffer[RRDDIMVAR_ID_MAX + 1];
778
779     // KEYS
780
781     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
782     rs->key_id = strdupz(buffer);
783
784     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
785     rs->key_name = strdupz(buffer);
786
787     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
788     rs->key_fullidid = strdupz(buffer);
789
790     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
791     rs->key_fullidname = strdupz(buffer);
792
793     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
794     rs->key_contextid = strdupz(buffer);
795
796     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
797     rs->key_contextname = strdupz(buffer);
798
799     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
800     rs->key_fullnameid = strdupz(buffer);
801
802     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
803     rs->key_fullnamename = strdupz(buffer);
804
805     // CHART VARIABLES FOR THIS DIMENSION
806     // -----------------------------------
807     //
808     // dimensions are available as:
809     // - $id
810     // - $name
811
812     rs->var_local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
813     rs->var_local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
814
815     // FAMILY VARIABLES FOR THIS DIMENSION
816     // -----------------------------------
817     //
818     // dimensions are available as:
819     // - $id                 (only the first, when multiple overlap)
820     // - $name               (only the first, when multiple overlap)
821     // - $chart-context.id
822     // - $chart-context.name
823
824     rs->var_family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
825     rs->var_family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
826     rs->var_family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
827     rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
828
829     // HOST VARIABLES FOR THIS DIMENSION
830     // -----------------------------------
831     //
832     // dimensions are available as:
833     // - $chart-id.id
834     // - $chart-id.name
835     // - $chart-name.id
836     // - $chart-name.name
837
838     rs->var_host_chartidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
839     rs->var_host_chartidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
840     rs->var_host_chartnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
841     rs->var_host_chartnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
842 }
843
844 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
845     RRDSET *st = rd->rrdset;
846
847     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
848
849     if(!prefix) prefix = "";
850     if(!suffix) suffix = "";
851
852     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
853
854     rs->prefix = strdupz(prefix);
855     rs->suffix = strdupz(suffix);
856
857     rs->type = type;
858     rs->value = value;
859     rs->options = options;
860     rs->rrddim = rd;
861
862     rs->next = rd->variables;
863     rd->variables = rs;
864
865     rrddimvar_create_variables(rs);
866
867     return rs;
868 }
869
870 void rrddimvar_rename_all(RRDDIM *rd) {
871     RRDSET *st = rd->rrdset;
872     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
873
874     RRDDIMVAR *rs, *next = rd->variables;
875     while((rs = next)) {
876         next = rs->next;
877         rrddimvar_create_variables(rs);
878     }
879 }
880
881 void rrddimvar_free(RRDDIMVAR *rs) {
882     RRDDIM *rd = rs->rrddim;
883     RRDSET *st = rd->rrdset;
884     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
885
886     rrddimvar_free_variables(rs);
887
888     if(rd->variables == rs) {
889         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
890         rd->variables = rs->next;
891     }
892     else {
893         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
894         RRDDIMVAR *t;
895         for (t = rd->variables; t && t->next != rs; t = t->next) ;
896         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
897         else t->next = rs->next;
898     }
899
900     freez(rs->prefix);
901     freez(rs->suffix);
902     freez(rs);
903 }
904
905 // ----------------------------------------------------------------------------
906 // RRDSETVAR management
907 // CHART VARIABLES
908
909 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
910     RRDSET *st = rs->rrdset;
911
912     // CHART
913
914     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
915     rs->var_local = NULL;
916
917     // FAMILY
918
919     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
920     rs->var_family = NULL;
921
922     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
923     rs->var_host = NULL;
924
925     // HOST
926
927     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
928     rs->var_family_name = NULL;
929
930     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
931     rs->var_host_name = NULL;
932
933     // KEYS
934
935     freez(rs->key_fullid);
936     rs->key_fullid = NULL;
937
938     freez(rs->key_fullname);
939     rs->key_fullname = NULL;
940 }
941
942 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
943     rrdsetvar_free_variables(rs);
944
945     RRDSET *st = rs->rrdset;
946
947     // KEYS
948
949     char buffer[RRDVAR_MAX_LENGTH + 1];
950     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
951     rs->key_fullid = strdupz(buffer);
952
953     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
954     rs->key_fullname = strdupz(buffer);
955
956     // CHART
957
958     rs->var_local       = rrdvar_create_and_index("local",  &st->variables_root_index,               rs->variable, rs->type, rs->value);
959
960     // FAMILY
961
962     rs->var_family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullid,   rs->type, rs->value);
963     rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullname, rs->type, rs->value);
964
965     // HOST
966
967     rs->var_host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullid,   rs->type, rs->value);
968     rs->var_host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullname, rs->type, rs->value);
969
970 }
971
972 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
973     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
974     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
975
976     rs->variable = strdupz(variable);
977     rs->type = type;
978     rs->value = value;
979     rs->options = options;
980     rs->rrdset = st;
981
982     rs->next = st->variables;
983     st->variables = rs;
984
985     rrdsetvar_create_variables(rs);
986
987     return rs;
988 }
989
990 void rrdsetvar_rename_all(RRDSET *st) {
991     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
992
993     RRDSETVAR *rs, *next = st->variables;
994     while((rs = next)) {
995         next = rs->next;
996         rrdsetvar_create_variables(rs);
997     }
998
999     rrdsetcalc_link_matching(st);
1000 }
1001
1002 void rrdsetvar_free(RRDSETVAR *rs) {
1003     RRDSET *st = rs->rrdset;
1004     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
1005
1006     if(st->variables == rs) {
1007         st->variables = rs->next;
1008     }
1009     else {
1010         RRDSETVAR *t;
1011         for (t = st->variables; t && t->next != rs; t = t->next);
1012         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
1013         else t->next = rs->next;
1014     }
1015
1016     rrdsetvar_free_variables(rs);
1017
1018     freez(rs->variable);
1019     freez(rs);
1020 }
1021
1022 // ----------------------------------------------------------------------------
1023 // RRDCALC management
1024
1025 static inline const char *rrdcalc_status2string(int status) {
1026     switch(status) {
1027         case RRDCALC_STATUS_REMOVED:
1028             return "REMOVED";
1029
1030         case RRDCALC_STATUS_UNDEFINED:
1031             return "UNDEFINED";
1032
1033         case RRDCALC_STATUS_UNINITIALIZED:
1034             return "UNINITIALIZED";
1035
1036         case RRDCALC_STATUS_CLEAR:
1037             return "CLEAR";
1038
1039         case RRDCALC_STATUS_RAISED:
1040             return "RAISED";
1041
1042         case RRDCALC_STATUS_WARNING:
1043             return "WARNING";
1044
1045         case RRDCALC_STATUS_CRITICAL:
1046             return "CRITICAL";
1047
1048         default:
1049             error("Unknown alarm status %d", status);
1050             return "UNKNOWN";
1051     }
1052 }
1053
1054 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
1055     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
1056
1057     rc->last_status_change = time(NULL);
1058     rc->rrdset = st;
1059
1060     rc->rrdset_next = st->alarms;
1061     rc->rrdset_prev = NULL;
1062     
1063     if(rc->rrdset_next)
1064         rc->rrdset_next->rrdset_prev = rc;
1065
1066     st->alarms = rc;
1067
1068     if(rc->update_every < rc->rrdset->update_every) {
1069         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
1070         rc->update_every = rc->rrdset->update_every;
1071     }
1072
1073     if(!isnan(rc->green) && isnan(st->green)) {
1074         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
1075         st->green = rc->green;
1076     }
1077
1078     if(!isnan(rc->red) && isnan(st->red)) {
1079         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
1080         st->red = rc->red;
1081     }
1082
1083     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1084     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1085
1086     char fullname[RRDVAR_MAX_LENGTH + 1];
1087     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
1088     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1089
1090     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
1091     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1092
1093         if(!rc->units) rc->units = strdupz(st->units);
1094
1095     {
1096         time_t now = time(NULL);
1097         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
1098     }
1099 }
1100
1101 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
1102     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
1103             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
1104         return 1;
1105
1106     return 0;
1107 }
1108
1109 // this has to be called while the RRDHOST is locked
1110 inline void rrdsetcalc_link_matching(RRDSET *st) {
1111     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
1112
1113     RRDCALC *rc;
1114     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
1115         if(unlikely(rc->rrdset))
1116             continue;
1117
1118         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
1119             rrdsetcalc_link(st, rc);
1120     }
1121 }
1122
1123 // this has to be called while the RRDHOST is locked
1124 inline void rrdsetcalc_unlink(RRDCALC *rc) {
1125     RRDSET *st = rc->rrdset;
1126
1127     if(!st) {
1128         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1129         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1130         return;
1131     }
1132
1133     {
1134         time_t now = time(NULL);
1135         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
1136     }
1137
1138     RRDHOST *host = st->rrdhost;
1139
1140     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
1141
1142     // unlink it
1143     if(rc->rrdset_prev)
1144         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
1145
1146     if(rc->rrdset_next)
1147         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
1148
1149     if(st->alarms == rc)
1150         st->alarms = rc->rrdset_next;
1151
1152     rc->rrdset_prev = rc->rrdset_next = NULL;
1153
1154     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
1155     rc->local = NULL;
1156
1157     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
1158     rc->family = NULL;
1159
1160     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
1161     rc->hostid = NULL;
1162
1163     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
1164     rc->hostname = NULL;
1165
1166     rc->rrdset = NULL;
1167
1168     // RRDCALC will remain in RRDHOST
1169     // so that if the matching chart is found in the future
1170     // it will be applied automatically
1171 }
1172
1173 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
1174     RRDCALC *rc;
1175     uint32_t hash = simple_hash(name);
1176
1177     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
1178         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
1179             return rc;
1180     }
1181
1182     return NULL;
1183 }
1184
1185 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
1186     RRDCALC *rc;
1187
1188     if(unlikely(!chart)) {
1189         error("attempt to find RRDCALC '%s' without giving a chart name", name);
1190         return 1;
1191     }
1192
1193     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1194     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
1195
1196     // make sure it does not already exist
1197     for(rc = host->alarms; rc ; rc = rc->next) {
1198         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1199             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1200             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1201             return 1;
1202         }
1203     }
1204
1205     return 0;
1206 }
1207
1208 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1209     if(chart && name) {
1210         uint32_t hash_chart = simple_hash(chart);
1211         uint32_t hash_name = simple_hash(name);
1212
1213         // re-use old IDs, by looking them up in the alarm log
1214         ALARM_ENTRY *ae;
1215         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1216             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1217                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1218                 return ae->alarm_id;
1219             }
1220         }
1221     }
1222
1223     return host->health_log.next_alarm_id++;
1224 }
1225
1226 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1227     rrdhost_check_rdlock(host);
1228
1229     if(rc->calculation) {
1230         rc->calculation->status = &rc->status;
1231         rc->calculation->this = &rc->value;
1232         rc->calculation->after = &rc->db_after;
1233         rc->calculation->before = &rc->db_before;
1234         rc->calculation->rrdcalc = rc;
1235     }
1236
1237     if(rc->warning) {
1238         rc->warning->status = &rc->status;
1239         rc->warning->this = &rc->value;
1240         rc->warning->after = &rc->db_after;
1241         rc->warning->before = &rc->db_before;
1242         rc->warning->rrdcalc = rc;
1243     }
1244
1245     if(rc->critical) {
1246         rc->critical->status = &rc->status;
1247         rc->critical->this = &rc->value;
1248         rc->critical->after = &rc->db_after;
1249         rc->critical->before = &rc->db_before;
1250         rc->critical->rrdcalc = rc;
1251     }
1252
1253     // link it to the host
1254     if(likely(host->alarms)) {
1255         // append it
1256         RRDCALC *t;
1257         for(t = host->alarms; t && t->next ; t = t->next) ;
1258         t->next = rc;
1259     }
1260     else {
1261         host->alarms = rc;
1262     }
1263
1264     // link it to its chart
1265     RRDSET *st;
1266     for(st = host->rrdset_root; st ; st = st->next) {
1267         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1268             rrdsetcalc_link(st, rc);
1269             break;
1270         }
1271     }
1272 }
1273
1274 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1275
1276     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1277
1278     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1279         return NULL;
1280
1281     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1282     rc->next_event_id = 1;
1283     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1284     rc->name = strdupz(rt->name);
1285     rc->hash = simple_hash(rc->name);
1286     rc->chart = strdupz(chart);
1287     rc->hash_chart = simple_hash(rc->chart);
1288
1289     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1290
1291     rc->green = rt->green;
1292     rc->red = rt->red;
1293     rc->value = NAN;
1294     rc->old_value = NAN;
1295
1296     rc->delay_up_duration = rt->delay_up_duration;
1297     rc->delay_down_duration = rt->delay_down_duration;
1298     rc->delay_max_duration = rt->delay_max_duration;
1299     rc->delay_multiplier = rt->delay_multiplier;
1300
1301     rc->group = rt->group;
1302     rc->after = rt->after;
1303     rc->before = rt->before;
1304     rc->update_every = rt->update_every;
1305     rc->options = rt->options;
1306
1307     if(rt->exec) rc->exec = strdupz(rt->exec);
1308     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1309     if(rt->source) rc->source = strdupz(rt->source);
1310     if(rt->units) rc->units = strdupz(rt->units);
1311     if(rt->info) rc->info = strdupz(rt->info);
1312
1313     if(rt->calculation) {
1314         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1315         if(!rc->calculation)
1316             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1317     }
1318     if(rt->warning) {
1319         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1320         if(!rc->warning)
1321             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1322     }
1323     if(rt->critical) {
1324         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1325         if(!rc->critical)
1326             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1327     }
1328
1329     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1330           (rc->chart)?rc->chart:"NOCHART",
1331           rc->name,
1332           (rc->exec)?rc->exec:"DEFAULT",
1333           (rc->recipient)?rc->recipient:"DEFAULT",
1334           rc->green,
1335           rc->red,
1336           rc->group,
1337           rc->after,
1338           rc->before,
1339           rc->options,
1340           (rc->dimensions)?rc->dimensions:"NONE",
1341           rc->update_every,
1342           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1343           (rc->warning)?rc->warning->parsed_as:"NONE",
1344           (rc->critical)?rc->critical->parsed_as:"NONE",
1345           rc->source,
1346           rc->delay_up_duration,
1347           rc->delay_down_duration,
1348           rc->delay_max_duration,
1349           rc->delay_multiplier
1350     );
1351
1352     rrdcalc_create_part2(host, rc);
1353     return rc;
1354 }
1355
1356 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1357     if(!rc) return;
1358
1359     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1360
1361     // unlink it from RRDSET
1362     if(rc->rrdset) rrdsetcalc_unlink(rc);
1363
1364     // unlink it from RRDHOST
1365     if(unlikely(rc == host->alarms))
1366         host->alarms = rc->next;
1367
1368     else if(likely(host->alarms)) {
1369         RRDCALC *t, *last = host->alarms;
1370         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1371         if(last->next == rc)
1372             last->next = rc->next;
1373         else
1374             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1375     }
1376     else
1377         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1378
1379     expression_free(rc->calculation);
1380     expression_free(rc->warning);
1381     expression_free(rc->critical);
1382
1383     freez(rc->name);
1384     freez(rc->chart);
1385     freez(rc->family);
1386     freez(rc->dimensions);
1387     freez(rc->exec);
1388     freez(rc->recipient);
1389     freez(rc->source);
1390     freez(rc->units);
1391     freez(rc->info);
1392     freez(rc);
1393 }
1394
1395 // ----------------------------------------------------------------------------
1396 // RRDCALCTEMPLATE management
1397
1398 void rrdcalctemplate_link_matching(RRDSET *st) {
1399     RRDCALCTEMPLATE *rt;
1400
1401     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1402         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1403             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1404             if(unlikely(!rc))
1405                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1406
1407 #ifdef NETDATA_INTERNAL_CHECKS
1408             else if(rc->rrdset != st)
1409                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1410 #endif
1411         }
1412     }
1413 }
1414
1415 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1416     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1417
1418     if(host->templates) {
1419         if(host->templates == rt) {
1420             host->templates = rt->next;
1421         }
1422         else {
1423             RRDCALCTEMPLATE *t, *last = host->templates;
1424             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1425             if(last && last->next == rt) {
1426                 last->next = rt->next;
1427                 rt->next = NULL;
1428             }
1429             else
1430                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1431         }
1432     }
1433
1434     expression_free(rt->calculation);
1435     expression_free(rt->warning);
1436     expression_free(rt->critical);
1437
1438     freez(rt->name);
1439     freez(rt->exec);
1440     freez(rt->recipient);
1441     freez(rt->context);
1442     freez(rt->source);
1443     freez(rt->units);
1444     freez(rt->info);
1445     freez(rt->dimensions);
1446     freez(rt);
1447 }
1448
1449 // ----------------------------------------------------------------------------
1450 // load health configuration
1451
1452 #define HEALTH_CONF_MAX_LINE 4096
1453
1454 #define HEALTH_ALARM_KEY "alarm"
1455 #define HEALTH_TEMPLATE_KEY "template"
1456 #define HEALTH_ON_KEY "on"
1457 #define HEALTH_LOOKUP_KEY "lookup"
1458 #define HEALTH_CALC_KEY "calc"
1459 #define HEALTH_EVERY_KEY "every"
1460 #define HEALTH_GREEN_KEY "green"
1461 #define HEALTH_RED_KEY "red"
1462 #define HEALTH_WARN_KEY "warn"
1463 #define HEALTH_CRIT_KEY "crit"
1464 #define HEALTH_EXEC_KEY "exec"
1465 #define HEALTH_RECIPIENT_KEY "to"
1466 #define HEALTH_UNITS_KEY "units"
1467 #define HEALTH_INFO_KEY "info"
1468 #define HEALTH_DELAY_KEY "delay"
1469
1470 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1471     if(!rc->chart) {
1472         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1473         return 0;
1474     }
1475
1476     if(!rc->update_every) {
1477         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1478         return 0;
1479     }
1480
1481     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1482         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1483         return 0;
1484     }
1485
1486     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1487         return 0;
1488
1489     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1490
1491     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1492           rc->chart?rc->chart:"NOCHART",
1493           rc->name,
1494           rc->id,
1495           (rc->exec)?rc->exec:"DEFAULT",
1496           (rc->recipient)?rc->recipient:"DEFAULT",
1497           rc->green,
1498           rc->red,
1499           rc->group,
1500           rc->after,
1501           rc->before,
1502           rc->options,
1503           (rc->dimensions)?rc->dimensions:"NONE",
1504           rc->update_every,
1505           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1506           (rc->warning)?rc->warning->parsed_as:"NONE",
1507           (rc->critical)?rc->critical->parsed_as:"NONE",
1508           rc->source,
1509           rc->delay_up_duration,
1510           rc->delay_down_duration,
1511           rc->delay_max_duration,
1512           rc->delay_multiplier
1513     );
1514
1515     rrdcalc_create_part2(host, rc);
1516     return 1;
1517 }
1518
1519 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1520     if(unlikely(!rt->context)) {
1521         error("Health configuration for template '%s' does not have a context", rt->name);
1522         return 0;
1523     }
1524
1525     if(unlikely(!rt->update_every)) {
1526         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1527         return 0;
1528     }
1529
1530     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1531         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1532         return 0;
1533     }
1534
1535     RRDCALCTEMPLATE *t, *last = NULL;
1536     for (t = host->templates; t ; last = t, t = t->next) {
1537         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1538             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1539             return 0;
1540         }
1541     }
1542
1543     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1544           rt->name,
1545           (rt->context)?rt->context:"NONE",
1546           (rt->exec)?rt->exec:"DEFAULT",
1547           (rt->recipient)?rt->recipient:"DEFAULT",
1548           rt->green,
1549           rt->red,
1550           rt->group,
1551           rt->after,
1552           rt->before,
1553           rt->options,
1554           (rt->dimensions)?rt->dimensions:"NONE",
1555           rt->update_every,
1556           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1557           (rt->warning)?rt->warning->parsed_as:"NONE",
1558           (rt->critical)?rt->critical->parsed_as:"NONE",
1559           rt->source,
1560           rt->delay_up_duration,
1561           rt->delay_down_duration,
1562           rt->delay_max_duration,
1563           rt->delay_multiplier
1564     );
1565
1566     if(likely(last)) {
1567         last->next = rt;
1568     }
1569     else {
1570         rt->next = host->templates;
1571         host->templates = rt;
1572     }
1573
1574     return 1;
1575 }
1576
1577 static inline int health_parse_duration(char *string, int *result) {
1578     // make sure it is a number
1579     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1580         *result = 0;
1581         return 0;
1582     }
1583
1584     char *e = NULL;
1585     calculated_number n = strtold(string, &e);
1586     if(e && *e) {
1587         switch (*e) {
1588             case 'Y':
1589                 *result = (int) (n * 86400 * 365);
1590                 break;
1591             case 'M':
1592                 *result = (int) (n * 86400 * 30);
1593                 break;
1594             case 'w':
1595                 *result = (int) (n * 86400 * 7);
1596                 break;
1597             case 'd':
1598                 *result = (int) (n * 86400);
1599                 break;
1600             case 'h':
1601                 *result = (int) (n * 3600);
1602                 break;
1603             case 'm':
1604                 *result = (int) (n * 60);
1605                 break;
1606
1607             default:
1608             case 's':
1609                 *result = (int) (n);
1610                 break;
1611         }
1612     }
1613     else
1614        *result = (int)(n);
1615
1616     return 1;
1617 }
1618
1619 static inline int health_parse_delay(
1620         size_t line, const char *path, const char *file, char *string,
1621         int *delay_up_duration,
1622         int *delay_down_duration,
1623         int *delay_max_duration,
1624         float *delay_multiplier) {
1625
1626     char given_up = 0;
1627     char given_down = 0;
1628     char given_max = 0;
1629     char given_multiplier = 0;
1630
1631     char *s = string;
1632     while(*s) {
1633         char *key = s;
1634
1635         while(*s && !isspace(*s)) s++;
1636         while(*s && isspace(*s)) *s++ = '\0';
1637
1638         if(!*key) break;
1639
1640         char *value = s;
1641         while(*s && !isspace(*s)) s++;
1642         while(*s && isspace(*s)) *s++ = '\0';
1643
1644         if(!strcasecmp(key, "up")) {
1645             if (!health_parse_duration(value, delay_up_duration)) {
1646                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1647                       line, path, file, value, key);
1648             }
1649             else given_up = 1;
1650         }
1651         else if(!strcasecmp(key, "down")) {
1652             if (!health_parse_duration(value, delay_down_duration)) {
1653                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1654                       line, path, file, value, key);
1655             }
1656             else given_down = 1;
1657         }
1658         else if(!strcasecmp(key, "multiplier")) {
1659             *delay_multiplier = strtof(value, NULL);
1660             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1661                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1662                       line, path, file, value, key);
1663             }
1664             else given_multiplier = 1;
1665         }
1666         else if(!strcasecmp(key, "max")) {
1667             if (!health_parse_duration(value, delay_max_duration)) {
1668                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1669                       line, path, file, value, key);
1670             }
1671             else given_max = 1;
1672         }
1673         else {
1674             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1675                   line, path, file, key);
1676         }
1677     }
1678
1679     if(!given_up)
1680         *delay_up_duration = 0;
1681
1682     if(!given_down)
1683         *delay_down_duration = 0;
1684
1685     if(!given_multiplier)
1686         *delay_multiplier = 1.0;
1687
1688     if(!given_max) {
1689         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1690             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1691
1692         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1693             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1694     }
1695
1696     return 1;
1697 }
1698
1699 static inline int health_parse_db_lookup(
1700         size_t line, const char *path, const char *file, char *string,
1701         int *group_method, int *after, int *before, int *every,
1702         uint32_t *options, char **dimensions
1703 ) {
1704     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1705
1706     if(*dimensions) freez(*dimensions);
1707     *dimensions = NULL;
1708     *after = 0;
1709     *before = 0;
1710     *every = 0;
1711     *options = 0;
1712
1713     char *s = string, *key;
1714
1715     // first is the group method
1716     key = s;
1717     while(*s && !isspace(*s)) s++;
1718     while(*s && isspace(*s)) *s++ = '\0';
1719     if(!*s) {
1720         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1721               line, path, file, key);
1722         return 0;
1723     }
1724
1725     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1726         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1727               line, path, file, key);
1728         return 0;
1729     }
1730
1731     // then is the 'after' time
1732     key = s;
1733     while(*s && !isspace(*s)) s++;
1734     while(*s && isspace(*s)) *s++ = '\0';
1735
1736     if(!health_parse_duration(key, after)) {
1737         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1738               line, path, file, key);
1739         return 0;
1740     }
1741
1742     // sane defaults
1743     *every = abs(*after);
1744
1745     // now we may have optional parameters
1746     while(*s) {
1747         key = s;
1748         while(*s && !isspace(*s)) s++;
1749         while(*s && isspace(*s)) *s++ = '\0';
1750         if(!*key) break;
1751
1752         if(!strcasecmp(key, "at")) {
1753             char *value = s;
1754             while(*s && !isspace(*s)) s++;
1755             while(*s && isspace(*s)) *s++ = '\0';
1756
1757             if (!health_parse_duration(value, before)) {
1758                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1759                       line, path, file, value, key);
1760             }
1761         }
1762         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1763             char *value = s;
1764             while(*s && !isspace(*s)) s++;
1765             while(*s && isspace(*s)) *s++ = '\0';
1766
1767             if (!health_parse_duration(value, every)) {
1768                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1769                       line, path, file, value, key);
1770             }
1771         }
1772         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1773             *options |= RRDR_OPTION_ABSOLUTE;
1774         }
1775         else if(!strcasecmp(key, "min2max")) {
1776             *options |= RRDR_OPTION_MIN2MAX;
1777         }
1778         else if(!strcasecmp(key, "null2zero")) {
1779             *options |= RRDR_OPTION_NULL2ZERO;
1780         }
1781         else if(!strcasecmp(key, "percentage")) {
1782             *options |= RRDR_OPTION_PERCENTAGE;
1783         }
1784         else if(!strcasecmp(key, "unaligned")) {
1785             *options |= RRDR_OPTION_NOT_ALIGNED;
1786         }
1787         else if(!strcasecmp(key, "of")) {
1788             if(*s && strcasecmp(s, "all"))
1789                *dimensions = strdupz(s);
1790             break;
1791         }
1792         else {
1793             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1794                   line, path, file, key);
1795         }
1796     }
1797
1798     return 1;
1799 }
1800
1801 static inline char *tabs2spaces(char *s) {
1802     char *t = s;
1803     while(*t) {
1804         if(unlikely(*t == '\t')) *t = ' ';
1805         t++;
1806     }
1807
1808     return s;
1809 }
1810
1811 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1812     char buffer[FILENAME_MAX + 1];
1813     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1814     return strdupz(buffer);
1815 }
1816
1817 static inline void strip_quotes(char *s) {
1818     while(*s) {
1819         if(*s == '\'' || *s == '"') *s = ' ';
1820         s++;
1821     }
1822 }
1823
1824 int health_readfile(const char *path, const char *filename) {
1825     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1826
1827     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1828     char buffer[HEALTH_CONF_MAX_LINE + 1];
1829
1830     if(unlikely(!hash_alarm)) {
1831         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1832         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1833         hash_on = simple_uhash(HEALTH_ON_KEY);
1834         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1835         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1836         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1837         hash_red = simple_uhash(HEALTH_RED_KEY);
1838         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1839         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1840         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1841         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1842         hash_units = simple_hash(HEALTH_UNITS_KEY);
1843         hash_info = simple_hash(HEALTH_INFO_KEY);
1844         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1845         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1846     }
1847
1848     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1849     FILE *fp = fopen(buffer, "r");
1850     if(!fp) {
1851         error("Health configuration cannot read file '%s'.", buffer);
1852         return 0;
1853     }
1854
1855     RRDCALC *rc = NULL;
1856     RRDCALCTEMPLATE *rt = NULL;
1857
1858     size_t line = 0, append = 0;
1859     char *s;
1860     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1861         int stop_appending = !s;
1862         line++;
1863         s = trim(buffer);
1864         if(!s) continue;
1865
1866         append = strlen(s);
1867         if(!stop_appending && s[append - 1] == '\\') {
1868             s[append - 1] = ' ';
1869             append = &s[append] - buffer;
1870             if(append < HEALTH_CONF_MAX_LINE)
1871                 continue;
1872             else {
1873                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1874             }
1875         }
1876         append = 0;
1877
1878         char *key = s;
1879         while(*s && *s != ':') s++;
1880         if(!*s) {
1881             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1882             continue;
1883         }
1884         *s = '\0';
1885         s++;
1886
1887         char *value = s;
1888         key = trim(key);
1889         value = trim(value);
1890
1891         if(!key) {
1892             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1893             continue;
1894         }
1895
1896         if(!value) {
1897             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1898             continue;
1899         }
1900
1901         uint32_t hash = simple_uhash(key);
1902
1903         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1904             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1905                 rrdcalc_free(&localhost, rc);
1906
1907             if(rt) {
1908                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1909                     rrdcalctemplate_free(&localhost, rt);
1910                 rt = NULL;
1911             }
1912
1913             rc = callocz(1, sizeof(RRDCALC));
1914             rc->next_event_id = 1;
1915             rc->name = tabs2spaces(strdupz(value));
1916             rc->hash = simple_hash(rc->name);
1917             rc->source = health_source_file(line, path, filename);
1918             rc->green = NAN;
1919             rc->red = NAN;
1920             rc->value = NAN;
1921             rc->old_value = NAN;
1922             rc->delay_multiplier = 1.0;
1923
1924             if(rrdvar_fix_name(rc->name))
1925                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1926         }
1927         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1928             if(rc) {
1929                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1930                     rrdcalc_free(&localhost, rc);
1931                 rc = NULL;
1932             }
1933
1934             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1935                 rrdcalctemplate_free(&localhost, rt);
1936
1937             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1938             rt->name = tabs2spaces(strdupz(value));
1939             rt->hash_name = simple_hash(rt->name);
1940             rt->source = health_source_file(line, path, filename);
1941             rt->green = NAN;
1942             rt->red = NAN;
1943             rt->delay_multiplier = 1.0;
1944
1945             if(rrdvar_fix_name(rt->name))
1946                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1947         }
1948         else if(rc) {
1949             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1950                 if(rc->chart) {
1951                     if(strcmp(rc->chart, value))
1952                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1953                              line, path, filename, rc->name, key, rc->chart, value, value);
1954
1955                     freez(rc->chart);
1956                 }
1957                 rc->chart = tabs2spaces(strdupz(value));
1958                 rc->hash_chart = simple_hash(rc->chart);
1959             }
1960             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1961                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1962                                        &rc->update_every,
1963                                        &rc->options, &rc->dimensions);
1964             }
1965             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1966                 if(!health_parse_duration(value, &rc->update_every))
1967                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1968                          line, path, filename, rc->name, key, value);
1969             }
1970             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1971                 char *e;
1972                 rc->green = strtold(value, &e);
1973                 if(e && *e) {
1974                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1975                          line, path, filename, rc->name, key, e);
1976                 }
1977             }
1978             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1979                 char *e;
1980                 rc->red = strtold(value, &e);
1981                 if(e && *e) {
1982                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1983                          line, path, filename, rc->name, key, e);
1984                 }
1985             }
1986             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1987                 const char *failed_at = NULL;
1988                 int error = 0;
1989                 rc->calculation = expression_parse(value, &failed_at, &error);
1990                 if(!rc->calculation) {
1991                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1992                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1993                 }
1994             }
1995             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1996                 const char *failed_at = NULL;
1997                 int error = 0;
1998                 rc->warning = expression_parse(value, &failed_at, &error);
1999                 if(!rc->warning) {
2000                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2001                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2002                 }
2003             }
2004             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2005                 const char *failed_at = NULL;
2006                 int error = 0;
2007                 rc->critical = expression_parse(value, &failed_at, &error);
2008                 if(!rc->critical) {
2009                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2010                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2011                 }
2012             }
2013             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2014                 if(rc->exec) {
2015                     if(strcmp(rc->exec, value))
2016                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2017                              line, path, filename, rc->name, key, rc->exec, value, value);
2018
2019                     freez(rc->exec);
2020                 }
2021                 rc->exec = tabs2spaces(strdupz(value));
2022             }
2023             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2024                 if(rc->recipient) {
2025                     if(strcmp(rc->recipient, value))
2026                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2027                              line, path, filename, rc->name, key, rc->recipient, value, value);
2028
2029                     freez(rc->recipient);
2030                 }
2031                 rc->recipient = tabs2spaces(strdupz(value));
2032             }
2033             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2034                 if(rc->units) {
2035                     if(strcmp(rc->units, value))
2036                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2037                              line, path, filename, rc->name, key, rc->units, value, value);
2038
2039                     freez(rc->units);
2040                 }
2041                 rc->units = tabs2spaces(strdupz(value));
2042                 strip_quotes(rc->units);
2043             }
2044             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2045                 if(rc->info) {
2046                     if(strcmp(rc->info, value))
2047                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2048                              line, path, filename, rc->name, key, rc->info, value, value);
2049
2050                     freez(rc->info);
2051                 }
2052                 rc->info = tabs2spaces(strdupz(value));
2053                 strip_quotes(rc->info);
2054             }
2055             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2056                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
2057             }
2058             else {
2059                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
2060                      line, path, filename, rc->name, key);
2061             }
2062         }
2063         else if(rt) {
2064             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
2065                 if(rt->context) {
2066                     if(strcmp(rt->context, value))
2067                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2068                              line, path, filename, rt->name, key, rt->context, value, value);
2069
2070                     freez(rt->context);
2071                 }
2072                 rt->context = tabs2spaces(strdupz(value));
2073                 rt->hash_context = simple_hash(rt->context);
2074             }
2075             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
2076                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
2077                                        &rt->update_every,
2078                                        &rt->options, &rt->dimensions);
2079             }
2080             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
2081                 if(!health_parse_duration(value, &rt->update_every))
2082                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
2083                          line, path, filename, rt->name, key, value);
2084             }
2085             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
2086                 char *e;
2087                 rt->green = strtold(value, &e);
2088                 if(e && *e) {
2089                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2090                          line, path, filename, rt->name, key, e);
2091                 }
2092             }
2093             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
2094                 char *e;
2095                 rt->red = strtold(value, &e);
2096                 if(e && *e) {
2097                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2098                          line, path, filename, rt->name, key, e);
2099                 }
2100             }
2101             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2102                 const char *failed_at = NULL;
2103                 int error = 0;
2104                 rt->calculation = expression_parse(value, &failed_at, &error);
2105                 if(!rt->calculation) {
2106                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2107                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2108                 }
2109             }
2110             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2111                 const char *failed_at = NULL;
2112                 int error = 0;
2113                 rt->warning = expression_parse(value, &failed_at, &error);
2114                 if(!rt->warning) {
2115                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2116                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2117                 }
2118             }
2119             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2120                 const char *failed_at = NULL;
2121                 int error = 0;
2122                 rt->critical = expression_parse(value, &failed_at, &error);
2123                 if(!rt->critical) {
2124                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2125                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2126                 }
2127             }
2128             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2129                 if(rt->exec) {
2130                     if(strcmp(rt->exec, value))
2131                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2132                              line, path, filename, rt->name, key, rt->exec, value, value);
2133
2134                     freez(rt->exec);
2135                 }
2136                 rt->exec = tabs2spaces(strdupz(value));
2137             }
2138             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2139                 if(rt->recipient) {
2140                     if(strcmp(rt->recipient, value))
2141                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2142                              line, path, filename, rt->name, key, rt->recipient, value, value);
2143
2144                     freez(rt->recipient);
2145                 }
2146                 rt->recipient = tabs2spaces(strdupz(value));
2147             }
2148             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2149                 if(rt->units) {
2150                     if(strcmp(rt->units, value))
2151                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2152                              line, path, filename, rt->name, key, rt->units, value, value);
2153
2154                     freez(rt->units);
2155                 }
2156                 rt->units = tabs2spaces(strdupz(value));
2157                 strip_quotes(rt->units);
2158             }
2159             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2160                 if(rt->info) {
2161                     if(strcmp(rt->info, value))
2162                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2163                              line, path, filename, rt->name, key, rt->info, value, value);
2164
2165                     freez(rt->info);
2166                 }
2167                 rt->info = tabs2spaces(strdupz(value));
2168                 strip_quotes(rt->info);
2169             }
2170             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2171                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
2172             }
2173             else {
2174                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
2175                       line, path, filename, rt->name, key);
2176             }
2177         }
2178         else {
2179             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
2180                   line, path, filename, key);
2181         }
2182     }
2183
2184     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2185         rrdcalc_free(&localhost, rc);
2186
2187     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2188         rrdcalctemplate_free(&localhost, rt);
2189
2190     fclose(fp);
2191     return 1;
2192 }
2193
2194 void health_readdir(const char *path) {
2195     size_t pathlen = strlen(path);
2196
2197     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2198
2199     DIR *dir = opendir(path);
2200     if (!dir) {
2201         error("Health configuration cannot open directory '%s'.", path);
2202         return;
2203     }
2204
2205     struct dirent *de = NULL;
2206     while ((de = readdir(dir))) {
2207         size_t len = strlen(de->d_name);
2208
2209         if(de->d_type == DT_DIR
2210            && (
2211                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2212                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2213            )) {
2214             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2215             continue;
2216         }
2217
2218         else if(de->d_type == DT_DIR) {
2219             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2220             strcpy(s, path);
2221             strcat(s, "/");
2222             strcat(s, de->d_name);
2223             health_readdir(s);
2224             freez(s);
2225             continue;
2226         }
2227
2228         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2229                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2230             health_readfile(path, de->d_name);
2231         }
2232
2233         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2234     }
2235
2236     closedir(dir);
2237 }
2238
2239 static inline char *health_config_dir(void) {
2240     char buffer[FILENAME_MAX + 1];
2241     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2242     return config_get("health", "health configuration directory", buffer);
2243 }
2244
2245 void health_init(void) {
2246     debug(D_HEALTH, "Health configuration initializing");
2247
2248     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2249         debug(D_HEALTH, "Health is disabled.");
2250         return;
2251     }
2252
2253     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2254     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2255         fatal("Cannot create directory '%s'.", pathname);
2256
2257     char filename[FILENAME_MAX + 1];
2258     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2259     health.log_filename = config_get("health", "health db file", filename);
2260
2261     health_alarm_log_load(&localhost);
2262     health_alarm_log_open();
2263
2264     char *path = health_config_dir();
2265
2266     {
2267         char buffer[FILENAME_MAX + 1];
2268         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2269         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2270     }
2271
2272     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2273     if(n < 10) {
2274         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2275         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2276     }
2277     else localhost.health_log.max = (unsigned int)n;
2278
2279     rrdhost_rwlock(&localhost);
2280     health_readdir(path);
2281     rrdhost_unlock(&localhost);
2282 }
2283
2284 // ----------------------------------------------------------------------------
2285 // JSON generation
2286
2287 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2288     if(value && *value)
2289         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2290     else
2291         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2292 }
2293
2294 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2295     buffer_sprintf(wb, "\n\t{\n"
2296                            "\t\t\"hostname\": \"%s\",\n"
2297                            "\t\t\"unique_id\": %u,\n"
2298                            "\t\t\"alarm_id\": %u,\n"
2299                            "\t\t\"alarm_event_id\": %u,\n"
2300                            "\t\t\"name\": \"%s\",\n"
2301                            "\t\t\"chart\": \"%s\",\n"
2302                            "\t\t\"family\": \"%s\",\n"
2303                            "\t\t\"processed\": %s,\n"
2304                            "\t\t\"updated\": %s,\n"
2305                            "\t\t\"exec_run\": %lu,\n"
2306                            "\t\t\"exec_failed\": %s,\n"
2307                            "\t\t\"exec\": \"%s\",\n"
2308                            "\t\t\"recipient\": \"%s\",\n"
2309                            "\t\t\"exec_code\": %d,\n"
2310                            "\t\t\"source\": \"%s\",\n"
2311                            "\t\t\"units\": \"%s\",\n"
2312                            "\t\t\"info\": \"%s\",\n"
2313                            "\t\t\"when\": %lu,\n"
2314                            "\t\t\"duration\": %lu,\n"
2315                            "\t\t\"non_clear_duration\": %lu,\n"
2316                            "\t\t\"status\": \"%s\",\n"
2317                            "\t\t\"old_status\": \"%s\",\n"
2318                            "\t\t\"delay\": %d,\n"
2319                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2320                            "\t\t\"updated_by_id\": %u,\n"
2321                            "\t\t\"updates_id\": %u,\n",
2322                    host->hostname,
2323                    ae->unique_id,
2324                    ae->alarm_id,
2325                    ae->alarm_event_id,
2326                    ae->name,
2327                    ae->chart,
2328                    ae->family,
2329                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2330                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2331                    (unsigned long)ae->exec_run_timestamp,
2332                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2333                    ae->exec?ae->exec:health.health_default_exec,
2334                    ae->recipient?ae->recipient:health.health_default_recipient,
2335                    ae->exec_code,
2336                    ae->source,
2337                    ae->units?ae->units:"",
2338                    ae->info?ae->info:"",
2339                    (unsigned long)ae->when,
2340                    (unsigned long)ae->duration,
2341                    (unsigned long)ae->non_clear_duration,
2342                    rrdcalc_status2string(ae->new_status),
2343                    rrdcalc_status2string(ae->old_status),
2344                    ae->delay,
2345                    (unsigned long)ae->delay_up_to_timestamp,
2346                    ae->updated_by_id,
2347                    ae->updates_id
2348     );
2349
2350     buffer_strcat(wb, "\t\t\"value\":");
2351     buffer_rrd_value(wb, ae->new_value);
2352     buffer_strcat(wb, ",\n");
2353
2354     buffer_strcat(wb, "\t\t\"old_value\":");
2355     buffer_rrd_value(wb, ae->old_value);
2356     buffer_strcat(wb, "\n");
2357
2358     buffer_strcat(wb, "\t}");
2359 }
2360
2361 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2362     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2363
2364     buffer_strcat(wb, "[");
2365
2366     unsigned int max = host->health_log.max;
2367     unsigned int count = 0;
2368     ALARM_ENTRY *ae;
2369     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2370         if(ae->unique_id > after) {
2371             if(likely(count)) buffer_strcat(wb, ",");
2372             health_alarm_entry2json_nolock(wb, ae, host);
2373         }
2374     }
2375
2376     buffer_strcat(wb, "\n]\n");
2377
2378     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2379 }
2380
2381 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2382     buffer_sprintf(wb,
2383            "\t\t\"%s.%s\": {\n"
2384                    "\t\t\t\"id\": %lu,\n"
2385                    "\t\t\t\"name\": \"%s\",\n"
2386                    "\t\t\t\"chart\": \"%s\",\n"
2387                    "\t\t\t\"family\": \"%s\",\n"
2388                    "\t\t\t\"active\": %s,\n"
2389                    "\t\t\t\"exec\": \"%s\",\n"
2390                    "\t\t\t\"recipient\": \"%s\",\n"
2391                    "\t\t\t\"source\": \"%s\",\n"
2392                    "\t\t\t\"units\": \"%s\",\n"
2393                    "\t\t\t\"info\": \"%s\",\n"
2394                                    "\t\t\t\"status\": \"%s\",\n"
2395                    "\t\t\t\"last_status_change\": %lu,\n"
2396                    "\t\t\t\"last_updated\": %lu,\n"
2397                    "\t\t\t\"next_update\": %lu,\n"
2398                    "\t\t\t\"update_every\": %d,\n"
2399                    "\t\t\t\"delay_up_duration\": %d,\n"
2400                    "\t\t\t\"delay_down_duration\": %d,\n"
2401                    "\t\t\t\"delay_max_duration\": %d,\n"
2402                    "\t\t\t\"delay_multiplier\": %f,\n"
2403                    "\t\t\t\"delay\": %d,\n"
2404                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2405             , rc->chart, rc->name
2406             , (unsigned long)rc->id
2407             , rc->name
2408             , rc->chart
2409             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2410             , (rc->rrdset)?"true":"false"
2411             , rc->exec?rc->exec:health.health_default_exec
2412             , rc->recipient?rc->recipient:health.health_default_recipient
2413             , rc->source
2414             , rc->units?rc->units:""
2415             , rc->info?rc->info:""
2416             , rrdcalc_status2string(rc->status)
2417             , (unsigned long)rc->last_status_change
2418             , (unsigned long)rc->last_updated
2419             , (unsigned long)rc->next_update
2420             , rc->update_every
2421             , rc->delay_up_duration
2422             , rc->delay_down_duration
2423             , rc->delay_max_duration
2424             , rc->delay_multiplier
2425             , rc->delay_last
2426             , (unsigned long)rc->delay_up_to_timestamp
2427     );
2428
2429     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2430         if(rc->dimensions && *rc->dimensions)
2431             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2432
2433         buffer_sprintf(wb,
2434                        "\t\t\t\"db_after\": %lu,\n"
2435                        "\t\t\t\"db_before\": %lu,\n"
2436                        "\t\t\t\"lookup_method\": \"%s\",\n"
2437                        "\t\t\t\"lookup_after\": %d,\n"
2438                        "\t\t\t\"lookup_before\": %d,\n"
2439                        "\t\t\t\"lookup_options\": \"",
2440                        (unsigned long) rc->db_after,
2441                        (unsigned long) rc->db_before,
2442                        group_method2string(rc->group),
2443                        rc->after,
2444                        rc->before
2445         );
2446         buffer_data_options2string(wb, rc->options);
2447         buffer_strcat(wb, "\",\n");
2448     }
2449
2450     if(rc->calculation) {
2451         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2452         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2453     }
2454
2455     if(rc->warning) {
2456         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2457         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2458     }
2459
2460     if(rc->critical) {
2461         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2462         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2463     }
2464
2465     buffer_strcat(wb, "\t\t\t\"green\":");
2466     buffer_rrd_value(wb, rc->green);
2467     buffer_strcat(wb, ",\n");
2468
2469     buffer_strcat(wb, "\t\t\t\"red\":");
2470     buffer_rrd_value(wb, rc->red);
2471     buffer_strcat(wb, ",\n");
2472
2473     buffer_strcat(wb, "\t\t\t\"value\":");
2474     buffer_rrd_value(wb, rc->value);
2475     buffer_strcat(wb, "\n");
2476
2477     buffer_strcat(wb, "\t\t}");
2478 }
2479
2480 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2481 //
2482 //}
2483
2484 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2485     int i;
2486
2487     rrdhost_rdlock(&localhost);
2488     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2489                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2490                         "\n\t\"status\": %s,"
2491                         "\n\t\"now\": %lu,"
2492                         "\n\t\"alarms\": {\n",
2493                         host->hostname,
2494                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2495                         health_enabled?"true":"false",
2496                         (unsigned long)time(NULL));
2497
2498     RRDCALC *rc;
2499     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2500         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2501             continue;
2502
2503         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2504             continue;
2505
2506         if(likely(i)) buffer_strcat(wb, ",\n");
2507         health_rrdcalc2json_nolock(wb, rc);
2508         i++;
2509     }
2510
2511 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2512 //    RRDCALCTEMPLATE *rt;
2513 //    for(rt = host->templates; rt ; rt = rt->next)
2514 //        health_rrdcalctemplate2json_nolock(wb, rt);
2515
2516     buffer_strcat(wb, "\n\t}\n}\n");
2517     rrdhost_unlock(&localhost);
2518 }
2519
2520
2521 // ----------------------------------------------------------------------------
2522 // re-load health configuration
2523
2524 static inline void health_free_all_nolock(RRDHOST *host) {
2525     while(host->templates)
2526         rrdcalctemplate_free(host, host->templates);
2527
2528     while(host->alarms)
2529         rrdcalc_free(host, host->alarms);
2530 }
2531
2532 void health_reload(void) {
2533     if(!health_enabled) {
2534         error("Health reload is requested, but health is not enabled.");
2535         return;
2536     }
2537
2538     char *path = health_config_dir();
2539
2540     // free all running alarms
2541     rrdhost_rwlock(&localhost);
2542     health_free_all_nolock(&localhost);
2543     rrdhost_unlock(&localhost);
2544
2545     // invalidate all previous entries in the alarm log
2546     ALARM_ENTRY *t;
2547     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2548         if(t->new_status != RRDCALC_STATUS_REMOVED)
2549             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2550     }
2551
2552     // reset all thresholds to all charts
2553     RRDSET *st;
2554     for(st = localhost.rrdset_root; st ; st = st->next) {
2555         st->green = NAN;
2556         st->red = NAN;
2557     }
2558
2559     // load the new alarms
2560     rrdhost_rwlock(&localhost);
2561     health_readdir(path);
2562     rrdhost_unlock(&localhost);
2563
2564     // link the loaded alarms to their charts
2565     for(st = localhost.rrdset_root; st ; st = st->next) {
2566         rrdhost_rwlock(&localhost);
2567
2568         rrdsetcalc_link_matching(st);
2569         rrdcalctemplate_link_matching(st);
2570
2571         rrdhost_unlock(&localhost);
2572     }
2573 }
2574
2575 // ----------------------------------------------------------------------------
2576 // health main thread and friends
2577
2578 static inline int rrdcalc_value2status(calculated_number n) {
2579     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2580     if(n) return RRDCALC_STATUS_RAISED;
2581     return RRDCALC_STATUS_CLEAR;
2582 }
2583
2584 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2585     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2586
2587     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2588         // do not send notifications for internal statuses
2589         goto done;
2590     }
2591
2592     // find the previous notification for the same alarm
2593     // which we have run the exec script
2594     ALARM_ENTRY *t;
2595     for(t = ae->next; t ;t = t->next) {
2596         if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2597             break;
2598     }
2599
2600     if(likely(t)) {
2601         // we have executed this alarm notification in the past
2602         if (t && t->new_status == ae->new_status) {
2603             // don't send the same notification again
2604             debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name,
2605                  rrdcalc_status2string(ae->new_status));
2606             goto done;
2607         }
2608     }
2609     else {
2610         // we have not executed this alarm notification in the past
2611         if(unlikely(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2612             debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2613             goto done;
2614         }
2615     }
2616
2617     char buffer[FILENAME_MAX + 1];
2618     pid_t command_pid;
2619
2620     const char *exec = ae->exec;
2621     if(!exec) exec = health.health_default_exec;
2622
2623     const char *recipient = ae->recipient;
2624     if(!recipient) recipient = health.health_default_recipient;
2625
2626     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2627               exec,
2628               recipient,
2629               host->hostname,
2630               ae->unique_id,
2631               ae->alarm_id,
2632               ae->alarm_event_id,
2633               (unsigned long)ae->when,
2634               ae->name,
2635               ae->chart?ae->chart:"NOCAHRT",
2636               ae->family?ae->family:"NOFAMILY",
2637               rrdcalc_status2string(ae->new_status),
2638               rrdcalc_status2string(ae->old_status),
2639               ae->new_value,
2640               ae->old_value,
2641               ae->source?ae->source:"UNKNOWN",
2642               (uint32_t)ae->duration,
2643               (uint32_t)ae->non_clear_duration,
2644               ae->units?ae->units:"",
2645               ae->info?ae->info:""
2646     );
2647
2648     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2649     ae->exec_run_timestamp = time(NULL);
2650
2651     debug(D_HEALTH, "executing command '%s'", buffer);
2652     FILE *fp = mypopen(buffer, &command_pid);
2653     if(!fp) {
2654         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2655         goto done;
2656     }
2657     debug(D_HEALTH, "HEALTH reading from command");
2658     char *s = fgets(buffer, FILENAME_MAX, fp);
2659     (void)s;
2660     ae->exec_code = mypclose(fp, command_pid);
2661     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2662
2663     if(ae->exec_code != 0)
2664         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2665
2666 done:
2667     health_alarm_log_save(host, ae);
2668     return;
2669 }
2670
2671 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2672     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2673          ae->chart?ae->chart:"NOCHART", ae->name,
2674          ae->new_value,
2675          rrdcalc_status2string(ae->old_status),
2676          rrdcalc_status2string(ae->new_status)
2677     );
2678
2679     health_alarm_execute(host, ae);
2680 }
2681
2682 static inline void health_alarm_log_process(RRDHOST *host) {
2683     static uint32_t stop_at_id = 0;
2684     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2685     time_t now = time(NULL);
2686
2687     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2688
2689     ALARM_ENTRY *ae;
2690     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2691         if(unlikely(
2692             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2693             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2694             )) {
2695
2696             if(unlikely(ae->unique_id < first_waiting))
2697                 first_waiting = ae->unique_id;
2698
2699             if(likely(now >= ae->delay_up_to_timestamp))
2700                 health_process_notifications(host, ae);
2701         }
2702     }
2703
2704     // remember this for the next iteration
2705     stop_at_id = first_waiting;
2706
2707     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2708
2709     if(host->health_log.count <= host->health_log.max)
2710         return;
2711
2712     // cleanup excess entries in the log
2713     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2714
2715     ALARM_ENTRY *last = NULL;
2716     unsigned int count = host->health_log.max * 2 / 3;
2717     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2718
2719     if(ae && last && last->next == ae)
2720         last->next = NULL;
2721     else
2722         ae = NULL;
2723
2724     while(ae) {
2725         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2726
2727         ALARM_ENTRY *t = ae->next;
2728
2729         freez(ae->name);
2730         freez(ae->chart);
2731         freez(ae->family);
2732         freez(ae->exec);
2733         freez(ae->recipient);
2734         freez(ae->source);
2735         freez(ae->units);
2736         freez(ae->info);
2737         freez(ae);
2738
2739         ae = t;
2740         host->health_log.count--;
2741     }
2742
2743     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2744 }
2745
2746 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2747     if(unlikely(!rc->rrdset)) {
2748         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2749         return 0;
2750     }
2751
2752     if(unlikely(rc->next_update > now)) {
2753         if (unlikely(*next_run > rc->next_update)) {
2754             // update the next_run time of the main loop
2755             // to run this alarm precisely the time required
2756             *next_run = rc->next_update;
2757         }
2758
2759         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2760         return 0;
2761     }
2762
2763     if(unlikely(!rc->update_every)) {
2764         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2765         return 0;
2766     }
2767
2768     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
2769         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
2770         return 0;
2771     }
2772
2773     int update_every = rc->rrdset->update_every;
2774     time_t first = rrdset_first_entry_t(rc->rrdset);
2775     time_t last = rrdset_last_entry_t(rc->rrdset);
2776
2777     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
2778         debug(D_HEALTH
2779               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
2780               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
2781               , (unsigned long) last);
2782         return 0;
2783     }
2784
2785     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2786         time_t needed = now + rc->before + rc->after;
2787
2788         if(needed + update_every < first || needed - update_every > last) {
2789             debug(D_HEALTH
2790                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
2791                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
2792                   , (unsigned long) last);
2793             return 0;
2794         }
2795     }
2796
2797     return 1;
2798 }
2799
2800 void *health_main(void *ptr) {
2801     (void)ptr;
2802
2803     info("HEALTH thread created with task id %d", gettid());
2804
2805     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2806         error("Cannot set pthread cancel type to DEFERRED.");
2807
2808     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2809         error("Cannot set pthread cancel state to ENABLE.");
2810
2811     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2812     if(min_run_every < 1) min_run_every = 1;
2813
2814     BUFFER *wb = buffer_create(100);
2815
2816     unsigned int loop = 0;
2817     while(health_enabled && !netdata_exit) {
2818         loop++;
2819         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2820
2821         int oldstate, runnable = 0;
2822         time_t now = time(NULL);
2823         time_t next_run = now + min_run_every;
2824         RRDCALC *rc;
2825
2826         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2827             error("Cannot set pthread cancel state to DISABLE.");
2828
2829         rrdhost_rdlock(&localhost);
2830
2831         // the first loop is to lookup values from the db
2832         for(rc = localhost.alarms; rc; rc = rc->next) {
2833             if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
2834                 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
2835                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
2836                 continue;
2837             }
2838
2839             runnable++;
2840             rc->old_value = rc->value;
2841             rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
2842
2843             // 1. if there is database lookup, do it
2844             // 2. if there is calculation expression, run it
2845
2846             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2847                 /* time_t old_db_timestamp = rc->db_before; */
2848                 int value_is_null = 0;
2849
2850                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2851                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2852                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2853
2854                 if (unlikely(ret != 200)) {
2855                     // database lookup failed
2856                     rc->value = NAN;
2857
2858                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2859
2860                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2861                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2862                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2863                     }
2864                 }
2865                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2866                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2867
2868                 /* - RRDCALC_FLAG_DB_STALE not currently used
2869                 if (unlikely(old_db_timestamp == rc->db_before)) {
2870                     // database is stale
2871
2872                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2873
2874                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2875                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2876                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2877                     }
2878                 }
2879                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2880                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2881                 */
2882
2883                 if (unlikely(value_is_null)) {
2884                     // collected value is null
2885
2886                     rc->value = NAN;
2887
2888                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2889                           rc->chart?rc->chart:"NOCHART", rc->name);
2890
2891                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2892                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2893                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2894                               rc->chart?rc->chart:"NOCHART", rc->name);
2895                     }
2896                 }
2897                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2898                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2899
2900                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2901                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2902             }
2903
2904             if(unlikely(rc->calculation)) {
2905                 if (unlikely(!expression_evaluate(rc->calculation))) {
2906                     // calculation failed
2907
2908                     rc->value = NAN;
2909
2910                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s",
2911                           rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2912
2913                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2914                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2915                         error("Health alarm '%s.%s': expression '%s' failed: %s",
2916                               rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
2917                     }
2918                 }
2919                 else {
2920                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2921                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2922
2923                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value "
2924                             CALCULATED_NUMBER_FORMAT
2925                             ": %s (source: %s)",
2926                           rc->chart?rc->chart:"NOCHART", rc->name,
2927                           rc->calculation->parsed_as,
2928                           rc->calculation->result,
2929                           buffer_tostring(rc->calculation->error_msg),
2930                           rc->source
2931                     );
2932
2933                     rc->value = rc->calculation->result;
2934                 }
2935             }
2936         }
2937         rrdhost_unlock(&localhost);
2938
2939         if(unlikely(runnable && !netdata_exit)) {
2940             rrdhost_rdlock(&localhost);
2941
2942             for(rc = localhost.alarms; rc; rc = rc->next) {
2943                 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
2944                     continue;
2945
2946                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2947                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2948
2949                 if(likely(rc->warning)) {
2950                     if(unlikely(!expression_evaluate(rc->warning))) {
2951                         // calculation failed
2952
2953                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2954                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2955
2956                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2957                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2958                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2959                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2960                         }
2961                     }
2962                     else {
2963                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2964                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2965
2966                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2967                                 CALCULATED_NUMBER_FORMAT
2968                                 ": %s (source: %s)",
2969                               rc->chart?rc->chart:"NOCHART", rc->name,
2970                               rc->warning->result,
2971                               buffer_tostring(rc->warning->error_msg),
2972                               rc->source
2973                         );
2974
2975                         warning_status = rrdcalc_value2status(rc->warning->result);
2976                     }
2977                 }
2978
2979                 if(likely(rc->critical)) {
2980                     if(unlikely(!expression_evaluate(rc->critical))) {
2981                         // calculation failed
2982
2983                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2984                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2985
2986                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2987                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2988                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2989                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2990                         }
2991                     }
2992                     else {
2993                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2994                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2995
2996                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2997                                 CALCULATED_NUMBER_FORMAT
2998                                 ": %s (source: %s)",
2999                               rc->chart?rc->chart:"NOCHART", rc->name,
3000                               rc->critical->result,
3001                               buffer_tostring(rc->critical->error_msg),
3002                               rc->source
3003                         );
3004
3005                         critical_status = rrdcalc_value2status(rc->critical->result);
3006                     }
3007                 }
3008
3009                 int status = RRDCALC_STATUS_UNDEFINED;
3010
3011                 switch(warning_status) {
3012                     case RRDCALC_STATUS_CLEAR:
3013                         status = RRDCALC_STATUS_CLEAR;
3014                         break;
3015
3016                     case RRDCALC_STATUS_RAISED:
3017                         status = RRDCALC_STATUS_WARNING;
3018                         break;
3019
3020                     default:
3021                         break;
3022                 }
3023
3024                 switch(critical_status) {
3025                     case RRDCALC_STATUS_CLEAR:
3026                         if(status == RRDCALC_STATUS_UNDEFINED)
3027                             status = RRDCALC_STATUS_CLEAR;
3028                         break;
3029
3030                     case RRDCALC_STATUS_RAISED:
3031                         status = RRDCALC_STATUS_CRITICAL;
3032                         break;
3033
3034                     default:
3035                         break;
3036                 }
3037
3038                 if(status != rc->status) {
3039                     int delay = 0;
3040
3041                     if(now > rc->delay_up_to_timestamp) {
3042                         rc->delay_up_current = rc->delay_up_duration;
3043                         rc->delay_down_current = rc->delay_down_duration;
3044                         rc->delay_last = 0;
3045                         rc->delay_up_to_timestamp = 0;
3046                     }
3047                     else {
3048                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
3049                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
3050
3051                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
3052                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
3053                     }
3054
3055                     if(status > rc->status)
3056                         delay = rc->delay_up_current;
3057                     else
3058                         delay = rc->delay_down_current;
3059
3060                     // COMMENTED: because we do need to send raising alarms
3061                     // if(now + delay < rc->delay_up_to_timestamp)
3062                     //    delay = (int)(rc->delay_up_to_timestamp - now);
3063
3064                     rc->delay_last = delay;
3065                     rc->delay_up_to_timestamp = now + delay;
3066                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
3067                     rc->last_status_change = now;
3068                     rc->status = status;
3069                 }
3070
3071                 rc->last_updated = now;
3072                 rc->next_update = now + rc->update_every;
3073
3074                 if (next_run > rc->next_update)
3075                     next_run = rc->next_update;
3076             }
3077
3078             rrdhost_unlock(&localhost);
3079         }
3080
3081         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
3082             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
3083
3084         if(unlikely(netdata_exit))
3085             break;
3086
3087         // execute notifications
3088         // and cleanup
3089         health_alarm_log_process(&localhost);
3090
3091         if(unlikely(netdata_exit))
3092             break;
3093         
3094         now = time(NULL);
3095         if(now < next_run) {
3096             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
3097                   loop, (int) (next_run - now));
3098             sleep_usec(1000000 * (unsigned long long) (next_run - now));
3099         }
3100         else {
3101             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
3102         }
3103     }
3104
3105     buffer_free(wb);
3106
3107     info("HEALTH thread exiting");
3108     pthread_exit(NULL);
3109     return NULL;
3110 }