]> arthur.barton.de Git - netdata.git/blob - src/health.c
Merge pull request #1368 from rlefevre/time-improvements
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         if(unlink(health.log_filename) == -1 && errno != ENOENT)
71             error("Health: cannot remove old alarms log file '%s'", health.log_filename);
72
73         // open it with truncate
74         health.log_fp = fopen(health.log_filename, "w");
75
76         if(health.log_fp)
77             fclose(health.log_fp);
78         else
79             error("Health: cannot truncate health log '%s'", health.log_filename);
80
81         health.log_fp = NULL;
82
83         health.log_entries_written = 0;
84         health_alarm_log_open();
85     }
86 }
87
88 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
89     health_log_rotate();
90
91     if(likely(health.log_fp)) {
92         if(unlikely(fprintf(health.log_fp
93                 , "%c\t%s"
94                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
95                   "\t%08x\t%08x\t%08x"
96                   "\t%08x\t%08x\t%08x"
97                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
98                   "\t%d\t%d\t%d\t%d"
99                   "\t%Lf\t%Lf"
100                   "\n"
101                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
102                 , host->hostname
103
104                 , ae->unique_id
105                 , ae->alarm_id
106                 , ae->alarm_event_id
107                 , ae->updated_by_id
108                 , ae->updates_id
109
110                 , (uint32_t)ae->when
111                 , (uint32_t)ae->duration
112                 , (uint32_t)ae->non_clear_duration
113                 , (uint32_t)ae->flags
114                 , (uint32_t)ae->exec_run_timestamp
115                 , (uint32_t)ae->delay_up_to_timestamp
116
117                 , (ae->name)?ae->name:""
118                 , (ae->chart)?ae->chart:""
119                 , (ae->family)?ae->family:""
120                 , (ae->exec)?ae->exec:""
121                 , (ae->recipient)?ae->recipient:""
122                 , (ae->source)?ae->source:""
123                 , (ae->units)?ae->units:""
124                 , (ae->info)?ae->info:""
125
126                 , ae->exec_code
127                 , ae->new_status
128                 , ae->old_status
129                 , ae->delay
130
131                 , (long double)ae->new_value
132                 , (long double)ae->old_value
133         ) < 0))
134             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
135         else {
136             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
137             health.log_entries_written++;
138         }
139     }
140 }
141
142 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
143     static uint32_t max_unique_id = 0, max_alarm_id = 0;
144
145     errno = 0;
146
147     char *s, *buf = mallocz(65536 + 1);
148     size_t line = 0, len = 0;
149     ssize_t loaded = 0, updated = 0, errored = 0, duplicate = 0;
150
151     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
152
153     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
154         health.log_entries_written++;
155         line++;
156
157         int max_entries = 30, entries = 0;
158         char *pointers[max_entries];
159
160         pointers[entries++] = s++;
161         while(*s) {
162             if(unlikely(*s == '\t')) {
163                 *s = '\0';
164                 pointers[entries++] = ++s;
165                 if(entries >= max_entries) {
166                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
167                     break;
168                 }
169             }
170             else s++;
171         }
172
173         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
174             ALARM_ENTRY *ae = NULL;
175
176             if(entries < 26) {
177                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries);
178                 errored++;
179                 continue;
180             }
181
182             // check that we have valid ids
183             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
184             if(!unique_id) {
185                 error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]);
186                 errored++;
187                 continue;
188             }
189
190             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
191             if(!alarm_id) {
192                 error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]);
193                 errored++;
194                 continue;
195             }
196
197             if(unlikely(*pointers[0] == 'A')) {
198                 // make sure it is properly numbered
199                 if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
200                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id);
201                     errored++;
202                     continue;
203                 }
204
205                 ae = callocz(1, sizeof(ALARM_ENTRY));
206             }
207             else if(unlikely(*pointers[0] == 'U')) {
208                 // find the original
209                 for(ae = host->health_log.alarms; ae; ae = ae->next) {
210                     if(unlikely(unique_id == ae->unique_id)) {
211                         if(unlikely(*pointers[0] == 'A')) {
212                             error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later."
213                                   , line, filename, unique_id);
214                             *pointers[0] = 'U';
215                             duplicate++;
216                         }
217                         break;
218                     }
219                     else if(unlikely(unique_id > ae->unique_id)) {
220                         // no need to continue
221                         // the linked list is sorted
222                         ae = NULL;
223                         break;
224                     }
225                 }
226
227                 // if not found, skip this line
228                 if(!ae) {
229                     // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
230                     continue;
231                 }
232             }
233
234             // check for a possible host missmatch
235             //if(strcmp(pointers[1], host->hostname))
236             //    error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
237
238             ae->unique_id               = unique_id;
239             ae->alarm_id                = alarm_id;
240             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
241             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
242             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
243
244             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
245             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
246             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
247
248             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
249             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
250
251             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
252             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
253
254             freez(ae->name);
255             ae->name = strdupz(pointers[13]);
256             ae->hash_name = simple_hash(ae->name);
257
258             freez(ae->chart);
259             ae->chart = strdupz(pointers[14]);
260             ae->hash_chart = simple_hash(ae->chart);
261
262             freez(ae->family);
263             ae->family = strdupz(pointers[15]);
264
265             freez(ae->exec);
266             ae->exec = strdupz(pointers[16]);
267             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
268
269             freez(ae->recipient);
270             ae->recipient = strdupz(pointers[17]);
271             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
272
273             freez(ae->source);
274             ae->source = strdupz(pointers[18]);
275             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
276
277             freez(ae->units);
278             ae->units = strdupz(pointers[19]);
279             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
280
281             freez(ae->info);
282             ae->info = strdupz(pointers[20]);
283             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
284
285             ae->exec_code   = str2i(pointers[21]);
286             ae->new_status  = str2i(pointers[22]);
287             ae->old_status  = str2i(pointers[23]);
288             ae->delay       = str2i(pointers[24]);
289
290             ae->new_value   = str2l(pointers[25]);
291             ae->old_value   = str2l(pointers[26]);
292
293             static char value_string[100 + 1];
294             freez(ae->old_value_string);
295             freez(ae->new_value_string);
296             ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
297             ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
298
299             // add it to host if not already there
300             if(unlikely(*pointers[0] == 'A')) {
301                 ae->next = host->health_log.alarms;
302                 host->health_log.alarms = ae;
303                 loaded++;
304             }
305             else updated++;
306
307             if(unlikely(ae->unique_id > max_unique_id))
308                 max_unique_id = ae->unique_id;
309
310             if(unlikely(ae->alarm_id >= max_alarm_id))
311                 max_alarm_id = ae->alarm_id;
312         }
313         else {
314             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
315             errored++;
316         }
317     }
318
319     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
320
321     freez(buf);
322
323     if(!max_unique_id) max_unique_id = (uint32_t)now_realtime_sec();
324     if(!max_alarm_id)  max_alarm_id  = (uint32_t)now_realtime_sec();
325
326     host->health_log.next_log_id = max_unique_id + 1;
327     host->health_log.next_alarm_id = max_alarm_id + 1;
328
329     debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
330     return loaded;
331 }
332
333 static inline void health_alarm_log_load(RRDHOST *host) {
334     health_alarm_log_close();
335
336     char filename[FILENAME_MAX + 1];
337     snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename);
338     FILE *fp = fopen(filename, "r");
339     if(!fp)
340         error("Health: cannot open health file: %s", filename);
341     else {
342         health_alarm_log_read(host, fp, filename);
343         fclose(fp);
344     }
345
346     health.log_entries_written = 0;
347     fp = fopen(health.log_filename, "r");
348     if(!fp)
349         error("Health: cannot open health file: %s", health.log_filename);
350     else {
351         health_alarm_log_read(host, fp, health.log_filename);
352         fclose(fp);
353     }
354
355     health_alarm_log_open();
356 }
357
358
359 // ----------------------------------------------------------------------------
360 // health alarm log management
361
362 static inline void health_alarm_log(
363         RRDHOST *host,
364         uint32_t alarm_id,
365         uint32_t alarm_event_id,
366         time_t when,
367         const char *name,
368         const char *chart,
369         const char *family,
370         const char *exec,
371         const char *recipient,
372         time_t duration,
373         calculated_number old_value,
374         calculated_number new_value,
375         int old_status,
376         int new_status,
377         const char *source,
378         const char *units,
379         const char *info,
380         int delay,
381         uint32_t flags
382 ) {
383     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
384
385     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
386     ae->name = strdupz(name);
387     ae->hash_name = simple_hash(ae->name);
388
389     if(chart) {
390         ae->chart = strdupz(chart);
391         ae->hash_chart = simple_hash(ae->chart);
392     }
393
394     if(family)
395         ae->family = strdupz(family);
396
397     if(exec) ae->exec = strdupz(exec);
398     if(recipient) ae->recipient = strdupz(recipient);
399     if(source) ae->source = strdupz(source);
400     if(units) ae->units = strdupz(units);
401     if(info) ae->info = strdupz(info);
402
403     ae->unique_id = host->health_log.next_log_id++;
404     ae->alarm_id = alarm_id;
405     ae->alarm_event_id = alarm_event_id;
406     ae->when = when;
407     ae->old_value = old_value;
408     ae->new_value = new_value;
409
410     static char value_string[100 + 1];
411     ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
412     ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
413
414     ae->old_status = old_status;
415     ae->new_status = new_status;
416     ae->duration = duration;
417     ae->delay = delay;
418     ae->delay_up_to_timestamp = when + delay;
419
420     ae->flags |= flags;
421
422     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
423         ae->non_clear_duration += ae->duration;
424
425     // link it
426     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
427     ae->next = host->health_log.alarms;
428     host->health_log.alarms = ae;
429     host->health_log.count++;
430     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
431
432     // match previous alarms
433     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
434     ALARM_ENTRY *t;
435     for(t = host->health_log.alarms ; t ; t = t->next) {
436         if(t != ae && t->alarm_id == ae->alarm_id) {
437             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
438                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
439                 t->updated_by_id = ae->unique_id;
440                 ae->updates_id = t->unique_id;
441
442                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
443                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
444                     ae->non_clear_duration += t->non_clear_duration;
445
446                 health_alarm_log_save(host, t);
447             }
448
449             // no need to continue
450             break;
451         }
452     }
453     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
454
455     health_alarm_log_save(host, ae);
456 }
457
458 // ----------------------------------------------------------------------------
459 // RRDVAR management
460
461 static inline int rrdvar_fix_name(char *variable) {
462     int fixed = 0;
463     while(*variable) {
464         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
465             *variable++ = '_';
466             fixed++;
467         }
468         else
469             variable++;
470     }
471
472     return fixed;
473 }
474
475 int rrdvar_compare(void* a, void* b) {
476     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
477     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
478     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
479 }
480
481 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
482     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
483     if(ret != rv)
484         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
485
486     return ret;
487 }
488
489 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
490     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
491     if(!ret)
492         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
493
494     return ret;
495 }
496
497 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
498     RRDVAR tmp;
499     tmp.name = (char *)name;
500     tmp.hash = (hash)?hash:simple_hash(tmp.name);
501
502     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
503 }
504
505 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
506     (void)host;
507
508     if(!rv) return;
509
510     if(tree) {
511         debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
512         if(unlikely(!rrdvar_index_del(tree, rv)))
513             error("Attempted to delete variable '%s' from host '%s', but it is not found.", rv->name, host->hostname);
514     }
515
516     freez(rv->name);
517     freez(rv);
518 }
519
520 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, void *value) {
521     char *variable = strdupz(name);
522     rrdvar_fix_name(variable);
523     uint32_t hash = simple_hash(variable);
524
525     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
526     if(unlikely(!rv)) {
527         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
528
529         rv = callocz(1, sizeof(RRDVAR));
530         rv->name = variable;
531         rv->hash = hash;
532         rv->type = type;
533         rv->value = value;
534
535         RRDVAR *ret = rrdvar_index_add(tree, rv);
536         if(unlikely(ret != rv)) {
537             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
538             rrdvar_free(NULL, NULL, rv);
539             rv = NULL;
540         }
541         else
542             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
543     }
544     else {
545         debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
546
547         // already exists
548         freez(variable);
549
550         // this is important
551         // it must return NULL - not the existing variable - or double-free will happen
552         rv = NULL;
553     }
554
555     return rv;
556 }
557
558 // ----------------------------------------------------------------------------
559 // CUSTOM VARIABLES
560
561 RRDVAR *rrdvar_custom_host_variable_create(RRDHOST *host, const char *name) {
562     calculated_number *v = callocz(1, sizeof(calculated_number));
563     *v = NAN;
564     RRDVAR *rv = rrdvar_create_and_index("host", &host->variables_root_index, name, RRDVAR_TYPE_CALCULATED_ALLOCATED, v);
565     if(unlikely(!rv)) {
566         free(v);
567         error("Requested variable '%s' already exists - possibly 2 plugins will be updating it at the same time", name);
568
569         char *variable = strdupz(name);
570         rrdvar_fix_name(variable);
571         uint32_t hash = simple_hash(variable);
572
573         rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
574     }
575
576     return rv;
577 }
578
579 void rrdvar_custom_host_variable_destroy(RRDHOST *host, const char *name) {
580     char *variable = strdupz(name);
581     rrdvar_fix_name(variable);
582     uint32_t hash = simple_hash(variable);
583
584     RRDVAR *rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
585     freez(variable);
586
587     if(!rv) {
588         error("Attempted to remove variable '%s' from host '%s', but it does not exist.", name, host->hostname);
589         return;
590     }
591
592     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED) {
593         error("Attempted to remove variable '%s' from host '%s', but it does not a custom allocated variable.", name, host->hostname);
594         return;
595     }
596
597     if(!rrdvar_index_del(&host->variables_root_index, rv)) {
598         error("Attempted to remove variable '%s' from host '%s', but it cannot be found.", name, host->hostname);
599         return;
600     }
601
602     freez(rv->name);
603     freez(rv->value);
604     freez(rv);
605 }
606
607 void rrdvar_custom_host_variable_set(RRDVAR *rv, calculated_number value) {
608     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED)
609         error("requested to set variable '%s' to value " CALCULATED_NUMBER_FORMAT " but the variable is not a custom one.", rv->name, value);
610     else {
611         calculated_number *v = rv->value;
612         *v = value;
613     }
614 }
615
616 // ----------------------------------------------------------------------------
617 // RRDVAR lookup
618
619 static calculated_number rrdvar2number(RRDVAR *rv) {
620     switch(rv->type) {
621         case RRDVAR_TYPE_CALCULATED_ALLOCATED:
622         case RRDVAR_TYPE_CALCULATED: {
623             calculated_number *n = (calculated_number *)rv->value;
624             return *n;
625         }
626
627         case RRDVAR_TYPE_TIME_T: {
628             time_t *n = (time_t *)rv->value;
629             return *n;
630         }
631
632         case RRDVAR_TYPE_COLLECTED: {
633             collected_number *n = (collected_number *)rv->value;
634             return *n;
635         }
636
637         case RRDVAR_TYPE_TOTAL: {
638             total_number *n = (total_number *)rv->value;
639             return *n;
640         }
641
642         case RRDVAR_TYPE_INT: {
643             int *n = (int *)rv->value;
644             return *n;
645         }
646
647         default:
648             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
649             return NAN;
650     }
651 }
652
653 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
654     RRDSET *st = rc->rrdset;
655     RRDVAR *rv;
656
657     if(!st) return 0;
658
659     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
660     if(rv) {
661         *result = rrdvar2number(rv);
662         return 1;
663     }
664
665     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
666     if(rv) {
667         *result = rrdvar2number(rv);
668         return 1;
669     }
670
671     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
672     if(rv) {
673         *result = rrdvar2number(rv);
674         return 1;
675     }
676
677     return 0;
678 }
679
680 // ----------------------------------------------------------------------------
681 // RRDVAR to JSON
682
683 struct variable2json_helper {
684     BUFFER *buf;
685     size_t counter;
686 };
687
688 static int single_variable2json(void *entry, void *data) {
689     struct variable2json_helper *helper = (struct variable2json_helper *)data;
690     RRDVAR *rv = (RRDVAR *)entry;
691     calculated_number value = rrdvar2number(rv);
692
693     if(unlikely(isnan(value) || isinf(value)))
694         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
695     else
696         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
697
698     helper->counter++;
699
700     return 0;
701 }
702
703 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
704     struct variable2json_helper helper = {
705             .buf = buf,
706             .counter = 0
707     };
708
709     buffer_sprintf(buf, "{\n\t\"chart\": \"%s\",\n\t\"chart_name\": \"%s\",\n\t\"chart_context\": \"%s\",\n\t\"chart_variables\": {", st->id, st->name, st->context);
710     avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
711     buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
712     helper.counter = 0;
713     avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
714     buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
715     helper.counter = 0;
716     avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
717     buffer_strcat(buf, "\n\t}\n}\n");
718 }
719
720
721 // ----------------------------------------------------------------------------
722 // RRDDIMVAR management
723 // DIMENSION VARIABLES
724
725 #define RRDDIMVAR_ID_MAX 1024
726
727 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
728     RRDDIM *rd = rs->rrddim;
729     RRDSET *st = rd->rrdset;
730
731     // CHART VARIABLES FOR THIS DIMENSION
732
733     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
734     rs->var_local_id = NULL;
735
736     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
737     rs->var_local_name = NULL;
738
739     // FAMILY VARIABLES FOR THIS DIMENSION
740
741     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
742     rs->var_family_id = NULL;
743
744     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
745     rs->var_family_name = NULL;
746
747     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
748     rs->var_family_contextid = NULL;
749
750     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
751     rs->var_family_contextname = NULL;
752
753     // HOST VARIABLES FOR THIS DIMENSION
754
755     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
756     rs->var_host_chartidid = NULL;
757
758     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
759     rs->var_host_chartidname = NULL;
760
761     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
762     rs->var_host_chartnameid = NULL;
763
764     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
765     rs->var_host_chartnamename = NULL;
766
767     // KEYS
768
769     freez(rs->key_id);
770     rs->key_id = NULL;
771
772     freez(rs->key_name);
773     rs->key_name = NULL;
774
775     freez(rs->key_fullidid);
776     rs->key_fullidid = NULL;
777
778     freez(rs->key_fullidname);
779     rs->key_fullidname = NULL;
780
781     freez(rs->key_contextid);
782     rs->key_contextid = NULL;
783
784     freez(rs->key_contextname);
785     rs->key_contextname = NULL;
786
787     freez(rs->key_fullnameid);
788     rs->key_fullnameid = NULL;
789
790     freez(rs->key_fullnamename);
791     rs->key_fullnamename = NULL;
792 }
793
794 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
795     rrddimvar_free_variables(rs);
796
797     RRDDIM *rd = rs->rrddim;
798     RRDSET *st = rd->rrdset;
799
800     char buffer[RRDDIMVAR_ID_MAX + 1];
801
802     // KEYS
803
804     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
805     rs->key_id = strdupz(buffer);
806
807     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
808     rs->key_name = strdupz(buffer);
809
810     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
811     rs->key_fullidid = strdupz(buffer);
812
813     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
814     rs->key_fullidname = strdupz(buffer);
815
816     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
817     rs->key_contextid = strdupz(buffer);
818
819     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
820     rs->key_contextname = strdupz(buffer);
821
822     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
823     rs->key_fullnameid = strdupz(buffer);
824
825     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
826     rs->key_fullnamename = strdupz(buffer);
827
828     // CHART VARIABLES FOR THIS DIMENSION
829     // -----------------------------------
830     //
831     // dimensions are available as:
832     // - $id
833     // - $name
834
835     rs->var_local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
836     rs->var_local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
837
838     // FAMILY VARIABLES FOR THIS DIMENSION
839     // -----------------------------------
840     //
841     // dimensions are available as:
842     // - $id                 (only the first, when multiple overlap)
843     // - $name               (only the first, when multiple overlap)
844     // - $chart-context.id
845     // - $chart-context.name
846
847     rs->var_family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
848     rs->var_family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
849     rs->var_family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
850     rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
851
852     // HOST VARIABLES FOR THIS DIMENSION
853     // -----------------------------------
854     //
855     // dimensions are available as:
856     // - $chart-id.id
857     // - $chart-id.name
858     // - $chart-name.id
859     // - $chart-name.name
860
861     rs->var_host_chartidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
862     rs->var_host_chartidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
863     rs->var_host_chartnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
864     rs->var_host_chartnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
865 }
866
867 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
868     RRDSET *st = rd->rrdset;
869
870     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
871
872     if(!prefix) prefix = "";
873     if(!suffix) suffix = "";
874
875     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
876
877     rs->prefix = strdupz(prefix);
878     rs->suffix = strdupz(suffix);
879
880     rs->type = type;
881     rs->value = value;
882     rs->options = options;
883     rs->rrddim = rd;
884
885     rs->next = rd->variables;
886     rd->variables = rs;
887
888     rrddimvar_create_variables(rs);
889
890     return rs;
891 }
892
893 void rrddimvar_rename_all(RRDDIM *rd) {
894     RRDSET *st = rd->rrdset;
895     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
896
897     RRDDIMVAR *rs, *next = rd->variables;
898     while((rs = next)) {
899         next = rs->next;
900         rrddimvar_create_variables(rs);
901     }
902 }
903
904 void rrddimvar_free(RRDDIMVAR *rs) {
905     RRDDIM *rd = rs->rrddim;
906     RRDSET *st = rd->rrdset;
907     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
908
909     rrddimvar_free_variables(rs);
910
911     if(rd->variables == rs) {
912         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
913         rd->variables = rs->next;
914     }
915     else {
916         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
917         RRDDIMVAR *t;
918         for (t = rd->variables; t && t->next != rs; t = t->next) ;
919         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
920         else t->next = rs->next;
921     }
922
923     freez(rs->prefix);
924     freez(rs->suffix);
925     freez(rs);
926 }
927
928 // ----------------------------------------------------------------------------
929 // RRDSETVAR management
930 // CHART VARIABLES
931
932 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
933     RRDSET *st = rs->rrdset;
934
935     // CHART
936
937     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
938     rs->var_local = NULL;
939
940     // FAMILY
941
942     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
943     rs->var_family = NULL;
944
945     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
946     rs->var_host = NULL;
947
948     // HOST
949
950     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
951     rs->var_family_name = NULL;
952
953     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
954     rs->var_host_name = NULL;
955
956     // KEYS
957
958     freez(rs->key_fullid);
959     rs->key_fullid = NULL;
960
961     freez(rs->key_fullname);
962     rs->key_fullname = NULL;
963 }
964
965 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
966     rrdsetvar_free_variables(rs);
967
968     RRDSET *st = rs->rrdset;
969
970     // KEYS
971
972     char buffer[RRDVAR_MAX_LENGTH + 1];
973     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
974     rs->key_fullid = strdupz(buffer);
975
976     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
977     rs->key_fullname = strdupz(buffer);
978
979     // CHART
980
981     rs->var_local       = rrdvar_create_and_index("local",  &st->variables_root_index,               rs->variable, rs->type, rs->value);
982
983     // FAMILY
984
985     rs->var_family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullid,   rs->type, rs->value);
986     rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullname, rs->type, rs->value);
987
988     // HOST
989
990     rs->var_host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullid,   rs->type, rs->value);
991     rs->var_host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullname, rs->type, rs->value);
992
993 }
994
995 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
996     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
997     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
998
999     rs->variable = strdupz(variable);
1000     rs->type = type;
1001     rs->value = value;
1002     rs->options = options;
1003     rs->rrdset = st;
1004
1005     rs->next = st->variables;
1006     st->variables = rs;
1007
1008     rrdsetvar_create_variables(rs);
1009
1010     return rs;
1011 }
1012
1013 void rrdsetvar_rename_all(RRDSET *st) {
1014     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
1015
1016     RRDSETVAR *rs, *next = st->variables;
1017     while((rs = next)) {
1018         next = rs->next;
1019         rrdsetvar_create_variables(rs);
1020     }
1021
1022     rrdsetcalc_link_matching(st);
1023 }
1024
1025 void rrdsetvar_free(RRDSETVAR *rs) {
1026     RRDSET *st = rs->rrdset;
1027     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
1028
1029     if(st->variables == rs) {
1030         st->variables = rs->next;
1031     }
1032     else {
1033         RRDSETVAR *t;
1034         for (t = st->variables; t && t->next != rs; t = t->next);
1035         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
1036         else t->next = rs->next;
1037     }
1038
1039     rrdsetvar_free_variables(rs);
1040
1041     freez(rs->variable);
1042     freez(rs);
1043 }
1044
1045 // ----------------------------------------------------------------------------
1046 // RRDCALC management
1047
1048 inline const char *rrdcalc_status2string(int status) {
1049     switch(status) {
1050         case RRDCALC_STATUS_REMOVED:
1051             return "REMOVED";
1052
1053         case RRDCALC_STATUS_UNDEFINED:
1054             return "UNDEFINED";
1055
1056         case RRDCALC_STATUS_UNINITIALIZED:
1057             return "UNINITIALIZED";
1058
1059         case RRDCALC_STATUS_CLEAR:
1060             return "CLEAR";
1061
1062         case RRDCALC_STATUS_RAISED:
1063             return "RAISED";
1064
1065         case RRDCALC_STATUS_WARNING:
1066             return "WARNING";
1067
1068         case RRDCALC_STATUS_CRITICAL:
1069             return "CRITICAL";
1070
1071         default:
1072             error("Unknown alarm status %d", status);
1073             return "UNKNOWN";
1074     }
1075 }
1076
1077 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
1078     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
1079
1080     rc->last_status_change = now_realtime_sec();
1081     rc->rrdset = st;
1082
1083     rc->rrdset_next = st->alarms;
1084     rc->rrdset_prev = NULL;
1085     
1086     if(rc->rrdset_next)
1087         rc->rrdset_next->rrdset_prev = rc;
1088
1089     st->alarms = rc;
1090
1091     if(rc->update_every < rc->rrdset->update_every) {
1092         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
1093         rc->update_every = rc->rrdset->update_every;
1094     }
1095
1096     if(!isnan(rc->green) && isnan(st->green)) {
1097         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
1098         st->green = rc->green;
1099     }
1100
1101     if(!isnan(rc->red) && isnan(st->red)) {
1102         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
1103         st->red = rc->red;
1104     }
1105
1106     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1107     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
1108
1109     char fullname[RRDVAR_MAX_LENGTH + 1];
1110     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
1111     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1112
1113     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
1114     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
1115
1116         if(!rc->units) rc->units = strdupz(st->units);
1117
1118     {
1119         time_t now = now_realtime_sec();
1120         health_alarm_log(
1121                 st->rrdhost,
1122                 rc->id,
1123                 rc->next_event_id++,
1124                 now,
1125                 rc->name,
1126                 rc->rrdset->id,
1127                 rc->rrdset->family,
1128                 rc->exec,
1129                 rc->recipient,
1130                 now - rc->last_status_change,
1131                 rc->old_value,
1132                 rc->value,
1133                 rc->status,
1134                 RRDCALC_STATUS_UNINITIALIZED,
1135                 rc->source,
1136                 rc->units,
1137                 rc->info,
1138                 0,
1139                 0
1140         );
1141     }
1142 }
1143
1144 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
1145     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
1146             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
1147         return 1;
1148
1149     return 0;
1150 }
1151
1152 // this has to be called while the RRDHOST is locked
1153 inline void rrdsetcalc_link_matching(RRDSET *st) {
1154     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
1155
1156     RRDCALC *rc;
1157     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
1158         if(unlikely(rc->rrdset))
1159             continue;
1160
1161         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
1162             rrdsetcalc_link(st, rc);
1163     }
1164 }
1165
1166 // this has to be called while the RRDHOST is locked
1167 inline void rrdsetcalc_unlink(RRDCALC *rc) {
1168     RRDSET *st = rc->rrdset;
1169
1170     if(!st) {
1171         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1172         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
1173         return;
1174     }
1175
1176     {
1177         time_t now = now_realtime_sec();
1178         health_alarm_log(
1179                 st->rrdhost,
1180                 rc->id,
1181                 rc->next_event_id++,
1182                 now,
1183                 rc->name,
1184                 rc->rrdset->id,
1185                 rc->rrdset->family,
1186                 rc->exec,
1187                 rc->recipient,
1188                 now - rc->last_status_change,
1189                 rc->old_value,
1190                 rc->value,
1191                 rc->status,
1192                 RRDCALC_STATUS_REMOVED,
1193                 rc->source,
1194                 rc->units,
1195                 rc->info,
1196                 0,
1197                 0
1198         );
1199     }
1200
1201     RRDHOST *host = st->rrdhost;
1202
1203     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
1204
1205     // unlink it
1206     if(rc->rrdset_prev)
1207         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
1208
1209     if(rc->rrdset_next)
1210         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
1211
1212     if(st->alarms == rc)
1213         st->alarms = rc->rrdset_next;
1214
1215     rc->rrdset_prev = rc->rrdset_next = NULL;
1216
1217     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
1218     rc->local = NULL;
1219
1220     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
1221     rc->family = NULL;
1222
1223     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
1224     rc->hostid = NULL;
1225
1226     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
1227     rc->hostname = NULL;
1228
1229     rc->rrdset = NULL;
1230
1231     // RRDCALC will remain in RRDHOST
1232     // so that if the matching chart is found in the future
1233     // it will be applied automatically
1234 }
1235
1236 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
1237     RRDCALC *rc;
1238     uint32_t hash = simple_hash(name);
1239
1240     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
1241         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
1242             return rc;
1243     }
1244
1245     return NULL;
1246 }
1247
1248 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
1249     RRDCALC *rc;
1250
1251     if(unlikely(!chart)) {
1252         error("attempt to find RRDCALC '%s' without giving a chart name", name);
1253         return 1;
1254     }
1255
1256     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
1257     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
1258
1259     // make sure it does not already exist
1260     for(rc = host->alarms; rc ; rc = rc->next) {
1261         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
1262             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1263             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
1264             return 1;
1265         }
1266     }
1267
1268     return 0;
1269 }
1270
1271 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1272     if(chart && name) {
1273         uint32_t hash_chart = simple_hash(chart);
1274         uint32_t hash_name = simple_hash(name);
1275
1276         // re-use old IDs, by looking them up in the alarm log
1277         ALARM_ENTRY *ae;
1278         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1279             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1280                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1281                 return ae->alarm_id;
1282             }
1283         }
1284     }
1285
1286     return host->health_log.next_alarm_id++;
1287 }
1288
1289 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1290     rrdhost_check_rdlock(host);
1291
1292     if(rc->calculation) {
1293         rc->calculation->status = &rc->status;
1294         rc->calculation->this = &rc->value;
1295         rc->calculation->after = &rc->db_after;
1296         rc->calculation->before = &rc->db_before;
1297         rc->calculation->rrdcalc = rc;
1298     }
1299
1300     if(rc->warning) {
1301         rc->warning->status = &rc->status;
1302         rc->warning->this = &rc->value;
1303         rc->warning->after = &rc->db_after;
1304         rc->warning->before = &rc->db_before;
1305         rc->warning->rrdcalc = rc;
1306     }
1307
1308     if(rc->critical) {
1309         rc->critical->status = &rc->status;
1310         rc->critical->this = &rc->value;
1311         rc->critical->after = &rc->db_after;
1312         rc->critical->before = &rc->db_before;
1313         rc->critical->rrdcalc = rc;
1314     }
1315
1316     // link it to the host
1317     if(likely(host->alarms)) {
1318         // append it
1319         RRDCALC *t;
1320         for(t = host->alarms; t && t->next ; t = t->next) ;
1321         t->next = rc;
1322     }
1323     else {
1324         host->alarms = rc;
1325     }
1326
1327     // link it to its chart
1328     RRDSET *st;
1329     for(st = host->rrdset_root; st ; st = st->next) {
1330         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1331             rrdsetcalc_link(st, rc);
1332             break;
1333         }
1334     }
1335 }
1336
1337 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1338
1339     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1340
1341     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1342         return NULL;
1343
1344     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1345     rc->next_event_id = 1;
1346     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1347     rc->name = strdupz(rt->name);
1348     rc->hash = simple_hash(rc->name);
1349     rc->chart = strdupz(chart);
1350     rc->hash_chart = simple_hash(rc->chart);
1351
1352     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1353
1354     rc->green = rt->green;
1355     rc->red = rt->red;
1356     rc->value = NAN;
1357     rc->old_value = NAN;
1358
1359     rc->delay_up_duration = rt->delay_up_duration;
1360     rc->delay_down_duration = rt->delay_down_duration;
1361     rc->delay_max_duration = rt->delay_max_duration;
1362     rc->delay_multiplier = rt->delay_multiplier;
1363
1364     rc->group = rt->group;
1365     rc->after = rt->after;
1366     rc->before = rt->before;
1367     rc->update_every = rt->update_every;
1368     rc->options = rt->options;
1369
1370     if(rt->exec) rc->exec = strdupz(rt->exec);
1371     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1372     if(rt->source) rc->source = strdupz(rt->source);
1373     if(rt->units) rc->units = strdupz(rt->units);
1374     if(rt->info) rc->info = strdupz(rt->info);
1375
1376     if(rt->calculation) {
1377         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1378         if(!rc->calculation)
1379             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1380     }
1381     if(rt->warning) {
1382         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1383         if(!rc->warning)
1384             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1385     }
1386     if(rt->critical) {
1387         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1388         if(!rc->critical)
1389             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1390     }
1391
1392     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1393           (rc->chart)?rc->chart:"NOCHART",
1394           rc->name,
1395           (rc->exec)?rc->exec:"DEFAULT",
1396           (rc->recipient)?rc->recipient:"DEFAULT",
1397           rc->green,
1398           rc->red,
1399           rc->group,
1400           rc->after,
1401           rc->before,
1402           rc->options,
1403           (rc->dimensions)?rc->dimensions:"NONE",
1404           rc->update_every,
1405           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1406           (rc->warning)?rc->warning->parsed_as:"NONE",
1407           (rc->critical)?rc->critical->parsed_as:"NONE",
1408           rc->source,
1409           rc->delay_up_duration,
1410           rc->delay_down_duration,
1411           rc->delay_max_duration,
1412           rc->delay_multiplier
1413     );
1414
1415     rrdcalc_create_part2(host, rc);
1416     return rc;
1417 }
1418
1419 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1420     if(!rc) return;
1421
1422     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1423
1424     // unlink it from RRDSET
1425     if(rc->rrdset) rrdsetcalc_unlink(rc);
1426
1427     // unlink it from RRDHOST
1428     if(unlikely(rc == host->alarms))
1429         host->alarms = rc->next;
1430
1431     else if(likely(host->alarms)) {
1432         RRDCALC *t, *last = host->alarms;
1433         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1434         if(last->next == rc)
1435             last->next = rc->next;
1436         else
1437             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1438     }
1439     else
1440         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1441
1442     expression_free(rc->calculation);
1443     expression_free(rc->warning);
1444     expression_free(rc->critical);
1445
1446     freez(rc->name);
1447     freez(rc->chart);
1448     freez(rc->family);
1449     freez(rc->dimensions);
1450     freez(rc->exec);
1451     freez(rc->recipient);
1452     freez(rc->source);
1453     freez(rc->units);
1454     freez(rc->info);
1455     freez(rc);
1456 }
1457
1458 // ----------------------------------------------------------------------------
1459 // RRDCALCTEMPLATE management
1460
1461 void rrdcalctemplate_link_matching(RRDSET *st) {
1462     RRDCALCTEMPLATE *rt;
1463
1464     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1465         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
1466                 && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
1467             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1468             if(unlikely(!rc))
1469                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1470
1471 #ifdef NETDATA_INTERNAL_CHECKS
1472             else if(rc->rrdset != st)
1473                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1474 #endif
1475         }
1476     }
1477 }
1478
1479 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1480     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1481
1482     if(host->templates) {
1483         if(host->templates == rt) {
1484             host->templates = rt->next;
1485         }
1486         else {
1487             RRDCALCTEMPLATE *t, *last = host->templates;
1488             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1489             if(last && last->next == rt) {
1490                 last->next = rt->next;
1491                 rt->next = NULL;
1492             }
1493             else
1494                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1495         }
1496     }
1497
1498     expression_free(rt->calculation);
1499     expression_free(rt->warning);
1500     expression_free(rt->critical);
1501
1502     freez(rt->family_match);
1503     simple_pattern_free(rt->family_pattern);
1504
1505     freez(rt->name);
1506     freez(rt->exec);
1507     freez(rt->recipient);
1508     freez(rt->context);
1509     freez(rt->source);
1510     freez(rt->units);
1511     freez(rt->info);
1512     freez(rt->dimensions);
1513     freez(rt);
1514 }
1515
1516 // ----------------------------------------------------------------------------
1517 // load health configuration
1518
1519 #define HEALTH_CONF_MAX_LINE 4096
1520
1521 #define HEALTH_ALARM_KEY "alarm"
1522 #define HEALTH_TEMPLATE_KEY "template"
1523 #define HEALTH_ON_KEY "on"
1524 #define HEALTH_FAMILIES_KEY "families"
1525 #define HEALTH_LOOKUP_KEY "lookup"
1526 #define HEALTH_CALC_KEY "calc"
1527 #define HEALTH_EVERY_KEY "every"
1528 #define HEALTH_GREEN_KEY "green"
1529 #define HEALTH_RED_KEY "red"
1530 #define HEALTH_WARN_KEY "warn"
1531 #define HEALTH_CRIT_KEY "crit"
1532 #define HEALTH_EXEC_KEY "exec"
1533 #define HEALTH_RECIPIENT_KEY "to"
1534 #define HEALTH_UNITS_KEY "units"
1535 #define HEALTH_INFO_KEY "info"
1536 #define HEALTH_DELAY_KEY "delay"
1537 #define HEALTH_OPTIONS_KEY "options"
1538
1539 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1540     if(!rc->chart) {
1541         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1542         return 0;
1543     }
1544
1545     if(!rc->update_every) {
1546         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1547         return 0;
1548     }
1549
1550     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1551         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1552         return 0;
1553     }
1554
1555     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1556         return 0;
1557
1558     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1559
1560     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1561           rc->chart?rc->chart:"NOCHART",
1562           rc->name,
1563           rc->id,
1564           (rc->exec)?rc->exec:"DEFAULT",
1565           (rc->recipient)?rc->recipient:"DEFAULT",
1566           rc->green,
1567           rc->red,
1568           rc->group,
1569           rc->after,
1570           rc->before,
1571           rc->options,
1572           (rc->dimensions)?rc->dimensions:"NONE",
1573           rc->update_every,
1574           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1575           (rc->warning)?rc->warning->parsed_as:"NONE",
1576           (rc->critical)?rc->critical->parsed_as:"NONE",
1577           rc->source,
1578           rc->delay_up_duration,
1579           rc->delay_down_duration,
1580           rc->delay_max_duration,
1581           rc->delay_multiplier
1582     );
1583
1584     rrdcalc_create_part2(host, rc);
1585     return 1;
1586 }
1587
1588 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1589     if(unlikely(!rt->context)) {
1590         error("Health configuration for template '%s' does not have a context", rt->name);
1591         return 0;
1592     }
1593
1594     if(unlikely(!rt->update_every)) {
1595         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1596         return 0;
1597     }
1598
1599     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1600         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1601         return 0;
1602     }
1603
1604     RRDCALCTEMPLATE *t, *last = NULL;
1605     for (t = host->templates; t ; last = t, t = t->next) {
1606         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1607             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1608             return 0;
1609         }
1610     }
1611
1612     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1613           rt->name,
1614           (rt->context)?rt->context:"NONE",
1615           (rt->exec)?rt->exec:"DEFAULT",
1616           (rt->recipient)?rt->recipient:"DEFAULT",
1617           rt->green,
1618           rt->red,
1619           rt->group,
1620           rt->after,
1621           rt->before,
1622           rt->options,
1623           (rt->dimensions)?rt->dimensions:"NONE",
1624           rt->update_every,
1625           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1626           (rt->warning)?rt->warning->parsed_as:"NONE",
1627           (rt->critical)?rt->critical->parsed_as:"NONE",
1628           rt->source,
1629           rt->delay_up_duration,
1630           rt->delay_down_duration,
1631           rt->delay_max_duration,
1632           rt->delay_multiplier
1633     );
1634
1635     if(likely(last)) {
1636         last->next = rt;
1637     }
1638     else {
1639         rt->next = host->templates;
1640         host->templates = rt;
1641     }
1642
1643     return 1;
1644 }
1645
1646 static inline int health_parse_duration(char *string, int *result) {
1647     // make sure it is a number
1648     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1649         *result = 0;
1650         return 0;
1651     }
1652
1653     char *e = NULL;
1654     calculated_number n = strtold(string, &e);
1655     if(e && *e) {
1656         switch (*e) {
1657             case 'Y':
1658                 *result = (int) (n * 86400 * 365);
1659                 break;
1660             case 'M':
1661                 *result = (int) (n * 86400 * 30);
1662                 break;
1663             case 'w':
1664                 *result = (int) (n * 86400 * 7);
1665                 break;
1666             case 'd':
1667                 *result = (int) (n * 86400);
1668                 break;
1669             case 'h':
1670                 *result = (int) (n * 3600);
1671                 break;
1672             case 'm':
1673                 *result = (int) (n * 60);
1674                 break;
1675
1676             default:
1677             case 's':
1678                 *result = (int) (n);
1679                 break;
1680         }
1681     }
1682     else
1683        *result = (int)(n);
1684
1685     return 1;
1686 }
1687
1688 static inline int health_parse_delay(
1689         size_t line, const char *path, const char *file, char *string,
1690         int *delay_up_duration,
1691         int *delay_down_duration,
1692         int *delay_max_duration,
1693         float *delay_multiplier) {
1694
1695     char given_up = 0;
1696     char given_down = 0;
1697     char given_max = 0;
1698     char given_multiplier = 0;
1699
1700     char *s = string;
1701     while(*s) {
1702         char *key = s;
1703
1704         while(*s && !isspace(*s)) s++;
1705         while(*s && isspace(*s)) *s++ = '\0';
1706
1707         if(!*key) break;
1708
1709         char *value = s;
1710         while(*s && !isspace(*s)) s++;
1711         while(*s && isspace(*s)) *s++ = '\0';
1712
1713         if(!strcasecmp(key, "up")) {
1714             if (!health_parse_duration(value, delay_up_duration)) {
1715                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1716                       line, path, file, value, key);
1717             }
1718             else given_up = 1;
1719         }
1720         else if(!strcasecmp(key, "down")) {
1721             if (!health_parse_duration(value, delay_down_duration)) {
1722                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1723                       line, path, file, value, key);
1724             }
1725             else given_down = 1;
1726         }
1727         else if(!strcasecmp(key, "multiplier")) {
1728             *delay_multiplier = strtof(value, NULL);
1729             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1730                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1731                       line, path, file, value, key);
1732             }
1733             else given_multiplier = 1;
1734         }
1735         else if(!strcasecmp(key, "max")) {
1736             if (!health_parse_duration(value, delay_max_duration)) {
1737                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1738                       line, path, file, value, key);
1739             }
1740             else given_max = 1;
1741         }
1742         else {
1743             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1744                   line, path, file, key);
1745         }
1746     }
1747
1748     if(!given_up)
1749         *delay_up_duration = 0;
1750
1751     if(!given_down)
1752         *delay_down_duration = 0;
1753
1754     if(!given_multiplier)
1755         *delay_multiplier = 1.0;
1756
1757     if(!given_max) {
1758         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1759             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1760
1761         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1762             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1763     }
1764
1765     return 1;
1766 }
1767
1768 static inline uint32_t health_parse_options(const char *s) {
1769     uint32_t options = 0;
1770     char buf[100+1] = "";
1771
1772     while(*s) {
1773         buf[0] = '\0';
1774
1775         // skip spaces
1776         while(*s && isspace(*s))
1777             s++;
1778
1779         // find the next space
1780         size_t count = 0;
1781         while(*s && count < 100 && !isspace(*s))
1782             buf[count++] = *s++;
1783
1784         if(buf[0]) {
1785             buf[count] = '\0';
1786
1787             if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear"))
1788                 options |= RRDCALC_FLAG_NO_CLEAR_NOTIFICATION;
1789             else
1790                 error("Ignoring unknown alarm option '%s'", buf);
1791         }
1792     }
1793
1794     return options;
1795 }
1796
1797 static inline int health_parse_db_lookup(
1798         size_t line, const char *path, const char *file, char *string,
1799         int *group_method, int *after, int *before, int *every,
1800         uint32_t *options, char **dimensions
1801 ) {
1802     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1803
1804     if(*dimensions) freez(*dimensions);
1805     *dimensions = NULL;
1806     *after = 0;
1807     *before = 0;
1808     *every = 0;
1809     *options = 0;
1810
1811     char *s = string, *key;
1812
1813     // first is the group method
1814     key = s;
1815     while(*s && !isspace(*s)) s++;
1816     while(*s && isspace(*s)) *s++ = '\0';
1817     if(!*s) {
1818         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1819               line, path, file, key);
1820         return 0;
1821     }
1822
1823     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1824         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1825               line, path, file, key);
1826         return 0;
1827     }
1828
1829     // then is the 'after' time
1830     key = s;
1831     while(*s && !isspace(*s)) s++;
1832     while(*s && isspace(*s)) *s++ = '\0';
1833
1834     if(!health_parse_duration(key, after)) {
1835         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1836               line, path, file, key);
1837         return 0;
1838     }
1839
1840     // sane defaults
1841     *every = abs(*after);
1842
1843     // now we may have optional parameters
1844     while(*s) {
1845         key = s;
1846         while(*s && !isspace(*s)) s++;
1847         while(*s && isspace(*s)) *s++ = '\0';
1848         if(!*key) break;
1849
1850         if(!strcasecmp(key, "at")) {
1851             char *value = s;
1852             while(*s && !isspace(*s)) s++;
1853             while(*s && isspace(*s)) *s++ = '\0';
1854
1855             if (!health_parse_duration(value, before)) {
1856                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1857                       line, path, file, value, key);
1858             }
1859         }
1860         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1861             char *value = s;
1862             while(*s && !isspace(*s)) s++;
1863             while(*s && isspace(*s)) *s++ = '\0';
1864
1865             if (!health_parse_duration(value, every)) {
1866                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1867                       line, path, file, value, key);
1868             }
1869         }
1870         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1871             *options |= RRDR_OPTION_ABSOLUTE;
1872         }
1873         else if(!strcasecmp(key, "min2max")) {
1874             *options |= RRDR_OPTION_MIN2MAX;
1875         }
1876         else if(!strcasecmp(key, "null2zero")) {
1877             *options |= RRDR_OPTION_NULL2ZERO;
1878         }
1879         else if(!strcasecmp(key, "percentage")) {
1880             *options |= RRDR_OPTION_PERCENTAGE;
1881         }
1882         else if(!strcasecmp(key, "unaligned")) {
1883             *options |= RRDR_OPTION_NOT_ALIGNED;
1884         }
1885         else if(!strcasecmp(key, "of")) {
1886             if(*s && strcasecmp(s, "all"))
1887                *dimensions = strdupz(s);
1888             break;
1889         }
1890         else {
1891             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1892                   line, path, file, key);
1893         }
1894     }
1895
1896     return 1;
1897 }
1898
1899 static inline char *tabs2spaces(char *s) {
1900     char *t = s;
1901     while(*t) {
1902         if(unlikely(*t == '\t')) *t = ' ';
1903         t++;
1904     }
1905
1906     return s;
1907 }
1908
1909 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1910     char buffer[FILENAME_MAX + 1];
1911     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1912     return strdupz(buffer);
1913 }
1914
1915 static inline void strip_quotes(char *s) {
1916     while(*s) {
1917         if(*s == '\'' || *s == '"') *s = ' ';
1918         s++;
1919     }
1920 }
1921
1922 int health_readfile(const char *path, const char *filename) {
1923     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1924
1925     static uint32_t
1926             hash_alarm = 0,
1927             hash_template = 0,
1928             hash_on = 0,
1929             hash_families = 0,
1930             hash_calc = 0,
1931             hash_green = 0,
1932             hash_red = 0,
1933             hash_warn = 0,
1934             hash_crit = 0,
1935             hash_exec = 0,
1936             hash_every = 0,
1937             hash_lookup = 0,
1938             hash_units = 0,
1939             hash_info = 0,
1940             hash_recipient = 0,
1941             hash_delay = 0,
1942             hash_options = 0;
1943
1944     char buffer[HEALTH_CONF_MAX_LINE + 1];
1945
1946     if(unlikely(!hash_alarm)) {
1947         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1948         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1949         hash_on = simple_uhash(HEALTH_ON_KEY);
1950         hash_families = simple_uhash(HEALTH_FAMILIES_KEY);
1951         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1952         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1953         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1954         hash_red = simple_uhash(HEALTH_RED_KEY);
1955         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1956         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1957         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1958         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1959         hash_units = simple_hash(HEALTH_UNITS_KEY);
1960         hash_info = simple_hash(HEALTH_INFO_KEY);
1961         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1962         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1963         hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
1964     }
1965
1966     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1967     FILE *fp = fopen(buffer, "r");
1968     if(!fp) {
1969         error("Health configuration cannot read file '%s'.", buffer);
1970         return 0;
1971     }
1972
1973     RRDCALC *rc = NULL;
1974     RRDCALCTEMPLATE *rt = NULL;
1975
1976     size_t line = 0, append = 0;
1977     char *s;
1978     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1979         int stop_appending = !s;
1980         line++;
1981         s = trim(buffer);
1982         if(!s) continue;
1983
1984         append = strlen(s);
1985         if(!stop_appending && s[append - 1] == '\\') {
1986             s[append - 1] = ' ';
1987             append = &s[append] - buffer;
1988             if(append < HEALTH_CONF_MAX_LINE)
1989                 continue;
1990             else {
1991                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1992             }
1993         }
1994         append = 0;
1995
1996         char *key = s;
1997         while(*s && *s != ':') s++;
1998         if(!*s) {
1999             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
2000             continue;
2001         }
2002         *s = '\0';
2003         s++;
2004
2005         char *value = s;
2006         key = trim(key);
2007         value = trim(value);
2008
2009         if(!key) {
2010             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
2011             continue;
2012         }
2013
2014         if(!value) {
2015             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
2016             continue;
2017         }
2018
2019         uint32_t hash = simple_uhash(key);
2020
2021         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
2022             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2023                 rrdcalc_free(&localhost, rc);
2024
2025             if(rt) {
2026                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
2027                     rrdcalctemplate_free(&localhost, rt);
2028                 rt = NULL;
2029             }
2030
2031             rc = callocz(1, sizeof(RRDCALC));
2032             rc->next_event_id = 1;
2033             rc->name = tabs2spaces(strdupz(value));
2034             rc->hash = simple_hash(rc->name);
2035             rc->source = health_source_file(line, path, filename);
2036             rc->green = NAN;
2037             rc->red = NAN;
2038             rc->value = NAN;
2039             rc->old_value = NAN;
2040             rc->delay_multiplier = 1.0;
2041
2042             if(rrdvar_fix_name(rc->name))
2043                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
2044         }
2045         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
2046             if(rc) {
2047                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
2048                     rrdcalc_free(&localhost, rc);
2049                 rc = NULL;
2050             }
2051
2052             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2053                 rrdcalctemplate_free(&localhost, rt);
2054
2055             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
2056             rt->name = tabs2spaces(strdupz(value));
2057             rt->hash_name = simple_hash(rt->name);
2058             rt->source = health_source_file(line, path, filename);
2059             rt->green = NAN;
2060             rt->red = NAN;
2061             rt->delay_multiplier = 1.0;
2062
2063             if(rrdvar_fix_name(rt->name))
2064                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
2065         }
2066         else if(rc) {
2067             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
2068                 if(rc->chart) {
2069                     if(strcmp(rc->chart, value))
2070                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2071                                 line, path, filename, rc->name, key, rc->chart, value, value);
2072
2073                     freez(rc->chart);
2074                 }
2075                 rc->chart = tabs2spaces(strdupz(value));
2076                 rc->hash_chart = simple_hash(rc->chart);
2077             }
2078             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
2079                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
2080                                        &rc->update_every,
2081                                        &rc->options, &rc->dimensions);
2082             }
2083             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
2084                 if(!health_parse_duration(value, &rc->update_every))
2085                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
2086                          line, path, filename, rc->name, key, value);
2087             }
2088             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
2089                 char *e;
2090                 rc->green = strtold(value, &e);
2091                 if(e && *e) {
2092                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
2093                          line, path, filename, rc->name, key, e);
2094                 }
2095             }
2096             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
2097                 char *e;
2098                 rc->red = strtold(value, &e);
2099                 if(e && *e) {
2100                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
2101                          line, path, filename, rc->name, key, e);
2102                 }
2103             }
2104             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2105                 const char *failed_at = NULL;
2106                 int error = 0;
2107                 rc->calculation = expression_parse(value, &failed_at, &error);
2108                 if(!rc->calculation) {
2109                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2110                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2111                 }
2112             }
2113             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2114                 const char *failed_at = NULL;
2115                 int error = 0;
2116                 rc->warning = expression_parse(value, &failed_at, &error);
2117                 if(!rc->warning) {
2118                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2119                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2120                 }
2121             }
2122             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2123                 const char *failed_at = NULL;
2124                 int error = 0;
2125                 rc->critical = expression_parse(value, &failed_at, &error);
2126                 if(!rc->critical) {
2127                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2128                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
2129                 }
2130             }
2131             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2132                 if(rc->exec) {
2133                     if(strcmp(rc->exec, value))
2134                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2135                              line, path, filename, rc->name, key, rc->exec, value, value);
2136
2137                     freez(rc->exec);
2138                 }
2139                 rc->exec = tabs2spaces(strdupz(value));
2140             }
2141             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2142                 if(rc->recipient) {
2143                     if(strcmp(rc->recipient, value))
2144                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2145                              line, path, filename, rc->name, key, rc->recipient, value, value);
2146
2147                     freez(rc->recipient);
2148                 }
2149                 rc->recipient = tabs2spaces(strdupz(value));
2150             }
2151             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2152                 if(rc->units) {
2153                     if(strcmp(rc->units, value))
2154                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2155                              line, path, filename, rc->name, key, rc->units, value, value);
2156
2157                     freez(rc->units);
2158                 }
2159                 rc->units = tabs2spaces(strdupz(value));
2160                 strip_quotes(rc->units);
2161             }
2162             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2163                 if(rc->info) {
2164                     if(strcmp(rc->info, value))
2165                         error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2166                              line, path, filename, rc->name, key, rc->info, value, value);
2167
2168                     freez(rc->info);
2169                 }
2170                 rc->info = tabs2spaces(strdupz(value));
2171                 strip_quotes(rc->info);
2172             }
2173             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2174                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
2175             }
2176             else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
2177                 rc->options |= health_parse_options(value);
2178             }
2179             else {
2180                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
2181                      line, path, filename, rc->name, key);
2182             }
2183         }
2184         else if(rt) {
2185             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
2186                 if(rt->context) {
2187                     if(strcmp(rt->context, value))
2188                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2189                                 line, path, filename, rt->name, key, rt->context, value, value);
2190
2191                     freez(rt->context);
2192                 }
2193                 rt->context = tabs2spaces(strdupz(value));
2194                 rt->hash_context = simple_hash(rt->context);
2195             }
2196             else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) {
2197                 freez(rt->family_match);
2198                 simple_pattern_free(rt->family_pattern);
2199
2200                 rt->family_match = tabs2spaces(strdupz(value));
2201                 rt->family_pattern = simple_pattern_create(rt->family_match, SIMPLE_PATTERN_EXACT);
2202             }
2203             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
2204                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
2205                                        &rt->update_every, &rt->options, &rt->dimensions);
2206             }
2207             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
2208                 if(!health_parse_duration(value, &rt->update_every))
2209                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
2210                          line, path, filename, rt->name, key, value);
2211             }
2212             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
2213                 char *e;
2214                 rt->green = strtold(value, &e);
2215                 if(e && *e) {
2216                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2217                          line, path, filename, rt->name, key, e);
2218                 }
2219             }
2220             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
2221                 char *e;
2222                 rt->red = strtold(value, &e);
2223                 if(e && *e) {
2224                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
2225                          line, path, filename, rt->name, key, e);
2226                 }
2227             }
2228             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
2229                 const char *failed_at = NULL;
2230                 int error = 0;
2231                 rt->calculation = expression_parse(value, &failed_at, &error);
2232                 if(!rt->calculation) {
2233                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2234                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2235                 }
2236             }
2237             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
2238                 const char *failed_at = NULL;
2239                 int error = 0;
2240                 rt->warning = expression_parse(value, &failed_at, &error);
2241                 if(!rt->warning) {
2242                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2243                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2244                 }
2245             }
2246             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
2247                 const char *failed_at = NULL;
2248                 int error = 0;
2249                 rt->critical = expression_parse(value, &failed_at, &error);
2250                 if(!rt->critical) {
2251                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
2252                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
2253                 }
2254             }
2255             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
2256                 if(rt->exec) {
2257                     if(strcmp(rt->exec, value))
2258                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2259                              line, path, filename, rt->name, key, rt->exec, value, value);
2260
2261                     freez(rt->exec);
2262                 }
2263                 rt->exec = tabs2spaces(strdupz(value));
2264             }
2265             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
2266                 if(rt->recipient) {
2267                     if(strcmp(rt->recipient, value))
2268                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2269                              line, path, filename, rt->name, key, rt->recipient, value, value);
2270
2271                     freez(rt->recipient);
2272                 }
2273                 rt->recipient = tabs2spaces(strdupz(value));
2274             }
2275             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
2276                 if(rt->units) {
2277                     if(strcmp(rt->units, value))
2278                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2279                              line, path, filename, rt->name, key, rt->units, value, value);
2280
2281                     freez(rt->units);
2282                 }
2283                 rt->units = tabs2spaces(strdupz(value));
2284                 strip_quotes(rt->units);
2285             }
2286             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
2287                 if(rt->info) {
2288                     if(strcmp(rt->info, value))
2289                         error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
2290                              line, path, filename, rt->name, key, rt->info, value, value);
2291
2292                     freez(rt->info);
2293                 }
2294                 rt->info = tabs2spaces(strdupz(value));
2295                 strip_quotes(rt->info);
2296             }
2297             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
2298                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
2299             }
2300             else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
2301                 rt->options |= health_parse_options(value);
2302             }
2303             else {
2304                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
2305                       line, path, filename, rt->name, key);
2306             }
2307         }
2308         else {
2309             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
2310                   line, path, filename, key);
2311         }
2312     }
2313
2314     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
2315         rrdcalc_free(&localhost, rc);
2316
2317     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
2318         rrdcalctemplate_free(&localhost, rt);
2319
2320     fclose(fp);
2321     return 1;
2322 }
2323
2324 void health_readdir(const char *path) {
2325     size_t pathlen = strlen(path);
2326
2327     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
2328
2329     DIR *dir = opendir(path);
2330     if (!dir) {
2331         error("Health configuration cannot open directory '%s'.", path);
2332         return;
2333     }
2334
2335     struct dirent *de = NULL;
2336     while ((de = readdir(dir))) {
2337         size_t len = strlen(de->d_name);
2338
2339         if(de->d_type == DT_DIR
2340            && (
2341                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2342                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2343            )) {
2344             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2345             continue;
2346         }
2347
2348         else if(de->d_type == DT_DIR) {
2349             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2350             strcpy(s, path);
2351             strcat(s, "/");
2352             strcat(s, de->d_name);
2353             health_readdir(s);
2354             freez(s);
2355             continue;
2356         }
2357
2358         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2359                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2360             health_readfile(path, de->d_name);
2361         }
2362
2363         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2364     }
2365
2366     closedir(dir);
2367 }
2368
2369 static inline char *health_config_dir(void) {
2370     char buffer[FILENAME_MAX + 1];
2371     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2372     return config_get("health", "health configuration directory", buffer);
2373 }
2374
2375 void health_init(void) {
2376     debug(D_HEALTH, "Health configuration initializing");
2377
2378     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2379         debug(D_HEALTH, "Health is disabled.");
2380         return;
2381     }
2382
2383     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2384     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2385         fatal("Cannot create directory '%s'.", pathname);
2386
2387     char filename[FILENAME_MAX + 1];
2388     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2389     health.log_filename = config_get("health", "health db file", filename);
2390
2391     health_alarm_log_load(&localhost);
2392     health_alarm_log_open();
2393
2394     char *path = health_config_dir();
2395
2396     {
2397         char buffer[FILENAME_MAX + 1];
2398         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2399         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2400     }
2401
2402     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2403     if(n < 10) {
2404         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2405         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2406     }
2407     else localhost.health_log.max = (unsigned int)n;
2408
2409     rrdhost_rwlock(&localhost);
2410     health_readdir(path);
2411     rrdhost_unlock(&localhost);
2412 }
2413
2414 // ----------------------------------------------------------------------------
2415 // JSON generation
2416
2417 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2418     if(value && *value)
2419         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2420     else
2421         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2422 }
2423
2424 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2425     buffer_sprintf(wb,
2426             "\n\t{\n"
2427                     "\t\t\"hostname\": \"%s\",\n"
2428                     "\t\t\"unique_id\": %u,\n"
2429                     "\t\t\"alarm_id\": %u,\n"
2430                     "\t\t\"alarm_event_id\": %u,\n"
2431                     "\t\t\"name\": \"%s\",\n"
2432                     "\t\t\"chart\": \"%s\",\n"
2433                     "\t\t\"family\": \"%s\",\n"
2434                     "\t\t\"processed\": %s,\n"
2435                     "\t\t\"updated\": %s,\n"
2436                     "\t\t\"exec_run\": %lu,\n"
2437                     "\t\t\"exec_failed\": %s,\n"
2438                     "\t\t\"exec\": \"%s\",\n"
2439                     "\t\t\"recipient\": \"%s\",\n"
2440                     "\t\t\"exec_code\": %d,\n"
2441                     "\t\t\"source\": \"%s\",\n"
2442                     "\t\t\"units\": \"%s\",\n"
2443                     "\t\t\"info\": \"%s\",\n"
2444                     "\t\t\"when\": %lu,\n"
2445                     "\t\t\"duration\": %lu,\n"
2446                     "\t\t\"non_clear_duration\": %lu,\n"
2447                     "\t\t\"status\": \"%s\",\n"
2448                     "\t\t\"old_status\": \"%s\",\n"
2449                     "\t\t\"delay\": %d,\n"
2450                     "\t\t\"delay_up_to_timestamp\": %lu,\n"
2451                     "\t\t\"updated_by_id\": %u,\n"
2452                     "\t\t\"updates_id\": %u,\n"
2453                     "\t\t\"value_string\": \"%s\",\n"
2454                     "\t\t\"old_value_string\": \"%s\",\n"
2455             , host->hostname
2456             , ae->unique_id
2457             , ae->alarm_id
2458             , ae->alarm_event_id
2459             , ae->name
2460             , ae->chart
2461             , ae->family
2462             , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
2463             , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
2464             , (unsigned long)ae->exec_run_timestamp
2465             , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false"
2466             , ae->exec?ae->exec:health.health_default_exec
2467             , ae->recipient?ae->recipient:health.health_default_recipient
2468             , ae->exec_code
2469             , ae->source
2470             , ae->units?ae->units:""
2471             , ae->info?ae->info:""
2472             , (unsigned long)ae->when
2473             , (unsigned long)ae->duration
2474             , (unsigned long)ae->non_clear_duration
2475             , rrdcalc_status2string(ae->new_status)
2476             , rrdcalc_status2string(ae->old_status)
2477             , ae->delay
2478             , (unsigned long)ae->delay_up_to_timestamp
2479             , ae->updated_by_id
2480             , ae->updates_id
2481             , ae->new_value_string
2482             , ae->old_value_string
2483     );
2484
2485     if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
2486         buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
2487     }
2488
2489     buffer_strcat(wb, "\t\t\"value\":");
2490     buffer_rrd_value(wb, ae->new_value);
2491     buffer_strcat(wb, ",\n");
2492
2493     buffer_strcat(wb, "\t\t\"old_value\":");
2494     buffer_rrd_value(wb, ae->old_value);
2495     buffer_strcat(wb, "\n");
2496
2497     buffer_strcat(wb, "\t}");
2498 }
2499
2500 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2501     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2502
2503     buffer_strcat(wb, "[");
2504
2505     unsigned int max = host->health_log.max;
2506     unsigned int count = 0;
2507     ALARM_ENTRY *ae;
2508     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2509         if(ae->unique_id > after) {
2510             if(likely(count)) buffer_strcat(wb, ",");
2511             health_alarm_entry2json_nolock(wb, ae, host);
2512         }
2513     }
2514
2515     buffer_strcat(wb, "\n]\n");
2516
2517     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2518 }
2519
2520 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2521     buffer_sprintf(wb,
2522            "\t\t\"%s.%s\": {\n"
2523                    "\t\t\t\"id\": %lu,\n"
2524                    "\t\t\t\"name\": \"%s\",\n"
2525                    "\t\t\t\"chart\": \"%s\",\n"
2526                    "\t\t\t\"family\": \"%s\",\n"
2527                    "\t\t\t\"active\": %s,\n"
2528                    "\t\t\t\"exec\": \"%s\",\n"
2529                    "\t\t\t\"recipient\": \"%s\",\n"
2530                    "\t\t\t\"source\": \"%s\",\n"
2531                    "\t\t\t\"units\": \"%s\",\n"
2532                    "\t\t\t\"info\": \"%s\",\n"
2533                                    "\t\t\t\"status\": \"%s\",\n"
2534                    "\t\t\t\"last_status_change\": %lu,\n"
2535                    "\t\t\t\"last_updated\": %lu,\n"
2536                    "\t\t\t\"next_update\": %lu,\n"
2537                    "\t\t\t\"update_every\": %d,\n"
2538                    "\t\t\t\"delay_up_duration\": %d,\n"
2539                    "\t\t\t\"delay_down_duration\": %d,\n"
2540                    "\t\t\t\"delay_max_duration\": %d,\n"
2541                    "\t\t\t\"delay_multiplier\": %f,\n"
2542                    "\t\t\t\"delay\": %d,\n"
2543                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2544            , rc->chart, rc->name
2545            , (unsigned long)rc->id
2546            , rc->name
2547            , rc->chart
2548            , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2549            , (rc->rrdset)?"true":"false"
2550            , rc->exec?rc->exec:health.health_default_exec
2551            , rc->recipient?rc->recipient:health.health_default_recipient
2552            , rc->source
2553            , rc->units?rc->units:""
2554            , rc->info?rc->info:""
2555            , rrdcalc_status2string(rc->status)
2556            , (unsigned long)rc->last_status_change
2557            , (unsigned long)rc->last_updated
2558            , (unsigned long)rc->next_update
2559            , rc->update_every
2560            , rc->delay_up_duration
2561            , rc->delay_down_duration
2562            , rc->delay_max_duration
2563            , rc->delay_multiplier
2564            , rc->delay_last
2565            , (unsigned long)rc->delay_up_to_timestamp
2566     );
2567
2568     if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
2569         buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n");
2570     }
2571
2572     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2573         if(rc->dimensions && *rc->dimensions)
2574             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2575
2576         buffer_sprintf(wb,
2577                        "\t\t\t\"db_after\": %lu,\n"
2578                        "\t\t\t\"db_before\": %lu,\n"
2579                        "\t\t\t\"lookup_method\": \"%s\",\n"
2580                        "\t\t\t\"lookup_after\": %d,\n"
2581                        "\t\t\t\"lookup_before\": %d,\n"
2582                        "\t\t\t\"lookup_options\": \"",
2583                        (unsigned long) rc->db_after,
2584                        (unsigned long) rc->db_before,
2585                        group_method2string(rc->group),
2586                        rc->after,
2587                        rc->before
2588         );
2589         buffer_data_options2string(wb, rc->options);
2590         buffer_strcat(wb, "\",\n");
2591     }
2592
2593     if(rc->calculation) {
2594         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2595         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2596     }
2597
2598     if(rc->warning) {
2599         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2600         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2601     }
2602
2603     if(rc->critical) {
2604         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2605         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2606     }
2607
2608     buffer_strcat(wb, "\t\t\t\"green\":");
2609     buffer_rrd_value(wb, rc->green);
2610     buffer_strcat(wb, ",\n");
2611
2612     buffer_strcat(wb, "\t\t\t\"red\":");
2613     buffer_rrd_value(wb, rc->red);
2614     buffer_strcat(wb, ",\n");
2615
2616     buffer_strcat(wb, "\t\t\t\"value\":");
2617     buffer_rrd_value(wb, rc->value);
2618     buffer_strcat(wb, "\n");
2619
2620     buffer_strcat(wb, "\t\t}");
2621 }
2622
2623 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2624 //
2625 //}
2626
2627 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2628     int i;
2629
2630     rrdhost_rdlock(&localhost);
2631     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2632                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2633                         "\n\t\"status\": %s,"
2634                         "\n\t\"now\": %lu,"
2635                         "\n\t\"alarms\": {\n",
2636                         host->hostname,
2637                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2638                         health_enabled?"true":"false",
2639                         (unsigned long)now_realtime_sec());
2640
2641     RRDCALC *rc;
2642     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2643         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2644             continue;
2645
2646         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2647             continue;
2648
2649         if(likely(i)) buffer_strcat(wb, ",\n");
2650         health_rrdcalc2json_nolock(wb, rc);
2651         i++;
2652     }
2653
2654 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2655 //    RRDCALCTEMPLATE *rt;
2656 //    for(rt = host->templates; rt ; rt = rt->next)
2657 //        health_rrdcalctemplate2json_nolock(wb, rt);
2658
2659     buffer_strcat(wb, "\n\t}\n}\n");
2660     rrdhost_unlock(&localhost);
2661 }
2662
2663
2664 // ----------------------------------------------------------------------------
2665 // re-load health configuration
2666
2667 static inline void health_free_all_nolock(RRDHOST *host) {
2668     while(host->templates)
2669         rrdcalctemplate_free(host, host->templates);
2670
2671     while(host->alarms)
2672         rrdcalc_free(host, host->alarms);
2673 }
2674
2675 void health_reload(void) {
2676     if(!health_enabled) {
2677         error("Health reload is requested, but health is not enabled.");
2678         return;
2679     }
2680
2681     char *path = health_config_dir();
2682
2683     // free all running alarms
2684     rrdhost_rwlock(&localhost);
2685     health_free_all_nolock(&localhost);
2686     rrdhost_unlock(&localhost);
2687
2688     // invalidate all previous entries in the alarm log
2689     ALARM_ENTRY *t;
2690     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2691         if(t->new_status != RRDCALC_STATUS_REMOVED)
2692             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2693     }
2694
2695     // reset all thresholds to all charts
2696     RRDSET *st;
2697     for(st = localhost.rrdset_root; st ; st = st->next) {
2698         st->green = NAN;
2699         st->red = NAN;
2700     }
2701
2702     // load the new alarms
2703     rrdhost_rwlock(&localhost);
2704     health_readdir(path);
2705     rrdhost_unlock(&localhost);
2706
2707     // link the loaded alarms to their charts
2708     for(st = localhost.rrdset_root; st ; st = st->next) {
2709         rrdhost_rwlock(&localhost);
2710
2711         rrdsetcalc_link_matching(st);
2712         rrdcalctemplate_link_matching(st);
2713
2714         rrdhost_unlock(&localhost);
2715     }
2716 }
2717
2718 // ----------------------------------------------------------------------------
2719 // health main thread and friends
2720
2721 static inline int rrdcalc_value2status(calculated_number n) {
2722     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
2723     if(n) return RRDCALC_STATUS_RAISED;
2724     return RRDCALC_STATUS_CLEAR;
2725 }
2726
2727 #define ALARM_EXEC_COMMAND_LENGTH 8192
2728
2729 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2730     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2731
2732     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
2733         // do not send notifications for internal statuses
2734         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2735         goto done;
2736     }
2737
2738     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
2739         // do not send notifications for disabled statuses
2740         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2741         // mark it as run, so that we will send the same alarm if it happens again
2742         goto done;
2743     }
2744
2745     // find the previous notification for the same alarm
2746     // which we have run the exec script
2747     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
2748     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
2749         uint32_t id = ae->alarm_id;
2750         ALARM_ENTRY *t;
2751         for(t = ae->next; t ; t = t->next) {
2752             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2753                 break;
2754         }
2755
2756         if(likely(t)) {
2757             // we have executed this alarm notification in the past
2758             if(t && t->new_status == ae->new_status) {
2759                 // don't send the notification for the same status again
2760                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
2761                       , rrdcalc_status2string(ae->new_status));
2762                 goto done;
2763             }
2764         }
2765         else {
2766             // we have not executed this alarm notification in the past
2767             // so, don't send CLEAR notifications
2768             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
2769                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
2770                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2771                 goto done;
2772             }
2773         }
2774     }
2775
2776     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
2777     pid_t command_pid;
2778
2779     const char *exec = ae->exec;
2780     if(!exec) exec = health.health_default_exec;
2781
2782     const char *recipient = ae->recipient;
2783     if(!recipient) recipient = health.health_default_recipient;
2784
2785     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
2786               exec,
2787               recipient,
2788               host->hostname,
2789               ae->unique_id,
2790               ae->alarm_id,
2791               ae->alarm_event_id,
2792               (unsigned long)ae->when,
2793               ae->name,
2794               ae->chart?ae->chart:"NOCAHRT",
2795               ae->family?ae->family:"NOFAMILY",
2796               rrdcalc_status2string(ae->new_status),
2797               rrdcalc_status2string(ae->old_status),
2798               ae->new_value,
2799               ae->old_value,
2800               ae->source?ae->source:"UNKNOWN",
2801               (uint32_t)ae->duration,
2802               (uint32_t)ae->non_clear_duration,
2803               ae->units?ae->units:"",
2804               ae->info?ae->info:"",
2805               ae->new_value_string,
2806               ae->old_value_string
2807     );
2808
2809     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2810     ae->exec_run_timestamp = now_realtime_sec();
2811
2812     debug(D_HEALTH, "executing command '%s'", command_to_run);
2813     FILE *fp = mypopen(command_to_run, &command_pid);
2814     if(!fp) {
2815         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
2816         goto done;
2817     }
2818     debug(D_HEALTH, "HEALTH reading from command");
2819     char *s = fgets(command_to_run, FILENAME_MAX, fp);
2820     (void)s;
2821     ae->exec_code = mypclose(fp, command_pid);
2822     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2823
2824     if(ae->exec_code != 0)
2825         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2826
2827 done:
2828     health_alarm_log_save(host, ae);
2829     return;
2830 }
2831
2832 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2833     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2834          ae->chart?ae->chart:"NOCHART", ae->name,
2835          ae->new_value,
2836          rrdcalc_status2string(ae->old_status),
2837          rrdcalc_status2string(ae->new_status)
2838     );
2839
2840     health_alarm_execute(host, ae);
2841 }
2842
2843 static inline void health_alarm_log_process(RRDHOST *host) {
2844     static uint32_t stop_at_id = 0;
2845     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2846     time_t now = now_realtime_sec();
2847
2848     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2849
2850     ALARM_ENTRY *ae;
2851     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2852         if(unlikely(
2853             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2854             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2855             )) {
2856
2857             if(unlikely(ae->unique_id < first_waiting))
2858                 first_waiting = ae->unique_id;
2859
2860             if(likely(now >= ae->delay_up_to_timestamp))
2861                 health_process_notifications(host, ae);
2862         }
2863     }
2864
2865     // remember this for the next iteration
2866     stop_at_id = first_waiting;
2867
2868     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2869
2870     if(host->health_log.count <= host->health_log.max)
2871         return;
2872
2873     // cleanup excess entries in the log
2874     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2875
2876     ALARM_ENTRY *last = NULL;
2877     unsigned int count = host->health_log.max * 2 / 3;
2878     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2879
2880     if(ae && last && last->next == ae)
2881         last->next = NULL;
2882     else
2883         ae = NULL;
2884
2885     while(ae) {
2886         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2887
2888         ALARM_ENTRY *t = ae->next;
2889
2890         freez(ae->name);
2891         freez(ae->chart);
2892         freez(ae->family);
2893         freez(ae->exec);
2894         freez(ae->recipient);
2895         freez(ae->source);
2896         freez(ae->units);
2897         freez(ae->info);
2898         freez(ae->old_value_string);
2899         freez(ae->new_value_string);
2900         freez(ae);
2901
2902         ae = t;
2903         host->health_log.count--;
2904     }
2905
2906     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2907 }
2908
2909 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2910     if(unlikely(!rc->rrdset)) {
2911         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2912         return 0;
2913     }
2914
2915     if(unlikely(rc->next_update > now)) {
2916         if (unlikely(*next_run > rc->next_update)) {
2917             // update the next_run time of the main loop
2918             // to run this alarm precisely the time required
2919             *next_run = rc->next_update;
2920         }
2921
2922         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2923         return 0;
2924     }
2925
2926     if(unlikely(!rc->update_every)) {
2927         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2928         return 0;
2929     }
2930
2931     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
2932         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
2933         return 0;
2934     }
2935
2936     int update_every = rc->rrdset->update_every;
2937     time_t first = rrdset_first_entry_t(rc->rrdset);
2938     time_t last = rrdset_last_entry_t(rc->rrdset);
2939
2940     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
2941         debug(D_HEALTH
2942               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
2943               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
2944               , (unsigned long) last);
2945         return 0;
2946     }
2947
2948     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2949         time_t needed = now + rc->before + rc->after;
2950
2951         if(needed + update_every < first || needed - update_every > last) {
2952             debug(D_HEALTH
2953                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
2954                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
2955                   , (unsigned long) last);
2956             return 0;
2957         }
2958     }
2959
2960     return 1;
2961 }
2962
2963 void *health_main(void *ptr) {
2964     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
2965
2966     info("HEALTH thread created with task id %d", gettid());
2967
2968     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2969         error("Cannot set pthread cancel type to DEFERRED.");
2970
2971     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2972         error("Cannot set pthread cancel state to ENABLE.");
2973
2974     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2975     if(min_run_every < 1) min_run_every = 1;
2976
2977     BUFFER *wb = buffer_create(100);
2978
2979     unsigned int loop = 0;
2980     while(health_enabled && !netdata_exit) {
2981         loop++;
2982         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2983
2984         int oldstate, runnable = 0;
2985         time_t now = now_realtime_sec();
2986         time_t next_run = now + min_run_every;
2987         RRDCALC *rc;
2988
2989         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2990             error("Cannot set pthread cancel state to DISABLE.");
2991
2992         rrdhost_rdlock(&localhost);
2993
2994         // the first loop is to lookup values from the db
2995         for(rc = localhost.alarms; rc; rc = rc->next) {
2996             if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
2997                 if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
2998                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
2999                 continue;
3000             }
3001
3002             runnable++;
3003             rc->old_value = rc->value;
3004             rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
3005
3006             // 1. if there is database lookup, do it
3007             // 2. if there is calculation expression, run it
3008
3009             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
3010                 /* time_t old_db_timestamp = rc->db_before; */
3011                 int value_is_null = 0;
3012
3013                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
3014                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
3015                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
3016
3017                 if (unlikely(ret != 200)) {
3018                     // database lookup failed
3019                     rc->value = NAN;
3020
3021                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
3022
3023                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
3024                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
3025                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
3026                     }
3027                 }
3028                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
3029                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
3030
3031                 /* - RRDCALC_FLAG_DB_STALE not currently used
3032                 if (unlikely(old_db_timestamp == rc->db_before)) {
3033                     // database is stale
3034
3035                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
3036
3037                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
3038                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
3039                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
3040                     }
3041                 }
3042                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
3043                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
3044                 */
3045
3046                 if (unlikely(value_is_null)) {
3047                     // collected value is null
3048
3049                     rc->value = NAN;
3050
3051                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
3052                           rc->chart?rc->chart:"NOCHART", rc->name);
3053
3054                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
3055                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
3056                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
3057                               rc->chart?rc->chart:"NOCHART", rc->name);
3058                     }
3059                 }
3060                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
3061                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
3062
3063                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
3064                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
3065             }
3066
3067             if(unlikely(rc->calculation)) {
3068                 if (unlikely(!expression_evaluate(rc->calculation))) {
3069                     // calculation failed
3070
3071                     rc->value = NAN;
3072
3073                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' failed: %s",
3074                           rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
3075
3076                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
3077                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
3078                         error("Health alarm '%s.%s': expression '%s' failed: %s",
3079                               rc->chart?rc->chart:"NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
3080                     }
3081                 }
3082                 else {
3083                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
3084                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
3085
3086                     debug(D_HEALTH, "Health alarm '%s.%s': expression '%s' gave value "
3087                             CALCULATED_NUMBER_FORMAT
3088                             ": %s (source: %s)",
3089                           rc->chart?rc->chart:"NOCHART", rc->name,
3090                           rc->calculation->parsed_as,
3091                           rc->calculation->result,
3092                           buffer_tostring(rc->calculation->error_msg),
3093                           rc->source
3094                     );
3095
3096                     rc->value = rc->calculation->result;
3097                 }
3098             }
3099         }
3100         rrdhost_unlock(&localhost);
3101
3102         if(unlikely(runnable && !netdata_exit)) {
3103             rrdhost_rdlock(&localhost);
3104
3105             for(rc = localhost.alarms; rc; rc = rc->next) {
3106                 if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
3107                     continue;
3108
3109                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
3110                 int critical_status = RRDCALC_STATUS_UNDEFINED;
3111
3112                 if(likely(rc->warning)) {
3113                     if(unlikely(!expression_evaluate(rc->warning))) {
3114                         // calculation failed
3115
3116                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
3117                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
3118
3119                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
3120                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
3121                             error("Health alarm '%s.%s': warning expression failed with error: %s",
3122                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
3123                         }
3124                     }
3125                     else {
3126                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
3127                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
3128
3129                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
3130                                 CALCULATED_NUMBER_FORMAT
3131                                 ": %s (source: %s)",
3132                               rc->chart?rc->chart:"NOCHART", rc->name,
3133                               rc->warning->result,
3134                               buffer_tostring(rc->warning->error_msg),
3135                               rc->source
3136                         );
3137
3138                         warning_status = rrdcalc_value2status(rc->warning->result);
3139                     }
3140                 }
3141
3142                 if(likely(rc->critical)) {
3143                     if(unlikely(!expression_evaluate(rc->critical))) {
3144                         // calculation failed
3145
3146                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
3147                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
3148
3149                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
3150                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
3151                             error("Health alarm '%s.%s': critical expression failed with error: %s",
3152                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
3153                         }
3154                     }
3155                     else {
3156                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
3157                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
3158
3159                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
3160                                 CALCULATED_NUMBER_FORMAT
3161                                 ": %s (source: %s)",
3162                               rc->chart?rc->chart:"NOCHART", rc->name,
3163                               rc->critical->result,
3164                               buffer_tostring(rc->critical->error_msg),
3165                               rc->source
3166                         );
3167
3168                         critical_status = rrdcalc_value2status(rc->critical->result);
3169                     }
3170                 }
3171
3172                 int status = RRDCALC_STATUS_UNDEFINED;
3173
3174                 switch(warning_status) {
3175                     case RRDCALC_STATUS_CLEAR:
3176                         status = RRDCALC_STATUS_CLEAR;
3177                         break;
3178
3179                     case RRDCALC_STATUS_RAISED:
3180                         status = RRDCALC_STATUS_WARNING;
3181                         break;
3182
3183                     default:
3184                         break;
3185                 }
3186
3187                 switch(critical_status) {
3188                     case RRDCALC_STATUS_CLEAR:
3189                         if(status == RRDCALC_STATUS_UNDEFINED)
3190                             status = RRDCALC_STATUS_CLEAR;
3191                         break;
3192
3193                     case RRDCALC_STATUS_RAISED:
3194                         status = RRDCALC_STATUS_CRITICAL;
3195                         break;
3196
3197                     default:
3198                         break;
3199                 }
3200
3201                 if(status != rc->status) {
3202                     int delay = 0;
3203
3204                     if(now > rc->delay_up_to_timestamp) {
3205                         rc->delay_up_current = rc->delay_up_duration;
3206                         rc->delay_down_current = rc->delay_down_duration;
3207                         rc->delay_last = 0;
3208                         rc->delay_up_to_timestamp = 0;
3209                     }
3210                     else {
3211                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
3212                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
3213
3214                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
3215                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
3216                     }
3217
3218                     if(status > rc->status)
3219                         delay = rc->delay_up_current;
3220                     else
3221                         delay = rc->delay_down_current;
3222
3223                     // COMMENTED: because we do need to send raising alarms
3224                     // if(now + delay < rc->delay_up_to_timestamp)
3225                     //    delay = (int)(rc->delay_up_to_timestamp - now);
3226
3227                     rc->delay_last = delay;
3228                     rc->delay_up_to_timestamp = now + delay;
3229                     health_alarm_log(
3230                             &localhost,
3231                             rc->id,
3232                             rc->next_event_id++,
3233                             now,
3234                             rc->name,
3235                             rc->rrdset->id,
3236                             rc->rrdset->family,
3237                             rc->exec,
3238                             rc->recipient,
3239                             now - rc->last_status_change,
3240                             rc->old_value,
3241                             rc->value,
3242                             rc->status,
3243                             status,
3244                             rc->source,
3245                             rc->units,
3246                             rc->info,
3247                             rc->delay_last,
3248                             (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)?HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION:0
3249                     );
3250                     rc->last_status_change = now;
3251                     rc->status = status;
3252                 }
3253
3254                 rc->last_updated = now;
3255                 rc->next_update = now + rc->update_every;
3256
3257                 if (next_run > rc->next_update)
3258                     next_run = rc->next_update;
3259             }
3260
3261             rrdhost_unlock(&localhost);
3262         }
3263
3264         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
3265             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
3266
3267         if(unlikely(netdata_exit))
3268             break;
3269
3270         // execute notifications
3271         // and cleanup
3272         health_alarm_log_process(&localhost);
3273
3274         if(unlikely(netdata_exit))
3275             break;
3276         
3277         now = now_realtime_sec();
3278         if(now < next_run) {
3279             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
3280                   loop, (int) (next_run - now));
3281             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
3282         }
3283         else {
3284             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
3285         }
3286     }
3287
3288     buffer_free(wb);
3289
3290     info("HEALTH thread exiting");
3291
3292     static_thread->enabled = 0;
3293     pthread_exit(NULL);
3294     return NULL;
3295 }