]> arthur.barton.de Git - netdata.git/blob - src/health.c
properly rotate the health log file
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     size_t log_entries_written;
10     FILE *log_fp;
11 };
12
13 static struct health_options health = {
14     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
15     .health_default_recipient = "root",
16     .log_filename = VARLIB_DIR "/health/alarm_log.db",
17     .log_entries_written = 0,
18     .log_fp = NULL
19 };
20
21 int health_enabled = 1;
22
23 // ----------------------------------------------------------------------------
24 // health alarm log load/save
25 // no need for locking - only one thread is reading / writing the alarms log
26
27 static inline int health_alarm_log_open(void) {
28     if(health.log_fp)
29         fclose(health.log_fp);
30
31     health.log_fp = fopen(health.log_filename, "a");
32
33     if(health.log_fp) {
34         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
35             error("Health: cannot set line buffering on health log file.");
36         return 0;
37     }
38
39     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
40     return -1;
41 }
42
43 static inline void health_alarm_log_close(void) {
44     if(health.log_fp) {
45         fclose(health.log_fp);
46         health.log_fp = NULL;
47     }
48 }
49
50 static inline void health_log_rotate(void) {
51     static size_t rotate_every = 0;
52
53     if(unlikely(rotate_every == 0)) {
54         rotate_every = (size_t)config_get_number("health", "rotate log every lines", 100);
55         if(rotate_every < 100) rotate_every = 100;
56     }
57
58     if(unlikely(health.log_entries_written > rotate_every)) {
59         health_alarm_log_close();
60
61         char old_filename[FILENAME_MAX + 1];
62         snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename);
63
64         if(unlink(old_filename) == -1 && errno != ENOENT)
65             error("Health: cannot remove old alarms log file '%s'", old_filename);
66
67         if(link(health.log_filename, old_filename) == -1 && errno != ENOENT)
68             error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename);
69
70         if(unlink(health.log_filename) == -1 && errno != ENOENT)
71             error("Health: cannot remove old alarms log file '%s'", health.log_filename);
72
73         health_alarm_log_open();
74     }
75 }
76
77 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
78     health_log_rotate();
79
80     if(likely(health.log_fp)) {
81         if(unlikely(fprintf(health.log_fp
82                 , "%c\t%s"
83                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
84                   "\t%08x\t%08x\t%08x"
85                   "\t%08x\t%08x\t%08x"
86                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
87                   "\t%d\t%d\t%d\t%d"
88                   "\t%Lf\t%Lf"
89                   "\n"
90                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
91                 , host->hostname
92
93                 , ae->unique_id
94                 , ae->alarm_id
95                 , ae->alarm_event_id
96                 , ae->updated_by_id
97                 , ae->updates_id
98
99                 , (uint32_t)ae->when
100                 , (uint32_t)ae->duration
101                 , (uint32_t)ae->non_clear_duration
102                 , (uint32_t)ae->flags
103                 , (uint32_t)ae->exec_run_timestamp
104                 , (uint32_t)ae->delay_up_to_timestamp
105
106                 , (ae->name)?ae->name:""
107                 , (ae->chart)?ae->chart:""
108                 , (ae->family)?ae->family:""
109                 , (ae->exec)?ae->exec:""
110                 , (ae->recipient)?ae->recipient:""
111                 , (ae->source)?ae->source:""
112                 , (ae->units)?ae->units:""
113                 , (ae->info)?ae->info:""
114
115                 , ae->exec_code
116                 , ae->new_status
117                 , ae->old_status
118                 , ae->delay
119
120                 , (long double)ae->new_value
121                 , (long double)ae->old_value
122         ) < 0))
123             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
124         else {
125             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
126             health.log_entries_written++;
127         }
128     }
129 }
130
131 static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
132     uint32_t max_unique_id = 0, max_alarm_id = 0;
133     ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
134
135     errno = 0;
136
137     char *s, *buf = mallocz(65536 + 1);
138     size_t line = 0, len = 0;
139     loaded = updated = errored = duplicate = 0;
140
141     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
142
143     while((s = fgets_trim_len(buf, 65536, fp, &len))) {
144         health.log_entries_written++;
145         line++;
146
147         int max_entries = 30, entries = 0;
148         char *pointers[max_entries];
149
150         pointers[entries++] = s++;
151         while(*s) {
152             if(unlikely(*s == '\t')) {
153                 *s = '\0';
154                 pointers[entries++] = ++s;
155                 if(entries >= max_entries) {
156                     error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries);
157                     break;
158                 }
159             }
160             else s++;
161         }
162
163         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
164             ALARM_ENTRY *ae = NULL;
165
166             if(entries < 26) {
167                 error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring line.", line, filename, entries);
168                 errored++;
169                 continue;
170             }
171
172             // check that we have valid ids
173             uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
174             if(!unique_id) {
175                 error("Health: line %zu of file '%s' states alarm entry with unique id %u (%s). Ignoring line.", line, filename, unique_id, pointers[2]);
176                 errored++;
177                 continue;
178             }
179
180             uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
181             if(!alarm_id) {
182                 error("Health: line %zu of file '%s' states alarm entry for alarm id %u (%s). Ignoring line.", line, filename, alarm_id, pointers[3]);
183                 errored++;
184                 continue;
185             }
186
187             // find a possible overwrite
188             for(ae = host->health_log.alarms; ae; ae = ae->next) {
189                 if(unlikely(ae->unique_id == unique_id)) {
190                     if(unlikely(*pointers[0] == 'A')) {
191                         error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u."
192                               , line, filename, unique_id);
193                         *pointers[0] = 'U';
194                         duplicate++;
195                     }
196                     break;
197                 }
198             }
199
200             // if not found, create a new one
201             if(likely(!ae)) {
202
203                 // if it is an update, but we haven't found it, make it an addition
204                 if(unlikely(*pointers[0] == 'U')) {
205                     *pointers[0] = 'A';
206                     error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id);
207                 }
208
209                 // alarms should be added in the right order
210                 if(unlikely(unique_id < max_unique_id)) {
211                     error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order.", line
212                           , filename, ae->unique_id);
213                 }
214
215                 ae = callocz(1, sizeof(ALARM_ENTRY));
216             }
217
218             // check for a possible host missmatch
219             if(strcmp(pointers[1], host->hostname))
220                 error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname);
221
222             ae->unique_id               = unique_id;
223             ae->alarm_id                = alarm_id;
224             ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
225             ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
226             ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
227
228             ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
229             ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
230             ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
231
232             ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
233             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
234
235             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
236             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
237
238             if(unlikely(ae->name)) freez(ae->name);
239             ae->name = strdupz(pointers[13]);
240
241             if(unlikely(ae->chart)) freez(ae->chart);
242             ae->chart = strdupz(pointers[14]);
243
244             if(unlikely(ae->family)) freez(ae->family);
245             ae->family = strdupz(pointers[15]);
246
247             if(unlikely(ae->exec)) freez(ae->exec);
248             ae->exec = strdupz(pointers[16]);
249             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
250
251             if(unlikely(ae->recipient)) freez(ae->recipient);
252             ae->recipient = strdupz(pointers[17]);
253             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
254
255             if(unlikely(ae->source)) freez(ae->source);
256             ae->source = strdupz(pointers[18]);
257             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
258
259             if(unlikely(ae->units)) freez(ae->units);
260             ae->units = strdupz(pointers[19]);
261             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
262
263             if(unlikely(ae->info)) freez(ae->info);
264             ae->info = strdupz(pointers[20]);
265             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
266
267             ae->exec_code   = atoi(pointers[21]);
268             ae->new_status  = atoi(pointers[22]);
269             ae->old_status  = atoi(pointers[23]);
270             ae->delay       = atoi(pointers[24]);
271
272             ae->new_value   = strtold(pointers[25], NULL);
273             ae->old_value   = strtold(pointers[26], NULL);
274
275             // add it to host if not already there
276             if(unlikely(*pointers[0] == 'A')) {
277                 ae->next = host->health_log.alarms;
278                 host->health_log.alarms = ae;
279                 loaded++;
280             }
281             else updated++;
282
283             if(unlikely(ae->unique_id > max_unique_id))
284                 max_unique_id = ae->unique_id;
285
286             if(unlikely(ae->alarm_id >= max_alarm_id))
287                 max_alarm_id = ae->alarm_id;
288         }
289         else {
290             error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]);
291             errored++;
292         }
293     }
294
295     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
296
297     freez(buf);
298
299     if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
300     if(!max_alarm_id)  max_alarm_id  = (uint32_t)time(NULL);
301
302     host->health_log.next_log_id = max_unique_id + 1;
303     host->health_log.next_alarm_id = max_alarm_id + 1;
304
305     info("Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate);
306     return loaded;
307 }
308
309 static inline void health_alarm_log_load(RRDHOST *host) {
310     health_alarm_log_close();
311
312     char buffer[FILENAME_MAX + 1];
313     snprintfz(buffer, FILENAME_MAX, "%s.old", health.log_filename);
314     FILE *fp = fopen(buffer, "r");
315     if(!fp)
316         error("Health: cannot open health file: %s", health.log_filename);
317     else {
318         health_alarm_log_read(host, fp, health.log_filename);
319         fclose(fp);
320     }
321
322     health.log_entries_written = 0;
323     fp = fopen(health.log_filename, "r");
324     if(!fp)
325         error("Health: cannot open health file: %s", health.log_filename);
326     else {
327         health_alarm_log_read(host, fp, health.log_filename);
328         fclose(fp);
329     }
330
331     health_alarm_log_open();
332 }
333
334
335 // ----------------------------------------------------------------------------
336 // health alarm log management
337
338 static inline void health_alarm_log(RRDHOST *host,
339                 uint32_t alarm_id, uint32_t alarm_event_id,
340                 time_t when,
341                 const char *name, const char *chart, const char *family,
342                 const char *exec, const char *recipient, time_t duration,
343                 calculated_number old_value, calculated_number new_value,
344                 int old_status, int new_status,
345                 const char *source,
346                 const char *units,
347                 const char *info,
348                 int delay
349 ) {
350     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
351
352     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
353     ae->name = strdupz(name);
354     ae->hash_name = simple_hash(ae->name);
355
356     if(chart) {
357         ae->chart = strdupz(chart);
358         ae->hash_chart = simple_hash(ae->chart);
359     }
360
361     if(family)
362         ae->family = strdupz(family);
363
364     if(exec) ae->exec = strdupz(exec);
365     if(recipient) ae->recipient = strdupz(recipient);
366     if(source) ae->source = strdupz(source);
367     if(units) ae->units = strdupz(units);
368     if(info) ae->info = strdupz(info);
369
370     ae->unique_id = host->health_log.next_log_id++;
371     ae->alarm_id = alarm_id;
372     ae->alarm_event_id = alarm_event_id;
373     ae->when = when;
374     ae->old_value = old_value;
375     ae->new_value = new_value;
376     ae->old_status = old_status;
377     ae->new_status = new_status;
378     ae->duration = duration;
379     ae->delay = delay;
380     ae->delay_up_to_timestamp = when + delay;
381
382     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
383         ae->non_clear_duration += ae->duration;
384
385     // link it
386     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
387     ae->next = host->health_log.alarms;
388     host->health_log.alarms = ae;
389     host->health_log.count++;
390     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
391
392     // match previous alarms
393     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
394     ALARM_ENTRY *t;
395     for(t = host->health_log.alarms ; t ; t = t->next) {
396         if(t != ae && t->alarm_id == ae->alarm_id) {
397             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
398                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
399                 t->updated_by_id = ae->unique_id;
400                 ae->updates_id = t->unique_id;
401
402                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
403                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
404                     ae->non_clear_duration += t->non_clear_duration;
405
406                 health_alarm_log_save(host, t);
407             }
408
409             // no need to continue
410             break;
411         }
412     }
413     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
414
415     health_alarm_log_save(host, ae);
416 }
417
418 // ----------------------------------------------------------------------------
419 // RRDVAR management
420
421 static inline int rrdvar_fix_name(char *variable) {
422     int fixed = 0;
423     while(*variable) {
424         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
425             *variable++ = '_';
426             fixed++;
427         }
428         else
429             variable++;
430     }
431
432     return fixed;
433 }
434
435 int rrdvar_compare(void* a, void* b) {
436     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
437     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
438     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
439 }
440
441 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
442     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
443     if(ret != rv)
444         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
445
446     return ret;
447 }
448
449 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
450     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
451     if(!ret)
452         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
453
454     return ret;
455 }
456
457 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
458     RRDVAR tmp;
459     tmp.name = (char *)name;
460     tmp.hash = (hash)?hash:simple_hash(tmp.name);
461
462     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
463 }
464
465 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
466     (void)host;
467
468     if(!rv) return;
469
470     if(tree)
471         rrdvar_index_del(tree, rv);
472
473     freez(rv->name);
474     freez(rv);
475 }
476
477 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
478     char *variable = strdupz(name);
479     rrdvar_fix_name(variable);
480     uint32_t hash = simple_hash(variable);
481
482     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
483     if(unlikely(!rv)) {
484         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
485
486         rv = callocz(1, sizeof(RRDVAR));
487         rv->name = variable;
488         rv->hash = hash;
489         rv->type = type;
490         rv->value = value;
491
492         RRDVAR *ret = rrdvar_index_add(tree, rv);
493         if(unlikely(ret != rv)) {
494             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
495             rrdvar_free(NULL, NULL, rv);
496             rv = NULL;
497         }
498         else
499             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
500     }
501     else {
502         // already exists
503         freez(variable);
504         rv = NULL;
505     }
506
507     return rv;
508 }
509
510 // ----------------------------------------------------------------------------
511 // RRDVAR lookup
512
513 calculated_number rrdvar2number(RRDVAR *rv) {
514     switch(rv->type) {
515         case RRDVAR_TYPE_CALCULATED: {
516             calculated_number *n = (calculated_number *)rv->value;
517             return *n;
518         }
519
520         case RRDVAR_TYPE_TIME_T: {
521             time_t *n = (time_t *)rv->value;
522             return *n;
523         }
524
525         case RRDVAR_TYPE_COLLECTED: {
526             collected_number *n = (collected_number *)rv->value;
527             return *n;
528         }
529
530         case RRDVAR_TYPE_TOTAL: {
531             total_number *n = (total_number *)rv->value;
532             return *n;
533         }
534
535         case RRDVAR_TYPE_INT: {
536             int *n = (int *)rv->value;
537             return *n;
538         }
539
540         default:
541             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
542             return NAN;
543     }
544 }
545
546 void dump_variable(void *data) {
547     RRDVAR *rv = (RRDVAR *)data;
548     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
549 }
550
551 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
552     RRDSET *st = rc->rrdset;
553     RRDVAR *rv;
554
555     if(!st) return 0;
556
557     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
558     if(rv) {
559         *result = rrdvar2number(rv);
560         return 1;
561     }
562
563     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
564     if(rv) {
565         *result = rrdvar2number(rv);
566         return 1;
567     }
568
569     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
570     if(rv) {
571         *result = rrdvar2number(rv);
572         return 1;
573     }
574
575     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
576     avl_traverse_lock(&st->variables_root_index, dump_variable);
577
578     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
579     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
580
581     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
582     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
583
584     return 0;
585 }
586
587 // ----------------------------------------------------------------------------
588 // RRDSETVAR management
589
590 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
591     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
592     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
593
594     char buffer[RRDVAR_MAX_LENGTH + 1];
595     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
596     rs->fullid = strdupz(buffer);
597
598     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
599     rs->fullname = strdupz(buffer);
600
601     rs->variable = strdupz(variable);
602
603     rs->type = type;
604     rs->value = value;
605     rs->options = options;
606     rs->rrdset = st;
607
608     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
609     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
610     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
611     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
612     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
613
614     rs->next = st->variables;
615     st->variables = rs;
616
617     return rs;
618 }
619
620 void rrdsetvar_rename_all(RRDSET *st) {
621     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
622
623     // only these 2 can change name
624     // rs->family_name
625     // rs->host_name
626
627     char buffer[RRDVAR_MAX_LENGTH + 1];
628     RRDSETVAR *rs, *next = st->variables;
629     while((rs = next)) {
630         next = rs->next;
631
632         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
633
634         if (strcmp(buffer, rs->fullname)) {
635             // name changed
636             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
637             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
638
639             freez(rs->fullname);
640             rs->fullname = strdupz(st->name);
641             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
642             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
643         }
644     }
645
646     rrdsetcalc_link_matching(st);
647 }
648
649 void rrdsetvar_free(RRDSETVAR *rs) {
650     RRDSET *st = rs->rrdset;
651     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
652
653     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
654     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
655     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
656     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
657     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
658
659     if(st->variables == rs) {
660         st->variables = rs->next;
661     }
662     else {
663         RRDSETVAR *t;
664         for (t = st->variables; t && t->next != rs; t = t->next);
665         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
666         else t->next = rs->next;
667     }
668
669     freez(rs->fullid);
670     freez(rs->fullname);
671     freez(rs->variable);
672     freez(rs);
673 }
674
675 // ----------------------------------------------------------------------------
676 // RRDDIMVAR management
677
678 #define RRDDIMVAR_ID_MAX 1024
679
680 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
681     RRDSET *st = rd->rrdset;
682
683     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
684
685     if(!prefix) prefix = "";
686     if(!suffix) suffix = "";
687
688     char buffer[RRDDIMVAR_ID_MAX + 1];
689     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
690
691     rs->prefix = strdupz(prefix);
692     rs->suffix = strdupz(suffix);
693
694     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
695     rs->id = strdupz(buffer);
696
697     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
698     rs->name = strdupz(buffer);
699
700     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
701     rs->fullidid = strdupz(buffer);
702
703     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
704     rs->fullidname = strdupz(buffer);
705
706     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
707     rs->fullnameid = strdupz(buffer);
708
709     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
710     rs->fullnamename = strdupz(buffer);
711
712     rs->type = type;
713     rs->value = value;
714     rs->options = options;
715     rs->rrddim = rd;
716
717     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
718     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
719
720     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
721     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
722
723     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
724     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
725     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
726     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
727
728     rs->next = rd->variables;
729     rd->variables = rs;
730
731     return rs;
732 }
733
734 void rrddimvar_rename_all(RRDDIM *rd) {
735     RRDSET *st = rd->rrdset;
736     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
737
738     RRDDIMVAR *rs, *next = rd->variables;
739     while((rs = next)) {
740         next = rs->next;
741
742         if (strcmp(rd->name, rs->name)) {
743             char buffer[RRDDIMVAR_ID_MAX + 1];
744             // name changed
745
746             // name
747             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
748             freez(rs->name);
749             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
750             rs->name = strdupz(buffer);
751             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
752
753             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
754             freez(rs->fullidname);
755             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
756             rs->fullidname = strdupz(buffer);
757             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
758                                                              rs->fullidname, rs->type, rs->value);
759
760             // fullnameid
761             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
762             freez(rs->fullnameid);
763             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
764             rs->fullnameid = strdupz(buffer);
765             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
766                                                           rs->fullnameid, rs->type, rs->value);
767
768             // fullnamename
769             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
770             freez(rs->fullnamename);
771             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
772             rs->fullnamename = strdupz(buffer);
773             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
774                                                           rs->fullnamename, rs->type, rs->value);
775         }
776     }
777 }
778
779 void rrddimvar_free(RRDDIMVAR *rs) {
780     RRDDIM *rd = rs->rrddim;
781     RRDSET *st = rd->rrdset;
782     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
783
784     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
785     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
786
787     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
788     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
789
790     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
791     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
792     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
793     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
794
795     if(rd->variables == rs) {
796         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
797         rd->variables = rs->next;
798     }
799     else {
800         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
801         RRDDIMVAR *t;
802         for (t = rd->variables; t && t->next != rs; t = t->next) ;
803         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
804         else t->next = rs->next;
805     }
806
807     freez(rs->prefix);
808     freez(rs->suffix);
809     freez(rs->id);
810     freez(rs->name);
811     freez(rs->fullidid);
812     freez(rs->fullidname);
813     freez(rs->fullnameid);
814     freez(rs->fullnamename);
815     freez(rs);
816 }
817
818 // ----------------------------------------------------------------------------
819 // RRDCALC management
820
821 static inline const char *rrdcalc_status2string(int status) {
822     switch(status) {
823         case RRDCALC_STATUS_REMOVED:
824             return "REMOVED";
825
826         case RRDCALC_STATUS_UNDEFINED:
827             return "UNDEFINED";
828
829         case RRDCALC_STATUS_UNINITIALIZED:
830             return "UNINITIALIZED";
831
832         case RRDCALC_STATUS_CLEAR:
833             return "CLEAR";
834
835         case RRDCALC_STATUS_RAISED:
836             return "RAISED";
837
838         case RRDCALC_STATUS_WARNING:
839             return "WARNING";
840
841         case RRDCALC_STATUS_CRITICAL:
842             return "CRITICAL";
843
844         default:
845             error("Unknown alarm status %d", status);
846             return "UNKNOWN";
847     }
848 }
849
850 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
851     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
852
853     rc->last_status_change = time(NULL);
854     rc->rrdset = st;
855
856     rc->rrdset_next = st->alarms;
857     rc->rrdset_prev = NULL;
858     
859     if(rc->rrdset_next)
860         rc->rrdset_next->rrdset_prev = rc;
861
862     st->alarms = rc;
863
864     if(rc->update_every < rc->rrdset->update_every) {
865         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
866         rc->update_every = rc->rrdset->update_every;
867     }
868
869     if(!isnan(rc->green) && isnan(st->green)) {
870         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
871         st->green = rc->green;
872     }
873
874     if(!isnan(rc->red) && isnan(st->red)) {
875         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
876         st->red = rc->red;
877     }
878
879     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
880     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
881
882     char fullname[RRDVAR_MAX_LENGTH + 1];
883     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
884     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
885
886     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
887     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
888
889         if(!rc->units) rc->units = strdupz(st->units);
890
891     {
892         time_t now = time(NULL);
893         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
894     }
895 }
896
897 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
898     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
899             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
900         return 1;
901
902     return 0;
903 }
904
905 // this has to be called while the RRDHOST is locked
906 inline void rrdsetcalc_link_matching(RRDSET *st) {
907     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
908
909     RRDCALC *rc;
910     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
911         if(unlikely(rc->rrdset))
912             continue;
913
914         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
915             rrdsetcalc_link(st, rc);
916     }
917 }
918
919 // this has to be called while the RRDHOST is locked
920 inline void rrdsetcalc_unlink(RRDCALC *rc) {
921     RRDSET *st = rc->rrdset;
922
923     if(!st) {
924         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
925         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
926         return;
927     }
928
929     {
930         time_t now = time(NULL);
931         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
932     }
933
934     RRDHOST *host = st->rrdhost;
935
936     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
937
938     // unlink it
939     if(rc->rrdset_prev)
940         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
941
942     if(rc->rrdset_next)
943         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
944
945     if(st->alarms == rc)
946         st->alarms = rc->rrdset_next;
947
948     rc->rrdset_prev = rc->rrdset_next = NULL;
949
950     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
951     rc->local = NULL;
952
953     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
954     rc->family = NULL;
955
956     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
957     rc->hostid = NULL;
958
959     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
960     rc->hostname = NULL;
961
962     rc->rrdset = NULL;
963
964     // RRDCALC will remain in RRDHOST
965     // so that if the matching chart is found in the future
966     // it will be applied automatically
967 }
968
969 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
970     RRDCALC *rc;
971     uint32_t hash = simple_hash(name);
972
973     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
974         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
975             return rc;
976     }
977
978     return NULL;
979 }
980
981 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
982     RRDCALC *rc;
983
984     if(unlikely(!chart)) {
985         error("attempt to find RRDCALC '%s' without giving a chart name", name);
986         return 1;
987     }
988
989     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
990     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
991
992     // make sure it does not already exist
993     for(rc = host->alarms; rc ; rc = rc->next) {
994         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
995             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
996             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
997             return 1;
998         }
999     }
1000
1001     return 0;
1002 }
1003
1004 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
1005     if(chart && name) {
1006         uint32_t hash_chart = simple_hash(chart);
1007         uint32_t hash_name = simple_hash(name);
1008
1009         // re-use old IDs, by looking them up in the alarm log
1010         ALARM_ENTRY *ae;
1011         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1012             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
1013                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
1014                 return ae->alarm_id;
1015             }
1016         }
1017     }
1018
1019     return host->health_log.next_alarm_id++;
1020 }
1021
1022 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
1023     rrdhost_check_rdlock(host);
1024
1025     if(rc->calculation) {
1026         rc->calculation->status = &rc->status;
1027         rc->calculation->this = &rc->value;
1028         rc->calculation->after = &rc->db_after;
1029         rc->calculation->before = &rc->db_before;
1030         rc->calculation->rrdcalc = rc;
1031     }
1032
1033     if(rc->warning) {
1034         rc->warning->status = &rc->status;
1035         rc->warning->this = &rc->value;
1036         rc->warning->after = &rc->db_after;
1037         rc->warning->before = &rc->db_before;
1038         rc->warning->rrdcalc = rc;
1039     }
1040
1041     if(rc->critical) {
1042         rc->critical->status = &rc->status;
1043         rc->critical->this = &rc->value;
1044         rc->critical->after = &rc->db_after;
1045         rc->critical->before = &rc->db_before;
1046         rc->critical->rrdcalc = rc;
1047     }
1048
1049     // link it to the host
1050     if(likely(host->alarms)) {
1051         // append it
1052         RRDCALC *t;
1053         for(t = host->alarms; t && t->next ; t = t->next) ;
1054         t->next = rc;
1055     }
1056     else {
1057         host->alarms = rc;
1058     }
1059
1060     // link it to its chart
1061     RRDSET *st;
1062     for(st = host->rrdset_root; st ; st = st->next) {
1063         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1064             rrdsetcalc_link(st, rc);
1065             break;
1066         }
1067     }
1068 }
1069
1070 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1071
1072     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1073
1074     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1075         return NULL;
1076
1077     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1078     rc->next_event_id = 1;
1079     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1080     rc->name = strdupz(rt->name);
1081     rc->hash = simple_hash(rc->name);
1082     rc->chart = strdupz(chart);
1083     rc->hash_chart = simple_hash(rc->chart);
1084
1085     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1086
1087     rc->green = rt->green;
1088     rc->red = rt->red;
1089     rc->value = NAN;
1090     rc->old_value = NAN;
1091
1092     rc->delay_up_duration = rt->delay_up_duration;
1093     rc->delay_down_duration = rt->delay_down_duration;
1094     rc->delay_max_duration = rt->delay_max_duration;
1095     rc->delay_multiplier = rt->delay_multiplier;
1096
1097     rc->group = rt->group;
1098     rc->after = rt->after;
1099     rc->before = rt->before;
1100     rc->update_every = rt->update_every;
1101     rc->options = rt->options;
1102
1103     if(rt->exec) rc->exec = strdupz(rt->exec);
1104     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1105     if(rt->source) rc->source = strdupz(rt->source);
1106     if(rt->units) rc->units = strdupz(rt->units);
1107     if(rt->info) rc->info = strdupz(rt->info);
1108
1109     if(rt->calculation) {
1110         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1111         if(!rc->calculation)
1112             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1113     }
1114     if(rt->warning) {
1115         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1116         if(!rc->warning)
1117             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1118     }
1119     if(rt->critical) {
1120         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1121         if(!rc->critical)
1122             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1123     }
1124
1125     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1126           (rc->chart)?rc->chart:"NOCHART",
1127           rc->name,
1128           (rc->exec)?rc->exec:"DEFAULT",
1129           (rc->recipient)?rc->recipient:"DEFAULT",
1130           rc->green,
1131           rc->red,
1132           rc->group,
1133           rc->after,
1134           rc->before,
1135           rc->options,
1136           (rc->dimensions)?rc->dimensions:"NONE",
1137           rc->update_every,
1138           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1139           (rc->warning)?rc->warning->parsed_as:"NONE",
1140           (rc->critical)?rc->critical->parsed_as:"NONE",
1141           rc->source,
1142           rc->delay_up_duration,
1143           rc->delay_down_duration,
1144           rc->delay_max_duration,
1145           rc->delay_multiplier
1146     );
1147
1148     rrdcalc_create_part2(host, rc);
1149     return rc;
1150 }
1151
1152 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1153     if(!rc) return;
1154
1155     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1156
1157     // unlink it from RRDSET
1158     if(rc->rrdset) rrdsetcalc_unlink(rc);
1159
1160     // unlink it from RRDHOST
1161     if(unlikely(rc == host->alarms))
1162         host->alarms = rc->next;
1163
1164     else if(likely(host->alarms)) {
1165         RRDCALC *t, *last = host->alarms;
1166         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1167         if(last->next == rc)
1168             last->next = rc->next;
1169         else
1170             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1171     }
1172     else
1173         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1174
1175     expression_free(rc->calculation);
1176     expression_free(rc->warning);
1177     expression_free(rc->critical);
1178
1179     freez(rc->name);
1180     freez(rc->chart);
1181     freez(rc->family);
1182     freez(rc->dimensions);
1183     freez(rc->exec);
1184     freez(rc->recipient);
1185     freez(rc->source);
1186     freez(rc->units);
1187     freez(rc->info);
1188     freez(rc);
1189 }
1190
1191 // ----------------------------------------------------------------------------
1192 // RRDCALCTEMPLATE management
1193
1194 void rrdcalctemplate_link_matching(RRDSET *st) {
1195     RRDCALCTEMPLATE *rt;
1196
1197     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1198         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1199             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1200             if(unlikely(!rc))
1201                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1202
1203 #ifdef NETDATA_INTERNAL_CHECKS
1204             else if(rc->rrdset != st)
1205                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1206 #endif
1207         }
1208     }
1209 }
1210
1211 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1212     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1213
1214     if(host->templates) {
1215         if(host->templates == rt) {
1216             host->templates = rt->next;
1217         }
1218         else {
1219             RRDCALCTEMPLATE *t, *last = host->templates;
1220             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1221             if(last && last->next == rt) {
1222                 last->next = rt->next;
1223                 rt->next = NULL;
1224             }
1225             else
1226                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1227         }
1228     }
1229
1230     expression_free(rt->calculation);
1231     expression_free(rt->warning);
1232     expression_free(rt->critical);
1233
1234     freez(rt->name);
1235     freez(rt->exec);
1236     freez(rt->recipient);
1237     freez(rt->context);
1238     freez(rt->source);
1239     freez(rt->units);
1240     freez(rt->info);
1241     freez(rt->dimensions);
1242     freez(rt);
1243 }
1244
1245 // ----------------------------------------------------------------------------
1246 // load health configuration
1247
1248 #define HEALTH_CONF_MAX_LINE 4096
1249
1250 #define HEALTH_ALARM_KEY "alarm"
1251 #define HEALTH_TEMPLATE_KEY "template"
1252 #define HEALTH_ON_KEY "on"
1253 #define HEALTH_LOOKUP_KEY "lookup"
1254 #define HEALTH_CALC_KEY "calc"
1255 #define HEALTH_EVERY_KEY "every"
1256 #define HEALTH_GREEN_KEY "green"
1257 #define HEALTH_RED_KEY "red"
1258 #define HEALTH_WARN_KEY "warn"
1259 #define HEALTH_CRIT_KEY "crit"
1260 #define HEALTH_EXEC_KEY "exec"
1261 #define HEALTH_RECIPIENT_KEY "to"
1262 #define HEALTH_UNITS_KEY "units"
1263 #define HEALTH_INFO_KEY "info"
1264 #define HEALTH_DELAY_KEY "delay"
1265
1266 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1267     if(!rc->chart) {
1268         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1269         return 0;
1270     }
1271
1272     if(!rc->update_every) {
1273         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1274         return 0;
1275     }
1276
1277     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1278         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1279         return 0;
1280     }
1281
1282     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1283         return 0;
1284
1285     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1286
1287     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1288           rc->chart?rc->chart:"NOCHART",
1289           rc->name,
1290           rc->id,
1291           (rc->exec)?rc->exec:"DEFAULT",
1292           (rc->recipient)?rc->recipient:"DEFAULT",
1293           rc->green,
1294           rc->red,
1295           rc->group,
1296           rc->after,
1297           rc->before,
1298           rc->options,
1299           (rc->dimensions)?rc->dimensions:"NONE",
1300           rc->update_every,
1301           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1302           (rc->warning)?rc->warning->parsed_as:"NONE",
1303           (rc->critical)?rc->critical->parsed_as:"NONE",
1304           rc->source,
1305           rc->delay_up_duration,
1306           rc->delay_down_duration,
1307           rc->delay_max_duration,
1308           rc->delay_multiplier
1309     );
1310
1311     rrdcalc_create_part2(host, rc);
1312     return 1;
1313 }
1314
1315 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1316     if(unlikely(!rt->context)) {
1317         error("Health configuration for template '%s' does not have a context", rt->name);
1318         return 0;
1319     }
1320
1321     if(unlikely(!rt->update_every)) {
1322         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1323         return 0;
1324     }
1325
1326     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1327         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1328         return 0;
1329     }
1330
1331     RRDCALCTEMPLATE *t, *last = NULL;
1332     for (t = host->templates; t ; last = t, t = t->next) {
1333         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1334             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1335             return 0;
1336         }
1337     }
1338
1339     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1340           rt->name,
1341           (rt->context)?rt->context:"NONE",
1342           (rt->exec)?rt->exec:"DEFAULT",
1343           (rt->recipient)?rt->recipient:"DEFAULT",
1344           rt->green,
1345           rt->red,
1346           rt->group,
1347           rt->after,
1348           rt->before,
1349           rt->options,
1350           (rt->dimensions)?rt->dimensions:"NONE",
1351           rt->update_every,
1352           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1353           (rt->warning)?rt->warning->parsed_as:"NONE",
1354           (rt->critical)?rt->critical->parsed_as:"NONE",
1355           rt->source,
1356           rt->delay_up_duration,
1357           rt->delay_down_duration,
1358           rt->delay_max_duration,
1359           rt->delay_multiplier
1360     );
1361
1362     if(likely(last)) {
1363         last->next = rt;
1364     }
1365     else {
1366         rt->next = host->templates;
1367         host->templates = rt;
1368     }
1369
1370     return 1;
1371 }
1372
1373 static inline int health_parse_duration(char *string, int *result) {
1374     // make sure it is a number
1375     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1376         *result = 0;
1377         return 0;
1378     }
1379
1380     char *e = NULL;
1381     calculated_number n = strtold(string, &e);
1382     if(e && *e) {
1383         switch (*e) {
1384             case 'Y':
1385                 *result = (int) (n * 86400 * 365);
1386                 break;
1387             case 'M':
1388                 *result = (int) (n * 86400 * 30);
1389                 break;
1390             case 'w':
1391                 *result = (int) (n * 86400 * 7);
1392                 break;
1393             case 'd':
1394                 *result = (int) (n * 86400);
1395                 break;
1396             case 'h':
1397                 *result = (int) (n * 3600);
1398                 break;
1399             case 'm':
1400                 *result = (int) (n * 60);
1401                 break;
1402
1403             default:
1404             case 's':
1405                 *result = (int) (n);
1406                 break;
1407         }
1408     }
1409     else
1410        *result = (int)(n);
1411
1412     return 1;
1413 }
1414
1415 static inline int health_parse_delay(
1416         size_t line, const char *path, const char *file, char *string,
1417         int *delay_up_duration,
1418         int *delay_down_duration,
1419         int *delay_max_duration,
1420         float *delay_multiplier) {
1421
1422     char given_up = 0;
1423     char given_down = 0;
1424     char given_max = 0;
1425     char given_multiplier = 0;
1426
1427     char *s = string;
1428     while(*s) {
1429         char *key = s;
1430
1431         while(*s && !isspace(*s)) s++;
1432         while(*s && isspace(*s)) *s++ = '\0';
1433
1434         if(!*key) break;
1435
1436         char *value = s;
1437         while(*s && !isspace(*s)) s++;
1438         while(*s && isspace(*s)) *s++ = '\0';
1439
1440         if(!strcasecmp(key, "up")) {
1441             if (!health_parse_duration(value, delay_up_duration)) {
1442                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1443                       line, path, file, value, key);
1444             }
1445             else given_up = 1;
1446         }
1447         else if(!strcasecmp(key, "down")) {
1448             if (!health_parse_duration(value, delay_down_duration)) {
1449                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1450                       line, path, file, value, key);
1451             }
1452             else given_down = 1;
1453         }
1454         else if(!strcasecmp(key, "multiplier")) {
1455             *delay_multiplier = strtof(value, NULL);
1456             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1457                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1458                       line, path, file, value, key);
1459             }
1460             else given_multiplier = 1;
1461         }
1462         else if(!strcasecmp(key, "max")) {
1463             if (!health_parse_duration(value, delay_max_duration)) {
1464                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1465                       line, path, file, value, key);
1466             }
1467             else given_max = 1;
1468         }
1469         else {
1470             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1471                   line, path, file, key);
1472         }
1473     }
1474
1475     if(!given_up)
1476         *delay_up_duration = 0;
1477
1478     if(!given_down)
1479         *delay_down_duration = 0;
1480
1481     if(!given_multiplier)
1482         *delay_multiplier = 1.0;
1483
1484     if(!given_max) {
1485         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1486             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1487
1488         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1489             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1490     }
1491
1492     return 1;
1493 }
1494
1495 static inline int health_parse_db_lookup(
1496         size_t line, const char *path, const char *file, char *string,
1497         int *group_method, int *after, int *before, int *every,
1498         uint32_t *options, char **dimensions
1499 ) {
1500     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1501
1502     if(*dimensions) freez(*dimensions);
1503     *dimensions = NULL;
1504     *after = 0;
1505     *before = 0;
1506     *every = 0;
1507     *options = 0;
1508
1509     char *s = string, *key;
1510
1511     // first is the group method
1512     key = s;
1513     while(*s && !isspace(*s)) s++;
1514     while(*s && isspace(*s)) *s++ = '\0';
1515     if(!*s) {
1516         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1517               line, path, file, key);
1518         return 0;
1519     }
1520
1521     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1522         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1523               line, path, file, key);
1524         return 0;
1525     }
1526
1527     // then is the 'after' time
1528     key = s;
1529     while(*s && !isspace(*s)) s++;
1530     while(*s && isspace(*s)) *s++ = '\0';
1531
1532     if(!health_parse_duration(key, after)) {
1533         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1534               line, path, file, key);
1535         return 0;
1536     }
1537
1538     // sane defaults
1539     *every = abs(*after);
1540
1541     // now we may have optional parameters
1542     while(*s) {
1543         key = s;
1544         while(*s && !isspace(*s)) s++;
1545         while(*s && isspace(*s)) *s++ = '\0';
1546         if(!*key) break;
1547
1548         if(!strcasecmp(key, "at")) {
1549             char *value = s;
1550             while(*s && !isspace(*s)) s++;
1551             while(*s && isspace(*s)) *s++ = '\0';
1552
1553             if (!health_parse_duration(value, before)) {
1554                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1555                       line, path, file, value, key);
1556             }
1557         }
1558         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1559             char *value = s;
1560             while(*s && !isspace(*s)) s++;
1561             while(*s && isspace(*s)) *s++ = '\0';
1562
1563             if (!health_parse_duration(value, every)) {
1564                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1565                       line, path, file, value, key);
1566             }
1567         }
1568         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1569             *options |= RRDR_OPTION_ABSOLUTE;
1570         }
1571         else if(!strcasecmp(key, "min2max")) {
1572             *options |= RRDR_OPTION_MIN2MAX;
1573         }
1574         else if(!strcasecmp(key, "null2zero")) {
1575             *options |= RRDR_OPTION_NULL2ZERO;
1576         }
1577         else if(!strcasecmp(key, "percentage")) {
1578             *options |= RRDR_OPTION_PERCENTAGE;
1579         }
1580         else if(!strcasecmp(key, "unaligned")) {
1581             *options |= RRDR_OPTION_NOT_ALIGNED;
1582         }
1583         else if(!strcasecmp(key, "of")) {
1584             if(*s && strcasecmp(s, "all"))
1585                *dimensions = strdupz(s);
1586             break;
1587         }
1588         else {
1589             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1590                   line, path, file, key);
1591         }
1592     }
1593
1594     return 1;
1595 }
1596
1597 static inline char *tabs2spaces(char *s) {
1598     char *t = s;
1599     while(*t) {
1600         if(unlikely(*t == '\t')) *t = ' ';
1601         t++;
1602     }
1603
1604     return s;
1605 }
1606
1607 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1608     char buffer[FILENAME_MAX + 1];
1609     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1610     return strdupz(buffer);
1611 }
1612
1613 static inline void strip_quotes(char *s) {
1614     while(*s) {
1615         if(*s == '\'' || *s == '"') *s = ' ';
1616         s++;
1617     }
1618 }
1619
1620 int health_readfile(const char *path, const char *filename) {
1621     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1622
1623     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1624     char buffer[HEALTH_CONF_MAX_LINE + 1];
1625
1626     if(unlikely(!hash_alarm)) {
1627         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1628         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1629         hash_on = simple_uhash(HEALTH_ON_KEY);
1630         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1631         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1632         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1633         hash_red = simple_uhash(HEALTH_RED_KEY);
1634         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1635         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1636         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1637         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1638         hash_units = simple_hash(HEALTH_UNITS_KEY);
1639         hash_info = simple_hash(HEALTH_INFO_KEY);
1640         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1641         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1642     }
1643
1644     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1645     FILE *fp = fopen(buffer, "r");
1646     if(!fp) {
1647         error("Health configuration cannot read file '%s'.", buffer);
1648         return 0;
1649     }
1650
1651     RRDCALC *rc = NULL;
1652     RRDCALCTEMPLATE *rt = NULL;
1653
1654     size_t line = 0, append = 0;
1655     char *s;
1656     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1657         int stop_appending = !s;
1658         line++;
1659         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1660         s = trim(buffer);
1661         if(!s) continue;
1662         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1663
1664         append = strlen(s);
1665         if(!stop_appending && s[append - 1] == '\\') {
1666             s[append - 1] = ' ';
1667             append = &s[append] - buffer;
1668             if(append < HEALTH_CONF_MAX_LINE)
1669                 continue;
1670             else {
1671                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1672             }
1673         }
1674         append = 0;
1675
1676         char *key = s;
1677         while(*s && *s != ':') s++;
1678         if(!*s) {
1679             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1680             continue;
1681         }
1682         *s = '\0';
1683         s++;
1684
1685         char *value = s;
1686         key = trim(key);
1687         value = trim(value);
1688
1689         if(!key) {
1690             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1691             continue;
1692         }
1693
1694         if(!value) {
1695             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1696             continue;
1697         }
1698
1699         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1700         uint32_t hash = simple_uhash(key);
1701
1702         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1703             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1704                 rrdcalc_free(&localhost, rc);
1705
1706             if(rt) {
1707                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1708                     rrdcalctemplate_free(&localhost, rt);
1709                 rt = NULL;
1710             }
1711
1712             rc = callocz(1, sizeof(RRDCALC));
1713             rc->next_event_id = 1;
1714             rc->name = tabs2spaces(strdupz(value));
1715             rc->hash = simple_hash(rc->name);
1716             rc->source = health_source_file(line, path, filename);
1717             rc->green = NAN;
1718             rc->red = NAN;
1719             rc->value = NAN;
1720             rc->old_value = NAN;
1721             rc->delay_multiplier = 1.0;
1722
1723             if(rrdvar_fix_name(rc->name))
1724                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1725         }
1726         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1727             if(rc) {
1728                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1729                     rrdcalc_free(&localhost, rc);
1730                 rc = NULL;
1731             }
1732
1733             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1734                 rrdcalctemplate_free(&localhost, rt);
1735
1736             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1737             rt->name = tabs2spaces(strdupz(value));
1738             rt->hash_name = simple_hash(rt->name);
1739             rt->source = health_source_file(line, path, filename);
1740             rt->green = NAN;
1741             rt->red = NAN;
1742             rt->delay_multiplier = 1.0;
1743
1744             if(rrdvar_fix_name(rt->name))
1745                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1746         }
1747         else if(rc) {
1748             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1749                 if(rc->chart) {
1750                     if(strcmp(rc->chart, value))
1751                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1752                              line, path, filename, rc->name, key, rc->chart, value, value);
1753
1754                     freez(rc->chart);
1755                 }
1756                 rc->chart = tabs2spaces(strdupz(value));
1757                 rc->hash_chart = simple_hash(rc->chart);
1758             }
1759             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1760                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1761                                        &rc->update_every,
1762                                        &rc->options, &rc->dimensions);
1763             }
1764             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1765                 if(!health_parse_duration(value, &rc->update_every))
1766                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1767                          line, path, filename, rc->name, key, value);
1768             }
1769             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1770                 char *e;
1771                 rc->green = strtold(value, &e);
1772                 if(e && *e) {
1773                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1774                          line, path, filename, rc->name, key, e);
1775                 }
1776             }
1777             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1778                 char *e;
1779                 rc->red = strtold(value, &e);
1780                 if(e && *e) {
1781                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1782                          line, path, filename, rc->name, key, e);
1783                 }
1784             }
1785             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1786                 const char *failed_at = NULL;
1787                 int error = 0;
1788                 rc->calculation = expression_parse(value, &failed_at, &error);
1789                 if(!rc->calculation) {
1790                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1791                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1792                 }
1793             }
1794             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1795                 const char *failed_at = NULL;
1796                 int error = 0;
1797                 rc->warning = expression_parse(value, &failed_at, &error);
1798                 if(!rc->warning) {
1799                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1800                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1801                 }
1802             }
1803             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1804                 const char *failed_at = NULL;
1805                 int error = 0;
1806                 rc->critical = expression_parse(value, &failed_at, &error);
1807                 if(!rc->critical) {
1808                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1809                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1810                 }
1811             }
1812             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1813                 if(rc->exec) {
1814                     if(strcmp(rc->exec, value))
1815                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1816                              line, path, filename, rc->name, key, rc->exec, value, value);
1817
1818                     freez(rc->exec);
1819                 }
1820                 rc->exec = tabs2spaces(strdupz(value));
1821             }
1822             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1823                 if(rc->recipient) {
1824                     if(strcmp(rc->recipient, value))
1825                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1826                              line, path, filename, rc->name, key, rc->recipient, value, value);
1827
1828                     freez(rc->recipient);
1829                 }
1830                 rc->recipient = tabs2spaces(strdupz(value));
1831             }
1832             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1833                 if(rc->units) {
1834                     if(strcmp(rc->units, value))
1835                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1836                              line, path, filename, rc->name, key, rc->units, value, value);
1837
1838                     freez(rc->units);
1839                 }
1840                 rc->units = tabs2spaces(strdupz(value));
1841                 strip_quotes(rc->units);
1842             }
1843             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1844                 if(rc->info) {
1845                     if(strcmp(rc->info, value))
1846                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1847                              line, path, filename, rc->name, key, rc->info, value, value);
1848
1849                     freez(rc->info);
1850                 }
1851                 rc->info = tabs2spaces(strdupz(value));
1852                 strip_quotes(rc->info);
1853             }
1854             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1855                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1856             }
1857             else {
1858                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1859                      line, path, filename, rc->name, key);
1860             }
1861         }
1862         else if(rt) {
1863             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1864                 if(rt->context) {
1865                     if(strcmp(rt->context, value))
1866                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1867                              line, path, filename, rt->name, key, rt->context, value, value);
1868
1869                     freez(rt->context);
1870                 }
1871                 rt->context = tabs2spaces(strdupz(value));
1872                 rt->hash_context = simple_hash(rt->context);
1873             }
1874             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1875                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1876                                        &rt->update_every,
1877                                        &rt->options, &rt->dimensions);
1878             }
1879             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1880                 if(!health_parse_duration(value, &rt->update_every))
1881                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1882                          line, path, filename, rt->name, key, value);
1883             }
1884             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1885                 char *e;
1886                 rt->green = strtold(value, &e);
1887                 if(e && *e) {
1888                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1889                          line, path, filename, rt->name, key, e);
1890                 }
1891             }
1892             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1893                 char *e;
1894                 rt->red = strtold(value, &e);
1895                 if(e && *e) {
1896                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1897                          line, path, filename, rt->name, key, e);
1898                 }
1899             }
1900             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1901                 const char *failed_at = NULL;
1902                 int error = 0;
1903                 rt->calculation = expression_parse(value, &failed_at, &error);
1904                 if(!rt->calculation) {
1905                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1906                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1907                 }
1908             }
1909             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1910                 const char *failed_at = NULL;
1911                 int error = 0;
1912                 rt->warning = expression_parse(value, &failed_at, &error);
1913                 if(!rt->warning) {
1914                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1915                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1916                 }
1917             }
1918             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1919                 const char *failed_at = NULL;
1920                 int error = 0;
1921                 rt->critical = expression_parse(value, &failed_at, &error);
1922                 if(!rt->critical) {
1923                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1924                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1925                 }
1926             }
1927             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1928                 if(rt->exec) {
1929                     if(strcmp(rt->exec, value))
1930                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1931                              line, path, filename, rt->name, key, rt->exec, value, value);
1932
1933                     freez(rt->exec);
1934                 }
1935                 rt->exec = tabs2spaces(strdupz(value));
1936             }
1937             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1938                 if(rt->recipient) {
1939                     if(strcmp(rt->recipient, value))
1940                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1941                              line, path, filename, rt->name, key, rt->recipient, value, value);
1942
1943                     freez(rt->recipient);
1944                 }
1945                 rt->recipient = tabs2spaces(strdupz(value));
1946             }
1947             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1948                 if(rt->units) {
1949                     if(strcmp(rt->units, value))
1950                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1951                              line, path, filename, rt->name, key, rt->units, value, value);
1952
1953                     freez(rt->units);
1954                 }
1955                 rt->units = tabs2spaces(strdupz(value));
1956                 strip_quotes(rt->units);
1957             }
1958             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1959                 if(rt->info) {
1960                     if(strcmp(rt->info, value))
1961                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1962                              line, path, filename, rt->name, key, rt->info, value, value);
1963
1964                     freez(rt->info);
1965                 }
1966                 rt->info = tabs2spaces(strdupz(value));
1967                 strip_quotes(rt->info);
1968             }
1969             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1970                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
1971             }
1972             else {
1973                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1974                       line, path, filename, rt->name, key);
1975             }
1976         }
1977         else {
1978             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1979                   line, path, filename, key);
1980         }
1981     }
1982
1983     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1984         rrdcalc_free(&localhost, rc);
1985
1986     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1987         rrdcalctemplate_free(&localhost, rt);
1988
1989     fclose(fp);
1990     return 1;
1991 }
1992
1993 void health_readdir(const char *path) {
1994     size_t pathlen = strlen(path);
1995
1996     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
1997
1998     DIR *dir = opendir(path);
1999     if (!dir) {
2000         error("Health configuration cannot open directory '%s'.", path);
2001         return;
2002     }
2003
2004     struct dirent *de = NULL;
2005     while ((de = readdir(dir))) {
2006         size_t len = strlen(de->d_name);
2007
2008         if(de->d_type == DT_DIR
2009            && (
2010                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
2011                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
2012            )) {
2013             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
2014             continue;
2015         }
2016
2017         else if(de->d_type == DT_DIR) {
2018             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
2019             strcpy(s, path);
2020             strcat(s, "/");
2021             strcat(s, de->d_name);
2022             health_readdir(s);
2023             freez(s);
2024             continue;
2025         }
2026
2027         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
2028                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
2029             health_readfile(path, de->d_name);
2030         }
2031
2032         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
2033     }
2034
2035     closedir(dir);
2036 }
2037
2038 static inline char *health_config_dir(void) {
2039     char buffer[FILENAME_MAX + 1];
2040     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2041     return config_get("health", "health configuration directory", buffer);
2042 }
2043
2044 void health_init(void) {
2045     debug(D_HEALTH, "Health configuration initializing");
2046
2047     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2048         debug(D_HEALTH, "Health is disabled.");
2049         return;
2050     }
2051
2052     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2053     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2054         fatal("Cannot create directory '%s'.", pathname);
2055
2056     char filename[FILENAME_MAX + 1];
2057     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2058     health.log_filename = config_get("health", "health db file", filename);
2059
2060     health_alarm_log_load(&localhost);
2061     health_alarm_log_open();
2062
2063     char *path = health_config_dir();
2064
2065     {
2066         char buffer[FILENAME_MAX + 1];
2067         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2068         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2069     }
2070
2071     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2072     if(n < 10) {
2073         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2074         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2075     }
2076     else localhost.health_log.max = (unsigned int)n;
2077
2078     rrdhost_rwlock(&localhost);
2079     health_readdir(path);
2080     rrdhost_unlock(&localhost);
2081 }
2082
2083 // ----------------------------------------------------------------------------
2084 // JSON generation
2085
2086 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2087     if(value && *value)
2088         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2089     else
2090         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2091 }
2092
2093 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2094     buffer_sprintf(wb, "\n\t{\n"
2095                            "\t\t\"hostname\": \"%s\",\n"
2096                            "\t\t\"unique_id\": %u,\n"
2097                            "\t\t\"alarm_id\": %u,\n"
2098                            "\t\t\"alarm_event_id\": %u,\n"
2099                            "\t\t\"name\": \"%s\",\n"
2100                            "\t\t\"chart\": \"%s\",\n"
2101                            "\t\t\"family\": \"%s\",\n"
2102                            "\t\t\"processed\": %s,\n"
2103                            "\t\t\"updated\": %s,\n"
2104                            "\t\t\"exec_run\": %lu,\n"
2105                            "\t\t\"exec_failed\": %s,\n"
2106                            "\t\t\"exec\": \"%s\",\n"
2107                            "\t\t\"recipient\": \"%s\",\n"
2108                            "\t\t\"exec_code\": %d,\n"
2109                            "\t\t\"source\": \"%s\",\n"
2110                            "\t\t\"units\": \"%s\",\n"
2111                            "\t\t\"info\": \"%s\",\n"
2112                            "\t\t\"when\": %lu,\n"
2113                            "\t\t\"duration\": %lu,\n"
2114                            "\t\t\"non_clear_duration\": %lu,\n"
2115                            "\t\t\"status\": \"%s\",\n"
2116                            "\t\t\"old_status\": \"%s\",\n"
2117                            "\t\t\"delay\": %d,\n"
2118                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2119                            "\t\t\"updated_by_id\": %u,\n"
2120                            "\t\t\"updates_id\": %u,\n",
2121                    host->hostname,
2122                    ae->unique_id,
2123                    ae->alarm_id,
2124                    ae->alarm_event_id,
2125                    ae->name,
2126                    ae->chart,
2127                    ae->family,
2128                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2129                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2130                    (unsigned long)ae->exec_run_timestamp,
2131                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2132                    ae->exec?ae->exec:health.health_default_exec,
2133                    ae->recipient?ae->recipient:health.health_default_recipient,
2134                    ae->exec_code,
2135                    ae->source,
2136                    ae->units?ae->units:"",
2137                    ae->info?ae->info:"",
2138                    (unsigned long)ae->when,
2139                    (unsigned long)ae->duration,
2140                    (unsigned long)ae->non_clear_duration,
2141                    rrdcalc_status2string(ae->new_status),
2142                    rrdcalc_status2string(ae->old_status),
2143                    ae->delay,
2144                    (unsigned long)ae->delay_up_to_timestamp,
2145                    ae->updated_by_id,
2146                    ae->updates_id
2147     );
2148
2149     buffer_strcat(wb, "\t\t\"value\":");
2150     buffer_rrd_value(wb, ae->new_value);
2151     buffer_strcat(wb, ",\n");
2152
2153     buffer_strcat(wb, "\t\t\"old_value\":");
2154     buffer_rrd_value(wb, ae->old_value);
2155     buffer_strcat(wb, "\n");
2156
2157     buffer_strcat(wb, "\t}");
2158 }
2159
2160 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2161     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2162
2163     buffer_strcat(wb, "[");
2164
2165     unsigned int max = host->health_log.max;
2166     unsigned int count = 0;
2167     ALARM_ENTRY *ae;
2168     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2169         if(ae->unique_id > after) {
2170             if(likely(count)) buffer_strcat(wb, ",");
2171             health_alarm_entry2json_nolock(wb, ae, host);
2172         }
2173     }
2174
2175     buffer_strcat(wb, "\n]\n");
2176
2177     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2178 }
2179
2180 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2181     buffer_sprintf(wb,
2182            "\t\t\"%s.%s\": {\n"
2183                    "\t\t\t\"id\": %lu,\n"
2184                    "\t\t\t\"name\": \"%s\",\n"
2185                    "\t\t\t\"chart\": \"%s\",\n"
2186                    "\t\t\t\"family\": \"%s\",\n"
2187                    "\t\t\t\"active\": %s,\n"
2188                    "\t\t\t\"exec\": \"%s\",\n"
2189                    "\t\t\t\"recipient\": \"%s\",\n"
2190                    "\t\t\t\"source\": \"%s\",\n"
2191                    "\t\t\t\"units\": \"%s\",\n"
2192                    "\t\t\t\"info\": \"%s\",\n"
2193                                    "\t\t\t\"status\": \"%s\",\n"
2194                    "\t\t\t\"last_status_change\": %lu,\n"
2195                    "\t\t\t\"last_updated\": %lu,\n"
2196                    "\t\t\t\"next_update\": %lu,\n"
2197                    "\t\t\t\"update_every\": %d,\n"
2198                    "\t\t\t\"delay_up_duration\": %d,\n"
2199                    "\t\t\t\"delay_down_duration\": %d,\n"
2200                    "\t\t\t\"delay_max_duration\": %d,\n"
2201                    "\t\t\t\"delay_multiplier\": %f,\n"
2202                    "\t\t\t\"delay\": %d,\n"
2203                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2204             , rc->chart, rc->name
2205             , (unsigned long)rc->id
2206             , rc->name
2207             , rc->chart
2208             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2209             , (rc->rrdset)?"true":"false"
2210             , rc->exec?rc->exec:health.health_default_exec
2211             , rc->recipient?rc->recipient:health.health_default_recipient
2212             , rc->source
2213             , rc->units?rc->units:""
2214             , rc->info?rc->info:""
2215             , rrdcalc_status2string(rc->status)
2216             , (unsigned long)rc->last_status_change
2217             , (unsigned long)rc->last_updated
2218             , (unsigned long)rc->next_update
2219             , rc->update_every
2220             , rc->delay_up_duration
2221             , rc->delay_down_duration
2222             , rc->delay_max_duration
2223             , rc->delay_multiplier
2224             , rc->delay_last
2225             , (unsigned long)rc->delay_up_to_timestamp
2226     );
2227
2228     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2229         if(rc->dimensions && *rc->dimensions)
2230             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2231
2232         buffer_sprintf(wb,
2233                        "\t\t\t\"db_after\": %lu,\n"
2234                        "\t\t\t\"db_before\": %lu,\n"
2235                        "\t\t\t\"lookup_method\": \"%s\",\n"
2236                        "\t\t\t\"lookup_after\": %d,\n"
2237                        "\t\t\t\"lookup_before\": %d,\n"
2238                        "\t\t\t\"lookup_options\": \"",
2239                        (unsigned long) rc->db_after,
2240                        (unsigned long) rc->db_before,
2241                        group_method2string(rc->group),
2242                        rc->after,
2243                        rc->before
2244         );
2245         buffer_data_options2string(wb, rc->options);
2246         buffer_strcat(wb, "\",\n");
2247     }
2248
2249     if(rc->calculation) {
2250         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2251         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2252     }
2253
2254     if(rc->warning) {
2255         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2256         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2257     }
2258
2259     if(rc->critical) {
2260         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2261         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2262     }
2263
2264     buffer_strcat(wb, "\t\t\t\"green\":");
2265     buffer_rrd_value(wb, rc->green);
2266     buffer_strcat(wb, ",\n");
2267
2268     buffer_strcat(wb, "\t\t\t\"red\":");
2269     buffer_rrd_value(wb, rc->red);
2270     buffer_strcat(wb, ",\n");
2271
2272     buffer_strcat(wb, "\t\t\t\"value\":");
2273     buffer_rrd_value(wb, rc->value);
2274     buffer_strcat(wb, "\n");
2275
2276     buffer_strcat(wb, "\t\t}");
2277 }
2278
2279 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2280 //
2281 //}
2282
2283 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2284     int i;
2285
2286     rrdhost_rdlock(&localhost);
2287     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2288                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2289                         "\n\t\"status\": %s,"
2290                         "\n\t\"now\": %lu,"
2291                         "\n\t\"alarms\": {\n",
2292                         host->hostname,
2293                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2294                         health_enabled?"true":"false",
2295                         (unsigned long)time(NULL));
2296
2297     RRDCALC *rc;
2298     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2299         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2300             continue;
2301
2302         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2303             continue;
2304
2305         if(likely(i)) buffer_strcat(wb, ",\n");
2306         health_rrdcalc2json_nolock(wb, rc);
2307         i++;
2308     }
2309
2310 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2311 //    RRDCALCTEMPLATE *rt;
2312 //    for(rt = host->templates; rt ; rt = rt->next)
2313 //        health_rrdcalctemplate2json_nolock(wb, rt);
2314
2315     buffer_strcat(wb, "\n\t}\n}\n");
2316     rrdhost_unlock(&localhost);
2317 }
2318
2319
2320 // ----------------------------------------------------------------------------
2321 // re-load health configuration
2322
2323 static inline void health_free_all_nolock(RRDHOST *host) {
2324     while(host->templates)
2325         rrdcalctemplate_free(host, host->templates);
2326
2327     while(host->alarms)
2328         rrdcalc_free(host, host->alarms);
2329 }
2330
2331 void health_reload(void) {
2332     if(!health_enabled) {
2333         error("Health reload is requested, but health is not enabled.");
2334         return;
2335     }
2336
2337     char *path = health_config_dir();
2338
2339     // free all running alarms
2340     rrdhost_rwlock(&localhost);
2341     health_free_all_nolock(&localhost);
2342     rrdhost_unlock(&localhost);
2343
2344     // invalidate all previous entries in the alarm log
2345     ALARM_ENTRY *t;
2346     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2347         if(t->new_status != RRDCALC_STATUS_REMOVED)
2348             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2349     }
2350
2351     // reset all thresholds to all charts
2352     RRDSET *st;
2353     for(st = localhost.rrdset_root; st ; st = st->next) {
2354         st->green = NAN;
2355         st->red = NAN;
2356     }
2357
2358     // load the new alarms
2359     rrdhost_rwlock(&localhost);
2360     health_readdir(path);
2361     rrdhost_unlock(&localhost);
2362
2363     // link the loaded alarms to their charts
2364     for(st = localhost.rrdset_root; st ; st = st->next) {
2365         rrdhost_rwlock(&localhost);
2366
2367         rrdsetcalc_link_matching(st);
2368         rrdcalctemplate_link_matching(st);
2369
2370         rrdhost_unlock(&localhost);
2371     }
2372 }
2373
2374 // ----------------------------------------------------------------------------
2375 // health main thread and friends
2376
2377 static inline int rrdcalc_value2status(calculated_number n) {
2378     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2379     if(n) return RRDCALC_STATUS_RAISED;
2380     return RRDCALC_STATUS_CLEAR;
2381 }
2382
2383 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2384     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2385
2386     // find the previous notification for the same alarm
2387     ALARM_ENTRY *t;
2388     for(t = ae->next; t ;t = t->next) {
2389         if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2390             break;
2391     }
2392
2393     if(t && t->new_status == ae->new_status) {
2394         // don't send the same notification again
2395         info("Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2396         goto done;
2397     }
2398
2399     if((ae->old_status == RRDCALC_STATUS_UNDEFINED && ae->new_status == RRDCALC_STATUS_UNINITIALIZED)
2400         || (ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2401         info("Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2402         goto done;
2403     }
2404
2405     char buffer[FILENAME_MAX + 1];
2406     pid_t command_pid;
2407
2408     const char *exec = ae->exec;
2409     if(!exec) exec = health.health_default_exec;
2410
2411     const char *recipient = ae->recipient;
2412     if(!recipient) recipient = health.health_default_recipient;
2413
2414     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2415               exec,
2416               recipient,
2417               host->hostname,
2418               ae->unique_id,
2419               ae->alarm_id,
2420               ae->alarm_event_id,
2421               (unsigned long)ae->when,
2422               ae->name,
2423               ae->chart?ae->chart:"NOCAHRT",
2424               ae->family?ae->family:"NOFAMILY",
2425               rrdcalc_status2string(ae->new_status),
2426               rrdcalc_status2string(ae->old_status),
2427               ae->new_value,
2428               ae->old_value,
2429               ae->source?ae->source:"UNKNOWN",
2430               (uint32_t)ae->duration,
2431               (uint32_t)ae->non_clear_duration,
2432               ae->units?ae->units:"",
2433               ae->info?ae->info:""
2434     );
2435
2436     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2437     ae->exec_run_timestamp = time(NULL);
2438
2439     debug(D_HEALTH, "executing command '%s'", buffer);
2440     FILE *fp = mypopen(buffer, &command_pid);
2441     if(!fp) {
2442         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2443         goto done;
2444     }
2445     debug(D_HEALTH, "HEALTH reading from command");
2446     char *s = fgets(buffer, FILENAME_MAX, fp);
2447     (void)s;
2448     ae->exec_code = mypclose(fp, command_pid);
2449     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2450
2451     if(ae->exec_code != 0)
2452         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2453
2454 done:
2455     health_alarm_log_save(host, ae);
2456     return;
2457 }
2458
2459 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2460     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2461          ae->chart?ae->chart:"NOCHART", ae->name,
2462          ae->new_value,
2463          rrdcalc_status2string(ae->old_status),
2464          rrdcalc_status2string(ae->new_status)
2465     );
2466
2467     health_alarm_execute(host, ae);
2468 }
2469
2470 static inline void health_alarm_log_process(RRDHOST *host) {
2471     static uint32_t stop_at_id = 0;
2472     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2473     time_t now = time(NULL);
2474
2475     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2476
2477     ALARM_ENTRY *ae;
2478     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2479         if(unlikely(
2480             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2481             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2482             )) {
2483
2484             if(unlikely(ae->unique_id < first_waiting))
2485                 first_waiting = ae->unique_id;
2486
2487             if(likely(now >= ae->delay_up_to_timestamp))
2488                 health_process_notifications(host, ae);
2489         }
2490     }
2491
2492     // remember this for the next iteration
2493     stop_at_id = first_waiting;
2494
2495     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2496
2497     if(host->health_log.count <= host->health_log.max)
2498         return;
2499
2500     // cleanup excess entries in the log
2501     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2502
2503     ALARM_ENTRY *last = NULL;
2504     unsigned int count = host->health_log.max * 2 / 3;
2505     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2506
2507     if(ae && last && last->next == ae)
2508         last->next = NULL;
2509     else
2510         ae = NULL;
2511
2512     while(ae) {
2513         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2514
2515         ALARM_ENTRY *t = ae->next;
2516
2517         freez(ae->name);
2518         freez(ae->chart);
2519         freez(ae->family);
2520         freez(ae->exec);
2521         freez(ae->recipient);
2522         freez(ae->source);
2523         freez(ae->units);
2524         freez(ae->info);
2525         freez(ae);
2526
2527         ae = t;
2528         host->health_log.count--;
2529     }
2530
2531     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2532 }
2533
2534 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2535     if (unlikely(!rc->rrdset)) {
2536         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2537         return 0;
2538     }
2539
2540     if (unlikely(!rc->rrdset->last_collected_time.tv_sec)) {
2541         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not yet collected.", rc->chart?rc->chart:"NOCHART", rc->name);
2542         return 0;
2543     }
2544
2545     if (unlikely(!rc->update_every)) {
2546         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2547         return 0;
2548     }
2549
2550     if (unlikely(rc->next_update > now)) {
2551         if (unlikely(*next_run > rc->next_update))
2552             *next_run = rc->next_update;
2553
2554         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2555         return 0;
2556     }
2557
2558     // FIXME
2559     // we should check that the DB lookup is possible
2560     // i.e.
2561     // - the duration of the chart includes the required timeframe
2562     // we SHOULD NOT check the dimensions - there might be alarms that refer non-existing dimensions (e.g. cpu steal)
2563
2564     return 1;
2565 }
2566
2567 void *health_main(void *ptr) {
2568     (void)ptr;
2569
2570     info("HEALTH thread created with task id %d", gettid());
2571
2572     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2573         error("Cannot set pthread cancel type to DEFERRED.");
2574
2575     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2576         error("Cannot set pthread cancel state to ENABLE.");
2577
2578     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2579     if(min_run_every < 1) min_run_every = 1;
2580
2581     BUFFER *wb = buffer_create(100);
2582
2583     unsigned int loop = 0;
2584     while(health_enabled && !netdata_exit) {
2585         loop++;
2586         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2587
2588         int oldstate, runnable = 0;
2589         time_t now = time(NULL);
2590         time_t next_run = now + min_run_every;
2591         RRDCALC *rc;
2592
2593         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2594             error("Cannot set pthread cancel state to DISABLE.");
2595
2596         rrdhost_rdlock(&localhost);
2597
2598         // the first loop is to lookup values from the db
2599         for (rc = localhost.alarms; rc; rc = rc->next) {
2600             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2601                 continue;
2602
2603             runnable++;
2604             rc->old_value = rc->value;
2605
2606             // 1. if there is database lookup, do it
2607             // 2. if there is calculation expression, run it
2608
2609             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2610                 time_t old_db_timestamp = rc->db_before;
2611                 int value_is_null = 0;
2612
2613                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2614                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2615                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2616
2617                 if (unlikely(ret != 200)) {
2618                     // database lookup failed
2619                     rc->value = NAN;
2620
2621                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2622
2623                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2624                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2625                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2626                     }
2627                 }
2628                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2629                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2630
2631                 if (unlikely(old_db_timestamp == rc->db_before)) {
2632                     // database is stale
2633
2634                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2635
2636                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2637                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2638                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2639                     }
2640                 }
2641                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2642                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2643
2644                 if (unlikely(value_is_null)) {
2645                     // collected value is null
2646
2647                     rc->value = NAN;
2648
2649                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2650                           rc->chart?rc->chart:"NOCHART", rc->name);
2651
2652                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2653                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2654                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2655                               rc->chart?rc->chart:"NOCHART", rc->name);
2656                     }
2657                 }
2658                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2659                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2660
2661                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2662                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2663             }
2664
2665             if(unlikely(rc->calculation)) {
2666                 if (unlikely(!expression_evaluate(rc->calculation))) {
2667                     // calculation failed
2668
2669                     rc->value = NAN;
2670
2671                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2672                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2673
2674                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2675                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2676                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2677                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2678                     }
2679                 }
2680                 else {
2681                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2682                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2683
2684                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
2685                             CALCULATED_NUMBER_FORMAT
2686                             ": %s (source: %s)",
2687                           rc->chart?rc->chart:"NOCHART", rc->name,
2688                           rc->calculation->result,
2689                           buffer_tostring(rc->calculation->error_msg),
2690                           rc->source
2691                     );
2692
2693                     rc->value = rc->calculation->result;
2694                 }
2695             }
2696         }
2697         rrdhost_unlock(&localhost);
2698
2699         if (unlikely(runnable && !netdata_exit)) {
2700             rrdhost_rdlock(&localhost);
2701
2702             for (rc = localhost.alarms; rc; rc = rc->next) {
2703                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2704                     continue;
2705
2706                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2707                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2708
2709                 if(likely(rc->warning)) {
2710                     if(unlikely(!expression_evaluate(rc->warning))) {
2711                         // calculation failed
2712
2713                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2714                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2715
2716                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2717                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2718                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2719                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2720                         }
2721                     }
2722                     else {
2723                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2724                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2725
2726                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2727                                 CALCULATED_NUMBER_FORMAT
2728                                 ": %s (source: %s)",
2729                               rc->chart?rc->chart:"NOCHART", rc->name,
2730                               rc->warning->result,
2731                               buffer_tostring(rc->warning->error_msg),
2732                               rc->source
2733                         );
2734
2735                         warning_status = rrdcalc_value2status(rc->warning->result);
2736                     }
2737                 }
2738
2739                 if(likely(rc->critical)) {
2740                     if(unlikely(!expression_evaluate(rc->critical))) {
2741                         // calculation failed
2742
2743                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2744                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2745
2746                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2747                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2748                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2749                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2750                         }
2751                     }
2752                     else {
2753                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2754                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2755
2756                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2757                                 CALCULATED_NUMBER_FORMAT
2758                                 ": %s (source: %s)",
2759                               rc->chart?rc->chart:"NOCHART", rc->name,
2760                               rc->critical->result,
2761                               buffer_tostring(rc->critical->error_msg),
2762                               rc->source
2763                         );
2764
2765                         critical_status = rrdcalc_value2status(rc->critical->result);
2766                     }
2767                 }
2768
2769                 int status = RRDCALC_STATUS_UNDEFINED;
2770
2771                 switch(warning_status) {
2772                     case RRDCALC_STATUS_CLEAR:
2773                         status = RRDCALC_STATUS_CLEAR;
2774                         break;
2775
2776                     case RRDCALC_STATUS_RAISED:
2777                         status = RRDCALC_STATUS_WARNING;
2778                         break;
2779
2780                     default:
2781                         break;
2782                 }
2783
2784                 switch(critical_status) {
2785                     case RRDCALC_STATUS_CLEAR:
2786                         if(status == RRDCALC_STATUS_UNDEFINED)
2787                             status = RRDCALC_STATUS_CLEAR;
2788                         break;
2789
2790                     case RRDCALC_STATUS_RAISED:
2791                         status = RRDCALC_STATUS_CRITICAL;
2792                         break;
2793
2794                     default:
2795                         break;
2796                 }
2797
2798                 if(status != rc->status) {
2799                     int delay = 0;
2800
2801                     if(now > rc->delay_up_to_timestamp) {
2802                         rc->delay_up_current = rc->delay_up_duration;
2803                         rc->delay_down_current = rc->delay_down_duration;
2804                         rc->delay_last = 0;
2805                         rc->delay_up_to_timestamp = 0;
2806                     }
2807                     else {
2808                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2809                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2810
2811                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2812                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2813                     }
2814
2815                     if(status > rc->status)
2816                         delay = rc->delay_up_current;
2817                     else
2818                         delay = rc->delay_down_current;
2819
2820                     // COMMENTED: because we do need to send raising alarms
2821                     // if(now + delay < rc->delay_up_to_timestamp)
2822                     //    delay = (int)(rc->delay_up_to_timestamp - now);
2823
2824                     rc->delay_last = delay;
2825                     rc->delay_up_to_timestamp = now + delay;
2826                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
2827                     rc->last_status_change = now;
2828                     rc->status = status;
2829                 }
2830
2831                 rc->last_updated = now;
2832                 rc->next_update = now + rc->update_every;
2833
2834                 if (next_run > rc->next_update)
2835                     next_run = rc->next_update;
2836             }
2837
2838             rrdhost_unlock(&localhost);
2839         }
2840
2841         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2842             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2843
2844         if(unlikely(netdata_exit))
2845             break;
2846
2847         // execute notifications
2848         // and cleanup
2849         health_alarm_log_process(&localhost);
2850
2851         if(unlikely(netdata_exit))
2852             break;
2853         
2854         now = time(NULL);
2855         if(now < next_run) {
2856             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2857                   loop, (int) (next_run - now));
2858             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2859         }
2860         else {
2861             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2862         }
2863     }
2864
2865     buffer_free(wb);
2866
2867     info("HEALTH thread exiting");
2868     pthread_exit(NULL);
2869     return NULL;
2870 }