]> arthur.barton.de Git - netdata.git/blob - src/health.c
properly load the alarm log; additional checks; alarms are now sequencial even on...
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     FILE *log_fp;
10 };
11
12 static struct health_options health = {
13     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
14     .health_default_recipient = "root",
15     .log_filename = VARLIB_DIR "/health/alarm_log.db",
16     .log_fp = NULL
17 };
18
19 int health_enabled = 1;
20
21 // ----------------------------------------------------------------------------
22 // health alarm log load/save
23 // no need for locking - only one thread is reading / writing the alarms log
24
25 static inline int health_alarm_log_open(void) {
26     if(health.log_fp)
27         fclose(health.log_fp);
28
29     health.log_fp = fopen(health.log_filename, "a");
30
31     if(health.log_fp) {
32         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
33             error("Health: cannot set line buffering on health log file.");
34         return 0;
35     }
36
37     error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
38     return -1;
39 }
40
41 static inline void health_alarm_log_close(void) {
42     if(health.log_fp) {
43         fclose(health.log_fp);
44         health.log_fp = NULL;
45     }
46 }
47
48 static inline void health_log_recreate(void) {
49     if(health.log_fp != NULL) {
50         health_alarm_log_close();
51
52         // open it with truncate
53         health.log_fp = fopen(health.log_filename, "w");
54         if(health.log_fp) fclose(health.log_fp);
55         else error("Health: cannot truncate health log '%s'", health.log_filename);
56
57         health.log_fp = NULL;
58
59         health_alarm_log_open();
60     }
61 }
62
63 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
64     (void)host;
65     (void)ae;
66
67     if(likely(health.log_fp)) {
68         if(unlikely(fprintf(health.log_fp
69                 , "%c\t%s"
70                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
71                   "\t%08x\t%08x\t%08x"
72                   "\t%08x\t%08x\t%08x"
73                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
74                   "\t%d\t%d\t%d\t%d"
75                   "\t%Lf\t%Lf"
76                   "\n"
77                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
78                 , host->hostname
79
80                 , ae->unique_id
81                 , ae->alarm_id
82                 , ae->alarm_event_id
83                 , ae->updated_by_id
84                 , ae->updates_id
85
86                 , (uint32_t)ae->when
87                 , (uint32_t)ae->duration
88                 , (uint32_t)ae->non_clear_duration
89                 , (uint32_t)ae->flags
90                 , (uint32_t)ae->exec_run_timestamp
91                 , (uint32_t)ae->delay_up_to_timestamp
92
93                 , (ae->name)?ae->name:""
94                 , (ae->chart)?ae->chart:""
95                 , (ae->family)?ae->family:""
96                 , (ae->exec)?ae->exec:""
97                 , (ae->recipient)?ae->recipient:""
98                 , (ae->source)?ae->source:""
99                 , (ae->units)?ae->units:""
100                 , (ae->info)?ae->info:""
101
102                 , ae->exec_code
103                 , ae->new_status
104                 , ae->old_status
105                 , ae->delay
106
107                 , (long double)ae->new_value
108                 , (long double)ae->old_value
109         ) < 0))
110             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
111         else
112             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
113     }
114 }
115
116 static inline ssize_t health_alarm_log_load(RRDHOST *host) {
117     uint32_t max_unique_id = 0, max_alarm_id = 0;
118     ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1;
119     health_alarm_log_close();
120
121     FILE *fp = fopen(health.log_filename, "r");
122     if(!fp)
123         error("Health: cannot open health file: %s", health.log_filename);
124     else {
125         errno = 0;
126
127         char *s, *buf = mallocz(65536 + 1);
128         size_t line = 0, len = 0;
129         loaded = updated = errored = duplicate = 0;
130
131         pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
132
133         while((s = fgets_trim_len(buf, 65536, fp, &len))) {
134             line++;
135
136             int max_entries = 30, entries = 0;
137             char *pointers[max_entries];
138
139             pointers[entries++] = s++;
140             while(*s) {
141                 if(unlikely(*s == '\t')) {
142                     *s = '\0';
143                     pointers[entries++] = ++s;
144                     if(entries >= max_entries) {
145                         error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, health.log_filename, max_entries);
146                         break;
147                     }
148                 }
149                 else s++;
150             }
151
152             if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
153                 ALARM_ENTRY *ae = NULL;
154
155                 if(entries < 26) {
156                     error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring line.", line, health.log_filename, entries);
157                     errored++;
158                     continue;
159                 }
160
161                 // check that we have valid ids
162                 uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
163                 if(!unique_id) {
164                     error("Health: line %zu of file '%s' states alarm entry with unique id %u (%s). Ignoring line.", line, health.log_filename, unique_id, pointers[2]);
165                     errored++;
166                     continue;
167                 }
168
169                 uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
170                 if(!alarm_id) {
171                     error("Health: line %zu of file '%s' states alarm entry for alarm id %u (%s). Ignoring line.", line, health.log_filename, alarm_id, pointers[3]);
172                     errored++;
173                     continue;
174                 }
175
176                 // find a possible overwrite
177                 for(ae = host->health_log.alarms; ae ;ae = ae->next) {
178                     if(unlikely(ae->unique_id == unique_id)) {
179                         if(unlikely(*pointers[0] == 'A')) {
180                             error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u.", line, health.log_filename, unique_id);
181                             *pointers[0] = 'U';
182                             duplicate++;
183                         }
184                         break;
185                     }
186                 }
187
188                 // if not found, create a new one
189                 if(likely(!ae)) {
190                     // if it is an update, but we haven't found it, make it an addition
191                     if(unlikely(*pointers[0] == 'U')) {
192                         *pointers[0] = 'A';
193                         error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, health.log_filename, unique_id);
194                     }
195
196                     ae = callocz(1, sizeof(ALARM_ENTRY));
197                 }
198
199                 // check for a possible host missmatch
200                 if(strcmp(pointers[1], host->hostname))
201                     error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, health.log_filename, pointers[1], host->hostname);
202
203                 ae->unique_id               = unique_id;
204                 ae->alarm_id                = alarm_id;
205                 ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
206                 ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
207                 ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
208
209                 ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
210                 ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
211                 ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
212
213                 ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
214                 ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
215
216                 ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
217                 ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
218
219                 if(unlikely(ae->name)) freez(ae->name);
220                 ae->name = strdupz(pointers[13]);
221
222                 if(unlikely(ae->chart)) freez(ae->chart);
223                 ae->chart = strdupz(pointers[14]);
224
225                 if(unlikely(ae->family)) freez(ae->family);
226                 ae->family = strdupz(pointers[15]);
227
228                 if(unlikely(ae->exec)) freez(ae->exec);
229                 ae->exec = strdupz(pointers[16]);
230                 if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
231
232                 if(unlikely(ae->recipient)) freez(ae->recipient);
233                 ae->recipient = strdupz(pointers[17]);
234                 if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
235
236                 if(unlikely(ae->source)) freez(ae->source);
237                 ae->source = strdupz(pointers[18]);
238                 if(!*ae->source) { freez(ae->source); ae->source = NULL; }
239
240                 if(unlikely(ae->units)) freez(ae->units);
241                 ae->units = strdupz(pointers[19]);
242                 if(!*ae->units) { freez(ae->units); ae->units = NULL; }
243
244                 if(unlikely(ae->info)) freez(ae->info);
245                 ae->info = strdupz(pointers[20]);
246                 if(!*ae->info) { freez(ae->info); ae->info = NULL; }
247
248                 ae->exec_code   = atoi(pointers[21]);
249                 ae->new_status  = atoi(pointers[22]);
250                 ae->old_status  = atoi(pointers[23]);
251                 ae->delay       = atoi(pointers[24]);
252
253                 ae->new_value   = strtold(pointers[25], NULL);
254                 ae->old_value   = strtold(pointers[26], NULL);
255
256                 // add it to host if not already there
257                 if(unlikely(*pointers[0] == 'A')) {
258                     ae->next = host->health_log.alarms;
259                     host->health_log.alarms = ae;
260                     loaded++;
261                 }
262                 else updated++;
263
264                 if(unlikely(ae->unique_id > max_unique_id))
265                     max_unique_id = ae->unique_id;
266
267                 if(unlikely(ae->alarm_id >= max_alarm_id))
268                     max_alarm_id = ae->alarm_id;
269             }
270             else {
271                 error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, health.log_filename, pointers[0]);
272                 errored++;
273             }
274         }
275
276         pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
277
278         freez(buf);
279         fclose(fp);
280     }
281
282     if(!max_unique_id) max_unique_id = (uint32_t)time(NULL);
283     if(!max_alarm_id)  max_alarm_id  = (uint32_t)time(NULL);
284
285     host->health_log.next_log_id = max_unique_id + 1;
286     host->health_log.next_alarm_id = max_alarm_id + 1;
287
288     fprintf(stderr, "Health loaded %zd alarms, updated %zd alarms, errors %zd entries, duplicate %zd\n", loaded, updated, errored, duplicate);
289
290     health_alarm_log_open();
291     return loaded;
292 }
293
294 // ----------------------------------------------------------------------------
295 // health alarm log management
296
297 static inline void health_alarm_log(RRDHOST *host,
298                 uint32_t alarm_id, uint32_t alarm_event_id,
299                 time_t when,
300                 const char *name, const char *chart, const char *family,
301                 const char *exec, const char *recipient, time_t duration,
302                 calculated_number old_value, calculated_number new_value,
303                 int old_status, int new_status,
304                 const char *source,
305                 const char *units,
306                 const char *info,
307                 int delay
308 ) {
309     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
310
311     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
312     ae->name = strdupz(name);
313     ae->hash_name = simple_hash(ae->name);
314
315     if(chart) {
316         ae->chart = strdupz(chart);
317         ae->hash_chart = simple_hash(ae->chart);
318     }
319
320     if(family)
321         ae->family = strdupz(family);
322
323     if(exec) ae->exec = strdupz(exec);
324     if(recipient) ae->recipient = strdupz(recipient);
325     if(source) ae->source = strdupz(source);
326     if(units) ae->units = strdupz(units);
327     if(info) ae->info = strdupz(info);
328
329     ae->unique_id = host->health_log.next_log_id++;
330     ae->alarm_id = alarm_id;
331     ae->alarm_event_id = alarm_event_id;
332     ae->when = when;
333     ae->old_value = old_value;
334     ae->new_value = new_value;
335     ae->old_status = old_status;
336     ae->new_status = new_status;
337     ae->duration = duration;
338     ae->delay = delay;
339     ae->delay_up_to_timestamp = when + delay;
340
341     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
342         ae->non_clear_duration += ae->duration;
343
344     // link it
345     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
346     ae->next = host->health_log.alarms;
347     host->health_log.alarms = ae;
348     host->health_log.count++;
349     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
350
351     // match previous alarms
352     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
353     ALARM_ENTRY *t;
354     for(t = host->health_log.alarms ; t ; t = t->next) {
355         if(t != ae && t->alarm_id == ae->alarm_id) {
356             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
357                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
358                 t->updated_by_id = ae->unique_id;
359                 ae->updates_id = t->unique_id;
360
361                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
362                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
363                     ae->non_clear_duration += t->non_clear_duration;
364
365                 health_alarm_log_save(host, t);
366             }
367
368             // no need to continue
369             break;
370         }
371     }
372     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
373
374     health_alarm_log_save(host, ae);
375 }
376
377 // ----------------------------------------------------------------------------
378 // RRDVAR management
379
380 static inline int rrdvar_fix_name(char *variable) {
381     int fixed = 0;
382     while(*variable) {
383         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
384             *variable++ = '_';
385             fixed++;
386         }
387         else
388             variable++;
389     }
390
391     return fixed;
392 }
393
394 int rrdvar_compare(void* a, void* b) {
395     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
396     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
397     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
398 }
399
400 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
401     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
402     if(ret != rv)
403         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
404
405     return ret;
406 }
407
408 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
409     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
410     if(!ret)
411         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
412
413     return ret;
414 }
415
416 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
417     RRDVAR tmp;
418     tmp.name = (char *)name;
419     tmp.hash = (hash)?hash:simple_hash(tmp.name);
420
421     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
422 }
423
424 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
425     (void)host;
426
427     if(!rv) return;
428
429     if(tree)
430         rrdvar_index_del(tree, rv);
431
432     freez(rv->name);
433     freez(rv);
434 }
435
436 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
437     char *variable = strdupz(name);
438     rrdvar_fix_name(variable);
439     uint32_t hash = simple_hash(variable);
440
441     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
442     if(unlikely(!rv)) {
443         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
444
445         rv = callocz(1, sizeof(RRDVAR));
446         rv->name = variable;
447         rv->hash = hash;
448         rv->type = type;
449         rv->value = value;
450
451         RRDVAR *ret = rrdvar_index_add(tree, rv);
452         if(unlikely(ret != rv)) {
453             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
454             rrdvar_free(NULL, NULL, rv);
455             rv = NULL;
456         }
457         else
458             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
459     }
460     else {
461         // already exists
462         freez(variable);
463         rv = NULL;
464     }
465
466     return rv;
467 }
468
469 // ----------------------------------------------------------------------------
470 // RRDVAR lookup
471
472 calculated_number rrdvar2number(RRDVAR *rv) {
473     switch(rv->type) {
474         case RRDVAR_TYPE_CALCULATED: {
475             calculated_number *n = (calculated_number *)rv->value;
476             return *n;
477         }
478
479         case RRDVAR_TYPE_TIME_T: {
480             time_t *n = (time_t *)rv->value;
481             return *n;
482         }
483
484         case RRDVAR_TYPE_COLLECTED: {
485             collected_number *n = (collected_number *)rv->value;
486             return *n;
487         }
488
489         case RRDVAR_TYPE_TOTAL: {
490             total_number *n = (total_number *)rv->value;
491             return *n;
492         }
493
494         case RRDVAR_TYPE_INT: {
495             int *n = (int *)rv->value;
496             return *n;
497         }
498
499         default:
500             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
501             return NAN;
502     }
503 }
504
505 void dump_variable(void *data) {
506     RRDVAR *rv = (RRDVAR *)data;
507     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
508 }
509
510 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
511     RRDSET *st = rc->rrdset;
512     RRDVAR *rv;
513
514     if(!st) return 0;
515
516     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
517     if(rv) {
518         *result = rrdvar2number(rv);
519         return 1;
520     }
521
522     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
523     if(rv) {
524         *result = rrdvar2number(rv);
525         return 1;
526     }
527
528     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
529     if(rv) {
530         *result = rrdvar2number(rv);
531         return 1;
532     }
533
534     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
535     avl_traverse_lock(&st->variables_root_index, dump_variable);
536
537     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
538     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
539
540     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
541     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
542
543     return 0;
544 }
545
546 // ----------------------------------------------------------------------------
547 // RRDSETVAR management
548
549 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
550     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
551     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
552
553     char buffer[RRDVAR_MAX_LENGTH + 1];
554     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
555     rs->fullid = strdupz(buffer);
556
557     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
558     rs->fullname = strdupz(buffer);
559
560     rs->variable = strdupz(variable);
561
562     rs->type = type;
563     rs->value = value;
564     rs->options = options;
565     rs->rrdset = st;
566
567     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
568     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
569     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
570     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
571     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
572
573     rs->next = st->variables;
574     st->variables = rs;
575
576     return rs;
577 }
578
579 void rrdsetvar_rename_all(RRDSET *st) {
580     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
581
582     // only these 2 can change name
583     // rs->family_name
584     // rs->host_name
585
586     char buffer[RRDVAR_MAX_LENGTH + 1];
587     RRDSETVAR *rs, *next = st->variables;
588     while((rs = next)) {
589         next = rs->next;
590
591         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
592
593         if (strcmp(buffer, rs->fullname)) {
594             // name changed
595             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
596             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
597
598             freez(rs->fullname);
599             rs->fullname = strdupz(st->name);
600             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
601             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
602         }
603     }
604
605     rrdsetcalc_link_matching(st);
606 }
607
608 void rrdsetvar_free(RRDSETVAR *rs) {
609     RRDSET *st = rs->rrdset;
610     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
611
612     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
613     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
614     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
615     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
616     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
617
618     if(st->variables == rs) {
619         st->variables = rs->next;
620     }
621     else {
622         RRDSETVAR *t;
623         for (t = st->variables; t && t->next != rs; t = t->next);
624         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
625         else t->next = rs->next;
626     }
627
628     freez(rs->fullid);
629     freez(rs->fullname);
630     freez(rs->variable);
631     freez(rs);
632 }
633
634 // ----------------------------------------------------------------------------
635 // RRDDIMVAR management
636
637 #define RRDDIMVAR_ID_MAX 1024
638
639 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
640     RRDSET *st = rd->rrdset;
641
642     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
643
644     if(!prefix) prefix = "";
645     if(!suffix) suffix = "";
646
647     char buffer[RRDDIMVAR_ID_MAX + 1];
648     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
649
650     rs->prefix = strdupz(prefix);
651     rs->suffix = strdupz(suffix);
652
653     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
654     rs->id = strdupz(buffer);
655
656     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
657     rs->name = strdupz(buffer);
658
659     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
660     rs->fullidid = strdupz(buffer);
661
662     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
663     rs->fullidname = strdupz(buffer);
664
665     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
666     rs->fullnameid = strdupz(buffer);
667
668     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
669     rs->fullnamename = strdupz(buffer);
670
671     rs->type = type;
672     rs->value = value;
673     rs->options = options;
674     rs->rrddim = rd;
675
676     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
677     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
678
679     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
680     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
681
682     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
683     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
684     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
685     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
686
687     rs->next = rd->variables;
688     rd->variables = rs;
689
690     return rs;
691 }
692
693 void rrddimvar_rename_all(RRDDIM *rd) {
694     RRDSET *st = rd->rrdset;
695     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
696
697     RRDDIMVAR *rs, *next = rd->variables;
698     while((rs = next)) {
699         next = rs->next;
700
701         if (strcmp(rd->name, rs->name)) {
702             char buffer[RRDDIMVAR_ID_MAX + 1];
703             // name changed
704
705             // name
706             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
707             freez(rs->name);
708             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
709             rs->name = strdupz(buffer);
710             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
711
712             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
713             freez(rs->fullidname);
714             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
715             rs->fullidname = strdupz(buffer);
716             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
717                                                              rs->fullidname, rs->type, rs->value);
718
719             // fullnameid
720             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
721             freez(rs->fullnameid);
722             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
723             rs->fullnameid = strdupz(buffer);
724             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
725                                                           rs->fullnameid, rs->type, rs->value);
726
727             // fullnamename
728             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
729             freez(rs->fullnamename);
730             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
731             rs->fullnamename = strdupz(buffer);
732             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
733                                                           rs->fullnamename, rs->type, rs->value);
734         }
735     }
736 }
737
738 void rrddimvar_free(RRDDIMVAR *rs) {
739     RRDDIM *rd = rs->rrddim;
740     RRDSET *st = rd->rrdset;
741     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
742
743     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
744     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
745
746     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
747     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
748
749     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
750     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
751     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
752     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
753
754     if(rd->variables == rs) {
755         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
756         rd->variables = rs->next;
757     }
758     else {
759         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
760         RRDDIMVAR *t;
761         for (t = rd->variables; t && t->next != rs; t = t->next) ;
762         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
763         else t->next = rs->next;
764     }
765
766     freez(rs->prefix);
767     freez(rs->suffix);
768     freez(rs->id);
769     freez(rs->name);
770     freez(rs->fullidid);
771     freez(rs->fullidname);
772     freez(rs->fullnameid);
773     freez(rs->fullnamename);
774     freez(rs);
775 }
776
777 // ----------------------------------------------------------------------------
778 // RRDCALC management
779
780 static inline const char *rrdcalc_status2string(int status) {
781     switch(status) {
782         case RRDCALC_STATUS_REMOVED:
783             return "REMOVED";
784
785         case RRDCALC_STATUS_UNDEFINED:
786             return "UNDEFINED";
787
788         case RRDCALC_STATUS_UNINITIALIZED:
789             return "UNINITIALIZED";
790
791         case RRDCALC_STATUS_CLEAR:
792             return "CLEAR";
793
794         case RRDCALC_STATUS_RAISED:
795             return "RAISED";
796
797         case RRDCALC_STATUS_WARNING:
798             return "WARNING";
799
800         case RRDCALC_STATUS_CRITICAL:
801             return "CRITICAL";
802
803         default:
804             error("Unknown alarm status %d", status);
805             return "UNKNOWN";
806     }
807 }
808
809 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
810     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
811
812     rc->last_status_change = time(NULL);
813     rc->rrdset = st;
814
815     rc->rrdset_next = st->alarms;
816     rc->rrdset_prev = NULL;
817     
818     if(rc->rrdset_next)
819         rc->rrdset_next->rrdset_prev = rc;
820
821     st->alarms = rc;
822
823     if(rc->update_every < rc->rrdset->update_every) {
824         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
825         rc->update_every = rc->rrdset->update_every;
826     }
827
828     if(!isnan(rc->green) && isnan(st->green)) {
829         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
830         st->green = rc->green;
831     }
832
833     if(!isnan(rc->red) && isnan(st->red)) {
834         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
835         st->red = rc->red;
836     }
837
838     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
839     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
840
841     char fullname[RRDVAR_MAX_LENGTH + 1];
842     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
843     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
844
845     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
846     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
847
848         if(!rc->units) rc->units = strdupz(st->units);
849
850     {
851         time_t now = time(NULL);
852         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
853     }
854 }
855
856 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
857     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
858             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
859         return 1;
860
861     return 0;
862 }
863
864 // this has to be called while the RRDHOST is locked
865 inline void rrdsetcalc_link_matching(RRDSET *st) {
866     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
867
868     RRDCALC *rc;
869     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
870         if(unlikely(rc->rrdset))
871             continue;
872
873         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
874             rrdsetcalc_link(st, rc);
875     }
876 }
877
878 // this has to be called while the RRDHOST is locked
879 inline void rrdsetcalc_unlink(RRDCALC *rc) {
880     RRDSET *st = rc->rrdset;
881
882     if(!st) {
883         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
884         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
885         return;
886     }
887
888     {
889         time_t now = time(NULL);
890         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
891     }
892
893     RRDHOST *host = st->rrdhost;
894
895     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
896
897     // unlink it
898     if(rc->rrdset_prev)
899         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
900
901     if(rc->rrdset_next)
902         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
903
904     if(st->alarms == rc)
905         st->alarms = rc->rrdset_next;
906
907     rc->rrdset_prev = rc->rrdset_next = NULL;
908
909     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
910     rc->local = NULL;
911
912     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
913     rc->family = NULL;
914
915     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
916     rc->hostid = NULL;
917
918     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
919     rc->hostname = NULL;
920
921     rc->rrdset = NULL;
922
923     // RRDCALC will remain in RRDHOST
924     // so that if the matching chart is found in the future
925     // it will be applied automatically
926 }
927
928 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
929     RRDCALC *rc;
930     uint32_t hash = simple_hash(name);
931
932     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
933         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
934             return rc;
935     }
936
937     return NULL;
938 }
939
940 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
941     RRDCALC *rc;
942
943     if(unlikely(!chart)) {
944         error("attempt to find RRDCALC '%s' without giving a chart name", name);
945         return 1;
946     }
947
948     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
949     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
950
951     // make sure it does not already exist
952     for(rc = host->alarms; rc ; rc = rc->next) {
953         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
954             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
955             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
956             return 1;
957         }
958     }
959
960     return 0;
961 }
962
963 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
964     if(chart && name) {
965         uint32_t hash_chart = simple_hash(chart);
966         uint32_t hash_name = simple_hash(name);
967
968         // re-use old IDs, by looking them up in the alarm log
969         ALARM_ENTRY *ae;
970         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
971             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
972                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
973                 return ae->alarm_id;
974             }
975         }
976     }
977
978     return host->health_log.next_alarm_id++;
979 }
980
981 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
982     rrdhost_check_rdlock(host);
983
984     if(rc->calculation) {
985         rc->calculation->status = &rc->status;
986         rc->calculation->this = &rc->value;
987         rc->calculation->after = &rc->db_after;
988         rc->calculation->before = &rc->db_before;
989         rc->calculation->rrdcalc = rc;
990     }
991
992     if(rc->warning) {
993         rc->warning->status = &rc->status;
994         rc->warning->this = &rc->value;
995         rc->warning->after = &rc->db_after;
996         rc->warning->before = &rc->db_before;
997         rc->warning->rrdcalc = rc;
998     }
999
1000     if(rc->critical) {
1001         rc->critical->status = &rc->status;
1002         rc->critical->this = &rc->value;
1003         rc->critical->after = &rc->db_after;
1004         rc->critical->before = &rc->db_before;
1005         rc->critical->rrdcalc = rc;
1006     }
1007
1008     // link it to the host
1009     if(likely(host->alarms)) {
1010         // append it
1011         RRDCALC *t;
1012         for(t = host->alarms; t && t->next ; t = t->next) ;
1013         t->next = rc;
1014     }
1015     else {
1016         host->alarms = rc;
1017     }
1018
1019     // link it to its chart
1020     RRDSET *st;
1021     for(st = host->rrdset_root; st ; st = st->next) {
1022         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
1023             rrdsetcalc_link(st, rc);
1024             break;
1025         }
1026     }
1027 }
1028
1029 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
1030
1031     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
1032
1033     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
1034         return NULL;
1035
1036     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1037     rc->next_event_id = 1;
1038     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1039     rc->name = strdupz(rt->name);
1040     rc->hash = simple_hash(rc->name);
1041     rc->chart = strdupz(chart);
1042     rc->hash_chart = simple_hash(rc->chart);
1043
1044     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1045
1046     rc->green = rt->green;
1047     rc->red = rt->red;
1048     rc->value = NAN;
1049     rc->old_value = NAN;
1050
1051     rc->delay_up_duration = rt->delay_up_duration;
1052     rc->delay_down_duration = rt->delay_down_duration;
1053     rc->delay_max_duration = rt->delay_max_duration;
1054     rc->delay_multiplier = rt->delay_multiplier;
1055
1056     rc->group = rt->group;
1057     rc->after = rt->after;
1058     rc->before = rt->before;
1059     rc->update_every = rt->update_every;
1060     rc->options = rt->options;
1061
1062     if(rt->exec) rc->exec = strdupz(rt->exec);
1063     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1064     if(rt->source) rc->source = strdupz(rt->source);
1065     if(rt->units) rc->units = strdupz(rt->units);
1066     if(rt->info) rc->info = strdupz(rt->info);
1067
1068     if(rt->calculation) {
1069         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1070         if(!rc->calculation)
1071             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1072     }
1073     if(rt->warning) {
1074         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1075         if(!rc->warning)
1076             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1077     }
1078     if(rt->critical) {
1079         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1080         if(!rc->critical)
1081             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1082     }
1083
1084     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1085           (rc->chart)?rc->chart:"NOCHART",
1086           rc->name,
1087           (rc->exec)?rc->exec:"DEFAULT",
1088           (rc->recipient)?rc->recipient:"DEFAULT",
1089           rc->green,
1090           rc->red,
1091           rc->group,
1092           rc->after,
1093           rc->before,
1094           rc->options,
1095           (rc->dimensions)?rc->dimensions:"NONE",
1096           rc->update_every,
1097           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1098           (rc->warning)?rc->warning->parsed_as:"NONE",
1099           (rc->critical)?rc->critical->parsed_as:"NONE",
1100           rc->source,
1101           rc->delay_up_duration,
1102           rc->delay_down_duration,
1103           rc->delay_max_duration,
1104           rc->delay_multiplier
1105     );
1106
1107     rrdcalc_create_part2(host, rc);
1108     return rc;
1109 }
1110
1111 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1112     if(!rc) return;
1113
1114     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1115
1116     // unlink it from RRDSET
1117     if(rc->rrdset) rrdsetcalc_unlink(rc);
1118
1119     // unlink it from RRDHOST
1120     if(unlikely(rc == host->alarms))
1121         host->alarms = rc->next;
1122
1123     else if(likely(host->alarms)) {
1124         RRDCALC *t, *last = host->alarms;
1125         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1126         if(last->next == rc)
1127             last->next = rc->next;
1128         else
1129             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1130     }
1131     else
1132         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1133
1134     expression_free(rc->calculation);
1135     expression_free(rc->warning);
1136     expression_free(rc->critical);
1137
1138     freez(rc->name);
1139     freez(rc->chart);
1140     freez(rc->family);
1141     freez(rc->dimensions);
1142     freez(rc->exec);
1143     freez(rc->recipient);
1144     freez(rc->source);
1145     freez(rc->units);
1146     freez(rc->info);
1147     freez(rc);
1148 }
1149
1150 // ----------------------------------------------------------------------------
1151 // RRDCALCTEMPLATE management
1152
1153 void rrdcalctemplate_link_matching(RRDSET *st) {
1154     RRDCALCTEMPLATE *rt;
1155
1156     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1157         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1158             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1159             if(unlikely(!rc))
1160                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1161
1162 #ifdef NETDATA_INTERNAL_CHECKS
1163             else if(rc->rrdset != st)
1164                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1165 #endif
1166         }
1167     }
1168 }
1169
1170 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1171     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1172
1173     if(host->templates) {
1174         if(host->templates == rt) {
1175             host->templates = rt->next;
1176         }
1177         else {
1178             RRDCALCTEMPLATE *t, *last = host->templates;
1179             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1180             if(last && last->next == rt) {
1181                 last->next = rt->next;
1182                 rt->next = NULL;
1183             }
1184             else
1185                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1186         }
1187     }
1188
1189     expression_free(rt->calculation);
1190     expression_free(rt->warning);
1191     expression_free(rt->critical);
1192
1193     freez(rt->name);
1194     freez(rt->exec);
1195     freez(rt->recipient);
1196     freez(rt->context);
1197     freez(rt->source);
1198     freez(rt->units);
1199     freez(rt->info);
1200     freez(rt->dimensions);
1201     freez(rt);
1202 }
1203
1204 // ----------------------------------------------------------------------------
1205 // load health configuration
1206
1207 #define HEALTH_CONF_MAX_LINE 4096
1208
1209 #define HEALTH_ALARM_KEY "alarm"
1210 #define HEALTH_TEMPLATE_KEY "template"
1211 #define HEALTH_ON_KEY "on"
1212 #define HEALTH_LOOKUP_KEY "lookup"
1213 #define HEALTH_CALC_KEY "calc"
1214 #define HEALTH_EVERY_KEY "every"
1215 #define HEALTH_GREEN_KEY "green"
1216 #define HEALTH_RED_KEY "red"
1217 #define HEALTH_WARN_KEY "warn"
1218 #define HEALTH_CRIT_KEY "crit"
1219 #define HEALTH_EXEC_KEY "exec"
1220 #define HEALTH_RECIPIENT_KEY "to"
1221 #define HEALTH_UNITS_KEY "units"
1222 #define HEALTH_INFO_KEY "info"
1223 #define HEALTH_DELAY_KEY "delay"
1224
1225 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1226     if(!rc->chart) {
1227         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1228         return 0;
1229     }
1230
1231     if(!rc->update_every) {
1232         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1233         return 0;
1234     }
1235
1236     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1237         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1238         return 0;
1239     }
1240
1241     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1242         return 0;
1243
1244     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1245
1246     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1247           rc->chart?rc->chart:"NOCHART",
1248           rc->name,
1249           rc->id,
1250           (rc->exec)?rc->exec:"DEFAULT",
1251           (rc->recipient)?rc->recipient:"DEFAULT",
1252           rc->green,
1253           rc->red,
1254           rc->group,
1255           rc->after,
1256           rc->before,
1257           rc->options,
1258           (rc->dimensions)?rc->dimensions:"NONE",
1259           rc->update_every,
1260           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1261           (rc->warning)?rc->warning->parsed_as:"NONE",
1262           (rc->critical)?rc->critical->parsed_as:"NONE",
1263           rc->source,
1264           rc->delay_up_duration,
1265           rc->delay_down_duration,
1266           rc->delay_max_duration,
1267           rc->delay_multiplier
1268     );
1269
1270     rrdcalc_create_part2(host, rc);
1271     return 1;
1272 }
1273
1274 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1275     if(unlikely(!rt->context)) {
1276         error("Health configuration for template '%s' does not have a context", rt->name);
1277         return 0;
1278     }
1279
1280     if(unlikely(!rt->update_every)) {
1281         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1282         return 0;
1283     }
1284
1285     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1286         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1287         return 0;
1288     }
1289
1290     RRDCALCTEMPLATE *t, *last = NULL;
1291     for (t = host->templates; t ; last = t, t = t->next) {
1292         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1293             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1294             return 0;
1295         }
1296     }
1297
1298     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1299           rt->name,
1300           (rt->context)?rt->context:"NONE",
1301           (rt->exec)?rt->exec:"DEFAULT",
1302           (rt->recipient)?rt->recipient:"DEFAULT",
1303           rt->green,
1304           rt->red,
1305           rt->group,
1306           rt->after,
1307           rt->before,
1308           rt->options,
1309           (rt->dimensions)?rt->dimensions:"NONE",
1310           rt->update_every,
1311           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1312           (rt->warning)?rt->warning->parsed_as:"NONE",
1313           (rt->critical)?rt->critical->parsed_as:"NONE",
1314           rt->source,
1315           rt->delay_up_duration,
1316           rt->delay_down_duration,
1317           rt->delay_max_duration,
1318           rt->delay_multiplier
1319     );
1320
1321     if(likely(last)) {
1322         last->next = rt;
1323     }
1324     else {
1325         rt->next = host->templates;
1326         host->templates = rt;
1327     }
1328
1329     return 1;
1330 }
1331
1332 static inline int health_parse_duration(char *string, int *result) {
1333     // make sure it is a number
1334     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1335         *result = 0;
1336         return 0;
1337     }
1338
1339     char *e = NULL;
1340     calculated_number n = strtold(string, &e);
1341     if(e && *e) {
1342         switch (*e) {
1343             case 'Y':
1344                 *result = (int) (n * 86400 * 365);
1345                 break;
1346             case 'M':
1347                 *result = (int) (n * 86400 * 30);
1348                 break;
1349             case 'w':
1350                 *result = (int) (n * 86400 * 7);
1351                 break;
1352             case 'd':
1353                 *result = (int) (n * 86400);
1354                 break;
1355             case 'h':
1356                 *result = (int) (n * 3600);
1357                 break;
1358             case 'm':
1359                 *result = (int) (n * 60);
1360                 break;
1361
1362             default:
1363             case 's':
1364                 *result = (int) (n);
1365                 break;
1366         }
1367     }
1368     else
1369        *result = (int)(n);
1370
1371     return 1;
1372 }
1373
1374 static inline int health_parse_delay(
1375         size_t line, const char *path, const char *file, char *string,
1376         int *delay_up_duration,
1377         int *delay_down_duration,
1378         int *delay_max_duration,
1379         float *delay_multiplier) {
1380
1381     char given_up = 0;
1382     char given_down = 0;
1383     char given_max = 0;
1384     char given_multiplier = 0;
1385
1386     char *s = string;
1387     while(*s) {
1388         char *key = s;
1389
1390         while(*s && !isspace(*s)) s++;
1391         while(*s && isspace(*s)) *s++ = '\0';
1392
1393         if(!*key) break;
1394
1395         char *value = s;
1396         while(*s && !isspace(*s)) s++;
1397         while(*s && isspace(*s)) *s++ = '\0';
1398
1399         if(!strcasecmp(key, "up")) {
1400             if (!health_parse_duration(value, delay_up_duration)) {
1401                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1402                       line, path, file, value, key);
1403             }
1404             else given_up = 1;
1405         }
1406         else if(!strcasecmp(key, "down")) {
1407             if (!health_parse_duration(value, delay_down_duration)) {
1408                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1409                       line, path, file, value, key);
1410             }
1411             else given_down = 1;
1412         }
1413         else if(!strcasecmp(key, "multiplier")) {
1414             *delay_multiplier = strtof(value, NULL);
1415             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1416                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1417                       line, path, file, value, key);
1418             }
1419             else given_multiplier = 1;
1420         }
1421         else if(!strcasecmp(key, "max")) {
1422             if (!health_parse_duration(value, delay_max_duration)) {
1423                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1424                       line, path, file, value, key);
1425             }
1426             else given_max = 1;
1427         }
1428         else {
1429             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1430                   line, path, file, key);
1431         }
1432     }
1433
1434     if(!given_up)
1435         *delay_up_duration = 0;
1436
1437     if(!given_down)
1438         *delay_down_duration = 0;
1439
1440     if(!given_multiplier)
1441         *delay_multiplier = 1.0;
1442
1443     if(!given_max) {
1444         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1445             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1446
1447         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1448             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1449     }
1450
1451     return 1;
1452 }
1453
1454 static inline int health_parse_db_lookup(
1455         size_t line, const char *path, const char *file, char *string,
1456         int *group_method, int *after, int *before, int *every,
1457         uint32_t *options, char **dimensions
1458 ) {
1459     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1460
1461     if(*dimensions) freez(*dimensions);
1462     *dimensions = NULL;
1463     *after = 0;
1464     *before = 0;
1465     *every = 0;
1466     *options = 0;
1467
1468     char *s = string, *key;
1469
1470     // first is the group method
1471     key = s;
1472     while(*s && !isspace(*s)) s++;
1473     while(*s && isspace(*s)) *s++ = '\0';
1474     if(!*s) {
1475         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1476               line, path, file, key);
1477         return 0;
1478     }
1479
1480     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1481         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1482               line, path, file, key);
1483         return 0;
1484     }
1485
1486     // then is the 'after' time
1487     key = s;
1488     while(*s && !isspace(*s)) s++;
1489     while(*s && isspace(*s)) *s++ = '\0';
1490
1491     if(!health_parse_duration(key, after)) {
1492         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1493               line, path, file, key);
1494         return 0;
1495     }
1496
1497     // sane defaults
1498     *every = abs(*after);
1499
1500     // now we may have optional parameters
1501     while(*s) {
1502         key = s;
1503         while(*s && !isspace(*s)) s++;
1504         while(*s && isspace(*s)) *s++ = '\0';
1505         if(!*key) break;
1506
1507         if(!strcasecmp(key, "at")) {
1508             char *value = s;
1509             while(*s && !isspace(*s)) s++;
1510             while(*s && isspace(*s)) *s++ = '\0';
1511
1512             if (!health_parse_duration(value, before)) {
1513                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1514                       line, path, file, value, key);
1515             }
1516         }
1517         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1518             char *value = s;
1519             while(*s && !isspace(*s)) s++;
1520             while(*s && isspace(*s)) *s++ = '\0';
1521
1522             if (!health_parse_duration(value, every)) {
1523                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1524                       line, path, file, value, key);
1525             }
1526         }
1527         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1528             *options |= RRDR_OPTION_ABSOLUTE;
1529         }
1530         else if(!strcasecmp(key, "min2max")) {
1531             *options |= RRDR_OPTION_MIN2MAX;
1532         }
1533         else if(!strcasecmp(key, "null2zero")) {
1534             *options |= RRDR_OPTION_NULL2ZERO;
1535         }
1536         else if(!strcasecmp(key, "percentage")) {
1537             *options |= RRDR_OPTION_PERCENTAGE;
1538         }
1539         else if(!strcasecmp(key, "unaligned")) {
1540             *options |= RRDR_OPTION_NOT_ALIGNED;
1541         }
1542         else if(!strcasecmp(key, "of")) {
1543             if(*s && strcasecmp(s, "all"))
1544                *dimensions = strdupz(s);
1545             break;
1546         }
1547         else {
1548             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1549                   line, path, file, key);
1550         }
1551     }
1552
1553     return 1;
1554 }
1555
1556 static inline char *tabs2spaces(char *s) {
1557     char *t = s;
1558     while(*t) {
1559         if(unlikely(*t == '\t')) *t = ' ';
1560         t++;
1561     }
1562
1563     return s;
1564 }
1565
1566 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1567     char buffer[FILENAME_MAX + 1];
1568     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1569     return strdupz(buffer);
1570 }
1571
1572 static inline void strip_quotes(char *s) {
1573     while(*s) {
1574         if(*s == '\'' || *s == '"') *s = ' ';
1575         s++;
1576     }
1577 }
1578
1579 int health_readfile(const char *path, const char *filename) {
1580     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1581
1582     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1583     char buffer[HEALTH_CONF_MAX_LINE + 1];
1584
1585     if(unlikely(!hash_alarm)) {
1586         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1587         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1588         hash_on = simple_uhash(HEALTH_ON_KEY);
1589         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1590         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1591         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1592         hash_red = simple_uhash(HEALTH_RED_KEY);
1593         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1594         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1595         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1596         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1597         hash_units = simple_hash(HEALTH_UNITS_KEY);
1598         hash_info = simple_hash(HEALTH_INFO_KEY);
1599         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1600         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1601     }
1602
1603     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1604     FILE *fp = fopen(buffer, "r");
1605     if(!fp) {
1606         error("Health configuration cannot read file '%s'.", buffer);
1607         return 0;
1608     }
1609
1610     RRDCALC *rc = NULL;
1611     RRDCALCTEMPLATE *rt = NULL;
1612
1613     size_t line = 0, append = 0;
1614     char *s;
1615     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1616         int stop_appending = !s;
1617         line++;
1618         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1619         s = trim(buffer);
1620         if(!s) continue;
1621         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1622
1623         append = strlen(s);
1624         if(!stop_appending && s[append - 1] == '\\') {
1625             s[append - 1] = ' ';
1626             append = &s[append] - buffer;
1627             if(append < HEALTH_CONF_MAX_LINE)
1628                 continue;
1629             else {
1630                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1631             }
1632         }
1633         append = 0;
1634
1635         char *key = s;
1636         while(*s && *s != ':') s++;
1637         if(!*s) {
1638             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1639             continue;
1640         }
1641         *s = '\0';
1642         s++;
1643
1644         char *value = s;
1645         key = trim(key);
1646         value = trim(value);
1647
1648         if(!key) {
1649             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1650             continue;
1651         }
1652
1653         if(!value) {
1654             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1655             continue;
1656         }
1657
1658         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1659         uint32_t hash = simple_uhash(key);
1660
1661         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1662             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1663                 rrdcalc_free(&localhost, rc);
1664
1665             if(rt) {
1666                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1667                     rrdcalctemplate_free(&localhost, rt);
1668                 rt = NULL;
1669             }
1670
1671             rc = callocz(1, sizeof(RRDCALC));
1672             rc->next_event_id = 1;
1673             rc->name = tabs2spaces(strdupz(value));
1674             rc->hash = simple_hash(rc->name);
1675             rc->source = health_source_file(line, path, filename);
1676             rc->green = NAN;
1677             rc->red = NAN;
1678             rc->value = NAN;
1679             rc->old_value = NAN;
1680             rc->delay_multiplier = 1.0;
1681
1682             if(rrdvar_fix_name(rc->name))
1683                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1684         }
1685         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1686             if(rc) {
1687                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1688                     rrdcalc_free(&localhost, rc);
1689                 rc = NULL;
1690             }
1691
1692             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1693                 rrdcalctemplate_free(&localhost, rt);
1694
1695             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1696             rt->name = tabs2spaces(strdupz(value));
1697             rt->hash_name = simple_hash(rt->name);
1698             rt->source = health_source_file(line, path, filename);
1699             rt->green = NAN;
1700             rt->red = NAN;
1701             rt->delay_multiplier = 1.0;
1702
1703             if(rrdvar_fix_name(rt->name))
1704                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1705         }
1706         else if(rc) {
1707             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1708                 if(rc->chart) {
1709                     if(strcmp(rc->chart, value))
1710                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1711                              line, path, filename, rc->name, key, rc->chart, value, value);
1712
1713                     freez(rc->chart);
1714                 }
1715                 rc->chart = tabs2spaces(strdupz(value));
1716                 rc->hash_chart = simple_hash(rc->chart);
1717             }
1718             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1719                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1720                                        &rc->update_every,
1721                                        &rc->options, &rc->dimensions);
1722             }
1723             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1724                 if(!health_parse_duration(value, &rc->update_every))
1725                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1726                          line, path, filename, rc->name, key, value);
1727             }
1728             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1729                 char *e;
1730                 rc->green = strtold(value, &e);
1731                 if(e && *e) {
1732                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1733                          line, path, filename, rc->name, key, e);
1734                 }
1735             }
1736             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1737                 char *e;
1738                 rc->red = strtold(value, &e);
1739                 if(e && *e) {
1740                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1741                          line, path, filename, rc->name, key, e);
1742                 }
1743             }
1744             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1745                 const char *failed_at = NULL;
1746                 int error = 0;
1747                 rc->calculation = expression_parse(value, &failed_at, &error);
1748                 if(!rc->calculation) {
1749                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1750                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1751                 }
1752             }
1753             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1754                 const char *failed_at = NULL;
1755                 int error = 0;
1756                 rc->warning = expression_parse(value, &failed_at, &error);
1757                 if(!rc->warning) {
1758                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1759                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1760                 }
1761             }
1762             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1763                 const char *failed_at = NULL;
1764                 int error = 0;
1765                 rc->critical = expression_parse(value, &failed_at, &error);
1766                 if(!rc->critical) {
1767                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1768                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1769                 }
1770             }
1771             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1772                 if(rc->exec) {
1773                     if(strcmp(rc->exec, value))
1774                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1775                              line, path, filename, rc->name, key, rc->exec, value, value);
1776
1777                     freez(rc->exec);
1778                 }
1779                 rc->exec = tabs2spaces(strdupz(value));
1780             }
1781             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1782                 if(rc->recipient) {
1783                     if(strcmp(rc->recipient, value))
1784                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1785                              line, path, filename, rc->name, key, rc->recipient, value, value);
1786
1787                     freez(rc->recipient);
1788                 }
1789                 rc->recipient = tabs2spaces(strdupz(value));
1790             }
1791             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1792                 if(rc->units) {
1793                     if(strcmp(rc->units, value))
1794                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1795                              line, path, filename, rc->name, key, rc->units, value, value);
1796
1797                     freez(rc->units);
1798                 }
1799                 rc->units = tabs2spaces(strdupz(value));
1800                 strip_quotes(rc->units);
1801             }
1802             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1803                 if(rc->info) {
1804                     if(strcmp(rc->info, value))
1805                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1806                              line, path, filename, rc->name, key, rc->info, value, value);
1807
1808                     freez(rc->info);
1809                 }
1810                 rc->info = tabs2spaces(strdupz(value));
1811                 strip_quotes(rc->info);
1812             }
1813             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1814                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1815             }
1816             else {
1817                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1818                      line, path, filename, rc->name, key);
1819             }
1820         }
1821         else if(rt) {
1822             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1823                 if(rt->context) {
1824                     if(strcmp(rt->context, value))
1825                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1826                              line, path, filename, rt->name, key, rt->context, value, value);
1827
1828                     freez(rt->context);
1829                 }
1830                 rt->context = tabs2spaces(strdupz(value));
1831                 rt->hash_context = simple_hash(rt->context);
1832             }
1833             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1834                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1835                                        &rt->update_every,
1836                                        &rt->options, &rt->dimensions);
1837             }
1838             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1839                 if(!health_parse_duration(value, &rt->update_every))
1840                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1841                          line, path, filename, rt->name, key, value);
1842             }
1843             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1844                 char *e;
1845                 rt->green = strtold(value, &e);
1846                 if(e && *e) {
1847                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1848                          line, path, filename, rt->name, key, e);
1849                 }
1850             }
1851             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1852                 char *e;
1853                 rt->red = strtold(value, &e);
1854                 if(e && *e) {
1855                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1856                          line, path, filename, rt->name, key, e);
1857                 }
1858             }
1859             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1860                 const char *failed_at = NULL;
1861                 int error = 0;
1862                 rt->calculation = expression_parse(value, &failed_at, &error);
1863                 if(!rt->calculation) {
1864                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1865                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1866                 }
1867             }
1868             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1869                 const char *failed_at = NULL;
1870                 int error = 0;
1871                 rt->warning = expression_parse(value, &failed_at, &error);
1872                 if(!rt->warning) {
1873                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1874                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1875                 }
1876             }
1877             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1878                 const char *failed_at = NULL;
1879                 int error = 0;
1880                 rt->critical = expression_parse(value, &failed_at, &error);
1881                 if(!rt->critical) {
1882                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1883                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1884                 }
1885             }
1886             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1887                 if(rt->exec) {
1888                     if(strcmp(rt->exec, value))
1889                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1890                              line, path, filename, rt->name, key, rt->exec, value, value);
1891
1892                     freez(rt->exec);
1893                 }
1894                 rt->exec = tabs2spaces(strdupz(value));
1895             }
1896             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1897                 if(rt->recipient) {
1898                     if(strcmp(rt->recipient, value))
1899                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1900                              line, path, filename, rt->name, key, rt->recipient, value, value);
1901
1902                     freez(rt->recipient);
1903                 }
1904                 rt->recipient = tabs2spaces(strdupz(value));
1905             }
1906             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1907                 if(rt->units) {
1908                     if(strcmp(rt->units, value))
1909                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1910                              line, path, filename, rt->name, key, rt->units, value, value);
1911
1912                     freez(rt->units);
1913                 }
1914                 rt->units = tabs2spaces(strdupz(value));
1915                 strip_quotes(rt->units);
1916             }
1917             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1918                 if(rt->info) {
1919                     if(strcmp(rt->info, value))
1920                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1921                              line, path, filename, rt->name, key, rt->info, value, value);
1922
1923                     freez(rt->info);
1924                 }
1925                 rt->info = tabs2spaces(strdupz(value));
1926                 strip_quotes(rt->info);
1927             }
1928             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1929                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
1930             }
1931             else {
1932                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1933                       line, path, filename, rt->name, key);
1934             }
1935         }
1936         else {
1937             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1938                   line, path, filename, key);
1939         }
1940     }
1941
1942     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1943         rrdcalc_free(&localhost, rc);
1944
1945     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1946         rrdcalctemplate_free(&localhost, rt);
1947
1948     fclose(fp);
1949     return 1;
1950 }
1951
1952 void health_readdir(const char *path) {
1953     size_t pathlen = strlen(path);
1954
1955     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
1956
1957     DIR *dir = opendir(path);
1958     if (!dir) {
1959         error("Health configuration cannot open directory '%s'.", path);
1960         return;
1961     }
1962
1963     struct dirent *de = NULL;
1964     while ((de = readdir(dir))) {
1965         size_t len = strlen(de->d_name);
1966
1967         if(de->d_type == DT_DIR
1968            && (
1969                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
1970                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
1971            )) {
1972             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
1973             continue;
1974         }
1975
1976         else if(de->d_type == DT_DIR) {
1977             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
1978             strcpy(s, path);
1979             strcat(s, "/");
1980             strcat(s, de->d_name);
1981             health_readdir(s);
1982             freez(s);
1983             continue;
1984         }
1985
1986         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
1987                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
1988             health_readfile(path, de->d_name);
1989         }
1990
1991         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
1992     }
1993
1994     closedir(dir);
1995 }
1996
1997 static inline char *health_config_dir(void) {
1998     char buffer[FILENAME_MAX + 1];
1999     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
2000     return config_get("health", "health configuration directory", buffer);
2001 }
2002
2003 void health_init(void) {
2004     debug(D_HEALTH, "Health configuration initializing");
2005
2006     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
2007         debug(D_HEALTH, "Health is disabled.");
2008         return;
2009     }
2010
2011     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
2012     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
2013         fatal("Cannot create directory '%s'.", pathname);
2014
2015     char filename[FILENAME_MAX + 1];
2016     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
2017     health.log_filename = config_get("health", "health db file", filename);
2018
2019     health_alarm_log_load(&localhost);
2020     health_alarm_log_open();
2021
2022     char *path = health_config_dir();
2023
2024     {
2025         char buffer[FILENAME_MAX + 1];
2026         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
2027         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
2028     }
2029
2030     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2031     if(n < 10) {
2032         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
2033         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
2034     }
2035     else localhost.health_log.max = (unsigned int)n;
2036
2037     rrdhost_rwlock(&localhost);
2038     health_readdir(path);
2039     rrdhost_unlock(&localhost);
2040 }
2041
2042 // ----------------------------------------------------------------------------
2043 // JSON generation
2044
2045 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2046     if(value && *value)
2047         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2048     else
2049         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2050 }
2051
2052 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2053     buffer_sprintf(wb, "\n\t{\n"
2054                            "\t\t\"hostname\": \"%s\",\n"
2055                            "\t\t\"unique_id\": %u,\n"
2056                            "\t\t\"alarm_id\": %u,\n"
2057                            "\t\t\"alarm_event_id\": %u,\n"
2058                            "\t\t\"name\": \"%s\",\n"
2059                            "\t\t\"chart\": \"%s\",\n"
2060                            "\t\t\"family\": \"%s\",\n"
2061                            "\t\t\"processed\": %s,\n"
2062                            "\t\t\"updated\": %s,\n"
2063                            "\t\t\"exec_run\": %lu,\n"
2064                            "\t\t\"exec_failed\": %s,\n"
2065                            "\t\t\"exec\": \"%s\",\n"
2066                            "\t\t\"recipient\": \"%s\",\n"
2067                            "\t\t\"exec_code\": %d,\n"
2068                            "\t\t\"source\": \"%s\",\n"
2069                            "\t\t\"units\": \"%s\",\n"
2070                            "\t\t\"info\": \"%s\",\n"
2071                            "\t\t\"when\": %lu,\n"
2072                            "\t\t\"duration\": %lu,\n"
2073                            "\t\t\"non_clear_duration\": %lu,\n"
2074                            "\t\t\"status\": \"%s\",\n"
2075                            "\t\t\"old_status\": \"%s\",\n"
2076                            "\t\t\"delay\": %d,\n"
2077                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2078                            "\t\t\"updated_by_id\": %u,\n"
2079                            "\t\t\"updates_id\": %u,\n",
2080                    host->hostname,
2081                    ae->unique_id,
2082                    ae->alarm_id,
2083                    ae->alarm_event_id,
2084                    ae->name,
2085                    ae->chart,
2086                    ae->family,
2087                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2088                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2089                    (unsigned long)ae->exec_run_timestamp,
2090                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2091                    ae->exec?ae->exec:health.health_default_exec,
2092                    ae->recipient?ae->recipient:health.health_default_recipient,
2093                    ae->exec_code,
2094                    ae->source,
2095                    ae->units?ae->units:"",
2096                    ae->info?ae->info:"",
2097                    (unsigned long)ae->when,
2098                    (unsigned long)ae->duration,
2099                    (unsigned long)ae->non_clear_duration,
2100                    rrdcalc_status2string(ae->new_status),
2101                    rrdcalc_status2string(ae->old_status),
2102                    ae->delay,
2103                    (unsigned long)ae->delay_up_to_timestamp,
2104                    ae->updated_by_id,
2105                    ae->updates_id
2106     );
2107
2108     buffer_strcat(wb, "\t\t\"value\":");
2109     buffer_rrd_value(wb, ae->new_value);
2110     buffer_strcat(wb, ",\n");
2111
2112     buffer_strcat(wb, "\t\t\"old_value\":");
2113     buffer_rrd_value(wb, ae->old_value);
2114     buffer_strcat(wb, "\n");
2115
2116     buffer_strcat(wb, "\t}");
2117 }
2118
2119 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2120     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2121
2122     buffer_strcat(wb, "[");
2123
2124     unsigned int max = host->health_log.max;
2125     unsigned int count = 0;
2126     ALARM_ENTRY *ae;
2127     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2128         if(ae->unique_id > after) {
2129             if(likely(count)) buffer_strcat(wb, ",");
2130             health_alarm_entry2json_nolock(wb, ae, host);
2131         }
2132     }
2133
2134     buffer_strcat(wb, "\n]\n");
2135
2136     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2137 }
2138
2139 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2140     buffer_sprintf(wb,
2141            "\t\t\"%s.%s\": {\n"
2142                    "\t\t\t\"id\": %lu,\n"
2143                    "\t\t\t\"name\": \"%s\",\n"
2144                    "\t\t\t\"chart\": \"%s\",\n"
2145                    "\t\t\t\"family\": \"%s\",\n"
2146                    "\t\t\t\"active\": %s,\n"
2147                    "\t\t\t\"exec\": \"%s\",\n"
2148                    "\t\t\t\"recipient\": \"%s\",\n"
2149                    "\t\t\t\"source\": \"%s\",\n"
2150                    "\t\t\t\"units\": \"%s\",\n"
2151                    "\t\t\t\"info\": \"%s\",\n"
2152                                    "\t\t\t\"status\": \"%s\",\n"
2153                    "\t\t\t\"last_status_change\": %lu,\n"
2154                    "\t\t\t\"last_updated\": %lu,\n"
2155                    "\t\t\t\"next_update\": %lu,\n"
2156                    "\t\t\t\"update_every\": %d,\n"
2157                    "\t\t\t\"delay_up_duration\": %d,\n"
2158                    "\t\t\t\"delay_down_duration\": %d,\n"
2159                    "\t\t\t\"delay_max_duration\": %d,\n"
2160                    "\t\t\t\"delay_multiplier\": %f,\n"
2161                    "\t\t\t\"delay\": %d,\n"
2162                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2163             , rc->chart, rc->name
2164             , (unsigned long)rc->id
2165             , rc->name
2166             , rc->chart
2167             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2168             , (rc->rrdset)?"true":"false"
2169             , rc->exec?rc->exec:health.health_default_exec
2170             , rc->recipient?rc->recipient:health.health_default_recipient
2171             , rc->source
2172             , rc->units?rc->units:""
2173             , rc->info?rc->info:""
2174             , rrdcalc_status2string(rc->status)
2175             , (unsigned long)rc->last_status_change
2176             , (unsigned long)rc->last_updated
2177             , (unsigned long)rc->next_update
2178             , rc->update_every
2179             , rc->delay_up_duration
2180             , rc->delay_down_duration
2181             , rc->delay_max_duration
2182             , rc->delay_multiplier
2183             , rc->delay_last
2184             , (unsigned long)rc->delay_up_to_timestamp
2185     );
2186
2187     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2188         if(rc->dimensions && *rc->dimensions)
2189             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2190
2191         buffer_sprintf(wb,
2192                        "\t\t\t\"db_after\": %lu,\n"
2193                        "\t\t\t\"db_before\": %lu,\n"
2194                        "\t\t\t\"lookup_method\": \"%s\",\n"
2195                        "\t\t\t\"lookup_after\": %d,\n"
2196                        "\t\t\t\"lookup_before\": %d,\n"
2197                        "\t\t\t\"lookup_options\": \"",
2198                        (unsigned long) rc->db_after,
2199                        (unsigned long) rc->db_before,
2200                        group_method2string(rc->group),
2201                        rc->after,
2202                        rc->before
2203         );
2204         buffer_data_options2string(wb, rc->options);
2205         buffer_strcat(wb, "\",\n");
2206     }
2207
2208     if(rc->calculation) {
2209         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2210         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2211     }
2212
2213     if(rc->warning) {
2214         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2215         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2216     }
2217
2218     if(rc->critical) {
2219         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2220         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2221     }
2222
2223     buffer_strcat(wb, "\t\t\t\"green\":");
2224     buffer_rrd_value(wb, rc->green);
2225     buffer_strcat(wb, ",\n");
2226
2227     buffer_strcat(wb, "\t\t\t\"red\":");
2228     buffer_rrd_value(wb, rc->red);
2229     buffer_strcat(wb, ",\n");
2230
2231     buffer_strcat(wb, "\t\t\t\"value\":");
2232     buffer_rrd_value(wb, rc->value);
2233     buffer_strcat(wb, "\n");
2234
2235     buffer_strcat(wb, "\t\t}");
2236 }
2237
2238 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2239 //
2240 //}
2241
2242 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2243     int i;
2244
2245     rrdhost_rdlock(&localhost);
2246     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2247                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2248                         "\n\t\"status\": %s,"
2249                         "\n\t\"now\": %lu,"
2250                         "\n\t\"alarms\": {\n",
2251                         host->hostname,
2252                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2253                         health_enabled?"true":"false",
2254                         (unsigned long)time(NULL));
2255
2256     RRDCALC *rc;
2257     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2258         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2259             continue;
2260
2261         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2262             continue;
2263
2264         if(likely(i)) buffer_strcat(wb, ",\n");
2265         health_rrdcalc2json_nolock(wb, rc);
2266         i++;
2267     }
2268
2269 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2270 //    RRDCALCTEMPLATE *rt;
2271 //    for(rt = host->templates; rt ; rt = rt->next)
2272 //        health_rrdcalctemplate2json_nolock(wb, rt);
2273
2274     buffer_strcat(wb, "\n\t}\n}\n");
2275     rrdhost_unlock(&localhost);
2276 }
2277
2278
2279 // ----------------------------------------------------------------------------
2280 // re-load health configuration
2281
2282 static inline void health_free_all_nolock(RRDHOST *host) {
2283     while(host->templates)
2284         rrdcalctemplate_free(host, host->templates);
2285
2286     while(host->alarms)
2287         rrdcalc_free(host, host->alarms);
2288 }
2289
2290 void health_reload(void) {
2291     if(!health_enabled) {
2292         error("Health reload is requested, but health is not enabled.");
2293         return;
2294     }
2295
2296     char *path = health_config_dir();
2297
2298     // free all running alarms
2299     rrdhost_rwlock(&localhost);
2300     health_free_all_nolock(&localhost);
2301     rrdhost_unlock(&localhost);
2302
2303     // invalidate all previous entries in the alarm log
2304     ALARM_ENTRY *t;
2305     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2306         if(t->new_status != RRDCALC_STATUS_REMOVED)
2307             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2308     }
2309
2310     // reset all thresholds to all charts
2311     RRDSET *st;
2312     for(st = localhost.rrdset_root; st ; st = st->next) {
2313         st->green = NAN;
2314         st->red = NAN;
2315     }
2316
2317     // load the new alarms
2318     rrdhost_rwlock(&localhost);
2319     health_readdir(path);
2320     rrdhost_unlock(&localhost);
2321
2322     // link the loaded alarms to their charts
2323     for(st = localhost.rrdset_root; st ; st = st->next) {
2324         rrdhost_rwlock(&localhost);
2325
2326         rrdsetcalc_link_matching(st);
2327         rrdcalctemplate_link_matching(st);
2328
2329         rrdhost_unlock(&localhost);
2330     }
2331 }
2332
2333 // ----------------------------------------------------------------------------
2334 // health main thread and friends
2335
2336 static inline int rrdcalc_value2status(calculated_number n) {
2337     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2338     if(n) return RRDCALC_STATUS_RAISED;
2339     return RRDCALC_STATUS_CLEAR;
2340 }
2341
2342 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2343     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2344
2345     // find the previous notification for the same alarm
2346     ALARM_ENTRY *t;
2347     for(t = ae->next; t ;t = t->next) {
2348         if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2349             break;
2350     }
2351
2352     if(t && t->new_status == ae->new_status) {
2353         // don't send the same notification again
2354         info("Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2355         goto done;
2356     }
2357
2358     if((ae->old_status == RRDCALC_STATUS_UNDEFINED && ae->new_status == RRDCALC_STATUS_UNINITIALIZED)
2359         || (ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2360         info("Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2361         goto done;
2362     }
2363
2364     char buffer[FILENAME_MAX + 1];
2365     pid_t command_pid;
2366
2367     const char *exec = ae->exec;
2368     if(!exec) exec = health.health_default_exec;
2369
2370     const char *recipient = ae->recipient;
2371     if(!recipient) recipient = health.health_default_recipient;
2372
2373     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2374               exec,
2375               recipient,
2376               host->hostname,
2377               ae->unique_id,
2378               ae->alarm_id,
2379               ae->alarm_event_id,
2380               (unsigned long)ae->when,
2381               ae->name,
2382               ae->chart?ae->chart:"NOCAHRT",
2383               ae->family?ae->family:"NOFAMILY",
2384               rrdcalc_status2string(ae->new_status),
2385               rrdcalc_status2string(ae->old_status),
2386               ae->new_value,
2387               ae->old_value,
2388               ae->source?ae->source:"UNKNOWN",
2389               (uint32_t)ae->duration,
2390               (uint32_t)ae->non_clear_duration,
2391               ae->units?ae->units:"",
2392               ae->info?ae->info:""
2393     );
2394
2395     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2396     ae->exec_run_timestamp = time(NULL);
2397
2398     debug(D_HEALTH, "executing command '%s'", buffer);
2399     FILE *fp = mypopen(buffer, &command_pid);
2400     if(!fp) {
2401         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2402         goto done;
2403     }
2404     debug(D_HEALTH, "HEALTH reading from command");
2405     char *s = fgets(buffer, FILENAME_MAX, fp);
2406     (void)s;
2407     ae->exec_code = mypclose(fp, command_pid);
2408     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2409
2410     if(ae->exec_code != 0)
2411         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2412
2413 done:
2414     health_alarm_log_save(host, ae);
2415     return;
2416 }
2417
2418 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2419     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2420          ae->chart?ae->chart:"NOCHART", ae->name,
2421          ae->new_value,
2422          rrdcalc_status2string(ae->old_status),
2423          rrdcalc_status2string(ae->new_status)
2424     );
2425
2426     health_alarm_execute(host, ae);
2427 }
2428
2429 static inline void health_alarm_log_process(RRDHOST *host) {
2430     static uint32_t stop_at_id = 0;
2431     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2432     time_t now = time(NULL);
2433
2434     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2435
2436     ALARM_ENTRY *ae;
2437     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2438         if(unlikely(
2439             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2440             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2441             )) {
2442
2443             if(unlikely(ae->unique_id < first_waiting))
2444                 first_waiting = ae->unique_id;
2445
2446             if(likely(now >= ae->delay_up_to_timestamp))
2447                 health_process_notifications(host, ae);
2448         }
2449     }
2450
2451     // remember this for the next iteration
2452     stop_at_id = first_waiting;
2453
2454     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2455
2456     if(host->health_log.count <= host->health_log.max)
2457         return;
2458
2459     // cleanup excess entries in the log
2460     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2461
2462     ALARM_ENTRY *last = NULL;
2463     unsigned int count = host->health_log.max * 2 / 3;
2464     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2465
2466     if(ae && last && last->next == ae)
2467         last->next = NULL;
2468     else
2469         ae = NULL;
2470
2471     while(ae) {
2472         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2473
2474         ALARM_ENTRY *t = ae->next;
2475
2476         freez(ae->name);
2477         freez(ae->chart);
2478         freez(ae->family);
2479         freez(ae->exec);
2480         freez(ae->recipient);
2481         freez(ae->source);
2482         freez(ae->units);
2483         freez(ae->info);
2484         freez(ae);
2485
2486         ae = t;
2487         host->health_log.count--;
2488     }
2489
2490     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2491 }
2492
2493 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2494     if (unlikely(!rc->rrdset)) {
2495         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2496         return 0;
2497     }
2498
2499     if (unlikely(!rc->rrdset->last_collected_time.tv_sec)) {
2500         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not yet collected.", rc->chart?rc->chart:"NOCHART", rc->name);
2501         return 0;
2502     }
2503
2504     if (unlikely(!rc->update_every)) {
2505         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2506         return 0;
2507     }
2508
2509     if (unlikely(rc->next_update > now)) {
2510         if (unlikely(*next_run > rc->next_update))
2511             *next_run = rc->next_update;
2512
2513         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2514         return 0;
2515     }
2516
2517     // FIXME
2518     // we should check that the DB lookup is possible
2519     // i.e.
2520     // - the duration of the chart includes the required timeframe
2521     // we SHOULD NOT check the dimensions - there might be alarms that refer non-existing dimensions (e.g. cpu steal)
2522
2523     return 1;
2524 }
2525
2526 void *health_main(void *ptr) {
2527     (void)ptr;
2528
2529     info("HEALTH thread created with task id %d", gettid());
2530
2531     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2532         error("Cannot set pthread cancel type to DEFERRED.");
2533
2534     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2535         error("Cannot set pthread cancel state to ENABLE.");
2536
2537     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2538     if(min_run_every < 1) min_run_every = 1;
2539
2540     BUFFER *wb = buffer_create(100);
2541
2542     unsigned int loop = 0;
2543     while(health_enabled && !netdata_exit) {
2544         loop++;
2545         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2546
2547         int oldstate, runnable = 0;
2548         time_t now = time(NULL);
2549         time_t next_run = now + min_run_every;
2550         RRDCALC *rc;
2551
2552         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2553             error("Cannot set pthread cancel state to DISABLE.");
2554
2555         rrdhost_rdlock(&localhost);
2556
2557         // the first loop is to lookup values from the db
2558         for (rc = localhost.alarms; rc; rc = rc->next) {
2559             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2560                 continue;
2561
2562             runnable++;
2563             rc->old_value = rc->value;
2564
2565             // 1. if there is database lookup, do it
2566             // 2. if there is calculation expression, run it
2567
2568             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2569                 time_t old_db_timestamp = rc->db_before;
2570                 int value_is_null = 0;
2571
2572                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2573                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2574                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2575
2576                 if (unlikely(ret != 200)) {
2577                     // database lookup failed
2578                     rc->value = NAN;
2579
2580                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2581
2582                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2583                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2584                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2585                     }
2586                 }
2587                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2588                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2589
2590                 if (unlikely(old_db_timestamp == rc->db_before)) {
2591                     // database is stale
2592
2593                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2594
2595                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2596                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2597                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2598                     }
2599                 }
2600                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2601                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2602
2603                 if (unlikely(value_is_null)) {
2604                     // collected value is null
2605
2606                     rc->value = NAN;
2607
2608                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2609                           rc->chart?rc->chart:"NOCHART", rc->name);
2610
2611                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2612                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2613                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2614                               rc->chart?rc->chart:"NOCHART", rc->name);
2615                     }
2616                 }
2617                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2618                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2619
2620                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2621                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2622             }
2623
2624             if(unlikely(rc->calculation)) {
2625                 if (unlikely(!expression_evaluate(rc->calculation))) {
2626                     // calculation failed
2627
2628                     rc->value = NAN;
2629
2630                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2631                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2632
2633                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2634                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2635                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2636                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2637                     }
2638                 }
2639                 else {
2640                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2641                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2642
2643                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
2644                             CALCULATED_NUMBER_FORMAT
2645                             ": %s (source: %s)",
2646                           rc->chart?rc->chart:"NOCHART", rc->name,
2647                           rc->calculation->result,
2648                           buffer_tostring(rc->calculation->error_msg),
2649                           rc->source
2650                     );
2651
2652                     rc->value = rc->calculation->result;
2653                 }
2654             }
2655         }
2656         rrdhost_unlock(&localhost);
2657
2658         if (unlikely(runnable && !netdata_exit)) {
2659             rrdhost_rdlock(&localhost);
2660
2661             for (rc = localhost.alarms; rc; rc = rc->next) {
2662                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2663                     continue;
2664
2665                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2666                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2667
2668                 if(likely(rc->warning)) {
2669                     if(unlikely(!expression_evaluate(rc->warning))) {
2670                         // calculation failed
2671
2672                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2673                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2674
2675                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2676                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2677                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2678                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2679                         }
2680                     }
2681                     else {
2682                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2683                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2684
2685                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2686                                 CALCULATED_NUMBER_FORMAT
2687                                 ": %s (source: %s)",
2688                               rc->chart?rc->chart:"NOCHART", rc->name,
2689                               rc->warning->result,
2690                               buffer_tostring(rc->warning->error_msg),
2691                               rc->source
2692                         );
2693
2694                         warning_status = rrdcalc_value2status(rc->warning->result);
2695                     }
2696                 }
2697
2698                 if(likely(rc->critical)) {
2699                     if(unlikely(!expression_evaluate(rc->critical))) {
2700                         // calculation failed
2701
2702                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2703                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2704
2705                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2706                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2707                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2708                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2709                         }
2710                     }
2711                     else {
2712                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2713                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2714
2715                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2716                                 CALCULATED_NUMBER_FORMAT
2717                                 ": %s (source: %s)",
2718                               rc->chart?rc->chart:"NOCHART", rc->name,
2719                               rc->critical->result,
2720                               buffer_tostring(rc->critical->error_msg),
2721                               rc->source
2722                         );
2723
2724                         critical_status = rrdcalc_value2status(rc->critical->result);
2725                     }
2726                 }
2727
2728                 int status = RRDCALC_STATUS_UNDEFINED;
2729
2730                 switch(warning_status) {
2731                     case RRDCALC_STATUS_CLEAR:
2732                         status = RRDCALC_STATUS_CLEAR;
2733                         break;
2734
2735                     case RRDCALC_STATUS_RAISED:
2736                         status = RRDCALC_STATUS_WARNING;
2737                         break;
2738
2739                     default:
2740                         break;
2741                 }
2742
2743                 switch(critical_status) {
2744                     case RRDCALC_STATUS_CLEAR:
2745                         if(status == RRDCALC_STATUS_UNDEFINED)
2746                             status = RRDCALC_STATUS_CLEAR;
2747                         break;
2748
2749                     case RRDCALC_STATUS_RAISED:
2750                         status = RRDCALC_STATUS_CRITICAL;
2751                         break;
2752
2753                     default:
2754                         break;
2755                 }
2756
2757                 if(status != rc->status) {
2758                     int delay = 0;
2759
2760                     if(now > rc->delay_up_to_timestamp) {
2761                         rc->delay_up_current = rc->delay_up_duration;
2762                         rc->delay_down_current = rc->delay_down_duration;
2763                         rc->delay_last = 0;
2764                         rc->delay_up_to_timestamp = 0;
2765                     }
2766                     else {
2767                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2768                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2769
2770                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2771                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2772                     }
2773
2774                     if(status > rc->status)
2775                         delay = rc->delay_up_current;
2776                     else
2777                         delay = rc->delay_down_current;
2778
2779                     // COMMENTED: because we do need to send raising alarms
2780                     // if(now + delay < rc->delay_up_to_timestamp)
2781                     //    delay = (int)(rc->delay_up_to_timestamp - now);
2782
2783                     rc->delay_last = delay;
2784                     rc->delay_up_to_timestamp = now + delay;
2785                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
2786                     rc->last_status_change = now;
2787                     rc->status = status;
2788                 }
2789
2790                 rc->last_updated = now;
2791                 rc->next_update = now + rc->update_every;
2792
2793                 if (next_run > rc->next_update)
2794                     next_run = rc->next_update;
2795             }
2796
2797             rrdhost_unlock(&localhost);
2798         }
2799
2800         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2801             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2802
2803         if(unlikely(netdata_exit))
2804             break;
2805
2806         // execute notifications
2807         // and cleanup
2808         health_alarm_log_process(&localhost);
2809
2810         if(unlikely(netdata_exit))
2811             break;
2812         
2813         now = time(NULL);
2814         if(now < next_run) {
2815             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2816                   loop, (int) (next_run - now));
2817             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2818         }
2819         else {
2820             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2821         }
2822     }
2823
2824     buffer_free(wb);
2825
2826     info("HEALTH thread exiting");
2827     pthread_exit(NULL);
2828     return NULL;
2829 }