]> arthur.barton.de Git - netdata.git/blob - src/health.c
health log is saved and loaded back
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     FILE *log_fp;
10 };
11
12 static struct health_options health = {
13     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
14     .health_default_recipient = "root",
15     .log_filename = VARLIB_DIR "/health/alarm_log.db",
16     .log_fp = NULL
17 };
18
19 int health_enabled = 1;
20
21 // ----------------------------------------------------------------------------
22 // health alarm log load/save
23 // no need for locking - only one thread is reading / writing the alarms log
24
25 static inline int health_alarm_log_open(void) {
26     if(health.log_fp)
27         fclose(health.log_fp);
28
29     health.log_fp = fopen(health.log_filename, "a");
30
31     if(health.log_fp) {
32         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
33             error("Cannot set line buffering on health log file.");
34         return 0;
35     }
36
37     error("Cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
38     return -1;
39 }
40
41 static inline void health_alarm_log_close(void) {
42     if(health.log_fp) {
43         fclose(health.log_fp);
44         health.log_fp = NULL;
45     }
46 }
47
48 static inline void health_log_recreate(void) {
49     if(health.log_fp != NULL) {
50         health_alarm_log_close();
51
52         // open it with truncate
53         health.log_fp = fopen(health.log_filename, "w");
54         if(health.log_fp) fclose(health.log_fp);
55         else error("Cannot truncate health log '%s'", health.log_filename);
56
57         health.log_fp = NULL;
58
59         health_alarm_log_open();
60     }
61 }
62
63 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
64     (void)host;
65     (void)ae;
66
67     if(likely(health.log_fp)) {
68         if(unlikely(fprintf(health.log_fp
69                 , "%c\t%s"
70                   "\t%08x\t%08x\t%08x\t%08x\t%08x"
71                   "\t%08x\t%08x\t%08x"
72                   "\t%08x\t%08x\t%08x"
73                   "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
74                   "\t%d\t%d\t%d\t%d"
75                   "\t%Lf\t%Lf"
76                   "\n"
77                 , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
78                 , host->hostname
79
80                 , ae->unique_id
81                 , ae->alarm_id
82                 , ae->alarm_event_id
83                 , ae->updated_by_id
84                 , ae->updates_id
85
86                 , (uint32_t)ae->when
87                 , (uint32_t)ae->duration
88                 , (uint32_t)ae->non_clear_duration
89                 , (uint32_t)ae->flags
90                 , (uint32_t)ae->exec_run_timestamp
91                 , (uint32_t)ae->delay_up_to_timestamp
92
93                 , (ae->name)?ae->name:""
94                 , (ae->chart)?ae->chart:""
95                 , (ae->family)?ae->family:""
96                 , (ae->exec)?ae->exec:""
97                 , (ae->recipient)?ae->recipient:""
98                 , (ae->source)?ae->source:""
99                 , (ae->units)?ae->units:""
100                 , (ae->info)?ae->info:""
101
102                 , ae->exec_code
103                 , ae->new_status
104                 , ae->old_status
105                 , ae->delay
106
107                 , (long double)ae->new_value
108                 , (long double)ae->old_value
109         ) < 0))
110             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
111         else
112             ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
113     }
114 }
115
116 static inline void health_alarm_log_load(RRDHOST *host) {
117     (void)host;
118
119     health_alarm_log_close();
120
121     FILE *fp = fopen(health.log_filename, "r");
122     if(!fp)
123         error("Registry: cannot open health file: %s", health.log_filename);
124     else {
125         errno = 0;
126
127         char *s, *buf = mallocz(65536 + 1);
128         size_t line = 0;
129         size_t len = 0;
130
131         while((s = fgets_trim_len(buf, 65536, fp, &len))) {
132             line++;
133             // fprintf(stderr, "line %zu: '%s'\n", line, s);
134
135             int max_entries = 30, entries = 0;
136             char *pointers[max_entries];
137
138             pointers[entries++] = s;
139             while(*s) {
140                 if(unlikely(*s == '\t')) {
141                     *s = '\0';
142                     pointers[entries++] = s;
143                     if(entries > max_entries) {
144                         error("Line %zd of file '%s' has more than %d entries. Ignoring excessive entries.", line, health.log_filename, max_entries);
145                         break;
146                     }
147                 }
148                 s++;
149             }
150
151             if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
152                 ALARM_ENTRY *ae = NULL;
153
154                 if(entries < 26) {
155                     error("Line %zd of file '%s' should have at least 26 entries, but it has %d. Ignoring line.", line, health.log_filename, entries);
156                     continue;
157                 }
158
159                 // if this is an update, find it
160                 if(unlikely(*pointers[0] == 'U')) {
161                     uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
162
163                     // fprintf(stderr, "searching for alarm entry with unique id %u\n", unique_id);
164
165                     // find it
166                     for(ae = host->health_log.alarms; ae ;ae = ae->next) {
167                         if(unlikely(ae->unique_id == unique_id)) break;
168                     }
169
170                     if(!ae) {
171                         *pointers[0] = 'A';
172                         error("Line %zd of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, health.log_filename, unique_id);
173                     }
174                 }
175
176                 // create a new one
177                 if(likely(!ae)) {
178                     ae = callocz(1, sizeof(ALARM_ENTRY));
179                 }
180
181                 if(strcmp(pointers[1], host->hostname))
182                     error("Line %zd of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, health.log_filename, pointers[1], host->hostname);
183
184                 ae->unique_id               = (uint32_t)strtoul(pointers[2], NULL, 16);
185                 ae->alarm_id                = (uint32_t)strtoul(pointers[3], NULL, 16);
186                 ae->alarm_event_id          = (uint32_t)strtoul(pointers[4], NULL, 16);
187                 ae->updated_by_id           = (uint32_t)strtoul(pointers[5], NULL, 16);
188                 ae->updates_id              = (uint32_t)strtoul(pointers[6], NULL, 16);
189
190                 ae->when                    = (uint32_t)strtoul(pointers[7], NULL, 16);
191                 ae->duration                = (uint32_t)strtoul(pointers[8], NULL, 16);
192                 ae->non_clear_duration      = (uint32_t)strtoul(pointers[9], NULL, 16);
193                 ae->flags                   = (uint32_t)strtoul(pointers[10], NULL, 16);
194                 ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
195                 ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
196
197                 if(unlikely(ae->name)) freez(ae->name);
198                 ae->name = strdupz(pointers[13]);
199
200                 if(unlikely(ae->chart)) freez(ae->chart);
201                 ae->chart = strdupz(pointers[14]);
202
203                 if(unlikely(ae->family)) freez(ae->family);
204                 ae->family = strdupz(pointers[15]);
205
206                 if(unlikely(ae->exec)) freez(ae->exec);
207                 ae->exec = strdupz(pointers[16]);
208                 if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
209
210                 if(unlikely(ae->recipient)) freez(ae->recipient);
211                 ae->recipient = strdupz(pointers[17]);
212                 if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
213
214                 if(unlikely(ae->source)) freez(ae->source);
215                 ae->source = strdupz(pointers[18]);
216                 if(!*ae->source) { freez(ae->source); ae->source = NULL; }
217
218                 if(unlikely(ae->units)) freez(ae->units);
219                 ae->units = strdupz(pointers[19]);
220                 if(!*ae->units) { freez(ae->units); ae->units = NULL; }
221
222                 if(unlikely(ae->info)) freez(ae->info);
223                 ae->info = strdupz(pointers[20]);
224                 if(!*ae->info) { freez(ae->info); ae->info = NULL; }
225
226                 ae->exec_code   = atoi(pointers[21]);
227                 ae->new_status  = atoi(pointers[22]);
228                 ae->old_status  = atoi(pointers[23]);
229                 ae->delay       = atoi(pointers[24]);
230
231                 ae->new_value   = strtold(pointers[25], NULL);
232                 ae->old_value   = strtold(pointers[26], NULL);
233
234                 // add it to host if not already there
235                 if(unlikely(*pointers[0] == 'A')) {
236                     ae->next = host->health_log.alarms;
237                     host->health_log.alarms = ae;
238                 }
239
240                 if(unlikely(ae->unique_id >= host->health_log.next_log_id))
241                     host->health_log.next_log_id = ae->unique_id + 1;
242
243                 if(unlikely(ae->alarm_id >= host->health_log.next_alarm_id))
244                     host->health_log.next_alarm_id = ae->alarm_id + 1;
245             }
246             else {
247                 error("Line %zd of file '%s' is invalid (unrecognized entry type '%s').", line, health.log_filename, pointers[0]);
248             }
249         }
250
251         freez(buf);
252         fclose(fp);
253     }
254
255     health_alarm_log_open();
256 }
257
258 // ----------------------------------------------------------------------------
259 // health alarm log management
260
261 static inline void health_alarm_log(RRDHOST *host,
262                 uint32_t alarm_id, uint32_t alarm_event_id,
263                 time_t when,
264                 const char *name, const char *chart, const char *family,
265                 const char *exec, const char *recipient, time_t duration,
266                 calculated_number old_value, calculated_number new_value,
267                 int old_status, int new_status,
268                 const char *source,
269                 const char *units,
270                 const char *info,
271                 int delay
272 ) {
273     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
274
275     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
276     ae->name = strdupz(name);
277     ae->hash_name = simple_hash(ae->name);
278
279     if(chart) {
280         ae->chart = strdupz(chart);
281         ae->hash_chart = simple_hash(ae->chart);
282     }
283
284     if(family)
285         ae->family = strdupz(family);
286
287     if(exec) ae->exec = strdupz(exec);
288     if(recipient) ae->recipient = strdupz(recipient);
289     if(source) ae->source = strdupz(source);
290     if(units) ae->units = strdupz(units);
291     if(info) ae->info = strdupz(info);
292
293     ae->unique_id = host->health_log.next_log_id++;
294     ae->alarm_id = alarm_id;
295     ae->alarm_event_id = alarm_event_id;
296     ae->when = when;
297     ae->old_value = old_value;
298     ae->new_value = new_value;
299     ae->old_status = old_status;
300     ae->new_status = new_status;
301     ae->duration = duration;
302     ae->delay = delay;
303     ae->delay_up_to_timestamp = when + delay;
304
305     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
306         ae->non_clear_duration += ae->duration;
307
308     // link it
309     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
310     ae->next = host->health_log.alarms;
311     host->health_log.alarms = ae;
312     host->health_log.count++;
313     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
314
315     // match previous alarms
316     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
317     ALARM_ENTRY *t;
318     for(t = host->health_log.alarms ; t ; t = t->next) {
319         if(t != ae && t->alarm_id == ae->alarm_id) {
320             if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
321                 t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
322                 t->updated_by_id = ae->unique_id;
323                 ae->updates_id = t->unique_id;
324
325                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
326                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
327                     ae->non_clear_duration += t->non_clear_duration;
328
329                 health_alarm_log_save(host, t);
330             }
331             else {
332                 // no need to continue
333                 break;
334             }
335         }
336     }
337     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
338
339     health_alarm_log_save(host, ae);
340 }
341
342 // ----------------------------------------------------------------------------
343 // RRDVAR management
344
345 static inline int rrdvar_fix_name(char *variable) {
346     int fixed = 0;
347     while(*variable) {
348         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
349             *variable++ = '_';
350             fixed++;
351         }
352         else
353             variable++;
354     }
355
356     return fixed;
357 }
358
359 int rrdvar_compare(void* a, void* b) {
360     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
361     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
362     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
363 }
364
365 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
366     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
367     if(ret != rv)
368         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
369
370     return ret;
371 }
372
373 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
374     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
375     if(!ret)
376         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
377
378     return ret;
379 }
380
381 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
382     RRDVAR tmp;
383     tmp.name = (char *)name;
384     tmp.hash = (hash)?hash:simple_hash(tmp.name);
385
386     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
387 }
388
389 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
390     (void)host;
391
392     if(!rv) return;
393
394     if(tree)
395         rrdvar_index_del(tree, rv);
396
397     freez(rv->name);
398     freez(rv);
399 }
400
401 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
402     char *variable = strdupz(name);
403     rrdvar_fix_name(variable);
404     uint32_t hash = simple_hash(variable);
405
406     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
407     if(unlikely(!rv)) {
408         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
409
410         rv = callocz(1, sizeof(RRDVAR));
411         rv->name = variable;
412         rv->hash = hash;
413         rv->type = type;
414         rv->value = value;
415
416         RRDVAR *ret = rrdvar_index_add(tree, rv);
417         if(unlikely(ret != rv)) {
418             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
419             rrdvar_free(NULL, NULL, rv);
420             rv = NULL;
421         }
422         else
423             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
424     }
425     else {
426         // already exists
427         freez(variable);
428         rv = NULL;
429     }
430
431     return rv;
432 }
433
434 // ----------------------------------------------------------------------------
435 // RRDVAR lookup
436
437 calculated_number rrdvar2number(RRDVAR *rv) {
438     switch(rv->type) {
439         case RRDVAR_TYPE_CALCULATED: {
440             calculated_number *n = (calculated_number *)rv->value;
441             return *n;
442         }
443
444         case RRDVAR_TYPE_TIME_T: {
445             time_t *n = (time_t *)rv->value;
446             return *n;
447         }
448
449         case RRDVAR_TYPE_COLLECTED: {
450             collected_number *n = (collected_number *)rv->value;
451             return *n;
452         }
453
454         case RRDVAR_TYPE_TOTAL: {
455             total_number *n = (total_number *)rv->value;
456             return *n;
457         }
458
459         case RRDVAR_TYPE_INT: {
460             int *n = (int *)rv->value;
461             return *n;
462         }
463
464         default:
465             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
466             return NAN;
467     }
468 }
469
470 void dump_variable(void *data) {
471     RRDVAR *rv = (RRDVAR *)data;
472     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
473 }
474
475 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
476     RRDSET *st = rc->rrdset;
477     RRDVAR *rv;
478
479     if(!st) return 0;
480
481     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
482     if(rv) {
483         *result = rrdvar2number(rv);
484         return 1;
485     }
486
487     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
488     if(rv) {
489         *result = rrdvar2number(rv);
490         return 1;
491     }
492
493     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
494     if(rv) {
495         *result = rrdvar2number(rv);
496         return 1;
497     }
498
499     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
500     avl_traverse_lock(&st->variables_root_index, dump_variable);
501
502     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
503     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
504
505     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
506     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
507
508     return 0;
509 }
510
511 // ----------------------------------------------------------------------------
512 // RRDSETVAR management
513
514 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
515     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
516     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
517
518     char buffer[RRDVAR_MAX_LENGTH + 1];
519     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
520     rs->fullid = strdupz(buffer);
521
522     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
523     rs->fullname = strdupz(buffer);
524
525     rs->variable = strdupz(variable);
526
527     rs->type = type;
528     rs->value = value;
529     rs->options = options;
530     rs->rrdset = st;
531
532     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
533     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
534     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
535     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
536     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
537
538     rs->next = st->variables;
539     st->variables = rs;
540
541     return rs;
542 }
543
544 void rrdsetvar_rename_all(RRDSET *st) {
545     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
546
547     // only these 2 can change name
548     // rs->family_name
549     // rs->host_name
550
551     char buffer[RRDVAR_MAX_LENGTH + 1];
552     RRDSETVAR *rs, *next = st->variables;
553     while((rs = next)) {
554         next = rs->next;
555
556         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
557
558         if (strcmp(buffer, rs->fullname)) {
559             // name changed
560             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
561             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
562
563             freez(rs->fullname);
564             rs->fullname = strdupz(st->name);
565             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
566             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
567         }
568     }
569
570     rrdsetcalc_link_matching(st);
571 }
572
573 void rrdsetvar_free(RRDSETVAR *rs) {
574     RRDSET *st = rs->rrdset;
575     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
576
577     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
578     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
579     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
580     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
581     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
582
583     if(st->variables == rs) {
584         st->variables = rs->next;
585     }
586     else {
587         RRDSETVAR *t;
588         for (t = st->variables; t && t->next != rs; t = t->next);
589         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
590         else t->next = rs->next;
591     }
592
593     freez(rs->fullid);
594     freez(rs->fullname);
595     freez(rs->variable);
596     freez(rs);
597 }
598
599 // ----------------------------------------------------------------------------
600 // RRDDIMVAR management
601
602 #define RRDDIMVAR_ID_MAX 1024
603
604 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
605     RRDSET *st = rd->rrdset;
606
607     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
608
609     if(!prefix) prefix = "";
610     if(!suffix) suffix = "";
611
612     char buffer[RRDDIMVAR_ID_MAX + 1];
613     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
614
615     rs->prefix = strdupz(prefix);
616     rs->suffix = strdupz(suffix);
617
618     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
619     rs->id = strdupz(buffer);
620
621     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
622     rs->name = strdupz(buffer);
623
624     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
625     rs->fullidid = strdupz(buffer);
626
627     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
628     rs->fullidname = strdupz(buffer);
629
630     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
631     rs->fullnameid = strdupz(buffer);
632
633     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
634     rs->fullnamename = strdupz(buffer);
635
636     rs->type = type;
637     rs->value = value;
638     rs->options = options;
639     rs->rrddim = rd;
640
641     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
642     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
643
644     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
645     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
646
647     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
648     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
649     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
650     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
651
652     rs->next = rd->variables;
653     rd->variables = rs;
654
655     return rs;
656 }
657
658 void rrddimvar_rename_all(RRDDIM *rd) {
659     RRDSET *st = rd->rrdset;
660     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
661
662     RRDDIMVAR *rs, *next = rd->variables;
663     while((rs = next)) {
664         next = rs->next;
665
666         if (strcmp(rd->name, rs->name)) {
667             char buffer[RRDDIMVAR_ID_MAX + 1];
668             // name changed
669
670             // name
671             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
672             freez(rs->name);
673             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
674             rs->name = strdupz(buffer);
675             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
676
677             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
678             freez(rs->fullidname);
679             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
680             rs->fullidname = strdupz(buffer);
681             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
682                                                              rs->fullidname, rs->type, rs->value);
683
684             // fullnameid
685             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
686             freez(rs->fullnameid);
687             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
688             rs->fullnameid = strdupz(buffer);
689             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
690                                                           rs->fullnameid, rs->type, rs->value);
691
692             // fullnamename
693             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
694             freez(rs->fullnamename);
695             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
696             rs->fullnamename = strdupz(buffer);
697             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
698                                                           rs->fullnamename, rs->type, rs->value);
699         }
700     }
701 }
702
703 void rrddimvar_free(RRDDIMVAR *rs) {
704     RRDDIM *rd = rs->rrddim;
705     RRDSET *st = rd->rrdset;
706     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
707
708     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
709     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
710
711     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
712     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
713
714     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
715     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
716     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
717     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
718
719     if(rd->variables == rs) {
720         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
721         rd->variables = rs->next;
722     }
723     else {
724         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
725         RRDDIMVAR *t;
726         for (t = rd->variables; t && t->next != rs; t = t->next) ;
727         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
728         else t->next = rs->next;
729     }
730
731     freez(rs->prefix);
732     freez(rs->suffix);
733     freez(rs->id);
734     freez(rs->name);
735     freez(rs->fullidid);
736     freez(rs->fullidname);
737     freez(rs->fullnameid);
738     freez(rs->fullnamename);
739     freez(rs);
740 }
741
742 // ----------------------------------------------------------------------------
743 // RRDCALC management
744
745 static inline const char *rrdcalc_status2string(int status) {
746     switch(status) {
747         case RRDCALC_STATUS_REMOVED:
748             return "REMOVED";
749
750         case RRDCALC_STATUS_UNDEFINED:
751             return "UNDEFINED";
752
753         case RRDCALC_STATUS_UNINITIALIZED:
754             return "UNINITIALIZED";
755
756         case RRDCALC_STATUS_CLEAR:
757             return "CLEAR";
758
759         case RRDCALC_STATUS_RAISED:
760             return "RAISED";
761
762         case RRDCALC_STATUS_WARNING:
763             return "WARNING";
764
765         case RRDCALC_STATUS_CRITICAL:
766             return "CRITICAL";
767
768         default:
769             error("Unknown alarm status %d", status);
770             return "UNKNOWN";
771     }
772 }
773
774 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
775     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
776
777     rc->last_status_change = time(NULL);
778     rc->rrdset = st;
779
780     rc->rrdset_next = st->alarms;
781     rc->rrdset_prev = NULL;
782     
783     if(rc->rrdset_next)
784         rc->rrdset_next->rrdset_prev = rc;
785
786     st->alarms = rc;
787
788     if(rc->update_every < rc->rrdset->update_every) {
789         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
790         rc->update_every = rc->rrdset->update_every;
791     }
792
793     if(!isnan(rc->green) && isnan(st->green)) {
794         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
795         st->green = rc->green;
796     }
797
798     if(!isnan(rc->red) && isnan(st->red)) {
799         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
800         st->red = rc->red;
801     }
802
803     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
804     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
805
806     char fullname[RRDVAR_MAX_LENGTH + 1];
807     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
808     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
809
810     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
811     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
812
813         if(!rc->units) rc->units = strdupz(st->units);
814
815     {
816         time_t now = time(NULL);
817         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
818     }
819 }
820
821 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
822     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
823             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
824         return 1;
825
826     return 0;
827 }
828
829 // this has to be called while the RRDHOST is locked
830 inline void rrdsetcalc_link_matching(RRDSET *st) {
831     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
832
833     RRDCALC *rc;
834     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
835         if(unlikely(rc->rrdset))
836             continue;
837
838         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
839             rrdsetcalc_link(st, rc);
840     }
841 }
842
843 // this has to be called while the RRDHOST is locked
844 inline void rrdsetcalc_unlink(RRDCALC *rc) {
845     RRDSET *st = rc->rrdset;
846
847     if(!st) {
848         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
849         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
850         return;
851     }
852
853     {
854         time_t now = time(NULL);
855         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
856     }
857
858     RRDHOST *host = st->rrdhost;
859
860     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
861
862     // unlink it
863     if(rc->rrdset_prev)
864         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
865
866     if(rc->rrdset_next)
867         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
868
869     if(st->alarms == rc)
870         st->alarms = rc->rrdset_next;
871
872     rc->rrdset_prev = rc->rrdset_next = NULL;
873
874     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
875     rc->local = NULL;
876
877     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
878     rc->family = NULL;
879
880     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
881     rc->hostid = NULL;
882
883     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
884     rc->hostname = NULL;
885
886     rc->rrdset = NULL;
887
888     // RRDCALC will remain in RRDHOST
889     // so that if the matching chart is found in the future
890     // it will be applied automatically
891 }
892
893 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
894     RRDCALC *rc;
895     uint32_t hash = simple_hash(name);
896
897     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
898         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
899             return rc;
900     }
901
902     return NULL;
903 }
904
905 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
906     RRDCALC *rc;
907
908     if(unlikely(!chart)) {
909         error("attempt to find RRDCALC '%s' without giving a chart name", name);
910         return 1;
911     }
912
913     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
914     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
915
916     // make sure it does not already exist
917     for(rc = host->alarms; rc ; rc = rc->next) {
918         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
919             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
920             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
921             return 1;
922         }
923     }
924
925     return 0;
926 }
927
928 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
929     if(chart && name) {
930         uint32_t hash_chart = simple_hash(chart);
931         uint32_t hash_name = simple_hash(name);
932
933         // re-use old IDs, by looking them up in the alarm log
934         ALARM_ENTRY *ae;
935         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
936             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
937                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
938                 return ae->alarm_id;
939             }
940         }
941     }
942
943     return host->health_log.next_alarm_id++;
944 }
945
946 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
947     rrdhost_check_rdlock(host);
948
949     if(rc->calculation) {
950         rc->calculation->status = &rc->status;
951         rc->calculation->this = &rc->value;
952         rc->calculation->after = &rc->db_after;
953         rc->calculation->before = &rc->db_before;
954         rc->calculation->rrdcalc = rc;
955     }
956
957     if(rc->warning) {
958         rc->warning->status = &rc->status;
959         rc->warning->this = &rc->value;
960         rc->warning->after = &rc->db_after;
961         rc->warning->before = &rc->db_before;
962         rc->warning->rrdcalc = rc;
963     }
964
965     if(rc->critical) {
966         rc->critical->status = &rc->status;
967         rc->critical->this = &rc->value;
968         rc->critical->after = &rc->db_after;
969         rc->critical->before = &rc->db_before;
970         rc->critical->rrdcalc = rc;
971     }
972
973     // link it to the host
974     if(likely(host->alarms)) {
975         // append it
976         RRDCALC *t;
977         for(t = host->alarms; t && t->next ; t = t->next) ;
978         t->next = rc;
979     }
980     else {
981         host->alarms = rc;
982     }
983
984     // link it to its chart
985     RRDSET *st;
986     for(st = host->rrdset_root; st ; st = st->next) {
987         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
988             rrdsetcalc_link(st, rc);
989             break;
990         }
991     }
992 }
993
994 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
995
996     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
997
998     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
999         return NULL;
1000
1001     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
1002     rc->next_event_id = 1;
1003     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
1004     rc->name = strdupz(rt->name);
1005     rc->hash = simple_hash(rc->name);
1006     rc->chart = strdupz(chart);
1007     rc->hash_chart = simple_hash(rc->chart);
1008
1009     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
1010
1011     rc->green = rt->green;
1012     rc->red = rt->red;
1013     rc->value = NAN;
1014     rc->old_value = NAN;
1015
1016     rc->delay_up_duration = rt->delay_up_duration;
1017     rc->delay_down_duration = rt->delay_down_duration;
1018     rc->delay_max_duration = rt->delay_max_duration;
1019     rc->delay_multiplier = rt->delay_multiplier;
1020
1021     rc->group = rt->group;
1022     rc->after = rt->after;
1023     rc->before = rt->before;
1024     rc->update_every = rt->update_every;
1025     rc->options = rt->options;
1026
1027     if(rt->exec) rc->exec = strdupz(rt->exec);
1028     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
1029     if(rt->source) rc->source = strdupz(rt->source);
1030     if(rt->units) rc->units = strdupz(rt->units);
1031     if(rt->info) rc->info = strdupz(rt->info);
1032
1033     if(rt->calculation) {
1034         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
1035         if(!rc->calculation)
1036             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
1037     }
1038     if(rt->warning) {
1039         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
1040         if(!rc->warning)
1041             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
1042     }
1043     if(rt->critical) {
1044         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
1045         if(!rc->critical)
1046             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
1047     }
1048
1049     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1050           (rc->chart)?rc->chart:"NOCHART",
1051           rc->name,
1052           (rc->exec)?rc->exec:"DEFAULT",
1053           (rc->recipient)?rc->recipient:"DEFAULT",
1054           rc->green,
1055           rc->red,
1056           rc->group,
1057           rc->after,
1058           rc->before,
1059           rc->options,
1060           (rc->dimensions)?rc->dimensions:"NONE",
1061           rc->update_every,
1062           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1063           (rc->warning)?rc->warning->parsed_as:"NONE",
1064           (rc->critical)?rc->critical->parsed_as:"NONE",
1065           rc->source,
1066           rc->delay_up_duration,
1067           rc->delay_down_duration,
1068           rc->delay_max_duration,
1069           rc->delay_multiplier
1070     );
1071
1072     rrdcalc_create_part2(host, rc);
1073     return rc;
1074 }
1075
1076 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
1077     if(!rc) return;
1078
1079     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1080
1081     // unlink it from RRDSET
1082     if(rc->rrdset) rrdsetcalc_unlink(rc);
1083
1084     // unlink it from RRDHOST
1085     if(unlikely(rc == host->alarms))
1086         host->alarms = rc->next;
1087
1088     else if(likely(host->alarms)) {
1089         RRDCALC *t, *last = host->alarms;
1090         for(t = last->next; t && t != rc; last = t, t = t->next) ;
1091         if(last->next == rc)
1092             last->next = rc->next;
1093         else
1094             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1095     }
1096     else
1097         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
1098
1099     expression_free(rc->calculation);
1100     expression_free(rc->warning);
1101     expression_free(rc->critical);
1102
1103     freez(rc->name);
1104     freez(rc->chart);
1105     freez(rc->family);
1106     freez(rc->dimensions);
1107     freez(rc->exec);
1108     freez(rc->recipient);
1109     freez(rc->source);
1110     freez(rc->units);
1111     freez(rc->info);
1112     freez(rc);
1113 }
1114
1115 // ----------------------------------------------------------------------------
1116 // RRDCALCTEMPLATE management
1117
1118 void rrdcalctemplate_link_matching(RRDSET *st) {
1119     RRDCALCTEMPLATE *rt;
1120
1121     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1122         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
1123             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1124             if(unlikely(!rc))
1125                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1126
1127 #ifdef NETDATA_INTERNAL_CHECKS
1128             else if(rc->rrdset != st)
1129                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1130 #endif
1131         }
1132     }
1133 }
1134
1135 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1136     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1137
1138     if(host->templates) {
1139         if(host->templates == rt) {
1140             host->templates = rt->next;
1141         }
1142         else {
1143             RRDCALCTEMPLATE *t, *last = host->templates;
1144             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1145             if(last && last->next == rt) {
1146                 last->next = rt->next;
1147                 rt->next = NULL;
1148             }
1149             else
1150                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1151         }
1152     }
1153
1154     expression_free(rt->calculation);
1155     expression_free(rt->warning);
1156     expression_free(rt->critical);
1157
1158     freez(rt->name);
1159     freez(rt->exec);
1160     freez(rt->recipient);
1161     freez(rt->context);
1162     freez(rt->source);
1163     freez(rt->units);
1164     freez(rt->info);
1165     freez(rt->dimensions);
1166     freez(rt);
1167 }
1168
1169 // ----------------------------------------------------------------------------
1170 // load health configuration
1171
1172 #define HEALTH_CONF_MAX_LINE 4096
1173
1174 #define HEALTH_ALARM_KEY "alarm"
1175 #define HEALTH_TEMPLATE_KEY "template"
1176 #define HEALTH_ON_KEY "on"
1177 #define HEALTH_LOOKUP_KEY "lookup"
1178 #define HEALTH_CALC_KEY "calc"
1179 #define HEALTH_EVERY_KEY "every"
1180 #define HEALTH_GREEN_KEY "green"
1181 #define HEALTH_RED_KEY "red"
1182 #define HEALTH_WARN_KEY "warn"
1183 #define HEALTH_CRIT_KEY "crit"
1184 #define HEALTH_EXEC_KEY "exec"
1185 #define HEALTH_RECIPIENT_KEY "to"
1186 #define HEALTH_UNITS_KEY "units"
1187 #define HEALTH_INFO_KEY "info"
1188 #define HEALTH_DELAY_KEY "delay"
1189
1190 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1191     if(!rc->chart) {
1192         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1193         return 0;
1194     }
1195
1196     if(!rc->update_every) {
1197         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1198         return 0;
1199     }
1200
1201     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1202         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1203         return 0;
1204     }
1205
1206     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1207         return 0;
1208
1209     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1210
1211     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1212           rc->chart?rc->chart:"NOCHART",
1213           rc->name,
1214           rc->id,
1215           (rc->exec)?rc->exec:"DEFAULT",
1216           (rc->recipient)?rc->recipient:"DEFAULT",
1217           rc->green,
1218           rc->red,
1219           rc->group,
1220           rc->after,
1221           rc->before,
1222           rc->options,
1223           (rc->dimensions)?rc->dimensions:"NONE",
1224           rc->update_every,
1225           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1226           (rc->warning)?rc->warning->parsed_as:"NONE",
1227           (rc->critical)?rc->critical->parsed_as:"NONE",
1228           rc->source,
1229           rc->delay_up_duration,
1230           rc->delay_down_duration,
1231           rc->delay_max_duration,
1232           rc->delay_multiplier
1233     );
1234
1235     rrdcalc_create_part2(host, rc);
1236     return 1;
1237 }
1238
1239 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1240     if(unlikely(!rt->context)) {
1241         error("Health configuration for template '%s' does not have a context", rt->name);
1242         return 0;
1243     }
1244
1245     if(unlikely(!rt->update_every)) {
1246         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1247         return 0;
1248     }
1249
1250     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1251         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1252         return 0;
1253     }
1254
1255     RRDCALCTEMPLATE *t, *last = NULL;
1256     for (t = host->templates; t ; last = t, t = t->next) {
1257         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1258             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1259             return 0;
1260         }
1261     }
1262
1263     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1264           rt->name,
1265           (rt->context)?rt->context:"NONE",
1266           (rt->exec)?rt->exec:"DEFAULT",
1267           (rt->recipient)?rt->recipient:"DEFAULT",
1268           rt->green,
1269           rt->red,
1270           rt->group,
1271           rt->after,
1272           rt->before,
1273           rt->options,
1274           (rt->dimensions)?rt->dimensions:"NONE",
1275           rt->update_every,
1276           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1277           (rt->warning)?rt->warning->parsed_as:"NONE",
1278           (rt->critical)?rt->critical->parsed_as:"NONE",
1279           rt->source,
1280           rt->delay_up_duration,
1281           rt->delay_down_duration,
1282           rt->delay_max_duration,
1283           rt->delay_multiplier
1284     );
1285
1286     if(likely(last)) {
1287         last->next = rt;
1288     }
1289     else {
1290         rt->next = host->templates;
1291         host->templates = rt;
1292     }
1293
1294     return 1;
1295 }
1296
1297 static inline int health_parse_duration(char *string, int *result) {
1298     // make sure it is a number
1299     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1300         *result = 0;
1301         return 0;
1302     }
1303
1304     char *e = NULL;
1305     calculated_number n = strtold(string, &e);
1306     if(e && *e) {
1307         switch (*e) {
1308             case 'Y':
1309                 *result = (int) (n * 86400 * 365);
1310                 break;
1311             case 'M':
1312                 *result = (int) (n * 86400 * 30);
1313                 break;
1314             case 'w':
1315                 *result = (int) (n * 86400 * 7);
1316                 break;
1317             case 'd':
1318                 *result = (int) (n * 86400);
1319                 break;
1320             case 'h':
1321                 *result = (int) (n * 3600);
1322                 break;
1323             case 'm':
1324                 *result = (int) (n * 60);
1325                 break;
1326
1327             default:
1328             case 's':
1329                 *result = (int) (n);
1330                 break;
1331         }
1332     }
1333     else
1334        *result = (int)(n);
1335
1336     return 1;
1337 }
1338
1339 static inline int health_parse_delay(
1340         size_t line, const char *path, const char *file, char *string,
1341         int *delay_up_duration,
1342         int *delay_down_duration,
1343         int *delay_max_duration,
1344         float *delay_multiplier) {
1345
1346     char given_up = 0;
1347     char given_down = 0;
1348     char given_max = 0;
1349     char given_multiplier = 0;
1350
1351     char *s = string;
1352     while(*s) {
1353         char *key = s;
1354
1355         while(*s && !isspace(*s)) s++;
1356         while(*s && isspace(*s)) *s++ = '\0';
1357
1358         if(!*key) break;
1359
1360         char *value = s;
1361         while(*s && !isspace(*s)) s++;
1362         while(*s && isspace(*s)) *s++ = '\0';
1363
1364         if(!strcasecmp(key, "up")) {
1365             if (!health_parse_duration(value, delay_up_duration)) {
1366                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1367                       line, path, file, value, key);
1368             }
1369             else given_up = 1;
1370         }
1371         else if(!strcasecmp(key, "down")) {
1372             if (!health_parse_duration(value, delay_down_duration)) {
1373                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1374                       line, path, file, value, key);
1375             }
1376             else given_down = 1;
1377         }
1378         else if(!strcasecmp(key, "multiplier")) {
1379             *delay_multiplier = strtof(value, NULL);
1380             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1381                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1382                       line, path, file, value, key);
1383             }
1384             else given_multiplier = 1;
1385         }
1386         else if(!strcasecmp(key, "max")) {
1387             if (!health_parse_duration(value, delay_max_duration)) {
1388                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1389                       line, path, file, value, key);
1390             }
1391             else given_max = 1;
1392         }
1393         else {
1394             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1395                   line, path, file, key);
1396         }
1397     }
1398
1399     if(!given_up)
1400         *delay_up_duration = 0;
1401
1402     if(!given_down)
1403         *delay_down_duration = 0;
1404
1405     if(!given_multiplier)
1406         *delay_multiplier = 1.0;
1407
1408     if(!given_max) {
1409         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1410             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1411
1412         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1413             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1414     }
1415
1416     return 1;
1417 }
1418
1419 static inline int health_parse_db_lookup(
1420         size_t line, const char *path, const char *file, char *string,
1421         int *group_method, int *after, int *before, int *every,
1422         uint32_t *options, char **dimensions
1423 ) {
1424     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1425
1426     if(*dimensions) freez(*dimensions);
1427     *dimensions = NULL;
1428     *after = 0;
1429     *before = 0;
1430     *every = 0;
1431     *options = 0;
1432
1433     char *s = string, *key;
1434
1435     // first is the group method
1436     key = s;
1437     while(*s && !isspace(*s)) s++;
1438     while(*s && isspace(*s)) *s++ = '\0';
1439     if(!*s) {
1440         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1441               line, path, file, key);
1442         return 0;
1443     }
1444
1445     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1446         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1447               line, path, file, key);
1448         return 0;
1449     }
1450
1451     // then is the 'after' time
1452     key = s;
1453     while(*s && !isspace(*s)) s++;
1454     while(*s && isspace(*s)) *s++ = '\0';
1455
1456     if(!health_parse_duration(key, after)) {
1457         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1458               line, path, file, key);
1459         return 0;
1460     }
1461
1462     // sane defaults
1463     *every = abs(*after);
1464
1465     // now we may have optional parameters
1466     while(*s) {
1467         key = s;
1468         while(*s && !isspace(*s)) s++;
1469         while(*s && isspace(*s)) *s++ = '\0';
1470         if(!*key) break;
1471
1472         if(!strcasecmp(key, "at")) {
1473             char *value = s;
1474             while(*s && !isspace(*s)) s++;
1475             while(*s && isspace(*s)) *s++ = '\0';
1476
1477             if (!health_parse_duration(value, before)) {
1478                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1479                       line, path, file, value, key);
1480             }
1481         }
1482         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1483             char *value = s;
1484             while(*s && !isspace(*s)) s++;
1485             while(*s && isspace(*s)) *s++ = '\0';
1486
1487             if (!health_parse_duration(value, every)) {
1488                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1489                       line, path, file, value, key);
1490             }
1491         }
1492         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1493             *options |= RRDR_OPTION_ABSOLUTE;
1494         }
1495         else if(!strcasecmp(key, "min2max")) {
1496             *options |= RRDR_OPTION_MIN2MAX;
1497         }
1498         else if(!strcasecmp(key, "null2zero")) {
1499             *options |= RRDR_OPTION_NULL2ZERO;
1500         }
1501         else if(!strcasecmp(key, "percentage")) {
1502             *options |= RRDR_OPTION_PERCENTAGE;
1503         }
1504         else if(!strcasecmp(key, "unaligned")) {
1505             *options |= RRDR_OPTION_NOT_ALIGNED;
1506         }
1507         else if(!strcasecmp(key, "of")) {
1508             if(*s && strcasecmp(s, "all"))
1509                *dimensions = strdupz(s);
1510             break;
1511         }
1512         else {
1513             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1514                   line, path, file, key);
1515         }
1516     }
1517
1518     return 1;
1519 }
1520
1521 static inline char *tabs2spaces(char *s) {
1522     char *t = s;
1523     while(*t) {
1524         if(unlikely(*t == '\t')) *t = ' ';
1525         t++;
1526     }
1527
1528     return s;
1529 }
1530
1531 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1532     char buffer[FILENAME_MAX + 1];
1533     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1534     return strdupz(buffer);
1535 }
1536
1537 static inline void strip_quotes(char *s) {
1538     while(*s) {
1539         if(*s == '\'' || *s == '"') *s = ' ';
1540         s++;
1541     }
1542 }
1543
1544 int health_readfile(const char *path, const char *filename) {
1545     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1546
1547     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1548     char buffer[HEALTH_CONF_MAX_LINE + 1];
1549
1550     if(unlikely(!hash_alarm)) {
1551         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1552         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1553         hash_on = simple_uhash(HEALTH_ON_KEY);
1554         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1555         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1556         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1557         hash_red = simple_uhash(HEALTH_RED_KEY);
1558         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1559         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1560         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1561         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1562         hash_units = simple_hash(HEALTH_UNITS_KEY);
1563         hash_info = simple_hash(HEALTH_INFO_KEY);
1564         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1565         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1566     }
1567
1568     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1569     FILE *fp = fopen(buffer, "r");
1570     if(!fp) {
1571         error("Health configuration cannot read file '%s'.", buffer);
1572         return 0;
1573     }
1574
1575     RRDCALC *rc = NULL;
1576     RRDCALCTEMPLATE *rt = NULL;
1577
1578     size_t line = 0, append = 0;
1579     char *s;
1580     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1581         int stop_appending = !s;
1582         line++;
1583         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1584         s = trim(buffer);
1585         if(!s) continue;
1586         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1587
1588         append = strlen(s);
1589         if(!stop_appending && s[append - 1] == '\\') {
1590             s[append - 1] = ' ';
1591             append = &s[append] - buffer;
1592             if(append < HEALTH_CONF_MAX_LINE)
1593                 continue;
1594             else {
1595                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1596             }
1597         }
1598         append = 0;
1599
1600         char *key = s;
1601         while(*s && *s != ':') s++;
1602         if(!*s) {
1603             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1604             continue;
1605         }
1606         *s = '\0';
1607         s++;
1608
1609         char *value = s;
1610         key = trim(key);
1611         value = trim(value);
1612
1613         if(!key) {
1614             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1615             continue;
1616         }
1617
1618         if(!value) {
1619             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1620             continue;
1621         }
1622
1623         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1624         uint32_t hash = simple_uhash(key);
1625
1626         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1627             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1628                 rrdcalc_free(&localhost, rc);
1629
1630             if(rt) {
1631                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1632                     rrdcalctemplate_free(&localhost, rt);
1633                 rt = NULL;
1634             }
1635
1636             rc = callocz(1, sizeof(RRDCALC));
1637             rc->next_event_id = 1;
1638             rc->name = tabs2spaces(strdupz(value));
1639             rc->hash = simple_hash(rc->name);
1640             rc->source = health_source_file(line, path, filename);
1641             rc->green = NAN;
1642             rc->red = NAN;
1643             rc->value = NAN;
1644             rc->old_value = NAN;
1645             rc->delay_multiplier = 1.0;
1646
1647             if(rrdvar_fix_name(rc->name))
1648                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1649         }
1650         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1651             if(rc) {
1652                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1653                     rrdcalc_free(&localhost, rc);
1654                 rc = NULL;
1655             }
1656
1657             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1658                 rrdcalctemplate_free(&localhost, rt);
1659
1660             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1661             rt->name = tabs2spaces(strdupz(value));
1662             rt->hash_name = simple_hash(rt->name);
1663             rt->source = health_source_file(line, path, filename);
1664             rt->green = NAN;
1665             rt->red = NAN;
1666             rt->delay_multiplier = 1.0;
1667
1668             if(rrdvar_fix_name(rt->name))
1669                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1670         }
1671         else if(rc) {
1672             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1673                 if(rc->chart) {
1674                     if(strcmp(rc->chart, value))
1675                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1676                              line, path, filename, rc->name, key, rc->chart, value, value);
1677
1678                     freez(rc->chart);
1679                 }
1680                 rc->chart = tabs2spaces(strdupz(value));
1681                 rc->hash_chart = simple_hash(rc->chart);
1682             }
1683             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1684                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1685                                        &rc->update_every,
1686                                        &rc->options, &rc->dimensions);
1687             }
1688             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1689                 if(!health_parse_duration(value, &rc->update_every))
1690                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1691                          line, path, filename, rc->name, key, value);
1692             }
1693             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1694                 char *e;
1695                 rc->green = strtold(value, &e);
1696                 if(e && *e) {
1697                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1698                          line, path, filename, rc->name, key, e);
1699                 }
1700             }
1701             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1702                 char *e;
1703                 rc->red = strtold(value, &e);
1704                 if(e && *e) {
1705                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1706                          line, path, filename, rc->name, key, e);
1707                 }
1708             }
1709             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1710                 const char *failed_at = NULL;
1711                 int error = 0;
1712                 rc->calculation = expression_parse(value, &failed_at, &error);
1713                 if(!rc->calculation) {
1714                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1715                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1716                 }
1717             }
1718             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1719                 const char *failed_at = NULL;
1720                 int error = 0;
1721                 rc->warning = expression_parse(value, &failed_at, &error);
1722                 if(!rc->warning) {
1723                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1724                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1725                 }
1726             }
1727             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1728                 const char *failed_at = NULL;
1729                 int error = 0;
1730                 rc->critical = expression_parse(value, &failed_at, &error);
1731                 if(!rc->critical) {
1732                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1733                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1734                 }
1735             }
1736             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1737                 if(rc->exec) {
1738                     if(strcmp(rc->exec, value))
1739                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1740                              line, path, filename, rc->name, key, rc->exec, value, value);
1741
1742                     freez(rc->exec);
1743                 }
1744                 rc->exec = tabs2spaces(strdupz(value));
1745             }
1746             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1747                 if(rc->recipient) {
1748                     if(strcmp(rc->recipient, value))
1749                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1750                              line, path, filename, rc->name, key, rc->recipient, value, value);
1751
1752                     freez(rc->recipient);
1753                 }
1754                 rc->recipient = tabs2spaces(strdupz(value));
1755             }
1756             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1757                 if(rc->units) {
1758                     if(strcmp(rc->units, value))
1759                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1760                              line, path, filename, rc->name, key, rc->units, value, value);
1761
1762                     freez(rc->units);
1763                 }
1764                 rc->units = tabs2spaces(strdupz(value));
1765                 strip_quotes(rc->units);
1766             }
1767             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1768                 if(rc->info) {
1769                     if(strcmp(rc->info, value))
1770                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1771                              line, path, filename, rc->name, key, rc->info, value, value);
1772
1773                     freez(rc->info);
1774                 }
1775                 rc->info = tabs2spaces(strdupz(value));
1776                 strip_quotes(rc->info);
1777             }
1778             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1779                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1780             }
1781             else {
1782                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1783                      line, path, filename, rc->name, key);
1784             }
1785         }
1786         else if(rt) {
1787             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1788                 if(rt->context) {
1789                     if(strcmp(rt->context, value))
1790                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1791                              line, path, filename, rt->name, key, rt->context, value, value);
1792
1793                     freez(rt->context);
1794                 }
1795                 rt->context = tabs2spaces(strdupz(value));
1796                 rt->hash_context = simple_hash(rt->context);
1797             }
1798             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1799                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1800                                        &rt->update_every,
1801                                        &rt->options, &rt->dimensions);
1802             }
1803             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1804                 if(!health_parse_duration(value, &rt->update_every))
1805                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1806                          line, path, filename, rt->name, key, value);
1807             }
1808             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1809                 char *e;
1810                 rt->green = strtold(value, &e);
1811                 if(e && *e) {
1812                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1813                          line, path, filename, rt->name, key, e);
1814                 }
1815             }
1816             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1817                 char *e;
1818                 rt->red = strtold(value, &e);
1819                 if(e && *e) {
1820                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1821                          line, path, filename, rt->name, key, e);
1822                 }
1823             }
1824             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1825                 const char *failed_at = NULL;
1826                 int error = 0;
1827                 rt->calculation = expression_parse(value, &failed_at, &error);
1828                 if(!rt->calculation) {
1829                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1830                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1831                 }
1832             }
1833             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1834                 const char *failed_at = NULL;
1835                 int error = 0;
1836                 rt->warning = expression_parse(value, &failed_at, &error);
1837                 if(!rt->warning) {
1838                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1839                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1840                 }
1841             }
1842             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1843                 const char *failed_at = NULL;
1844                 int error = 0;
1845                 rt->critical = expression_parse(value, &failed_at, &error);
1846                 if(!rt->critical) {
1847                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1848                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1849                 }
1850             }
1851             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1852                 if(rt->exec) {
1853                     if(strcmp(rt->exec, value))
1854                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1855                              line, path, filename, rt->name, key, rt->exec, value, value);
1856
1857                     freez(rt->exec);
1858                 }
1859                 rt->exec = tabs2spaces(strdupz(value));
1860             }
1861             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1862                 if(rt->recipient) {
1863                     if(strcmp(rt->recipient, value))
1864                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1865                              line, path, filename, rt->name, key, rt->recipient, value, value);
1866
1867                     freez(rt->recipient);
1868                 }
1869                 rt->recipient = tabs2spaces(strdupz(value));
1870             }
1871             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1872                 if(rt->units) {
1873                     if(strcmp(rt->units, value))
1874                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1875                              line, path, filename, rt->name, key, rt->units, value, value);
1876
1877                     freez(rt->units);
1878                 }
1879                 rt->units = tabs2spaces(strdupz(value));
1880                 strip_quotes(rt->units);
1881             }
1882             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1883                 if(rt->info) {
1884                     if(strcmp(rt->info, value))
1885                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1886                              line, path, filename, rt->name, key, rt->info, value, value);
1887
1888                     freez(rt->info);
1889                 }
1890                 rt->info = tabs2spaces(strdupz(value));
1891                 strip_quotes(rt->info);
1892             }
1893             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1894                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
1895             }
1896             else {
1897                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1898                       line, path, filename, rt->name, key);
1899             }
1900         }
1901         else {
1902             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1903                   line, path, filename, key);
1904         }
1905     }
1906
1907     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1908         rrdcalc_free(&localhost, rc);
1909
1910     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1911         rrdcalctemplate_free(&localhost, rt);
1912
1913     fclose(fp);
1914     return 1;
1915 }
1916
1917 void health_readdir(const char *path) {
1918     size_t pathlen = strlen(path);
1919
1920     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
1921
1922     DIR *dir = opendir(path);
1923     if (!dir) {
1924         error("Health configuration cannot open directory '%s'.", path);
1925         return;
1926     }
1927
1928     struct dirent *de = NULL;
1929     while ((de = readdir(dir))) {
1930         size_t len = strlen(de->d_name);
1931
1932         if(de->d_type == DT_DIR
1933            && (
1934                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
1935                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
1936            )) {
1937             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
1938             continue;
1939         }
1940
1941         else if(de->d_type == DT_DIR) {
1942             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
1943             strcpy(s, path);
1944             strcat(s, "/");
1945             strcat(s, de->d_name);
1946             health_readdir(s);
1947             freez(s);
1948             continue;
1949         }
1950
1951         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
1952                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
1953             health_readfile(path, de->d_name);
1954         }
1955
1956         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
1957     }
1958
1959     closedir(dir);
1960 }
1961
1962 static inline char *health_config_dir(void) {
1963     char buffer[FILENAME_MAX + 1];
1964     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
1965     return config_get("health", "health configuration directory", buffer);
1966 }
1967
1968 void health_init(void) {
1969     debug(D_HEALTH, "Health configuration initializing");
1970
1971     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
1972         debug(D_HEALTH, "Health is disabled.");
1973         return;
1974     }
1975
1976     char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health");
1977     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
1978         fatal("Cannot create directory '%s'.", pathname);
1979
1980     char filename[FILENAME_MAX + 1];
1981     snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname);
1982     health.log_filename = config_get("health", "health db file", filename);
1983
1984     health_alarm_log_load(&localhost);
1985     health_alarm_log_open();
1986
1987     char *path = health_config_dir();
1988
1989     {
1990         char buffer[FILENAME_MAX + 1];
1991         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
1992         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
1993     }
1994
1995     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1996     if(n < 10) {
1997         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
1998         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1999     }
2000     else localhost.health_log.max = (unsigned int)n;
2001
2002     rrdhost_rwlock(&localhost);
2003     health_readdir(path);
2004     rrdhost_unlock(&localhost);
2005 }
2006
2007 // ----------------------------------------------------------------------------
2008 // JSON generation
2009
2010 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
2011     if(value && *value)
2012         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
2013     else
2014         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
2015 }
2016
2017 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
2018     buffer_sprintf(wb, "\n\t{\n"
2019                            "\t\t\"hostname\": \"%s\",\n"
2020                            "\t\t\"unique_id\": %u,\n"
2021                            "\t\t\"alarm_id\": %u,\n"
2022                            "\t\t\"alarm_event_id\": %u,\n"
2023                            "\t\t\"name\": \"%s\",\n"
2024                            "\t\t\"chart\": \"%s\",\n"
2025                            "\t\t\"family\": \"%s\",\n"
2026                            "\t\t\"processed\": %s,\n"
2027                            "\t\t\"updated\": %s,\n"
2028                            "\t\t\"exec_run\": %lu,\n"
2029                            "\t\t\"exec_failed\": %s,\n"
2030                            "\t\t\"exec\": \"%s\",\n"
2031                            "\t\t\"recipient\": \"%s\",\n"
2032                            "\t\t\"exec_code\": %d,\n"
2033                            "\t\t\"source\": \"%s\",\n"
2034                            "\t\t\"units\": \"%s\",\n"
2035                            "\t\t\"info\": \"%s\",\n"
2036                            "\t\t\"when\": %lu,\n"
2037                            "\t\t\"duration\": %lu,\n"
2038                            "\t\t\"non_clear_duration\": %lu,\n"
2039                            "\t\t\"status\": \"%s\",\n"
2040                            "\t\t\"old_status\": \"%s\",\n"
2041                            "\t\t\"delay\": %d,\n"
2042                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
2043                            "\t\t\"updated_by_id\": %u,\n"
2044                            "\t\t\"updates_id\": %u,\n",
2045                    host->hostname,
2046                    ae->unique_id,
2047                    ae->alarm_id,
2048                    ae->alarm_event_id,
2049                    ae->name,
2050                    ae->chart,
2051                    ae->family,
2052                    (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
2053                    (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
2054                    (unsigned long)ae->exec_run_timestamp,
2055                    (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
2056                    ae->exec?ae->exec:health.health_default_exec,
2057                    ae->recipient?ae->recipient:health.health_default_recipient,
2058                    ae->exec_code,
2059                    ae->source,
2060                    ae->units?ae->units:"",
2061                    ae->info?ae->info:"",
2062                    (unsigned long)ae->when,
2063                    (unsigned long)ae->duration,
2064                    (unsigned long)ae->non_clear_duration,
2065                    rrdcalc_status2string(ae->new_status),
2066                    rrdcalc_status2string(ae->old_status),
2067                    ae->delay,
2068                    (unsigned long)ae->delay_up_to_timestamp,
2069                    ae->updated_by_id,
2070                    ae->updates_id
2071     );
2072
2073     buffer_strcat(wb, "\t\t\"value\":");
2074     buffer_rrd_value(wb, ae->new_value);
2075     buffer_strcat(wb, ",\n");
2076
2077     buffer_strcat(wb, "\t\t\"old_value\":");
2078     buffer_rrd_value(wb, ae->old_value);
2079     buffer_strcat(wb, "\n");
2080
2081     buffer_strcat(wb, "\t}");
2082 }
2083
2084 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
2085     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2086
2087     buffer_strcat(wb, "[");
2088
2089     unsigned int max = host->health_log.max;
2090     unsigned int count = 0;
2091     ALARM_ENTRY *ae;
2092     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
2093         if(ae->unique_id > after) {
2094             if(likely(count)) buffer_strcat(wb, ",");
2095             health_alarm_entry2json_nolock(wb, ae, host);
2096         }
2097     }
2098
2099     buffer_strcat(wb, "\n]\n");
2100
2101     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2102 }
2103
2104 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
2105     buffer_sprintf(wb,
2106            "\t\t\"%s.%s\": {\n"
2107                    "\t\t\t\"id\": %lu,\n"
2108                    "\t\t\t\"name\": \"%s\",\n"
2109                    "\t\t\t\"chart\": \"%s\",\n"
2110                    "\t\t\t\"family\": \"%s\",\n"
2111                    "\t\t\t\"active\": %s,\n"
2112                    "\t\t\t\"exec\": \"%s\",\n"
2113                    "\t\t\t\"recipient\": \"%s\",\n"
2114                    "\t\t\t\"source\": \"%s\",\n"
2115                    "\t\t\t\"units\": \"%s\",\n"
2116                    "\t\t\t\"info\": \"%s\",\n"
2117                                    "\t\t\t\"status\": \"%s\",\n"
2118                    "\t\t\t\"last_status_change\": %lu,\n"
2119                    "\t\t\t\"last_updated\": %lu,\n"
2120                    "\t\t\t\"next_update\": %lu,\n"
2121                    "\t\t\t\"update_every\": %d,\n"
2122                    "\t\t\t\"delay_up_duration\": %d,\n"
2123                    "\t\t\t\"delay_down_duration\": %d,\n"
2124                    "\t\t\t\"delay_max_duration\": %d,\n"
2125                    "\t\t\t\"delay_multiplier\": %f,\n"
2126                    "\t\t\t\"delay\": %d,\n"
2127                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
2128             , rc->chart, rc->name
2129             , (unsigned long)rc->id
2130             , rc->name
2131             , rc->chart
2132             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
2133             , (rc->rrdset)?"true":"false"
2134             , rc->exec?rc->exec:health.health_default_exec
2135             , rc->recipient?rc->recipient:health.health_default_recipient
2136             , rc->source
2137             , rc->units?rc->units:""
2138             , rc->info?rc->info:""
2139             , rrdcalc_status2string(rc->status)
2140             , (unsigned long)rc->last_status_change
2141             , (unsigned long)rc->last_updated
2142             , (unsigned long)rc->next_update
2143             , rc->update_every
2144             , rc->delay_up_duration
2145             , rc->delay_down_duration
2146             , rc->delay_max_duration
2147             , rc->delay_multiplier
2148             , rc->delay_last
2149             , (unsigned long)rc->delay_up_to_timestamp
2150     );
2151
2152     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
2153         if(rc->dimensions && *rc->dimensions)
2154             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
2155
2156         buffer_sprintf(wb,
2157                        "\t\t\t\"db_after\": %lu,\n"
2158                        "\t\t\t\"db_before\": %lu,\n"
2159                        "\t\t\t\"lookup_method\": \"%s\",\n"
2160                        "\t\t\t\"lookup_after\": %d,\n"
2161                        "\t\t\t\"lookup_before\": %d,\n"
2162                        "\t\t\t\"lookup_options\": \"",
2163                        (unsigned long) rc->db_after,
2164                        (unsigned long) rc->db_before,
2165                        group_method2string(rc->group),
2166                        rc->after,
2167                        rc->before
2168         );
2169         buffer_data_options2string(wb, rc->options);
2170         buffer_strcat(wb, "\",\n");
2171     }
2172
2173     if(rc->calculation) {
2174         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
2175         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
2176     }
2177
2178     if(rc->warning) {
2179         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
2180         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
2181     }
2182
2183     if(rc->critical) {
2184         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2185         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2186     }
2187
2188     buffer_strcat(wb, "\t\t\t\"green\":");
2189     buffer_rrd_value(wb, rc->green);
2190     buffer_strcat(wb, ",\n");
2191
2192     buffer_strcat(wb, "\t\t\t\"red\":");
2193     buffer_rrd_value(wb, rc->red);
2194     buffer_strcat(wb, ",\n");
2195
2196     buffer_strcat(wb, "\t\t\t\"value\":");
2197     buffer_rrd_value(wb, rc->value);
2198     buffer_strcat(wb, "\n");
2199
2200     buffer_strcat(wb, "\t\t}");
2201 }
2202
2203 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2204 //
2205 //}
2206
2207 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2208     int i;
2209
2210     rrdhost_rdlock(&localhost);
2211     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2212                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2213                         "\n\t\"status\": %s,"
2214                         "\n\t\"now\": %lu,"
2215                         "\n\t\"alarms\": {\n",
2216                         host->hostname,
2217                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2218                         health_enabled?"true":"false",
2219                         (unsigned long)time(NULL));
2220
2221     RRDCALC *rc;
2222     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2223         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2224             continue;
2225
2226         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2227             continue;
2228
2229         if(likely(i)) buffer_strcat(wb, ",\n");
2230         health_rrdcalc2json_nolock(wb, rc);
2231         i++;
2232     }
2233
2234 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2235 //    RRDCALCTEMPLATE *rt;
2236 //    for(rt = host->templates; rt ; rt = rt->next)
2237 //        health_rrdcalctemplate2json_nolock(wb, rt);
2238
2239     buffer_strcat(wb, "\n\t}\n}\n");
2240     rrdhost_unlock(&localhost);
2241 }
2242
2243
2244 // ----------------------------------------------------------------------------
2245 // re-load health configuration
2246
2247 static inline void health_free_all_nolock(RRDHOST *host) {
2248     while(host->templates)
2249         rrdcalctemplate_free(host, host->templates);
2250
2251     while(host->alarms)
2252         rrdcalc_free(host, host->alarms);
2253 }
2254
2255 void health_reload(void) {
2256     if(!health_enabled) {
2257         error("Health reload is requested, but health is not enabled.");
2258         return;
2259     }
2260
2261     char *path = health_config_dir();
2262
2263     // free all running alarms
2264     rrdhost_rwlock(&localhost);
2265     health_free_all_nolock(&localhost);
2266     rrdhost_unlock(&localhost);
2267
2268     // invalidate all previous entries in the alarm log
2269     ALARM_ENTRY *t;
2270     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2271         if(t->new_status != RRDCALC_STATUS_REMOVED)
2272             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
2273     }
2274
2275     // reset all thresholds to all charts
2276     RRDSET *st;
2277     for(st = localhost.rrdset_root; st ; st = st->next) {
2278         st->green = NAN;
2279         st->red = NAN;
2280     }
2281
2282     // load the new alarms
2283     rrdhost_rwlock(&localhost);
2284     health_readdir(path);
2285     rrdhost_unlock(&localhost);
2286
2287     // link the loaded alarms to their charts
2288     for(st = localhost.rrdset_root; st ; st = st->next) {
2289         rrdhost_rwlock(&localhost);
2290
2291         rrdsetcalc_link_matching(st);
2292         rrdcalctemplate_link_matching(st);
2293
2294         rrdhost_unlock(&localhost);
2295     }
2296 }
2297
2298 // ----------------------------------------------------------------------------
2299 // health main thread and friends
2300
2301 static inline int rrdcalc_value2status(calculated_number n) {
2302     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2303     if(n) return RRDCALC_STATUS_RAISED;
2304     return RRDCALC_STATUS_CLEAR;
2305 }
2306
2307 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2308     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
2309
2310     // find the previous notification for the same alarm
2311     ALARM_ENTRY *t;
2312     for(t = ae->next; t ;t = t->next) {
2313         if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
2314             break;
2315     }
2316
2317     if(t && t->new_status == ae->new_status) {
2318         // don't send the same notification again
2319         info("Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2320         goto done;
2321     }
2322
2323     if((ae->old_status == RRDCALC_STATUS_UNDEFINED && ae->new_status == RRDCALC_STATUS_UNINITIALIZED)
2324         || (ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2325         info("Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2326         goto done;
2327     }
2328
2329     char buffer[FILENAME_MAX + 1];
2330     pid_t command_pid;
2331
2332     const char *exec = ae->exec;
2333     if(!exec) exec = health.health_default_exec;
2334
2335     const char *recipient = ae->recipient;
2336     if(!recipient) recipient = health.health_default_recipient;
2337
2338     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2339               exec,
2340               recipient,
2341               host->hostname,
2342               ae->unique_id,
2343               ae->alarm_id,
2344               ae->alarm_event_id,
2345               (unsigned long)ae->when,
2346               ae->name,
2347               ae->chart?ae->chart:"NOCAHRT",
2348               ae->family?ae->family:"NOFAMILY",
2349               rrdcalc_status2string(ae->new_status),
2350               rrdcalc_status2string(ae->old_status),
2351               ae->new_value,
2352               ae->old_value,
2353               ae->source?ae->source:"UNKNOWN",
2354               (uint32_t)ae->duration,
2355               (uint32_t)ae->non_clear_duration,
2356               ae->units?ae->units:"",
2357               ae->info?ae->info:""
2358     );
2359
2360     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
2361     ae->exec_run_timestamp = time(NULL);
2362
2363     debug(D_HEALTH, "executing command '%s'", buffer);
2364     FILE *fp = mypopen(buffer, &command_pid);
2365     if(!fp) {
2366         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2367         goto done;
2368     }
2369     debug(D_HEALTH, "HEALTH reading from command");
2370     char *s = fgets(buffer, FILENAME_MAX, fp);
2371     (void)s;
2372     ae->exec_code = mypclose(fp, command_pid);
2373     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2374
2375     if(ae->exec_code != 0)
2376         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
2377
2378 done:
2379     health_alarm_log_save(host, ae);
2380     return;
2381 }
2382
2383 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2384     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2385          ae->chart?ae->chart:"NOCHART", ae->name,
2386          ae->new_value,
2387          rrdcalc_status2string(ae->old_status),
2388          rrdcalc_status2string(ae->new_status)
2389     );
2390
2391     health_alarm_execute(host, ae);
2392 }
2393
2394 static inline void health_alarm_log_process(RRDHOST *host) {
2395     static uint32_t stop_at_id = 0;
2396     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2397     time_t now = time(NULL);
2398
2399     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2400
2401     ALARM_ENTRY *ae;
2402     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2403         if(unlikely(
2404             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
2405             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
2406             )) {
2407
2408             if(unlikely(ae->unique_id < first_waiting))
2409                 first_waiting = ae->unique_id;
2410
2411             if(likely(now >= ae->delay_up_to_timestamp))
2412                 health_process_notifications(host, ae);
2413         }
2414     }
2415
2416     // remember this for the next iteration
2417     stop_at_id = first_waiting;
2418
2419     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2420
2421     if(host->health_log.count <= host->health_log.max)
2422         return;
2423
2424     // cleanup excess entries in the log
2425     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2426
2427     ALARM_ENTRY *last = NULL;
2428     unsigned int count = host->health_log.max * 2 / 3;
2429     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2430
2431     if(ae && last && last->next == ae)
2432         last->next = NULL;
2433     else
2434         ae = NULL;
2435
2436     while(ae) {
2437         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2438
2439         ALARM_ENTRY *t = ae->next;
2440
2441         freez(ae->name);
2442         freez(ae->chart);
2443         freez(ae->family);
2444         freez(ae->exec);
2445         freez(ae->recipient);
2446         freez(ae->source);
2447         freez(ae->units);
2448         freez(ae->info);
2449         freez(ae);
2450
2451         ae = t;
2452         host->health_log.count--;
2453     }
2454
2455     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2456 }
2457
2458 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2459     if (unlikely(!rc->rrdset)) {
2460         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2461         return 0;
2462     }
2463
2464     if (unlikely(!rc->rrdset->last_collected_time.tv_sec)) {
2465         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not yet collected.", rc->chart?rc->chart:"NOCHART", rc->name);
2466         return 0;
2467     }
2468
2469     if (unlikely(!rc->update_every)) {
2470         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2471         return 0;
2472     }
2473
2474     if (unlikely(rc->next_update > now)) {
2475         if (unlikely(*next_run > rc->next_update))
2476             *next_run = rc->next_update;
2477
2478         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2479         return 0;
2480     }
2481
2482     // FIXME
2483     // we should check that the DB lookup is possible
2484     // i.e.
2485     // - the duration of the chart includes the required timeframe
2486     // we SHOULD NOT check the dimensions - there might be alarms that refer non-existing dimensions (e.g. cpu steal)
2487
2488     return 1;
2489 }
2490
2491 void *health_main(void *ptr) {
2492     (void)ptr;
2493
2494     info("HEALTH thread created with task id %d", gettid());
2495
2496     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2497         error("Cannot set pthread cancel type to DEFERRED.");
2498
2499     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2500         error("Cannot set pthread cancel state to ENABLE.");
2501
2502     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2503     if(min_run_every < 1) min_run_every = 1;
2504
2505     BUFFER *wb = buffer_create(100);
2506
2507     unsigned int loop = 0;
2508     while(health_enabled && !netdata_exit) {
2509         loop++;
2510         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2511
2512         int oldstate, runnable = 0;
2513         time_t now = time(NULL);
2514         time_t next_run = now + min_run_every;
2515         RRDCALC *rc;
2516
2517         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2518             error("Cannot set pthread cancel state to DISABLE.");
2519
2520         rrdhost_rdlock(&localhost);
2521
2522         // the first loop is to lookup values from the db
2523         for (rc = localhost.alarms; rc; rc = rc->next) {
2524             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2525                 continue;
2526
2527             runnable++;
2528             rc->old_value = rc->value;
2529
2530             // 1. if there is database lookup, do it
2531             // 2. if there is calculation expression, run it
2532
2533             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2534                 time_t old_db_timestamp = rc->db_before;
2535                 int value_is_null = 0;
2536
2537                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2538                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2539                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2540
2541                 if (unlikely(ret != 200)) {
2542                     // database lookup failed
2543                     rc->value = NAN;
2544
2545                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2546
2547                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2548                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2549                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2550                     }
2551                 }
2552                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2553                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2554
2555                 if (unlikely(old_db_timestamp == rc->db_before)) {
2556                     // database is stale
2557
2558                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2559
2560                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2561                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2562                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2563                     }
2564                 }
2565                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2566                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2567
2568                 if (unlikely(value_is_null)) {
2569                     // collected value is null
2570
2571                     rc->value = NAN;
2572
2573                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2574                           rc->chart?rc->chart:"NOCHART", rc->name);
2575
2576                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2577                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2578                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2579                               rc->chart?rc->chart:"NOCHART", rc->name);
2580                     }
2581                 }
2582                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2583                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2584
2585                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2586                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2587             }
2588
2589             if(unlikely(rc->calculation)) {
2590                 if (unlikely(!expression_evaluate(rc->calculation))) {
2591                     // calculation failed
2592
2593                     rc->value = NAN;
2594
2595                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2596                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2597
2598                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2599                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2600                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2601                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2602                     }
2603                 }
2604                 else {
2605                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2606                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2607
2608                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
2609                             CALCULATED_NUMBER_FORMAT
2610                             ": %s (source: %s)",
2611                           rc->chart?rc->chart:"NOCHART", rc->name,
2612                           rc->calculation->result,
2613                           buffer_tostring(rc->calculation->error_msg),
2614                           rc->source
2615                     );
2616
2617                     rc->value = rc->calculation->result;
2618                 }
2619             }
2620         }
2621         rrdhost_unlock(&localhost);
2622
2623         if (unlikely(runnable && !netdata_exit)) {
2624             rrdhost_rdlock(&localhost);
2625
2626             for (rc = localhost.alarms; rc; rc = rc->next) {
2627                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2628                     continue;
2629
2630                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2631                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2632
2633                 if(likely(rc->warning)) {
2634                     if(unlikely(!expression_evaluate(rc->warning))) {
2635                         // calculation failed
2636
2637                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2638                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2639
2640                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2641                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2642                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2643                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2644                         }
2645                     }
2646                     else {
2647                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2648                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2649
2650                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2651                                 CALCULATED_NUMBER_FORMAT
2652                                 ": %s (source: %s)",
2653                               rc->chart?rc->chart:"NOCHART", rc->name,
2654                               rc->warning->result,
2655                               buffer_tostring(rc->warning->error_msg),
2656                               rc->source
2657                         );
2658
2659                         warning_status = rrdcalc_value2status(rc->warning->result);
2660                     }
2661                 }
2662
2663                 if(likely(rc->critical)) {
2664                     if(unlikely(!expression_evaluate(rc->critical))) {
2665                         // calculation failed
2666
2667                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2668                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2669
2670                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2671                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2672                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2673                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2674                         }
2675                     }
2676                     else {
2677                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2678                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2679
2680                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2681                                 CALCULATED_NUMBER_FORMAT
2682                                 ": %s (source: %s)",
2683                               rc->chart?rc->chart:"NOCHART", rc->name,
2684                               rc->critical->result,
2685                               buffer_tostring(rc->critical->error_msg),
2686                               rc->source
2687                         );
2688
2689                         critical_status = rrdcalc_value2status(rc->critical->result);
2690                     }
2691                 }
2692
2693                 int status = RRDCALC_STATUS_UNDEFINED;
2694
2695                 switch(warning_status) {
2696                     case RRDCALC_STATUS_CLEAR:
2697                         status = RRDCALC_STATUS_CLEAR;
2698                         break;
2699
2700                     case RRDCALC_STATUS_RAISED:
2701                         status = RRDCALC_STATUS_WARNING;
2702                         break;
2703
2704                     default:
2705                         break;
2706                 }
2707
2708                 switch(critical_status) {
2709                     case RRDCALC_STATUS_CLEAR:
2710                         if(status == RRDCALC_STATUS_UNDEFINED)
2711                             status = RRDCALC_STATUS_CLEAR;
2712                         break;
2713
2714                     case RRDCALC_STATUS_RAISED:
2715                         status = RRDCALC_STATUS_CRITICAL;
2716                         break;
2717
2718                     default:
2719                         break;
2720                 }
2721
2722                 if(status != rc->status) {
2723                     int delay = 0;
2724
2725                     if(now > rc->delay_up_to_timestamp) {
2726                         rc->delay_up_current = rc->delay_up_duration;
2727                         rc->delay_down_current = rc->delay_down_duration;
2728                         rc->delay_last = 0;
2729                         rc->delay_up_to_timestamp = 0;
2730                     }
2731                     else {
2732                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2733                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2734
2735                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2736                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2737                     }
2738
2739                     if(status > rc->status)
2740                         delay = rc->delay_up_current;
2741                     else
2742                         delay = rc->delay_down_current;
2743
2744                     // COMMENTED: because we do need to send raising alarms
2745                     // if(now + delay < rc->delay_up_to_timestamp)
2746                     //    delay = (int)(rc->delay_up_to_timestamp - now);
2747
2748                     rc->delay_last = delay;
2749                     rc->delay_up_to_timestamp = now + delay;
2750                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
2751                     rc->last_status_change = now;
2752                     rc->status = status;
2753                 }
2754
2755                 rc->last_updated = now;
2756                 rc->next_update = now + rc->update_every;
2757
2758                 if (next_run > rc->next_update)
2759                     next_run = rc->next_update;
2760             }
2761
2762             rrdhost_unlock(&localhost);
2763         }
2764
2765         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2766             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2767
2768         if(unlikely(netdata_exit))
2769             break;
2770
2771         // execute notifications
2772         // and cleanup
2773         health_alarm_log_process(&localhost);
2774
2775         if(unlikely(netdata_exit))
2776             break;
2777         
2778         now = time(NULL);
2779         if(now < next_run) {
2780             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2781                   loop, (int) (next_run - now));
2782             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2783         }
2784         else {
2785             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2786         }
2787     }
2788
2789     buffer_free(wb);
2790
2791     info("HEALTH thread exiting");
2792     pthread_exit(NULL);
2793     return NULL;
2794 }