]> arthur.barton.de Git - netdata.git/blob - src/health.c
prevent running alarms on charts that have not been collected yet
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 static const char *health_default_exec = PLUGINS_DIR "/alarm-email.sh";
6 int health_enabled = 1;
7
8 // ----------------------------------------------------------------------------
9 // RRDVAR management
10
11 static inline int rrdvar_fix_name(char *variable) {
12     int fixed = 0;
13     while(*variable) {
14         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
15             *variable++ = '_';
16             fixed++;
17         }
18         else
19             variable++;
20     }
21
22     return fixed;
23 }
24
25 int rrdvar_compare(void* a, void* b) {
26     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
27     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
28     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
29 }
30
31 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
32     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
33     if(ret != rv)
34         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
35
36     return ret;
37 }
38
39 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
40     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
41     if(!ret)
42         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
43
44     return ret;
45 }
46
47 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
48     RRDVAR tmp;
49     tmp.name = (char *)name;
50     tmp.hash = (hash)?hash:simple_hash(tmp.name);
51
52     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
53 }
54
55 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
56     (void)host;
57
58     if(!rv) return;
59
60     if(tree)
61         rrdvar_index_del(tree, rv);
62
63     freez(rv->name);
64     freez(rv);
65 }
66
67 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
68     char *variable = strdupz(name);
69     rrdvar_fix_name(variable);
70     uint32_t hash = simple_hash(variable);
71
72     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
73     if(unlikely(!rv)) {
74         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
75
76         rv = callocz(1, sizeof(RRDVAR));
77         rv->name = variable;
78         rv->hash = hash;
79         rv->type = type;
80         rv->value = value;
81
82         RRDVAR *ret = rrdvar_index_add(tree, rv);
83         if(unlikely(ret != rv)) {
84             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
85             rrdvar_free(NULL, NULL, rv);
86             rv = NULL;
87         }
88         else
89             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
90     }
91     else {
92         // already exists
93         freez(variable);
94         rv = NULL;
95     }
96
97     return rv;
98 }
99
100 // ----------------------------------------------------------------------------
101 // RRDVAR lookup
102
103 calculated_number rrdvar2number(RRDVAR *rv) {
104     switch(rv->type) {
105         case RRDVAR_TYPE_CALCULATED: {
106             calculated_number *n = (calculated_number *)rv->value;
107             return *n;
108         }
109
110         case RRDVAR_TYPE_TIME_T: {
111             time_t *n = (time_t *)rv->value;
112             return *n;
113         }
114
115         case RRDVAR_TYPE_COLLECTED: {
116             collected_number *n = (collected_number *)rv->value;
117             return *n;
118         }
119
120         case RRDVAR_TYPE_TOTAL: {
121             total_number *n = (total_number *)rv->value;
122             return *n;
123         }
124
125         case RRDVAR_TYPE_INT: {
126             int *n = (int *)rv->value;
127             return *n;
128         }
129
130         default:
131             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
132             return NAN;
133     }
134 }
135
136 void dump_variable(void *data) {
137     RRDVAR *rv = (RRDVAR *)data;
138     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
139 }
140
141 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
142     RRDSET *st = rc->rrdset;
143     RRDVAR *rv;
144
145     if(!st) return 0;
146
147     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
148     if(rv) {
149         *result = rrdvar2number(rv);
150         return 1;
151     }
152
153     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
154     if(rv) {
155         *result = rrdvar2number(rv);
156         return 1;
157     }
158
159     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
160     if(rv) {
161         *result = rrdvar2number(rv);
162         return 1;
163     }
164
165     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
166     avl_traverse_lock(&st->variables_root_index, dump_variable);
167
168     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
169     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
170
171     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
172     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
173
174     return 0;
175 }
176
177 // ----------------------------------------------------------------------------
178 // RRDSETVAR management
179
180 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
181     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
182     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
183
184     char buffer[RRDVAR_MAX_LENGTH + 1];
185     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
186     rs->fullid = strdupz(buffer);
187
188     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
189     rs->fullname = strdupz(buffer);
190
191     rs->variable = strdupz(variable);
192
193     rs->type = type;
194     rs->value = value;
195     rs->options = options;
196     rs->rrdset = st;
197
198     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
199     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
200     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
201     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
202     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
203
204     rs->next = st->variables;
205     st->variables = rs;
206
207     return rs;
208 }
209
210 void rrdsetvar_rename_all(RRDSET *st) {
211     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
212
213     // only these 2 can change name
214     // rs->family_name
215     // rs->host_name
216
217     char buffer[RRDVAR_MAX_LENGTH + 1];
218     RRDSETVAR *rs, *next = st->variables;
219     while((rs = next)) {
220         next = rs->next;
221
222         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
223
224         if (strcmp(buffer, rs->fullname)) {
225             // name changed
226             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
227             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
228
229             freez(rs->fullname);
230             rs->fullname = strdupz(st->name);
231             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
232             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
233         }
234     }
235
236     rrdsetcalc_link_matching(st);
237 }
238
239 void rrdsetvar_free(RRDSETVAR *rs) {
240     RRDSET *st = rs->rrdset;
241     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
242
243     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
244     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
245     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
246     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
247     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
248
249     if(st->variables == rs) {
250         st->variables = rs->next;
251     }
252     else {
253         RRDSETVAR *t;
254         for (t = st->variables; t && t->next != rs; t = t->next);
255         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
256         else t->next = rs->next;
257     }
258
259     freez(rs->fullid);
260     freez(rs->fullname);
261     freez(rs->variable);
262     freez(rs);
263 }
264
265 // ----------------------------------------------------------------------------
266 // RRDDIMVAR management
267
268 #define RRDDIMVAR_ID_MAX 1024
269
270 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
271     RRDSET *st = rd->rrdset;
272
273     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
274
275     if(!prefix) prefix = "";
276     if(!suffix) suffix = "";
277
278     char buffer[RRDDIMVAR_ID_MAX + 1];
279     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
280
281     rs->prefix = strdupz(prefix);
282     rs->suffix = strdupz(suffix);
283
284     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
285     rs->id = strdupz(buffer);
286
287     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
288     rs->name = strdupz(buffer);
289
290     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
291     rs->fullidid = strdupz(buffer);
292
293     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
294     rs->fullidname = strdupz(buffer);
295
296     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
297     rs->fullnameid = strdupz(buffer);
298
299     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
300     rs->fullnamename = strdupz(buffer);
301
302     rs->type = type;
303     rs->value = value;
304     rs->options = options;
305     rs->rrddim = rd;
306
307     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
308     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
309
310     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
311     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
312
313     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
314     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
315     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
316     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
317
318     rs->next = rd->variables;
319     rd->variables = rs;
320
321     return rs;
322 }
323
324 void rrddimvar_rename_all(RRDDIM *rd) {
325     RRDSET *st = rd->rrdset;
326     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
327
328     RRDDIMVAR *rs, *next = rd->variables;
329     while((rs = next)) {
330         next = rs->next;
331
332         if (strcmp(rd->name, rs->name)) {
333             char buffer[RRDDIMVAR_ID_MAX + 1];
334             // name changed
335
336             // name
337             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
338             freez(rs->name);
339             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
340             rs->name = strdupz(buffer);
341             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
342
343             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
344             freez(rs->fullidname);
345             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
346             rs->fullidname = strdupz(buffer);
347             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
348                                                              rs->fullidname, rs->type, rs->value);
349
350             // fullnameid
351             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
352             freez(rs->fullnameid);
353             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
354             rs->fullnameid = strdupz(buffer);
355             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
356                                                           rs->fullnameid, rs->type, rs->value);
357
358             // fullnamename
359             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
360             freez(rs->fullnamename);
361             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
362             rs->fullnamename = strdupz(buffer);
363             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
364                                                           rs->fullnamename, rs->type, rs->value);
365         }
366     }
367 }
368
369 void rrddimvar_free(RRDDIMVAR *rs) {
370     RRDDIM *rd = rs->rrddim;
371     RRDSET *st = rd->rrdset;
372     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
373
374     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
375     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
376
377     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
378     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
379
380     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
381     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
382     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
383     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
384
385     if(rd->variables == rs) {
386         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
387         rd->variables = rs->next;
388     }
389     else {
390         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
391         RRDDIMVAR *t;
392         for (t = rd->variables; t && t->next != rs; t = t->next) ;
393         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
394         else t->next = rs->next;
395     }
396
397     freez(rs->prefix);
398     freez(rs->suffix);
399     freez(rs->id);
400     freez(rs->name);
401     freez(rs->fullidid);
402     freez(rs->fullidname);
403     freez(rs->fullnameid);
404     freez(rs->fullnamename);
405     freez(rs);
406 }
407
408 // ----------------------------------------------------------------------------
409 // RRDCALC management
410
411 static inline const char *rrdcalc_status2string(int status) {
412     switch(status) {
413         case RRDCALC_STATUS_UNINITIALIZED:
414             return "UNINITIALIZED";
415
416         case RRDCALC_STATUS_UNDEFINED:
417             return "UNDEFINED";
418
419         case RRDCALC_STATUS_CLEAR:
420             return "CLEAR";
421
422         case RRDCALC_STATUS_RAISED:
423             return "RAISED";
424
425         case RRDCALC_STATUS_WARNING:
426             return "WARNING";
427
428         case RRDCALC_STATUS_CRITICAL:
429             return "CRITICAL";
430
431         default:
432             return "UNKNOWN";
433     }
434 }
435
436 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
437     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
438
439     rc->last_status_change = time(NULL);
440     rc->rrdset = st;
441
442     rc->rrdset_next = st->alarms;
443     rc->rrdset_prev = NULL;
444     st->alarms = rc;
445
446     if(rc->update_every < rc->rrdset->update_every) {
447         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
448         rc->update_every = rc->rrdset->update_every;
449     }
450
451     if(!isnan(rc->green) && isnan(st->green)) {
452         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
453         st->green = rc->green;
454     }
455
456     if(!isnan(rc->red) && isnan(st->red)) {
457         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
458         st->red = rc->red;
459     }
460
461     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
462     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
463
464     char fullname[RRDVAR_MAX_LENGTH + 1];
465     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
466     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
467
468     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
469     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
470
471         if(!rc->units) rc->units = strdupz(st->units);
472 }
473
474 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
475     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
476             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
477         return 1;
478
479     return 0;
480 }
481
482 // this has to be called while the RRDHOST is locked
483 inline void rrdsetcalc_link_matching(RRDSET *st) {
484     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
485
486     RRDCALC *rc;
487     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
488         if(unlikely(rc->rrdset))
489             continue;
490
491         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
492             rrdsetcalc_link(st, rc);
493     }
494 }
495
496 // this has to be called while the RRDHOST is locked
497 inline void rrdsetcalc_unlink(RRDCALC *rc) {
498     RRDSET *st = rc->rrdset;
499
500     if(!st) {
501         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
502         return;
503     }
504
505     RRDHOST *host = st->rrdhost;
506
507     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
508
509     // unlink it
510     if(rc->rrdset_prev)
511         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
512
513     if(rc->rrdset_next)
514         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
515
516     if(st->alarms == rc)
517         st->alarms = rc->rrdset_next;
518
519     rc->rrdset_prev = rc->rrdset_next = NULL;
520
521     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
522     rc->local = NULL;
523
524     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
525     rc->family = NULL;
526
527     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
528     rc->hostid = NULL;
529
530     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
531     rc->hostname = NULL;
532
533     rc->rrdset = NULL;
534
535     // RRDCALC will remain in RRDHOST
536     // so that if the matching chart is found in the future
537     // it will be applied automatically
538 }
539
540 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
541     RRDCALC *rc;
542     uint32_t hash = simple_hash(name);
543
544     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
545         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
546             return rc;
547     }
548
549     return NULL;
550 }
551
552 static inline int rrdcalc_exists(RRDHOST *host, const char *name, uint32_t hash) {
553     RRDCALC *rc;
554
555     // make sure it does not already exist
556     for(rc = host->alarms; rc ; rc = rc->next) {
557         if (unlikely(rc->hash == hash && !strcmp(name, rc->name))) {
558             error("Health alarm '%s' already exists in host '%s'.", name, host->hostname);
559             return 1;
560         }
561     }
562
563     return 0;
564 }
565
566 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
567     rrdhost_check_rdlock(host);
568
569     if(rc->calculation) {
570         rc->calculation->this = &rc->value;
571         rc->calculation->after = &rc->db_after;
572         rc->calculation->before = &rc->db_before;
573         rc->calculation->rrdcalc = rc;
574     }
575
576     if(rc->warning) {
577         rc->warning->this = &rc->value;
578         rc->warning->after = &rc->db_after;
579         rc->warning->before = &rc->db_before;
580         rc->warning->rrdcalc = rc;
581     }
582
583     if(rc->critical) {
584         rc->critical->this = &rc->value;
585         rc->critical->after = &rc->db_after;
586         rc->critical->before = &rc->db_before;
587         rc->critical->rrdcalc = rc;
588     }
589
590     // link it to the host
591     if(likely(host->alarms)) {
592         // append it
593         RRDCALC *t;
594         for(t = host->alarms; t && t->next ; t = t->next) ;
595         t->next = rc;
596     }
597     else {
598         host->alarms = rc;
599     }
600
601     // link it to its chart
602     RRDSET *st;
603     for(st = host->rrdset_root; st ; st = st->next) {
604         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
605             rrdsetcalc_link(st, rc);
606             break;
607         }
608     }
609 }
610
611 static inline uint32_t rrdcalc_fullname(char *fullname, size_t len, const char *chart, const char *name) {
612     snprintfz(fullname, len - 1, "%s%s%s", chart?chart:"", chart?".":"", name);
613     rrdvar_fix_name(fullname);
614     return simple_hash(fullname);
615 }
616
617 static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const char *chart, const char *dimensions,
618                         const char *units, const char *info,
619                         int group_method, int after, int before, int update_every, uint32_t options,
620                         calculated_number green, calculated_number red,
621                         const char *exec, const char *source,
622                         const char *calc, const char *warn, const char *crit) {
623
624     char fullname[RRDVAR_MAX_LENGTH + 1];
625     uint32_t hash = rrdcalc_fullname(fullname, RRDVAR_MAX_LENGTH + 1, chart, name);
626
627     if(rrdcalc_exists(host, fullname, hash))
628         return NULL;
629
630     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
631
632     rc->name = strdupz(name);
633     rc->hash = simple_hash(rc->name);
634
635     rc->chart = strdupz(chart);
636     rc->hash_chart = simple_hash(rc->chart);
637
638     if(dimensions) rc->dimensions = strdupz(dimensions);
639
640     rc->green = green;
641     rc->red = red;
642     rc->value = NAN;
643     rc->old_value = NAN;
644
645     rc->group = group_method;
646     rc->after = after;
647     rc->before = before;
648     rc->update_every = update_every;
649     rc->options = options;
650
651     if(exec) rc->exec = strdupz(exec);
652     if(source) rc->source = strdupz(source);
653     if(units) rc->units = strdupz(units);
654     if(info) rc->info = strdupz(info);
655
656     if(calc) {
657         rc->calculation = expression_parse(calc, NULL, NULL);
658         if(!rc->calculation)
659             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, name, calc);
660     }
661     if(warn) {
662         rc->warning = expression_parse(warn, NULL, NULL);
663         if(!rc->warning)
664             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, name, warn);
665     }
666     if(crit) {
667         rc->critical = expression_parse(crit, NULL, NULL);
668         if(!rc->critical)
669             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, name, crit);
670     }
671
672     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
673           (rc->chart)?rc->chart:"NOCHART",
674           rc->name,
675           (rc->exec)?rc->exec:"DEFAULT",
676           rc->green,
677           rc->red,
678           rc->group,
679           rc->after,
680           rc->before,
681           rc->options,
682           (rc->dimensions)?rc->dimensions:"NONE",
683           rc->update_every,
684           (rc->calculation)?rc->calculation->parsed_as:"NONE",
685           (rc->warning)?rc->warning->parsed_as:"NONE",
686           (rc->critical)?rc->critical->parsed_as:"NONE",
687           rc->source
688     );
689
690     rrdcalc_create_part2(host, rc);
691     return rc;
692 }
693
694 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
695     if(!rc) return;
696
697     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
698
699     // unlink it from RRDSET
700     if(rc->rrdset) rrdsetcalc_unlink(rc);
701
702     // unlink it from RRDHOST
703     if(unlikely(rc == host->alarms))
704         host->alarms = rc->next;
705
706     else if(likely(host->alarms)) {
707         RRDCALC *t, *last = host->alarms;
708
709         for(t = last->next; t && t != rc; last = t, t = t->next) ;
710         if(last && last->next == rc)
711             last->next = rc->next;
712         else
713             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
714     }
715     else
716         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
717
718     expression_free(rc->calculation);
719     expression_free(rc->warning);
720     expression_free(rc->critical);
721
722     freez(rc->name);
723     freez(rc->chart);
724     freez(rc->family);
725     freez(rc->dimensions);
726     freez(rc->exec);
727     freez(rc->source);
728     freez(rc->units);
729     freez(rc->info);
730     freez(rc);
731 }
732
733 // ----------------------------------------------------------------------------
734 // RRDCALCTEMPLATE management
735
736 void rrdcalctemplate_link_matching(RRDSET *st) {
737     RRDCALCTEMPLATE *rt;
738
739     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
740         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
741             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt->name, st->id,
742                            rt->dimensions, rt->units, rt->info, rt->group, rt->after, rt->before, rt->update_every, rt->options,
743                            rt->green, rt->red, rt->exec, rt->source,
744                            (rt->calculation)?rt->calculation->source:NULL,
745                            (rt->warning)?rt->warning->source:NULL,
746                            (rt->critical)?rt->critical->source:NULL);
747
748             if(unlikely(!rc))
749                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
750
751 #ifdef NETDATA_INTERNAL_CHECKS
752             else if(rc->rrdset != st)
753                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
754 #endif
755         }
756     }
757 }
758
759 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
760     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
761
762     if(host->templates) {
763         if(host->templates == rt) {
764             host->templates = rt->next;
765         }
766         else {
767             RRDCALCTEMPLATE *t, *last = host->templates;
768             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
769             if(last && last->next == rt) {
770                 last->next = rt->next;
771                 rt->next = NULL;
772             }
773             else
774                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
775         }
776     }
777
778     expression_free(rt->calculation);
779     expression_free(rt->warning);
780     expression_free(rt->critical);
781
782     freez(rt->name);
783     freez(rt->exec);
784     freez(rt->context);
785     freez(rt->source);
786     freez(rt->units);
787     freez(rt->info);
788     freez(rt->dimensions);
789     freez(rt);
790 }
791
792 // ----------------------------------------------------------------------------
793 // load health configuration
794
795 #define HEALTH_CONF_MAX_LINE 4096
796
797 #define HEALTH_ALARM_KEY "alarm"
798 #define HEALTH_TEMPLATE_KEY "template"
799 #define HEALTH_ON_KEY "on"
800 #define HEALTH_LOOKUP_KEY "lookup"
801 #define HEALTH_CALC_KEY "calc"
802 #define HEALTH_EVERY_KEY "every"
803 #define HEALTH_GREEN_KEY "green"
804 #define HEALTH_RED_KEY "red"
805 #define HEALTH_WARN_KEY "warn"
806 #define HEALTH_CRIT_KEY "crit"
807 #define HEALTH_EXEC_KEY "exec"
808 #define HEALTH_UNITS_KEY "units"
809 #define HEALTH_INFO_KEY "info"
810
811 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
812     {
813         char fullname[RRDVAR_MAX_LENGTH + 1];
814         uint32_t hash = rrdcalc_fullname(fullname, RRDVAR_MAX_LENGTH + 1, rc->chart, rc->name);
815
816         if (rrdcalc_exists(host, fullname, hash))
817             return 0;
818     }
819
820     if(!rc->chart) {
821         error("Health configuration for alarm '%s' does not have a chart", rc->name);
822         return 0;
823     }
824
825     if(!rc->update_every) {
826         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
827         return 0;
828     }
829
830     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
831         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
832         return 0;
833     }
834
835     debug(D_HEALTH, "Health configuration adding alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
836           rc->chart?rc->chart:"NOCHART",
837           rc->name,
838           (rc->exec)?rc->exec:"DEFAULT",
839           rc->green,
840           rc->red,
841           rc->group,
842           rc->after,
843           rc->before,
844           rc->options,
845           (rc->dimensions)?rc->dimensions:"NONE",
846           rc->update_every,
847           (rc->calculation)?rc->calculation->parsed_as:"NONE",
848           (rc->warning)?rc->warning->parsed_as:"NONE",
849           (rc->critical)?rc->critical->parsed_as:"NONE",
850           rc->source
851     );
852
853     rrdcalc_create_part2(host, rc);
854     return 1;
855 }
856
857 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
858     if(unlikely(!rt->context)) {
859         error("Health configuration for template '%s' does not have a context", rt->name);
860         return 0;
861     }
862
863     if(unlikely(!rt->update_every)) {
864         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
865         return 0;
866     }
867
868     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
869         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
870         return 0;
871     }
872
873     RRDCALCTEMPLATE *t, *last = NULL;
874     for (t = host->templates; t ; last = t, t = t->next) {
875         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
876             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
877             return 0;
878         }
879     }
880
881     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'",
882           rt->name,
883           (rt->context)?rt->context:"NONE",
884           (rt->exec)?rt->exec:"DEFAULT",
885           rt->green,
886           rt->red,
887           rt->group,
888           rt->after,
889           rt->before,
890           rt->options,
891           (rt->dimensions)?rt->dimensions:"NONE",
892           rt->update_every,
893           (rt->calculation)?rt->calculation->parsed_as:"NONE",
894           (rt->warning)?rt->warning->parsed_as:"NONE",
895           (rt->critical)?rt->critical->parsed_as:"NONE",
896           rt->source
897     );
898
899     if(likely(last)) {
900         last->next = rt;
901     }
902     else {
903         rt->next = host->templates;
904         host->templates = rt;
905     }
906
907     return 1;
908 }
909
910 static inline int health_parse_duration(char *string, int *result) {
911     // make sure it is a number
912     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
913         *result = 0;
914         return 0;
915     }
916
917     char *e = NULL;
918     calculated_number n = strtold(string, &e);
919     if(e && *e) {
920         switch (*e) {
921             case 'Y':
922                 *result = (int) (n * 86400 * 365);
923                 break;
924             case 'M':
925                 *result = (int) (n * 86400 * 30);
926                 break;
927             case 'w':
928                 *result = (int) (n * 86400 * 7);
929                 break;
930             case 'd':
931                 *result = (int) (n * 86400);
932                 break;
933             case 'h':
934                 *result = (int) (n * 3600);
935                 break;
936             case 'm':
937                 *result = (int) (n * 60);
938                 break;
939
940             default:
941             case 's':
942                 *result = (int) (n);
943                 break;
944         }
945     }
946     else
947        *result = (int)(n);
948
949     return 1;
950 }
951
952 static inline int health_parse_db_lookup(
953         size_t line, const char *path, const char *file, char *string,
954         int *group_method, int *after, int *before, int *every,
955         uint32_t *options, char **dimensions
956 ) {
957     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
958
959     if(*dimensions) freez(*dimensions);
960     *dimensions = NULL;
961     *after = 0;
962     *before = 0;
963     *every = 0;
964     *options = 0;
965
966     char *s = string, *key;
967
968     // first is the group method
969     key = s;
970     while(*s && !isspace(*s)) s++;
971     while(*s && isspace(*s)) *s++ = '\0';
972     if(!*s) {
973         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
974               line, path, file, key);
975         return 0;
976     }
977
978     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
979         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
980               line, path, file, key);
981         return 0;
982     }
983
984     // then is the 'after' time
985     key = s;
986     while(*s && !isspace(*s)) s++;
987     while(*s && isspace(*s)) *s++ = '\0';
988
989     if(!health_parse_duration(key, after)) {
990         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
991               line, path, file, key);
992         return 0;
993     }
994
995     // sane defaults
996     *every = abs(*after);
997
998     // now we may have optional parameters
999     while(*s) {
1000         key = s;
1001         while(*s && !isspace(*s)) s++;
1002         while(*s && isspace(*s)) *s++ = '\0';
1003         if(!*key) break;
1004
1005         if(!strcasecmp(key, "at")) {
1006             char *value = s;
1007             while(*s && !isspace(*s)) s++;
1008             while(*s && isspace(*s)) *s++ = '\0';
1009
1010             if (!health_parse_duration(value, before)) {
1011                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1012                       line, path, file, value, key);
1013             }
1014         }
1015         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1016             char *value = s;
1017             while(*s && !isspace(*s)) s++;
1018             while(*s && isspace(*s)) *s++ = '\0';
1019
1020             if (!health_parse_duration(value, every)) {
1021                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1022                       line, path, file, value, key);
1023             }
1024         }
1025         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1026             *options |= RRDR_OPTION_ABSOLUTE;
1027         }
1028         else if(!strcasecmp(key, "min2max")) {
1029             *options |= RRDR_OPTION_MIN2MAX;
1030         }
1031         else if(!strcasecmp(key, "null2zero")) {
1032             *options |= RRDR_OPTION_NULL2ZERO;
1033         }
1034         else if(!strcasecmp(key, "percentage")) {
1035             *options |= RRDR_OPTION_PERCENTAGE;
1036         }
1037         else if(!strcasecmp(key, "unaligned")) {
1038             *options |= RRDR_OPTION_NOT_ALIGNED;
1039         }
1040         else if(!strcasecmp(key, "of")) {
1041             if(*s && strcasecmp(s, "all"))
1042                *dimensions = strdupz(s);
1043             break;
1044         }
1045         else {
1046             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1047                   line, path, file, key);
1048         }
1049     }
1050
1051     return 1;
1052 }
1053
1054 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1055     char buffer[FILENAME_MAX + 1];
1056     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1057     return strdupz(buffer);
1058 }
1059
1060 static inline void strip_quotes(char *s) {
1061     while(*s) {
1062         if(*s == '\'' || *s == '"') *s = ' ';
1063         s++;
1064     }
1065 }
1066
1067 int health_readfile(const char *path, const char *filename) {
1068     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1069
1070     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0;
1071     char buffer[HEALTH_CONF_MAX_LINE + 1];
1072
1073     if(unlikely(!hash_alarm)) {
1074         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1075         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1076         hash_on = simple_uhash(HEALTH_ON_KEY);
1077         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1078         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1079         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1080         hash_red = simple_uhash(HEALTH_RED_KEY);
1081         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1082         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1083         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1084         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1085         hash_units = simple_hash(HEALTH_UNITS_KEY);
1086         hash_info = simple_hash(HEALTH_INFO_KEY);
1087     }
1088
1089     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1090     FILE *fp = fopen(buffer, "r");
1091     if(!fp) {
1092         error("Health configuration cannot read file '%s'.", buffer);
1093         return 0;
1094     }
1095
1096     RRDCALC *rc = NULL;
1097     RRDCALCTEMPLATE *rt = NULL;
1098
1099     size_t line = 0, append = 0;
1100     char *s;
1101     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1102         int stop_appending = !s;
1103         line++;
1104         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1105         s = trim(buffer);
1106         if(!s) continue;
1107         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1108
1109         append = strlen(s);
1110         if(!stop_appending && s[append - 1] == '\\') {
1111             s[append - 1] = ' ';
1112             append = &s[append] - buffer;
1113             if(append < HEALTH_CONF_MAX_LINE)
1114                 continue;
1115             else {
1116                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1117             }
1118         }
1119         append = 0;
1120
1121         char *key = s;
1122         while(*s && *s != ':') s++;
1123         if(!*s) {
1124             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1125             continue;
1126         }
1127         *s = '\0';
1128         s++;
1129
1130         char *value = s;
1131         key = trim(key);
1132         value = trim(value);
1133
1134         if(!key) {
1135             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1136             continue;
1137         }
1138
1139         if(!value) {
1140             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1141             continue;
1142         }
1143
1144         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1145         uint32_t hash = simple_uhash(key);
1146
1147         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1148             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1149                 rrdcalc_free(&localhost, rc);
1150
1151             if(rt) {
1152                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1153                     rrdcalctemplate_free(&localhost, rt);
1154                 rt = NULL;
1155             }
1156
1157             rc = callocz(1, sizeof(RRDCALC));
1158             rc->name = strdupz(value);
1159             rc->hash = simple_hash(rc->name);
1160             rc->source = health_source_file(line, path, filename);
1161             rc->green = NAN;
1162             rc->red = NAN;
1163             rc->value = NAN;
1164             rc->old_value = NAN;
1165
1166             if(rrdvar_fix_name(rc->name))
1167                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1168         }
1169         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1170             if(rc) {
1171                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1172                     rrdcalc_free(&localhost, rc);
1173                 rc = NULL;
1174             }
1175
1176             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1177                 rrdcalctemplate_free(&localhost, rt);
1178
1179             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1180             rt->name = strdupz(value);
1181             rt->hash_name = simple_hash(rt->name);
1182             rt->source = health_source_file(line, path, filename);
1183             rt->green = NAN;
1184             rt->red = NAN;
1185
1186             if(rrdvar_fix_name(rt->name))
1187                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1188         }
1189         else if(rc) {
1190             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1191                 if(rc->chart) {
1192                     if(strcmp(rc->chart, value))
1193                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1194                              line, path, filename, rc->name, key, rc->chart, value, value);
1195
1196                     freez(rc->chart);
1197                 }
1198                 rc->chart = strdupz(value);
1199                 rc->hash_chart = simple_hash(rc->chart);
1200             }
1201             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1202                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1203                                        &rc->update_every,
1204                                        &rc->options, &rc->dimensions);
1205             }
1206             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1207                 if(!health_parse_duration(value, &rc->update_every))
1208                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1209                          line, path, filename, rc->name, key, value);
1210             }
1211             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1212                 char *e;
1213                 rc->green = strtold(value, &e);
1214                 if(e && *e) {
1215                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1216                          line, path, filename, rc->name, key, e);
1217                 }
1218             }
1219             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1220                 char *e;
1221                 rc->red = strtold(value, &e);
1222                 if(e && *e) {
1223                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1224                          line, path, filename, rc->name, key, e);
1225                 }
1226             }
1227             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1228                 const char *failed_at = NULL;
1229                 int error = 0;
1230                 rc->calculation = expression_parse(value, &failed_at, &error);
1231                 if(!rc->calculation) {
1232                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1233                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1234                 }
1235             }
1236             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1237                 const char *failed_at = NULL;
1238                 int error = 0;
1239                 rc->warning = expression_parse(value, &failed_at, &error);
1240                 if(!rc->warning) {
1241                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1242                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1243                 }
1244             }
1245             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1246                 const char *failed_at = NULL;
1247                 int error = 0;
1248                 rc->critical = expression_parse(value, &failed_at, &error);
1249                 if(!rc->critical) {
1250                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1251                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1252                 }
1253             }
1254             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1255                 if(rc->exec) {
1256                     if(strcmp(rc->exec, value))
1257                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1258                              line, path, filename, rc->name, key, rc->exec, value, value);
1259
1260                     freez(rc->exec);
1261                 }
1262                 rc->exec = strdupz(value);
1263             }
1264             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1265                 if(rc->units) {
1266                     if(strcmp(rc->units, value))
1267                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1268                              line, path, filename, rc->name, key, rc->units, value, value);
1269
1270                     freez(rc->units);
1271                 }
1272                 rc->units = strdupz(value);
1273                 strip_quotes(rc->units);
1274             }
1275             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1276                 if(rc->info) {
1277                     if(strcmp(rc->info, value))
1278                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1279                              line, path, filename, rc->name, key, rc->info, value, value);
1280
1281                     freez(rc->info);
1282                 }
1283                 rc->info = strdupz(value);
1284                 strip_quotes(rc->info);
1285             }
1286             else {
1287                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1288                      line, path, filename, rc->name, key);
1289             }
1290         }
1291         else if(rt) {
1292             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1293                 if(rt->context) {
1294                     if(strcmp(rt->context, value))
1295                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1296                              line, path, filename, rt->name, key, rt->context, value, value);
1297
1298                     freez(rt->context);
1299                 }
1300                 rt->context = strdupz(value);
1301                 rt->hash_context = simple_hash(rt->context);
1302             }
1303             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1304                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1305                                        &rt->update_every,
1306                                        &rt->options, &rt->dimensions);
1307             }
1308             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1309                 if(!health_parse_duration(value, &rt->update_every))
1310                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1311                          line, path, filename, rt->name, key, value);
1312             }
1313             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1314                 char *e;
1315                 rt->green = strtold(value, &e);
1316                 if(e && *e) {
1317                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1318                          line, path, filename, rt->name, key, e);
1319                 }
1320             }
1321             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1322                 char *e;
1323                 rt->red = strtold(value, &e);
1324                 if(e && *e) {
1325                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1326                          line, path, filename, rt->name, key, e);
1327                 }
1328             }
1329             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1330                 const char *failed_at = NULL;
1331                 int error = 0;
1332                 rt->calculation = expression_parse(value, &failed_at, &error);
1333                 if(!rt->calculation) {
1334                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1335                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1336                 }
1337             }
1338             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1339                 const char *failed_at = NULL;
1340                 int error = 0;
1341                 rt->warning = expression_parse(value, &failed_at, &error);
1342                 if(!rt->warning) {
1343                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1344                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1345                 }
1346             }
1347             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1348                 const char *failed_at = NULL;
1349                 int error = 0;
1350                 rt->critical = expression_parse(value, &failed_at, &error);
1351                 if(!rt->critical) {
1352                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1353                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1354                 }
1355             }
1356             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1357                 if(rt->exec) {
1358                     if(strcmp(rt->exec, value))
1359                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1360                              line, path, filename, rt->name, key, rt->exec, value, value);
1361
1362                     freez(rt->exec);
1363                 }
1364                 rt->exec = strdupz(value);
1365             }
1366             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1367                 if(rt->units) {
1368                     if(strcmp(rt->units, value))
1369                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1370                              line, path, filename, rt->name, key, rt->units, value, value);
1371
1372                     freez(rt->units);
1373                 }
1374                 rt->units = strdupz(value);
1375                 strip_quotes(rt->units);
1376             }
1377             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1378                 if(rt->info) {
1379                     if(strcmp(rt->info, value))
1380                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1381                              line, path, filename, rt->name, key, rt->info, value, value);
1382
1383                     freez(rt->info);
1384                 }
1385                 rt->info = strdupz(value);
1386                 strip_quotes(rt->info);
1387             }
1388             else {
1389                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1390                       line, path, filename, rt->name, key);
1391             }
1392         }
1393         else {
1394             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1395                   line, path, filename, key);
1396         }
1397     }
1398
1399     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1400         rrdcalc_free(&localhost, rc);
1401
1402     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1403         rrdcalctemplate_free(&localhost, rt);
1404
1405     fclose(fp);
1406     return 1;
1407 }
1408
1409 void health_readdir(const char *path) {
1410     size_t pathlen = strlen(path);
1411
1412     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
1413
1414     DIR *dir = opendir(path);
1415     if (!dir) {
1416         error("Health configuration cannot open directory '%s'.", path);
1417         return;
1418     }
1419
1420     struct dirent *de = NULL;
1421     while ((de = readdir(dir))) {
1422         size_t len = strlen(de->d_name);
1423
1424         if(de->d_type == DT_DIR
1425            && (
1426                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
1427                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
1428            ))
1429             continue;
1430
1431         else if(de->d_type == DT_DIR) {
1432             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
1433             strcpy(s, path);
1434             strcat(s, "/");
1435             strcat(s, de->d_name);
1436             health_readdir(s);
1437             freez(s);
1438             continue;
1439         }
1440
1441         else if((de->d_type == DT_LNK || de->d_type == DT_REG) &&
1442                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
1443             health_readfile(path, de->d_name);
1444         }
1445     }
1446
1447     closedir(dir);
1448 }
1449
1450 static inline char *health_config_dir(void) {
1451     char buffer[FILENAME_MAX + 1];
1452     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
1453     return config_get("health", "health configuration directory", buffer);
1454 }
1455
1456 void health_init(void) {
1457     debug(D_HEALTH, "Health configuration initializing");
1458
1459     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
1460         debug(D_HEALTH, "Health is disabled.");
1461         return;
1462     }
1463
1464     char *path = health_config_dir();
1465
1466     {
1467         char buffer[FILENAME_MAX + 1];
1468         snprintfz(buffer, FILENAME_MAX, "%s/alarm-email.sh", config_get("global", "plugins directory", PLUGINS_DIR));
1469         health_default_exec = config_get("health", "script to execute on alarm", buffer);
1470     }
1471
1472     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1473     if(n < 2) {
1474         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
1475         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1476     }
1477     else localhost.health_log.max = (unsigned int)n;
1478
1479     rrdhost_rwlock(&localhost);
1480     health_readdir(path);
1481     rrdhost_unlock(&localhost);
1482 }
1483
1484 // ----------------------------------------------------------------------------
1485 // JSON generation
1486
1487 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
1488     if(value && *value)
1489         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
1490     else
1491         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
1492 }
1493
1494 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
1495     buffer_sprintf(wb, "\n\t{\n"
1496                            "\t\t\"id\":%u,\n"
1497                            "\t\t\"name\":\"%s\",\n"
1498                            "\t\t\"chart\":\"%s\",\n"
1499                            "\t\t\"family\":\"%s\",\n"
1500                            "\t\t\"processed\":%s,\n"
1501                            "\t\t\"updated\":%s,\n"
1502                            "\t\t\"exec_run\":%s,\n"
1503                            "\t\t\"exec_failed\":%s,\n"
1504                            "\t\t\"exec\":\"%s\",\n"
1505                            "\t\t\"exec_code\":%d,\n"
1506                            "\t\t\"source\":\"%s\",\n"
1507                            "\t\t\"units\":\"%s\",\n"
1508                            "\t\t\"info\":\"%s\",\n"
1509                            "\t\t\"when\":%lu,\n"
1510                            "\t\t\"duration\":%lu,\n"
1511                            "\t\t\"non_clear_duration\":%lu,\n"
1512                            "\t\t\"status\":\"%s\",\n"
1513                            "\t\t\"old_status\":\"%s\",\n",
1514                    ae->id,
1515                    ae->name,
1516                    ae->chart,
1517                    ae->family,
1518                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)?"true":"false",
1519                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)?"true":"false",
1520                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN)?"true":"false",
1521                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false",
1522                    ae->exec?ae->exec:health_default_exec,
1523                    ae->exec_code,
1524                    ae->source,
1525                    ae->units?ae->units:"",
1526                    ae->info?ae->info:"",
1527                    (unsigned long)ae->when,
1528                    (unsigned long)ae->duration,
1529                    (unsigned long)ae->non_clear_duration,
1530                    rrdcalc_status2string(ae->new_status),
1531                    rrdcalc_status2string(ae->old_status)
1532     );
1533
1534     buffer_strcat(wb, "\t\t\"value\":");
1535     buffer_rrd_value(wb, ae->new_value);
1536     buffer_strcat(wb, ",\n");
1537
1538     buffer_strcat(wb, "\t\t\"old_value\":");
1539     buffer_rrd_value(wb, ae->old_value);
1540     buffer_strcat(wb, "\n");
1541
1542     buffer_strcat(wb, "\t}");
1543 }
1544
1545 void health_alarm_log2json(RRDHOST *host, BUFFER *wb) {
1546     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1547
1548     buffer_strcat(wb, "[");
1549
1550     unsigned int max = host->health_log.max;
1551     unsigned int count = 0;
1552     ALARM_ENTRY *ae;
1553     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
1554         if(likely(count)) buffer_strcat(wb, ",");
1555         health_alarm_entry2json_nolock(wb, ae);
1556     }
1557
1558     buffer_strcat(wb, "\n]\n");
1559
1560     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1561 }
1562
1563 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
1564     buffer_sprintf(wb,
1565            "\t\t\"%s.%s\": {\n"
1566                    "\t\t\t\"name\": \"%s\",\n"
1567                    "\t\t\t\"chart\": \"%s\",\n"
1568                    "\t\t\t\"family\": \"%s\",\n"
1569                    "\t\t\t\"active\": %s,\n"
1570                    "\t\t\t\"exec\": \"%s\",\n"
1571                    "\t\t\t\"source\": \"%s\",\n"
1572                    "\t\t\t\"units\": \"%s\",\n"
1573                    "\t\t\t\"info\": \"%s\",\n"
1574                                    "\t\t\t\"status\": \"%s\",\n"
1575                    "\t\t\t\"last_status_change\": %lu,\n"
1576                    "\t\t\t\"last_updated\": %lu,\n"
1577                    "\t\t\t\"next_update\": %lu,\n"
1578                    "\t\t\t\"update_every\": %d,\n"
1579             , rc->chart, rc->name
1580             , rc->name
1581             , rc->chart
1582             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
1583             , (rc->rrdset)?"true":"false"
1584             , rc->exec?rc->exec:health_default_exec
1585             , rc->source
1586             , rc->units?rc->units:""
1587             , rc->info?rc->info:""
1588             , rrdcalc_status2string(rc->status)
1589             , (unsigned long)rc->last_status_change
1590             , (unsigned long)rc->last_updated
1591             , (unsigned long)rc->next_update
1592             , rc->update_every
1593     );
1594
1595     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
1596         if(rc->dimensions && *rc->dimensions)
1597             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
1598
1599         buffer_sprintf(wb,
1600                        "\t\t\t\"db_after\": %lu,\n"
1601                        "\t\t\t\"db_before\": %lu,\n"
1602                        "\t\t\t\"lookup_method\": \"%s\",\n"
1603                        "\t\t\t\"lookup_after\": %d,\n"
1604                        "\t\t\t\"lookup_before\": %d,\n"
1605                        "\t\t\t\"lookup_options\": \"",
1606                        (unsigned long) rc->db_after,
1607                        (unsigned long) rc->db_before,
1608                        group_method2string(rc->group),
1609                        rc->after,
1610                        rc->before
1611         );
1612         buffer_data_options2string(wb, rc->options);
1613         buffer_strcat(wb, "\",\n");
1614     }
1615
1616     if(rc->calculation) {
1617         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
1618         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
1619     }
1620
1621     if(rc->warning) {
1622         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
1623         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
1624     }
1625
1626     if(rc->critical) {
1627         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
1628         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
1629     }
1630
1631     buffer_strcat(wb, "\t\t\t\"green\":");
1632     buffer_rrd_value(wb, rc->green);
1633     buffer_strcat(wb, ",\n");
1634
1635     buffer_strcat(wb, "\t\t\t\"red\":");
1636     buffer_rrd_value(wb, rc->red);
1637     buffer_strcat(wb, ",\n");
1638
1639     buffer_strcat(wb, "\t\t\t\"value\":");
1640     buffer_rrd_value(wb, rc->value);
1641     buffer_strcat(wb, "\n");
1642
1643     buffer_strcat(wb, "\t\t}");
1644 }
1645
1646 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
1647 //
1648 //}
1649
1650 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
1651     int i;
1652     rrdhost_rdlock(&localhost);
1653
1654     buffer_strcat(wb, "{\n\t\"alarms\": {\n");
1655     RRDCALC *rc;
1656     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
1657         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
1658             continue;
1659
1660         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
1661             continue;
1662
1663         if(likely(i)) buffer_strcat(wb, ",\n");
1664         health_rrdcalc2json_nolock(wb, rc);
1665         i++;
1666     }
1667
1668 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
1669
1670 //    RRDCALCTEMPLATE *rt;
1671 //    for(rt = host->templates; rt ; rt = rt->next)
1672 //        health_rrdcalctemplate2json_nolock(wb, rt);
1673
1674     buffer_sprintf(wb, "\n\t},\n\t\"now\": %lu\n}\n", (unsigned long)time(NULL));
1675     rrdhost_unlock(&localhost);
1676 }
1677
1678
1679 // ----------------------------------------------------------------------------
1680 // re-load health configuration
1681
1682 static inline void health_free_all_nolock(RRDHOST *host) {
1683     while(host->templates)
1684         rrdcalctemplate_free(host, host->templates);
1685
1686     while(host->alarms)
1687         rrdcalc_free(host, host->alarms);
1688 }
1689
1690 void health_reload(void) {
1691     if(!health_enabled) {
1692         error("Health reload is requested, but health is not enabled.");
1693         return;
1694     }
1695
1696     char *path = health_config_dir();
1697
1698     rrdhost_rwlock(&localhost);
1699     health_free_all_nolock(&localhost);
1700     rrdhost_unlock(&localhost);
1701
1702     RRDSET *st;
1703     for(st = localhost.rrdset_root; st ; st = st->next) {
1704         st->green = NAN;
1705         st->red = NAN;
1706     }
1707
1708     rrdhost_rwlock(&localhost);
1709     health_readdir(path);
1710     rrdhost_unlock(&localhost);
1711
1712     for(st = localhost.rrdset_root; st ; st = st->next) {
1713         rrdhost_rwlock(&localhost);
1714
1715         rrdsetcalc_link_matching(st);
1716         rrdcalctemplate_link_matching(st);
1717
1718         rrdhost_unlock(&localhost);
1719     }
1720 }
1721
1722
1723 // ----------------------------------------------------------------------------
1724 // health main thread and friends
1725
1726 static inline int rrdcalc_value2status(calculated_number n) {
1727     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
1728     if(n) return RRDCALC_STATUS_RAISED;
1729     return RRDCALC_STATUS_CLEAR;
1730 }
1731
1732 static inline void health_alarm_execute(ALARM_ENTRY *ae) {
1733     if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)
1734         return;
1735
1736     char buffer[FILENAME_MAX + 1];
1737     pid_t command_pid;
1738
1739     const char *exec = ae->exec;
1740     if(!exec) exec = health_default_exec;
1741
1742     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
1743               exec,
1744               ae->name,
1745               ae->chart?ae->chart:"NOCAHRT",
1746               ae->family?ae->family:"NOFAMILY",
1747               rrdcalc_status2string(ae->new_status),
1748               rrdcalc_status2string(ae->old_status),
1749               ae->new_value,
1750               ae->old_value,
1751               ae->source?ae->source:"UNKNOWN",
1752               (uint32_t)ae->duration,
1753               (uint32_t)ae->non_clear_duration,
1754               ae->units?ae->units:"",
1755               ae->info?ae->info:""
1756     );
1757
1758     ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN;
1759
1760     debug(D_HEALTH, "executing command '%s'", buffer);
1761     FILE *fp = mypopen(buffer, &command_pid);
1762     if(!fp) {
1763         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
1764         return;
1765     }
1766     debug(D_HEALTH, "HEALTH reading from command");
1767     char *s = fgets(buffer, FILENAME_MAX, fp);
1768     (void)s;
1769     debug(D_HEALTH, "HEALTH closing command");
1770     ae->exec_code = mypclose(fp, command_pid);
1771     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
1772
1773     if(ae->exec_code != 0)
1774         ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED;
1775 }
1776
1777 static inline void health_process_notifications(ALARM_ENTRY *ae) {
1778     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
1779          ae->chart?ae->chart:"NOCHART", ae->name,
1780          ae->new_value,
1781          rrdcalc_status2string(ae->old_status),
1782          rrdcalc_status2string(ae->new_status)
1783     );
1784
1785     health_alarm_execute(ae);
1786 }
1787
1788 static inline void health_alarm_log(RRDHOST *host, time_t when,
1789                 const char *name, const char *chart, const char *family,
1790                 const char *exec, time_t duration,
1791                 calculated_number old_value, calculated_number new_value,
1792                 int old_status, int new_status,
1793                 const char *source,
1794                 const char *units,
1795                 const char *info
1796 ) {
1797     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
1798     ae->name = strdupz(name);
1799     ae->hash_name = simple_hash(ae->name);
1800
1801     if(chart) {
1802         ae->chart = strdupz(chart);
1803         ae->hash_chart = simple_hash(ae->chart);
1804     }
1805
1806     if(family)
1807         ae->family = strdupz(family);
1808
1809     if(exec) ae->exec = strdupz(exec);
1810     if(source) ae->source = strdupz(source);
1811     if(units) ae->units = strdupz(units);
1812     if(info) ae->info = strdupz(info);
1813
1814     ae->id = host->health_log.nextid++;
1815     ae->when = when;
1816     ae->old_value = old_value;
1817     ae->new_value = new_value;
1818     ae->old_status = old_status;
1819     ae->new_status = new_status;
1820     ae->duration = duration;
1821
1822     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
1823         ae->non_clear_duration += ae->duration;
1824
1825     // link it
1826     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1827     ae->next = host->health_log.alarms;
1828     host->health_log.alarms = ae;
1829     host->health_log.count++;
1830     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1831
1832     // match previous alarms
1833     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1834     ALARM_ENTRY *t;
1835     for(t = host->health_log.alarms ; t ; t = t->next) {
1836         if(t != ae &&
1837                 t->hash_name == ae->hash_name &&
1838                 t->hash_chart == ae->hash_chart &&
1839                 !strcmp(t->name, ae->name) &&
1840                 t->chart && ae->chart && !strcmp(t->chart, ae->chart)) {
1841
1842             if(!(t->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED) && !t->updated_by) {
1843                 t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
1844                 t->updated_by = ae;
1845
1846                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
1847                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
1848                     ae->non_clear_duration += t->non_clear_duration;
1849             }
1850             else {
1851                 // no need to continue
1852                 break;
1853             }
1854         }
1855     }
1856     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1857 }
1858
1859 static inline void health_alarm_log_process(RRDHOST *host) {
1860     static uint32_t last_processed = 0;
1861     ALARM_ENTRY *ae;
1862
1863     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1864
1865     for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1866         if(last_processed >= ae->id) break;
1867
1868         if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
1869                 !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) {
1870             ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
1871             health_process_notifications(ae);
1872         }
1873     }
1874
1875     if(host->health_log.alarms)
1876         last_processed = host->health_log.alarms->id;
1877
1878     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1879
1880     if(host->health_log.count <= host->health_log.max)
1881         return;
1882
1883     // cleanup excess entries in the log
1884     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1885
1886     ALARM_ENTRY *last = NULL;
1887     unsigned int count = host->health_log.max;
1888     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
1889
1890     if(ae && last && last->next == ae)
1891         last->next = NULL;
1892     else
1893         ae = NULL;
1894
1895     while(ae) {
1896         ALARM_ENTRY *t = ae->next;
1897
1898         freez(ae->name);
1899         freez(ae->chart);
1900         freez(ae->family);
1901         freez(ae->exec);
1902         freez(ae->source);
1903         freez(ae->units);
1904         freez(ae->info);
1905         freez(ae);
1906
1907         ae = t;
1908     }
1909
1910     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1911 }
1912
1913 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
1914     if (unlikely(!rc->rrdset)) {
1915         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
1916         return 0;
1917     }
1918
1919     if (unlikely(!rc->rrdset->last_collected_time.tv_sec)) {
1920         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not yet collected.", rc->chart?rc->chart:"NOCHART", rc->name);
1921         return 0;
1922     }
1923
1924     if (unlikely(!rc->update_every)) {
1925         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
1926         return 0;
1927     }
1928
1929     if (unlikely(rc->next_update > now)) {
1930         if (unlikely(*next_run > rc->next_update))
1931             *next_run = rc->next_update;
1932
1933         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
1934         return 0;
1935     }
1936
1937     // FIXME
1938     // we should check that the DB lookup is possible
1939     // i.e.
1940     // - the duration of the chart includes the required timeframe
1941     // we SHOULD NOT check the dimensions - there might be alarms that refer non-existing dimensions (e.g. cpu steal)
1942
1943     return 1;
1944 }
1945
1946 void *health_main(void *ptr) {
1947     (void)ptr;
1948
1949     info("HEALTH thread created with task id %d", gettid());
1950
1951     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
1952         error("Cannot set pthread cancel type to DEFERRED.");
1953
1954     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
1955         error("Cannot set pthread cancel state to ENABLE.");
1956
1957     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
1958     if(min_run_every < 1) min_run_every = 1;
1959
1960     BUFFER *wb = buffer_create(100);
1961
1962     unsigned int loop = 0;
1963     while(health_enabled) {
1964         loop++;
1965         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
1966
1967         int oldstate, runnable = 0;
1968         time_t now = time(NULL);
1969         time_t next_run = now + min_run_every;
1970         RRDCALC *rc;
1971
1972         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
1973             error("Cannot set pthread cancel state to DISABLE.");
1974
1975         rrdhost_rdlock(&localhost);
1976
1977         // the first loop is to lookup values from the db
1978         for (rc = localhost.alarms; rc; rc = rc->next) {
1979             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
1980                 continue;
1981
1982             runnable++;
1983             rc->old_value = rc->value;
1984
1985             // 1. if there is database lookup, do it
1986             // 2. if there is calculation expression, run it
1987
1988             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
1989                 time_t old_db_timestamp = rc->db_before;
1990                 int value_is_null = 0;
1991
1992                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
1993                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
1994                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
1995
1996                 if (unlikely(ret != 200)) {
1997                     // database lookup failed
1998                     rc->value = NAN;
1999
2000                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2001
2002                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2003                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2004                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2005                     }
2006                 }
2007                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2008                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2009
2010                 if (unlikely(old_db_timestamp == rc->db_before)) {
2011                     // database is stale
2012
2013                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2014
2015                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2016                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2017                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2018                     }
2019                 }
2020                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2021                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2022
2023                 if (unlikely(value_is_null)) {
2024                     // collected value is null
2025
2026                     rc->value = NAN;
2027
2028                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2029                           rc->chart?rc->chart:"NOCHART", rc->name);
2030
2031                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2032                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2033                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2034                               rc->chart?rc->chart:"NOCHART", rc->name);
2035                     }
2036                 }
2037                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2038                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2039
2040                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2041                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2042             }
2043
2044             if(unlikely(rc->calculation)) {
2045                 if (unlikely(!expression_evaluate(rc->calculation))) {
2046                     // calculation failed
2047
2048                     rc->value = NAN;
2049
2050                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2051                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2052
2053                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2054                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2055                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2056                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2057                     }
2058                 }
2059                 else {
2060                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2061                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2062
2063                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
2064                             CALCULATED_NUMBER_FORMAT
2065                             ": %s (source: %s)",
2066                           rc->chart?rc->chart:"NOCHART", rc->name,
2067                           rc->calculation->result,
2068                           buffer_tostring(rc->calculation->error_msg),
2069                           rc->source
2070                     );
2071
2072                     rc->value = rc->calculation->result;
2073                 }
2074             }
2075         }
2076         rrdhost_unlock(&localhost);
2077
2078         if (runnable) {
2079             rrdhost_rdlock(&localhost);
2080
2081             for (rc = localhost.alarms; rc; rc = rc->next) {
2082                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2083                     continue;
2084
2085                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2086                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2087
2088                 if(likely(rc->warning)) {
2089                     if(unlikely(!expression_evaluate(rc->warning))) {
2090                         // calculation failed
2091
2092                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2093                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2094
2095                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2096                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2097                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2098                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2099                         }
2100                     }
2101                     else {
2102                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2103                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2104
2105                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2106                                 CALCULATED_NUMBER_FORMAT
2107                                 ": %s (source: %s)",
2108                               rc->chart?rc->chart:"NOCHART", rc->name,
2109                               rc->warning->result,
2110                               buffer_tostring(rc->warning->error_msg),
2111                               rc->source
2112                         );
2113
2114                         warning_status = rrdcalc_value2status(rc->warning->result);
2115                     }
2116                 }
2117
2118                 if(likely(rc->critical)) {
2119                     if(unlikely(!expression_evaluate(rc->critical))) {
2120                         // calculation failed
2121
2122                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2123                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2124
2125                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2126                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2127                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2128                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2129                         }
2130                     }
2131                     else {
2132                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2133                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2134
2135                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2136                                 CALCULATED_NUMBER_FORMAT
2137                                 ": %s (source: %s)",
2138                               rc->chart?rc->chart:"NOCHART", rc->name,
2139                               rc->critical->result,
2140                               buffer_tostring(rc->critical->error_msg),
2141                               rc->source
2142                         );
2143
2144                         critical_status = rrdcalc_value2status(rc->critical->result);
2145                     }
2146                 }
2147
2148                 int status = RRDCALC_STATUS_UNDEFINED;
2149
2150                 switch(warning_status) {
2151                     case RRDCALC_STATUS_CLEAR:
2152                         status = RRDCALC_STATUS_CLEAR;
2153                         break;
2154
2155                     case RRDCALC_STATUS_RAISED:
2156                         status = RRDCALC_STATUS_WARNING;
2157                         break;
2158
2159                     default:
2160                         break;
2161                 }
2162
2163                 switch(critical_status) {
2164                     case RRDCALC_STATUS_CLEAR:
2165                         if(status == RRDCALC_STATUS_UNDEFINED)
2166                             status = RRDCALC_STATUS_CLEAR;
2167                         break;
2168
2169                     case RRDCALC_STATUS_RAISED:
2170                         status = RRDCALC_STATUS_CRITICAL;
2171                         break;
2172
2173                     default:
2174                         break;
2175                 }
2176
2177                 if(status != rc->status) {
2178                     health_alarm_log(&localhost, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info);
2179                     rc->last_status_change = now;
2180                     rc->status = status;
2181                 }
2182
2183                 rc->last_updated = now;
2184                 rc->next_update = now + rc->update_every;
2185
2186                 if (next_run > rc->next_update)
2187                     next_run = rc->next_update;
2188             }
2189
2190             rrdhost_unlock(&localhost);
2191         }
2192
2193         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2194             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2195
2196         // execute notifications
2197         // and cleanup
2198         health_alarm_log_process(&localhost);
2199
2200         now = time(NULL);
2201         if(now < next_run) {
2202             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2203                   loop, (int) (next_run - now));
2204             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2205         }
2206         else {
2207             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2208         }
2209     }
2210
2211     buffer_free(wb);
2212
2213     info("HEALTH thread exiting");
2214     pthread_exit(NULL);
2215     return NULL;
2216 }