]> arthur.barton.de Git - netdata.git/blob - src/health.c
alarms status API ready
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 static const char *health_default_exec = PLUGINS_DIR "/alarm-email.sh";
6 int health_enabled = 1;
7
8 ALARM_LOG health_log = {
9         .nextid = 0,
10         .count = 0,
11         .max = 1000,
12         .alarms = NULL
13 };
14
15 // ----------------------------------------------------------------------------
16 // RRDVAR management
17
18 static inline int rrdvar_fix_name(char *variable) {
19     int fixed = 0;
20     while(*variable) {
21         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
22             *variable++ = '_';
23             fixed++;
24         }
25         else
26             variable++;
27     }
28
29     return fixed;
30 }
31
32 int rrdvar_compare(void* a, void* b) {
33     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
34     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
35     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
36 }
37
38 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
39     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
40     if(ret != rv)
41         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
42
43     return ret;
44 }
45
46 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
47     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
48     if(!ret)
49         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
50
51     return ret;
52 }
53
54 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
55     RRDVAR tmp;
56     tmp.name = (char *)name;
57     tmp.hash = (hash)?hash:simple_hash(tmp.name);
58
59     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
60 }
61
62 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
63     (void)host;
64
65     if(!rv) return;
66
67     if(tree)
68         rrdvar_index_del(tree, rv);
69
70     freez(rv->name);
71     freez(rv);
72 }
73
74 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
75     char *variable = strdupz(name);
76     rrdvar_fix_name(variable);
77     uint32_t hash = simple_hash(variable);
78
79     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
80     if(unlikely(!rv)) {
81         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
82
83         rv = callocz(1, sizeof(RRDVAR));
84         rv->name = variable;
85         rv->hash = hash;
86         rv->type = type;
87         rv->value = value;
88
89         RRDVAR *ret = rrdvar_index_add(tree, rv);
90         if(unlikely(ret != rv)) {
91             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
92             rrdvar_free(NULL, NULL, rv);
93             rv = NULL;
94         }
95         else
96             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
97     }
98     else {
99         // already exists
100         freez(variable);
101         rv = NULL;
102     }
103
104     return rv;
105 }
106
107 // ----------------------------------------------------------------------------
108 // RRDVAR lookup
109
110 calculated_number rrdvar2number(RRDVAR *rv) {
111     switch(rv->type) {
112         case RRDVAR_TYPE_CALCULATED: {
113             calculated_number *n = (calculated_number *)rv->value;
114             return *n;
115         }
116
117         case RRDVAR_TYPE_TIME_T: {
118             time_t *n = (time_t *)rv->value;
119             return *n;
120         }
121
122         case RRDVAR_TYPE_COLLECTED: {
123             collected_number *n = (collected_number *)rv->value;
124             return *n;
125         }
126
127         case RRDVAR_TYPE_TOTAL: {
128             total_number *n = (total_number *)rv->value;
129             return *n;
130         }
131
132         case RRDVAR_TYPE_INT: {
133             int *n = (int *)rv->value;
134             return *n;
135         }
136
137         default:
138             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
139             return NAN;
140     }
141 }
142
143 void dump_variable(void *data) {
144     RRDVAR *rv = (RRDVAR *)data;
145     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
146 }
147
148 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
149     RRDSET *st = rc->rrdset;
150     RRDVAR *rv;
151
152     if(!st) return 0;
153
154     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
155     if(rv) {
156         *result = rrdvar2number(rv);
157         return 1;
158     }
159
160     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
161     if(rv) {
162         *result = rrdvar2number(rv);
163         return 1;
164     }
165
166     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
167     if(rv) {
168         *result = rrdvar2number(rv);
169         return 1;
170     }
171
172     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
173     avl_traverse_lock(&st->variables_root_index, dump_variable);
174
175     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
176     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
177
178     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
179     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
180
181     return 0;
182 }
183
184 // ----------------------------------------------------------------------------
185 // RRDSETVAR management
186
187 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
188     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
189     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
190
191     char buffer[RRDVAR_MAX_LENGTH + 1];
192     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
193     rs->fullid = strdupz(buffer);
194
195     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
196     rs->fullname = strdupz(buffer);
197
198     rs->variable = strdupz(variable);
199
200     rs->type = type;
201     rs->value = value;
202     rs->options = options;
203     rs->rrdset = st;
204
205     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
206     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
207     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
208     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
209     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
210
211     rs->next = st->variables;
212     st->variables = rs;
213
214     return rs;
215 }
216
217 void rrdsetvar_rename_all(RRDSET *st) {
218     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
219
220     // only these 2 can change name
221     // rs->family_name
222     // rs->host_name
223
224     char buffer[RRDVAR_MAX_LENGTH + 1];
225     RRDSETVAR *rs, *next = st->variables;
226     while((rs = next)) {
227         next = rs->next;
228
229         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
230
231         if (strcmp(buffer, rs->fullname)) {
232             // name changed
233             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
234             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
235
236             freez(rs->fullname);
237             rs->fullname = strdupz(st->name);
238             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
239             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
240         }
241     }
242
243     rrdsetcalc_link_matching(st);
244 }
245
246 void rrdsetvar_free(RRDSETVAR *rs) {
247     RRDSET *st = rs->rrdset;
248     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
249
250     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
251     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
252     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
253     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
254     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
255
256     if(st->variables == rs) {
257         st->variables = rs->next;
258     }
259     else {
260         RRDSETVAR *t;
261         for (t = st->variables; t && t->next != rs; t = t->next);
262         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
263         else t->next = rs->next;
264     }
265
266     freez(rs->fullid);
267     freez(rs->fullname);
268     freez(rs->variable);
269     freez(rs);
270 }
271
272 // ----------------------------------------------------------------------------
273 // RRDDIMVAR management
274
275 #define RRDDIMVAR_ID_MAX 1024
276
277 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
278     RRDSET *st = rd->rrdset;
279
280     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
281
282     if(!prefix) prefix = "";
283     if(!suffix) suffix = "";
284
285     char buffer[RRDDIMVAR_ID_MAX + 1];
286     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
287
288     rs->prefix = strdupz(prefix);
289     rs->suffix = strdupz(suffix);
290
291     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
292     rs->id = strdupz(buffer);
293
294     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
295     rs->name = strdupz(buffer);
296
297     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
298     rs->fullidid = strdupz(buffer);
299
300     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
301     rs->fullidname = strdupz(buffer);
302
303     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
304     rs->fullnameid = strdupz(buffer);
305
306     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
307     rs->fullnamename = strdupz(buffer);
308
309     rs->type = type;
310     rs->value = value;
311     rs->options = options;
312     rs->rrddim = rd;
313
314     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
315     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
316
317     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
318     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
319
320     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
321     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
322     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
323     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
324
325     rs->next = rd->variables;
326     rd->variables = rs;
327
328     return rs;
329 }
330
331 void rrddimvar_rename_all(RRDDIM *rd) {
332     RRDSET *st = rd->rrdset;
333     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
334
335     RRDDIMVAR *rs, *next = rd->variables;
336     while((rs = next)) {
337         next = rs->next;
338
339         if (strcmp(rd->name, rs->name)) {
340             char buffer[RRDDIMVAR_ID_MAX + 1];
341             // name changed
342
343             // name
344             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
345             freez(rs->name);
346             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
347             rs->name = strdupz(buffer);
348             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
349
350             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
351             freez(rs->fullidname);
352             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
353             rs->fullidname = strdupz(buffer);
354             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
355                                                              rs->fullidname, rs->type, rs->value);
356
357             // fullnameid
358             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
359             freez(rs->fullnameid);
360             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
361             rs->fullnameid = strdupz(buffer);
362             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
363                                                           rs->fullnameid, rs->type, rs->value);
364
365             // fullnamename
366             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
367             freez(rs->fullnamename);
368             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
369             rs->fullnamename = strdupz(buffer);
370             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
371                                                           rs->fullnamename, rs->type, rs->value);
372         }
373     }
374 }
375
376 void rrddimvar_free(RRDDIMVAR *rs) {
377     RRDDIM *rd = rs->rrddim;
378     RRDSET *st = rd->rrdset;
379     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
380
381     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
382     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
383
384     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
385     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
386
387     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
388     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
389     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
390     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
391
392     if(rd->variables == rs) {
393         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
394         rd->variables = rs->next;
395     }
396     else {
397         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
398         RRDDIMVAR *t;
399         for (t = rd->variables; t && t->next != rs; t = t->next) ;
400         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
401         else t->next = rs->next;
402     }
403
404     freez(rs->prefix);
405     freez(rs->suffix);
406     freez(rs->id);
407     freez(rs->name);
408     freez(rs->fullidid);
409     freez(rs->fullidname);
410     freez(rs->fullnameid);
411     freez(rs->fullnamename);
412     freez(rs);
413 }
414
415 // ----------------------------------------------------------------------------
416 // RRDCALC management
417
418 static inline const char *rrdcalc_status2string(int status) {
419     switch(status) {
420         case RRDCALC_STATUS_UNINITIALIZED:
421             return "UNINITIALIZED";
422
423         case RRDCALC_STATUS_UNDEFINED:
424             return "UNDEFINED";
425
426         case RRDCALC_STATUS_CLEAR:
427             return "CLEAR";
428
429         case RRDCALC_STATUS_RAISED:
430             return "RAISED";
431
432         case RRDCALC_STATUS_WARNING:
433             return "WARNING";
434
435         case RRDCALC_STATUS_CRITICAL:
436             return "CRITICAL";
437
438         default:
439             return "UNKNOWN";
440     }
441 }
442
443 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
444     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
445
446     rc->rrdset = st;
447
448     rc->rrdset_next = st->alarms;
449     rc->rrdset_prev = NULL;
450     st->alarms = rc;
451
452     if(rc->update_every < rc->rrdset->update_every) {
453         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
454         rc->update_every = rc->rrdset->update_every;
455     }
456
457     if(!isnan(rc->green) && isnan(st->green)) {
458         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
459         st->green = rc->green;
460     }
461
462     if(!isnan(rc->red) && isnan(st->red)) {
463         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
464         st->red = rc->red;
465     }
466
467     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
468     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
469
470     char fullname[RRDVAR_MAX_LENGTH + 1];
471     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
472     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
473
474     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
475     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
476 }
477
478 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
479     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
480             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
481         return 1;
482
483     return 0;
484 }
485
486 // this has to be called while the RRDHOST is locked
487 inline void rrdsetcalc_link_matching(RRDSET *st) {
488     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
489
490     RRDCALC *rc;
491     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
492         if(rc->rrdset) continue;
493
494         if(rrdcalc_is_matching_this_rrdset(rc, st))
495             rrdsetcalc_link(st, rc);
496     }
497 }
498
499 // this has to be called while the RRDHOST is locked
500 inline void rrdsetcalc_unlink(RRDCALC *rc) {
501     RRDSET *st = rc->rrdset;
502
503     if(!st) {
504         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
505         return;
506     }
507
508     RRDHOST *host = st->rrdhost;
509
510     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
511
512     // unlink it
513     if(rc->rrdset_prev)
514         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
515
516     if(rc->rrdset_next)
517         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
518
519     if(st->alarms == rc)
520         st->alarms = rc->rrdset_next;
521
522     rc->rrdset_prev = rc->rrdset_next = NULL;
523
524     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
525     rc->local = NULL;
526
527     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
528     rc->family = NULL;
529
530     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
531     rc->hostid = NULL;
532
533     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
534     rc->hostname = NULL;
535
536     rc->rrdset = NULL;
537
538     // RRDCALC will remain in RRDHOST
539     // so that if the matching chart is found in the future
540     // it will be applied automatically
541 }
542
543 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
544     RRDCALC *rc;
545     uint32_t hash = simple_hash(name);
546
547     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
548         if(rc->hash == hash && !strcmp(rc->name, name))
549             return rc;
550     }
551
552     return NULL;
553 }
554
555 static inline int rrdcalc_exists(RRDHOST *host, const char *name, uint32_t hash) {
556     RRDCALC *rc;
557
558     // make sure it does not already exist
559     for(rc = host->alarms; rc ; rc = rc->next) {
560         if (rc->hash == hash && !strcmp(name, rc->name)) {
561             error("Health alarm '%s' already exists in host '%s'.", name, host->hostname);
562             return 1;
563         }
564     }
565
566     return 0;
567 }
568
569 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
570     rrdhost_check_rdlock(host);
571
572     if(rc->calculation) {
573         rc->calculation->this = &rc->value;
574         rc->calculation->after = &rc->db_after;
575         rc->calculation->before = &rc->db_before;
576         rc->calculation->rrdcalc = rc;
577     }
578
579     if(rc->warning) {
580         rc->warning->this = &rc->value;
581         rc->warning->after = &rc->db_after;
582         rc->warning->before = &rc->db_before;
583         rc->warning->rrdcalc = rc;
584     }
585
586     if(rc->critical) {
587         rc->critical->this = &rc->value;
588         rc->critical->after = &rc->db_after;
589         rc->critical->before = &rc->db_before;
590         rc->critical->rrdcalc = rc;
591     }
592
593     // link it to the host
594     rc->next = host->alarms;
595     host->alarms = rc;
596
597     // link it to its chart
598     RRDSET *st;
599     for(st = host->rrdset_root; st ; st = st->next) {
600         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
601             rrdsetcalc_link(st, rc);
602             break;
603         }
604     }
605 }
606
607 static inline uint32_t rrdcalc_fullname(char *fullname, size_t len, const char *chart, const char *name) {
608     snprintfz(fullname, len - 1, "%s%s%s", chart?chart:"", chart?".":"", name);
609     rrdvar_fix_name(fullname);
610     return simple_hash(fullname);
611 }
612
613 static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const char *chart, const char *dimensions, int group_method,
614                         int after, int before, int update_every, uint32_t options,
615                         calculated_number green, calculated_number red,
616                         const char *exec, const char *source,
617                         const char *calc, const char *warn, const char *crit) {
618
619     char fullname[RRDVAR_MAX_LENGTH + 1];
620     uint32_t hash = rrdcalc_fullname(fullname, RRDVAR_MAX_LENGTH + 1, chart, name);
621
622     if(rrdcalc_exists(host, fullname, hash))
623         return NULL;
624
625     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
626
627     rc->name = strdupz(name);
628     rc->hash = simple_hash(rc->name);
629
630     rc->chart = strdupz(chart);
631     rc->hash_chart = simple_hash(rc->chart);
632
633     if(dimensions) rc->dimensions = strdupz(dimensions);
634
635     rc->green = green;
636     rc->red = red;
637     rc->value = NAN;
638     rc->old_value = NAN;
639
640     rc->group = group_method;
641     rc->after = after;
642     rc->before = before;
643     rc->update_every = update_every;
644     rc->options = options;
645
646     if(exec) rc->exec = strdupz(exec);
647     if(source) rc->source = strdupz(source);
648
649     if(calc) {
650         rc->calculation = expression_parse(calc, NULL, NULL);
651         if(!rc->calculation)
652             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, name, calc);
653     }
654     if(warn) {
655         rc->warning = expression_parse(warn, NULL, NULL);
656         if(!rc->warning)
657             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, name, warn);
658     }
659     if(crit) {
660         rc->critical = expression_parse(crit, NULL, NULL);
661         if(!rc->critical)
662             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, name, crit);
663     }
664
665     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
666           (rc->chart)?rc->chart:"NOCHART",
667           rc->name,
668           (rc->exec)?rc->exec:"DEFAULT",
669           rc->green,
670           rc->red,
671           rc->group,
672           rc->after,
673           rc->before,
674           rc->options,
675           (rc->dimensions)?rc->dimensions:"NONE",
676           rc->update_every,
677           (rc->calculation)?rc->calculation->parsed_as:"NONE",
678           (rc->warning)?rc->warning->parsed_as:"NONE",
679           (rc->critical)?rc->critical->parsed_as:"NONE",
680           rc->source
681     );
682
683     rrdcalc_create_part2(host, rc);
684     return rc;
685 }
686
687 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
688     if(!rc) return;
689
690     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
691
692     // unlink it from RRDSET
693     if(rc->rrdset) rrdsetcalc_unlink(rc);
694
695     // unlink it from RRDHOST
696     if(rc == host->alarms)
697         host->alarms = rc->next;
698
699     else if(host->alarms) {
700         RRDCALC *t, *last = host->alarms;
701
702         for(t = last->next; t && t != rc; last = t, t = t->next) ;
703         if(last && last->next == rc)
704             last->next = rc->next;
705         else
706             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
707     }
708     else
709         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
710
711     expression_free(rc->calculation);
712     expression_free(rc->warning);
713     expression_free(rc->critical);
714
715     freez(rc->source);
716     freez(rc->name);
717     freez(rc->chart);
718     freez(rc->dimensions);
719     freez(rc->exec);
720     freez(rc);
721 }
722
723 // ----------------------------------------------------------------------------
724 // RRDCALCTEMPLATE management
725
726 void rrdcalctemplate_link_matching(RRDSET *st) {
727     RRDCALCTEMPLATE *rt;
728
729     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
730         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
731
732             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt->name, st->id,
733                            rt->dimensions, rt->group, rt->after, rt->before, rt->update_every, rt->options,
734                            rt->green, rt->red, rt->exec, rt->source,
735                            (rt->calculation)?rt->calculation->source:NULL,
736                            (rt->warning)?rt->warning->source:NULL,
737                            (rt->critical)?rt->critical->source:NULL);
738
739             if(!rc)
740                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
741
742 #ifdef NETDATA_INTERNAL_CHECKS
743             else if(rc->rrdset != st)
744                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
745 #else
746             (void)rc;
747 #endif
748         }
749     }
750 }
751
752 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
753     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
754
755     if(host->templates) {
756         if(host->templates == rt) {
757             host->templates = rt->next;
758         }
759         else {
760             RRDCALCTEMPLATE *t, *last = host->templates;
761             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
762             if(last && last->next == rt) {
763                 last->next = rt->next;
764                 rt->next = NULL;
765             }
766             else
767                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
768         }
769     }
770
771     expression_free(rt->calculation);
772     expression_free(rt->warning);
773     expression_free(rt->critical);
774
775     freez(rt->dimensions);
776     freez(rt->context);
777     freez(rt->name);
778     freez(rt->exec);
779     freez(rt->source);
780     freez(rt);
781 }
782
783 // ----------------------------------------------------------------------------
784 // load health configuration
785
786 #define HEALTH_CONF_MAX_LINE 4096
787
788 #define HEALTH_ALARM_KEY "alarm"
789 #define HEALTH_TEMPLATE_KEY "template"
790 #define HEALTH_ON_KEY "on"
791 #define HEALTH_LOOKUP_KEY "lookup"
792 #define HEALTH_CALC_KEY "calc"
793 #define HEALTH_EVERY_KEY "every"
794 #define HEALTH_GREEN_KEY "green"
795 #define HEALTH_RED_KEY "red"
796 #define HEALTH_WARN_KEY "warn"
797 #define HEALTH_CRIT_KEY "crit"
798 #define HEALTH_EXEC_KEY "exec"
799
800 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
801     {
802         char fullname[RRDVAR_MAX_LENGTH + 1];
803         uint32_t hash = rrdcalc_fullname(fullname, RRDVAR_MAX_LENGTH + 1, rc->chart, rc->name);
804
805         if (rrdcalc_exists(host, fullname, hash))
806             return 0;
807     }
808
809     if(!rc->chart) {
810         error("Health configuration for alarm '%s' does not have a chart", rc->name);
811         return 0;
812     }
813
814     if(!rc->update_every) {
815         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
816         return 0;
817     }
818
819     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
820         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
821         return 0;
822     }
823
824     debug(D_HEALTH, "Health configuration adding alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
825           rc->chart?rc->chart:"NOCHART",
826           rc->name,
827           (rc->exec)?rc->exec:"DEFAULT",
828           rc->green,
829           rc->red,
830           rc->group,
831           rc->after,
832           rc->before,
833           rc->options,
834           (rc->dimensions)?rc->dimensions:"NONE",
835           rc->update_every,
836           (rc->calculation)?rc->calculation->parsed_as:"NONE",
837           (rc->warning)?rc->warning->parsed_as:"NONE",
838           (rc->critical)?rc->critical->parsed_as:"NONE",
839           rc->source
840     );
841
842     rrdcalc_create_part2(host, rc);
843     return 1;
844 }
845
846 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
847     if(!rt->context) {
848         error("Health configuration for template '%s' does not have a context", rt->name);
849         return 0;
850     }
851
852     if(!rt->update_every) {
853         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
854         return 0;
855     }
856
857     if(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical) {
858         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
859         return 0;
860     }
861
862     RRDCALCTEMPLATE *t;
863     for (t = host->templates; t ; t = t->next) {
864         if(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name)) {
865             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
866             return 0;
867         }
868     }
869
870     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'",
871           rt->name,
872           (rt->context)?rt->context:"NONE",
873           (rt->exec)?rt->exec:"DEFAULT",
874           rt->green,
875           rt->red,
876           rt->group,
877           rt->after,
878           rt->before,
879           rt->options,
880           (rt->dimensions)?rt->dimensions:"NONE",
881           rt->update_every,
882           (rt->calculation)?rt->calculation->parsed_as:"NONE",
883           (rt->warning)?rt->warning->parsed_as:"NONE",
884           (rt->critical)?rt->critical->parsed_as:"NONE",
885           rt->source
886     );
887
888     rt->next = host->templates;
889     host->templates = rt;
890     return 1;
891 }
892
893 static inline int health_parse_duration(char *string, int *result) {
894     // make sure it is a number
895     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
896         *result = 0;
897         return 0;
898     }
899
900     char *e = NULL;
901     calculated_number n = strtold(string, &e);
902     if(e && *e) {
903         switch (*e) {
904             case 'Y':
905                 *result = (int) (n * 86400 * 365);
906                 break;
907             case 'M':
908                 *result = (int) (n * 86400 * 30);
909                 break;
910             case 'w':
911                 *result = (int) (n * 86400 * 7);
912                 break;
913             case 'd':
914                 *result = (int) (n * 86400);
915                 break;
916             case 'h':
917                 *result = (int) (n * 3600);
918                 break;
919             case 'm':
920                 *result = (int) (n * 60);
921                 break;
922
923             default:
924             case 's':
925                 *result = (int) (n);
926                 break;
927         }
928     }
929     else
930        *result = (int)(n);
931
932     return 1;
933 }
934
935 static inline int health_parse_db_lookup(
936         size_t line, const char *path, const char *file, char *string,
937         int *group_method, int *after, int *before, int *every,
938         uint32_t *options, char **dimensions
939 ) {
940     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
941
942     if(*dimensions) freez(*dimensions);
943     *dimensions = NULL;
944     *after = 0;
945     *before = 0;
946     *every = 0;
947     *options = 0;
948
949     char *s = string, *key;
950
951     // first is the group method
952     key = s;
953     while(*s && !isspace(*s)) s++;
954     while(*s && isspace(*s)) *s++ = '\0';
955     if(!*s) {
956         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
957               line, path, file, key);
958         return 0;
959     }
960
961     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
962         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
963               line, path, file, key);
964         return 0;
965     }
966
967     // then is the 'after' time
968     key = s;
969     while(*s && !isspace(*s)) s++;
970     while(*s && isspace(*s)) *s++ = '\0';
971
972     if(!health_parse_duration(key, after)) {
973         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
974               line, path, file, key);
975         return 0;
976     }
977
978     // sane defaults
979     *every = abs(*after);
980
981     // now we may have optional parameters
982     while(*s) {
983         key = s;
984         while(*s && !isspace(*s)) s++;
985         while(*s && isspace(*s)) *s++ = '\0';
986         if(!*key) break;
987
988         if(!strcasecmp(key, "at")) {
989             char *value = s;
990             while(*s && !isspace(*s)) s++;
991             while(*s && isspace(*s)) *s++ = '\0';
992
993             if (!health_parse_duration(value, before)) {
994                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
995                       line, path, file, value, key);
996             }
997         }
998         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
999             char *value = s;
1000             while(*s && !isspace(*s)) s++;
1001             while(*s && isspace(*s)) *s++ = '\0';
1002
1003             if (!health_parse_duration(value, every)) {
1004                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1005                       line, path, file, value, key);
1006             }
1007         }
1008         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1009             *options |= RRDR_OPTION_ABSOLUTE;
1010         }
1011         else if(!strcasecmp(key, "min2max")) {
1012             *options |= RRDR_OPTION_MIN2MAX;
1013         }
1014         else if(!strcasecmp(key, "null2zero")) {
1015             *options |= RRDR_OPTION_NULL2ZERO;
1016         }
1017         else if(!strcasecmp(key, "percentage")) {
1018             *options |= RRDR_OPTION_PERCENTAGE;
1019         }
1020         else if(!strcasecmp(key, "unaligned")) {
1021             *options |= RRDR_OPTION_NOT_ALIGNED;
1022         }
1023         else if(!strcasecmp(key, "of")) {
1024             if(*s && strcasecmp(s, "all"))
1025                *dimensions = strdupz(s);
1026             break;
1027         }
1028         else {
1029             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1030                   line, path, file, key);
1031         }
1032     }
1033
1034     return 1;
1035 }
1036
1037 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1038     char buffer[FILENAME_MAX + 1];
1039     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1040     return strdupz(buffer);
1041 }
1042
1043 int health_readfile(const char *path, const char *filename) {
1044     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1045
1046     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0;
1047     char buffer[HEALTH_CONF_MAX_LINE + 1];
1048
1049     if(unlikely(!hash_alarm)) {
1050         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1051         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1052         hash_on = simple_uhash(HEALTH_ON_KEY);
1053         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1054         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1055         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1056         hash_red = simple_uhash(HEALTH_RED_KEY);
1057         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1058         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1059         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1060         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1061     }
1062
1063     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1064     FILE *fp = fopen(buffer, "r");
1065     if(!fp) {
1066         error("Health configuration cannot read file '%s'.", buffer);
1067         return 0;
1068     }
1069
1070     RRDCALC *rc = NULL;
1071     RRDCALCTEMPLATE *rt = NULL;
1072
1073     size_t line = 0, append = 0;
1074     char *s;
1075     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1076         int stop_appending = !s;
1077         line++;
1078         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1079         s = trim(buffer);
1080         if(!s) continue;
1081         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1082
1083         append = strlen(s);
1084         if(!stop_appending && s[append - 1] == '\\') {
1085             s[append - 1] = ' ';
1086             append = &s[append] - buffer;
1087             if(append < HEALTH_CONF_MAX_LINE)
1088                 continue;
1089             else {
1090                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1091             }
1092         }
1093         append = 0;
1094
1095         char *key = s;
1096         while(*s && *s != ':') s++;
1097         if(!*s) {
1098             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1099             continue;
1100         }
1101         *s = '\0';
1102         s++;
1103
1104         char *value = s;
1105         key = trim(key);
1106         value = trim(value);
1107
1108         if(!key) {
1109             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1110             continue;
1111         }
1112
1113         if(!value) {
1114             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1115             continue;
1116         }
1117
1118         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1119         uint32_t hash = simple_uhash(key);
1120
1121         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1122             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1123                 rrdcalc_free(&localhost, rc);
1124
1125             if(rt) {
1126                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1127                     rrdcalctemplate_free(&localhost, rt);
1128                 rt = NULL;
1129             }
1130
1131             rc = callocz(1, sizeof(RRDCALC));
1132             rc->name = strdupz(value);
1133             rc->hash = simple_hash(rc->name);
1134             rc->source = health_source_file(line, path, filename);
1135             rc->green = NAN;
1136             rc->red = NAN;
1137             rc->value = NAN;
1138             rc->old_value = NAN;
1139
1140             if(rrdvar_fix_name(rc->name))
1141                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1142         }
1143         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1144             if(rc) {
1145                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1146                     rrdcalc_free(&localhost, rc);
1147                 rc = NULL;
1148             }
1149
1150             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1151                 rrdcalctemplate_free(&localhost, rt);
1152
1153             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1154             rt->name = strdupz(value);
1155             rt->hash_name = simple_hash(rt->name);
1156             rt->source = health_source_file(line, path, filename);
1157             rt->green = NAN;
1158             rt->red = NAN;
1159
1160             if(rrdvar_fix_name(rt->name))
1161                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1162         }
1163         else if(rc) {
1164             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1165                 if(rc->chart) {
1166                     if(strcmp(rc->chart, value))
1167                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1168                              line, path, filename, rc->name, key, rc->chart, value, value);
1169
1170                     freez(rc->chart);
1171                 }
1172                 rc->chart = strdupz(value);
1173                 rc->hash_chart = simple_hash(rc->chart);
1174             }
1175             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1176                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1177                                        &rc->update_every,
1178                                        &rc->options, &rc->dimensions);
1179             }
1180             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1181                 if(!health_parse_duration(value, &rc->update_every))
1182                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1183                          line, path, filename, rc->name, key, value);
1184             }
1185             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1186                 char *e;
1187                 rc->green = strtold(value, &e);
1188                 if(e && *e) {
1189                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1190                          line, path, filename, rc->name, key, e);
1191                 }
1192             }
1193             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1194                 char *e;
1195                 rc->red = strtold(value, &e);
1196                 if(e && *e) {
1197                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1198                          line, path, filename, rc->name, key, e);
1199                 }
1200             }
1201             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1202                 const char *failed_at = NULL;
1203                 int error = 0;
1204                 rc->calculation = expression_parse(value, &failed_at, &error);
1205                 if(!rc->calculation) {
1206                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1207                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1208                 }
1209             }
1210             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1211                 const char *failed_at = NULL;
1212                 int error = 0;
1213                 rc->warning = expression_parse(value, &failed_at, &error);
1214                 if(!rc->warning) {
1215                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1216                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1217                 }
1218             }
1219             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1220                 const char *failed_at = NULL;
1221                 int error = 0;
1222                 rc->critical = expression_parse(value, &failed_at, &error);
1223                 if(!rc->critical) {
1224                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1225                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1226                 }
1227             }
1228             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1229                 if(rc->exec) {
1230                     if(strcmp(rc->exec, value))
1231                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1232                              line, path, filename, rc->name, key, rc->exec, value, value);
1233
1234                     freez(rc->exec);
1235                 }
1236                 rc->exec = strdupz(value);
1237             }
1238             else {
1239                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1240                      line, path, filename, rc->name, key);
1241             }
1242         }
1243         else if(rt) {
1244             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1245                 if(rt->context) {
1246                     if(strcmp(rt->context, value))
1247                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1248                              line, path, filename, rt->name, key, rt->context, value, value);
1249
1250                     freez(rt->context);
1251                 }
1252                 rt->context = strdupz(value);
1253                 rt->hash_context = simple_hash(rt->context);
1254             }
1255             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1256                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1257                                        &rt->update_every,
1258                                        &rt->options, &rt->dimensions);
1259             }
1260             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1261                 if(!health_parse_duration(value, &rt->update_every))
1262                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1263                          line, path, filename, rt->name, key, value);
1264             }
1265             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1266                 char *e;
1267                 rt->green = strtold(value, &e);
1268                 if(e && *e) {
1269                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1270                          line, path, filename, rt->name, key, e);
1271                 }
1272             }
1273             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1274                 char *e;
1275                 rt->red = strtold(value, &e);
1276                 if(e && *e) {
1277                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1278                          line, path, filename, rt->name, key, e);
1279                 }
1280             }
1281             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1282                 const char *failed_at = NULL;
1283                 int error = 0;
1284                 rt->calculation = expression_parse(value, &failed_at, &error);
1285                 if(!rt->calculation) {
1286                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1287                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1288                 }
1289             }
1290             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1291                 const char *failed_at = NULL;
1292                 int error = 0;
1293                 rt->warning = expression_parse(value, &failed_at, &error);
1294                 if(!rt->warning) {
1295                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1296                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1297                 }
1298             }
1299             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1300                 const char *failed_at = NULL;
1301                 int error = 0;
1302                 rt->critical = expression_parse(value, &failed_at, &error);
1303                 if(!rt->critical) {
1304                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1305                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1306                 }
1307             }
1308             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1309                 if(rt->exec) {
1310                     if(strcmp(rt->exec, value))
1311                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1312                              line, path, filename, rt->name, key, rt->exec, value, value);
1313
1314                     freez(rt->exec);
1315                 }
1316                 rt->exec = strdupz(value);
1317             }
1318             else {
1319                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1320                       line, path, filename, rt->name, key);
1321             }
1322         }
1323         else {
1324             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1325                   line, path, filename, key);
1326         }
1327     }
1328
1329     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1330         rrdcalc_free(&localhost, rc);
1331
1332     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1333         rrdcalctemplate_free(&localhost, rt);
1334
1335     fclose(fp);
1336     return 1;
1337 }
1338
1339 void health_readdir(const char *path) {
1340     size_t pathlen = strlen(path);
1341
1342     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
1343
1344     DIR *dir = opendir(path);
1345     if (!dir) {
1346         error("Health configuration cannot open directory '%s'.", path);
1347         return;
1348     }
1349
1350     struct dirent *de = NULL;
1351     while ((de = readdir(dir))) {
1352         size_t len = strlen(de->d_name);
1353
1354         if(de->d_type == DT_DIR
1355            && (
1356                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
1357                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
1358            ))
1359             continue;
1360
1361         else if(de->d_type == DT_DIR) {
1362             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
1363             strcpy(s, path);
1364             strcat(s, "/");
1365             strcat(s, de->d_name);
1366             health_readdir(s);
1367             freez(s);
1368             continue;
1369         }
1370
1371         else if((de->d_type == DT_LNK || de->d_type == DT_REG) &&
1372                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
1373             health_readfile(path, de->d_name);
1374         }
1375     }
1376
1377     closedir(dir);
1378 }
1379
1380 static inline char *health_config_dir(void) {
1381     char buffer[FILENAME_MAX + 1];
1382     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
1383     return config_get("health", "health configuration directory", buffer);
1384 }
1385
1386 void health_init(void) {
1387     debug(D_HEALTH, "Health configuration initializing");
1388
1389     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
1390         debug(D_HEALTH, "Health is disabled.");
1391         return;
1392     }
1393
1394     char *path = health_config_dir();
1395
1396     {
1397         char buffer[FILENAME_MAX + 1];
1398         snprintfz(buffer, FILENAME_MAX, "%s/alarm-email.sh", config_get("global", "plugins directory", PLUGINS_DIR));
1399         health_default_exec = config_get("health", "script to execute on alarm", buffer);
1400     }
1401
1402     long n = config_get_number("health", "in memory max health log entries", (long)health_log.max);
1403     if(n < 2) {
1404         error("Health configuration has invalid max log entries %ld. Using default %u", n, health_log.max);
1405         config_set_number("health", "in memory max health log entries", (long)health_log.max);
1406     }
1407     else health_log.max = (unsigned int)n;
1408
1409     rrdhost_rwlock(&localhost);
1410     health_readdir(path);
1411     rrdhost_unlock(&localhost);
1412 }
1413
1414 // ----------------------------------------------------------------------------
1415 // JSON generation
1416
1417 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
1418     if(value && *value)
1419         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
1420     else
1421         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
1422 }
1423
1424 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
1425
1426     buffer_sprintf(wb,
1427            "\t\t\"%s.%s\": {\n"
1428                    "\t\t\t\"name\": \"%s\",\n"
1429                    "\t\t\t\"chart\": \"%s\",\n"
1430                    "\t\t\t\"exec\": \"%s\",\n"
1431                    "\t\t\t\"source\": \"%s\",\n"
1432                    "\t\t\t\"status\": \"%s\",\n"
1433                    "\t\t\t\"last_status_change\": %lu,\n"
1434                    "\t\t\t\"last_updated\": %lu,\n"
1435                    "\t\t\t\"next_update\": %lu,\n"
1436                    "\t\t\t\"update_every\": %d,\n"
1437             , rc->chart, rc->name
1438             , rc->name
1439             , rc->chart
1440             , rc->exec?rc->exec:health_default_exec
1441             , rc->source
1442             , rrdcalc_status2string(rc->status)
1443             , (unsigned long)rc->last_status_change
1444             , (unsigned long)rc->last_updated
1445             , (unsigned long)rc->next_update
1446             , rc->update_every
1447     );
1448
1449     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
1450         if(rc->dimensions && *rc->dimensions)
1451             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
1452
1453         buffer_sprintf(wb,
1454                        "\t\t\t\"db_after\": %lu,\n"
1455                        "\t\t\t\"db_before\": %lu,\n"
1456                        "\t\t\t\"lookup_method\": \"%s\",\n"
1457                        "\t\t\t\"lookup_after\": %d,\n"
1458                        "\t\t\t\"lookup_before\": %d,\n"
1459                        "\t\t\t\"lookup_options\": \"",
1460                        (unsigned long) rc->db_after,
1461                        (unsigned long) rc->db_before,
1462                        group_method2string(rc->group),
1463                        rc->after,
1464                        rc->before
1465         );
1466         buffer_data_options2string(wb, rc->options);
1467         buffer_strcat(wb, "\",\n");
1468     }
1469
1470     if(rc->calculation) {
1471         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
1472         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
1473     }
1474
1475     if(rc->warning) {
1476         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
1477         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
1478     }
1479
1480     if(rc->critical) {
1481         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
1482         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
1483     }
1484
1485     buffer_strcat(wb, "\t\t\t\"green\":");
1486     buffer_rrd_value(wb, rc->green);
1487     buffer_strcat(wb, ",\n");
1488
1489     buffer_strcat(wb, "\t\t\t\"red\":");
1490     buffer_rrd_value(wb, rc->red);
1491     buffer_strcat(wb, ",\n");
1492
1493     buffer_strcat(wb, "\t\t\t\"value\":");
1494     buffer_rrd_value(wb, rc->value);
1495     buffer_strcat(wb, "\n");
1496
1497     buffer_strcat(wb, "\t\t}");
1498 }
1499
1500 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
1501 //
1502 //}
1503
1504 void health_alarms2json(RRDHOST *host, BUFFER *wb) {
1505     int i;
1506     rrdhost_rdlock(&localhost);
1507
1508     buffer_strcat(wb, "{\n\t\"alarms\": {\n");
1509     RRDCALC *rc;
1510     for(i = 0, rc = host->alarms; rc ; rc = rc->next, i++) {
1511         if(likely(i)) buffer_strcat(wb, ",\n");
1512         health_rrdcalc2json_nolock(wb, rc);
1513     }
1514
1515     buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
1516
1517 //    RRDCALCTEMPLATE *rt;
1518 //    for(rt = host->templates; rt ; rt = rt->next)
1519 //        health_rrdcalctemplate2json_nolock(wb, rt);
1520
1521     buffer_strcat(wb, "\n\t}");
1522     buffer_sprintf(wb, ",\n\t\"now\": %lu", (unsigned long)time(NULL));
1523     buffer_strcat(wb, "\n}\n");
1524     rrdhost_unlock(&localhost);
1525 }
1526
1527
1528 // ----------------------------------------------------------------------------
1529 // re-load health configuration
1530
1531 static inline void health_free_all_nolock(RRDHOST *host) {
1532     while(host->templates)
1533         rrdcalctemplate_free(host, host->templates);
1534
1535     while(host->alarms)
1536         rrdcalc_free(host, host->alarms);
1537 }
1538
1539 void health_reload(void) {
1540     if(!health_enabled) {
1541         error("Health reload is requested, but health is not enabled.");
1542         return;
1543     }
1544
1545     char *path = health_config_dir();
1546
1547     rrdhost_rwlock(&localhost);
1548     health_free_all_nolock(&localhost);
1549     rrdhost_unlock(&localhost);
1550
1551     RRDSET *st;
1552     for(st = localhost.rrdset_root; st ; st = st->next) {
1553         st->green = NAN;
1554         st->red = NAN;
1555     }
1556
1557     rrdhost_rwlock(&localhost);
1558     health_readdir(path);
1559     rrdhost_unlock(&localhost);
1560
1561     for(st = localhost.rrdset_root; st ; st = st->next) {
1562         rrdhost_rwlock(&localhost);
1563
1564         rrdsetcalc_link_matching(st);
1565         rrdcalctemplate_link_matching(st);
1566
1567         rrdhost_unlock(&localhost);
1568     }
1569 }
1570
1571
1572 // ----------------------------------------------------------------------------
1573 // health main thread and friends
1574
1575 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
1576     if (unlikely(!rc->rrdset)) {
1577         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
1578         return 0;
1579     }
1580
1581     if (unlikely(!rc->update_every)) {
1582         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
1583         return 0;
1584     }
1585
1586     if (unlikely(rc->next_update > now)) {
1587         if (*next_run > rc->next_update)
1588             *next_run = rc->next_update;
1589
1590         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
1591         return 0;
1592     }
1593
1594     return 1;
1595 }
1596
1597 static inline int rrdcalc_value2status(calculated_number n) {
1598     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
1599     if(n) return RRDCALC_STATUS_RAISED;
1600     return RRDCALC_STATUS_CLEAR;
1601 }
1602
1603 static inline void health_alarm_execute(ALARM_ENTRY *ae) {
1604     if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)
1605         return;
1606
1607     char buffer[FILENAME_MAX + 1];
1608     pid_t command_pid;
1609
1610     const char *exec = ae->exec;
1611     if(!exec) exec = health_default_exec;
1612
1613     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u'",
1614               exec,
1615               ae->name,
1616               ae->chart?ae->chart:"NOCAHRT",
1617               rrdcalc_status2string(ae->new_status),
1618               rrdcalc_status2string(ae->old_status),
1619               ae->new_value,
1620               ae->old_value,
1621               ae->source?ae->source:"UNKNOWN",
1622               (uint32_t)ae->duration,
1623               (uint32_t)ae->non_clear_duration
1624     );
1625
1626     debug(D_HEALTH, "executing command '%s'", buffer);
1627     FILE *fp = mypopen(buffer, &command_pid);
1628     if(!fp) {
1629         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
1630         return;
1631     }
1632     debug(D_HEALTH, "HEALTH reading from command");
1633     char *s = fgets(buffer, FILENAME_MAX, fp);
1634     (void)s;
1635     debug(D_HEALTH, "HEALTH closing command");
1636     mypclose(fp, command_pid);
1637     debug(D_HEALTH, "closed command");
1638 }
1639
1640 static inline void health_process_notifications(ALARM_ENTRY *ae) {
1641     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
1642          ae->chart?ae->chart:"NOCHART", ae->name,
1643          ae->new_value,
1644          rrdcalc_status2string(ae->old_status),
1645          rrdcalc_status2string(ae->new_status)
1646     );
1647
1648     health_alarm_execute(ae);
1649 }
1650
1651 static inline void health_alarm_log(time_t when,
1652                 const char *name, const char *chart, const char *exec,
1653                 time_t duration,
1654                 calculated_number old_value, calculated_number new_value,
1655                 int old_status, int new_status,
1656                 const char *source
1657 ) {
1658     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
1659     ae->name = strdupz(name);
1660     ae->hash_name = simple_hash(ae->name);
1661
1662     if(chart) {
1663         ae->chart = strdupz(chart);
1664         ae->hash_chart = simple_hash(ae->chart);
1665     }
1666
1667     if(exec) ae->exec = strdupz(exec);
1668     if(source) ae->source = strdupz(source);
1669
1670     ae->id = health_log.nextid++;
1671     ae->when = when;
1672     ae->old_value = old_value;
1673     ae->new_value = new_value;
1674     ae->old_status = old_status;
1675     ae->new_status = new_status;
1676     ae->duration = duration;
1677
1678     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
1679         ae->non_clear_duration += ae->duration;
1680
1681     // link it
1682     ae->next = health_log.alarms;
1683     health_log.alarms = ae;
1684     health_log.count++;
1685
1686     // match previous alarms
1687     ALARM_ENTRY *t;
1688     for(t = health_log.alarms ; t ; t = t->next) {
1689         if(t != ae &&
1690                 t->hash_name == ae->hash_name &&
1691                 t->hash_chart == ae->hash_chart &&
1692                 !strcmp(t->name, ae->name) &&
1693                 t->chart && ae->chart && !strcmp(t->chart, ae->chart)) {
1694
1695             if(!(t->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED) && !t->updated_by) {
1696                 t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
1697                 t->updated_by = ae;
1698
1699                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
1700                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
1701                     ae->non_clear_duration += t->non_clear_duration;
1702             }
1703             else {
1704                 // no need to continue
1705                 break;
1706             }
1707         }
1708     }
1709 }
1710
1711 static inline void health_alarm_log_process(void) {
1712     static uint32_t last_processed = 0;
1713     ALARM_ENTRY *ae;
1714
1715     for(ae = health_log.alarms; ae ;ae = ae->next) {
1716         if(last_processed >= ae->id) break;
1717
1718         if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
1719                 !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) {
1720             ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
1721             health_process_notifications(ae);
1722         }
1723     }
1724
1725     if(health_log.alarms)
1726         last_processed = health_log.alarms->id;
1727
1728     if(health_log.count <= health_log.max)
1729         return;
1730
1731     // cleanup excess entries in the log
1732     ALARM_ENTRY *last = NULL;
1733     unsigned int count = health_log.max;
1734     for(ae = health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
1735     if(!ae || !last || last->next != ae) return;
1736     last->next = NULL;
1737
1738     while(ae) {
1739         ALARM_ENTRY *t = ae->next;
1740
1741         freez(ae->chart);
1742         freez(ae->name);
1743         freez(ae->exec);
1744         freez(ae);
1745
1746         ae = t;
1747     }
1748 }
1749
1750 void *health_main(void *ptr) {
1751     (void)ptr;
1752
1753     info("HEALTH thread created with task id %d", gettid());
1754
1755     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
1756         error("Cannot set pthread cancel type to DEFERRED.");
1757
1758     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
1759         error("Cannot set pthread cancel state to ENABLE.");
1760
1761     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
1762     if(min_run_every < 1) min_run_every = 1;
1763
1764     BUFFER *wb = buffer_create(100);
1765
1766     unsigned int loop = 0;
1767     while(health_enabled) {
1768         loop++;
1769         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
1770
1771         int oldstate, runnable = 0;
1772         time_t now = time(NULL);
1773         time_t next_run = now + min_run_every;
1774         RRDCALC *rc;
1775
1776         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
1777             error("Cannot set pthread cancel state to DISABLE.");
1778
1779         rrdhost_rdlock(&localhost);
1780
1781         // the first loop is to lookup values from the db
1782         for (rc = localhost.alarms; rc; rc = rc->next) {
1783             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
1784                 continue;
1785
1786             runnable++;
1787             rc->old_value = rc->value;
1788
1789             // 1. if there is database lookup, do it
1790             // 2. if there is calculation expression, run it
1791
1792             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
1793                 time_t old_db_timestamp = rc->db_before;
1794                 int value_is_null = 0;
1795
1796                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
1797                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
1798                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
1799
1800                 if (unlikely(ret != 200)) {
1801                     // database lookup failed
1802                     rc->value = NAN;
1803
1804                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
1805
1806                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
1807                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
1808                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
1809                     }
1810                 }
1811                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
1812                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
1813
1814                 if (unlikely(old_db_timestamp == rc->db_before)) {
1815                     // database is stale
1816
1817                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
1818
1819                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
1820                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
1821                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
1822                     }
1823                 }
1824                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
1825                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
1826
1827                 if (unlikely(value_is_null)) {
1828                     // collected value is null
1829
1830                     rc->value = NAN;
1831
1832                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
1833                           rc->chart?rc->chart:"NOCHART", rc->name);
1834
1835                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
1836                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
1837                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
1838                               rc->chart?rc->chart:"NOCHART", rc->name);
1839                     }
1840                 }
1841                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
1842                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
1843
1844                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
1845                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
1846             }
1847
1848             if(unlikely(rc->calculation)) {
1849                 if (unlikely(!expression_evaluate(rc->calculation))) {
1850                     // calculation failed
1851
1852                     rc->value = NAN;
1853
1854                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
1855                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
1856
1857                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
1858                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
1859                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
1860                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
1861                     }
1862                 }
1863                 else {
1864                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
1865                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
1866
1867                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
1868                             CALCULATED_NUMBER_FORMAT
1869                             ": %s (source: %s)",
1870                           rc->chart?rc->chart:"NOCHART", rc->name,
1871                           rc->calculation->result,
1872                           buffer_tostring(rc->calculation->error_msg),
1873                           rc->source
1874                     );
1875
1876                     rc->value = rc->calculation->result;
1877                 }
1878             }
1879         }
1880         rrdhost_unlock(&localhost);
1881
1882         if (runnable) {
1883             rrdhost_rdlock(&localhost);
1884
1885             for (rc = localhost.alarms; rc; rc = rc->next) {
1886                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
1887                     continue;
1888
1889                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
1890                 int critical_status = RRDCALC_STATUS_UNDEFINED;
1891
1892                 if(unlikely(rc->warning)) {
1893                     if(unlikely(!expression_evaluate(rc->warning))) {
1894                         // calculation failed
1895
1896                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
1897                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1898
1899                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
1900                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
1901                             error("Health alarm '%s.%s': warning expression failed with error: %s",
1902                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1903                         }
1904                     }
1905                     else {
1906                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
1907                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
1908
1909                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
1910                                 CALCULATED_NUMBER_FORMAT
1911                                 ": %s (source: %s)",
1912                               rc->chart?rc->chart:"NOCHART", rc->name,
1913                               rc->warning->result,
1914                               buffer_tostring(rc->warning->error_msg),
1915                               rc->source
1916                         );
1917
1918                         warning_status = rrdcalc_value2status(rc->warning->result);
1919                     }
1920                 }
1921
1922                 if(unlikely(rc->critical)) {
1923                     if(unlikely(!expression_evaluate(rc->critical))) {
1924                         // calculation failed
1925
1926                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
1927                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
1928
1929                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
1930                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
1931                             error("Health alarm '%s.%s': critical expression failed with error: %s",
1932                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
1933                         }
1934                     }
1935                     else {
1936                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
1937                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
1938
1939                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
1940                                 CALCULATED_NUMBER_FORMAT
1941                                 ": %s (source: %s)",
1942                               rc->chart?rc->chart:"NOCHART", rc->name,
1943                               rc->critical->result,
1944                               buffer_tostring(rc->critical->error_msg),
1945                               rc->source
1946                         );
1947
1948                         critical_status = rrdcalc_value2status(rc->critical->result);
1949                     }
1950                 }
1951
1952                 int status = RRDCALC_STATUS_UNDEFINED;
1953
1954                 switch(warning_status) {
1955                     case RRDCALC_STATUS_CLEAR:
1956                         status = RRDCALC_STATUS_CLEAR;
1957                         break;
1958
1959                     case RRDCALC_STATUS_RAISED:
1960                         status = RRDCALC_STATUS_WARNING;
1961                         break;
1962
1963                     default:
1964                         break;
1965                 }
1966
1967                 switch(critical_status) {
1968                     case RRDCALC_STATUS_CLEAR:
1969                         if(status == RRDCALC_STATUS_UNDEFINED)
1970                             status = RRDCALC_STATUS_CLEAR;
1971                         break;
1972
1973                     case RRDCALC_STATUS_RAISED:
1974                         status = RRDCALC_STATUS_CRITICAL;
1975                         break;
1976
1977                     default:
1978                         break;
1979                 }
1980
1981                 if(status != rc->status) {
1982                     health_alarm_log(time(NULL), rc->name, rc->rrdset->id, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source);
1983                     rc->last_status_change = now;
1984                     rc->status = status;
1985                 }
1986
1987                 rc->last_updated = now;
1988                 rc->next_update = now + rc->update_every;
1989
1990                 if (next_run > rc->next_update)
1991                     next_run = rc->next_update;
1992             }
1993
1994             rrdhost_unlock(&localhost);
1995         }
1996
1997         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
1998             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
1999
2000         // execute notifications
2001         // and cleanup
2002         health_alarm_log_process();
2003
2004         now = time(NULL);
2005         if(now < next_run) {
2006             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2007                   loop, (int) (next_run - now));
2008             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2009         }
2010         else {
2011             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2012         }
2013     }
2014
2015     buffer_free(wb);
2016
2017     info("HEALTH thread exiting");
2018     pthread_exit(NULL);
2019     return NULL;
2020 }