]> arthur.barton.de Git - netdata.git/blob - src/health.c
Prepare release 1.3.0
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 static const char *health_default_exec = PLUGINS_DIR "/alarm-email.sh";
6 int health_enabled = 1;
7
8 // ----------------------------------------------------------------------------
9 // RRDVAR management
10
11 static inline int rrdvar_fix_name(char *variable) {
12     int fixed = 0;
13     while(*variable) {
14         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
15             *variable++ = '_';
16             fixed++;
17         }
18         else
19             variable++;
20     }
21
22     return fixed;
23 }
24
25 int rrdvar_compare(void* a, void* b) {
26     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
27     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
28     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
29 }
30
31 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
32     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
33     if(ret != rv)
34         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
35
36     return ret;
37 }
38
39 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
40     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
41     if(!ret)
42         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
43
44     return ret;
45 }
46
47 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
48     RRDVAR tmp;
49     tmp.name = (char *)name;
50     tmp.hash = (hash)?hash:simple_hash(tmp.name);
51
52     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
53 }
54
55 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
56     (void)host;
57
58     if(!rv) return;
59
60     if(tree)
61         rrdvar_index_del(tree, rv);
62
63     freez(rv->name);
64     freez(rv);
65 }
66
67 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
68     char *variable = strdupz(name);
69     rrdvar_fix_name(variable);
70     uint32_t hash = simple_hash(variable);
71
72     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
73     if(unlikely(!rv)) {
74         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
75
76         rv = callocz(1, sizeof(RRDVAR));
77         rv->name = variable;
78         rv->hash = hash;
79         rv->type = type;
80         rv->value = value;
81
82         RRDVAR *ret = rrdvar_index_add(tree, rv);
83         if(unlikely(ret != rv)) {
84             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
85             rrdvar_free(NULL, NULL, rv);
86             rv = NULL;
87         }
88         else
89             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
90     }
91     else {
92         // already exists
93         freez(variable);
94         rv = NULL;
95     }
96
97     return rv;
98 }
99
100 // ----------------------------------------------------------------------------
101 // RRDVAR lookup
102
103 calculated_number rrdvar2number(RRDVAR *rv) {
104     switch(rv->type) {
105         case RRDVAR_TYPE_CALCULATED: {
106             calculated_number *n = (calculated_number *)rv->value;
107             return *n;
108         }
109
110         case RRDVAR_TYPE_TIME_T: {
111             time_t *n = (time_t *)rv->value;
112             return *n;
113         }
114
115         case RRDVAR_TYPE_COLLECTED: {
116             collected_number *n = (collected_number *)rv->value;
117             return *n;
118         }
119
120         case RRDVAR_TYPE_TOTAL: {
121             total_number *n = (total_number *)rv->value;
122             return *n;
123         }
124
125         case RRDVAR_TYPE_INT: {
126             int *n = (int *)rv->value;
127             return *n;
128         }
129
130         default:
131             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
132             return NAN;
133     }
134 }
135
136 void dump_variable(void *data) {
137     RRDVAR *rv = (RRDVAR *)data;
138     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
139 }
140
141 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
142     RRDSET *st = rc->rrdset;
143     RRDVAR *rv;
144
145     if(!st) return 0;
146
147     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
148     if(rv) {
149         *result = rrdvar2number(rv);
150         return 1;
151     }
152
153     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
154     if(rv) {
155         *result = rrdvar2number(rv);
156         return 1;
157     }
158
159     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
160     if(rv) {
161         *result = rrdvar2number(rv);
162         return 1;
163     }
164
165     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
166     avl_traverse_lock(&st->variables_root_index, dump_variable);
167
168     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
169     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
170
171     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
172     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
173
174     return 0;
175 }
176
177 // ----------------------------------------------------------------------------
178 // RRDSETVAR management
179
180 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
181     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
182     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
183
184     char buffer[RRDVAR_MAX_LENGTH + 1];
185     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
186     rs->fullid = strdupz(buffer);
187
188     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
189     rs->fullname = strdupz(buffer);
190
191     rs->variable = strdupz(variable);
192
193     rs->type = type;
194     rs->value = value;
195     rs->options = options;
196     rs->rrdset = st;
197
198     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
199     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
200     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
201     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
202     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
203
204     rs->next = st->variables;
205     st->variables = rs;
206
207     return rs;
208 }
209
210 void rrdsetvar_rename_all(RRDSET *st) {
211     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
212
213     // only these 2 can change name
214     // rs->family_name
215     // rs->host_name
216
217     char buffer[RRDVAR_MAX_LENGTH + 1];
218     RRDSETVAR *rs, *next = st->variables;
219     while((rs = next)) {
220         next = rs->next;
221
222         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
223
224         if (strcmp(buffer, rs->fullname)) {
225             // name changed
226             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
227             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
228
229             freez(rs->fullname);
230             rs->fullname = strdupz(st->name);
231             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
232             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
233         }
234     }
235
236     rrdsetcalc_link_matching(st);
237 }
238
239 void rrdsetvar_free(RRDSETVAR *rs) {
240     RRDSET *st = rs->rrdset;
241     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
242
243     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
244     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
245     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
246     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
247     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
248
249     if(st->variables == rs) {
250         st->variables = rs->next;
251     }
252     else {
253         RRDSETVAR *t;
254         for (t = st->variables; t && t->next != rs; t = t->next);
255         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
256         else t->next = rs->next;
257     }
258
259     freez(rs->fullid);
260     freez(rs->fullname);
261     freez(rs->variable);
262     freez(rs);
263 }
264
265 // ----------------------------------------------------------------------------
266 // RRDDIMVAR management
267
268 #define RRDDIMVAR_ID_MAX 1024
269
270 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
271     RRDSET *st = rd->rrdset;
272
273     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
274
275     if(!prefix) prefix = "";
276     if(!suffix) suffix = "";
277
278     char buffer[RRDDIMVAR_ID_MAX + 1];
279     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
280
281     rs->prefix = strdupz(prefix);
282     rs->suffix = strdupz(suffix);
283
284     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
285     rs->id = strdupz(buffer);
286
287     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
288     rs->name = strdupz(buffer);
289
290     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
291     rs->fullidid = strdupz(buffer);
292
293     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
294     rs->fullidname = strdupz(buffer);
295
296     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
297     rs->fullnameid = strdupz(buffer);
298
299     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
300     rs->fullnamename = strdupz(buffer);
301
302     rs->type = type;
303     rs->value = value;
304     rs->options = options;
305     rs->rrddim = rd;
306
307     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
308     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
309
310     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
311     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
312
313     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
314     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
315     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
316     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
317
318     rs->next = rd->variables;
319     rd->variables = rs;
320
321     return rs;
322 }
323
324 void rrddimvar_rename_all(RRDDIM *rd) {
325     RRDSET *st = rd->rrdset;
326     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
327
328     RRDDIMVAR *rs, *next = rd->variables;
329     while((rs = next)) {
330         next = rs->next;
331
332         if (strcmp(rd->name, rs->name)) {
333             char buffer[RRDDIMVAR_ID_MAX + 1];
334             // name changed
335
336             // name
337             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
338             freez(rs->name);
339             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
340             rs->name = strdupz(buffer);
341             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
342
343             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
344             freez(rs->fullidname);
345             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
346             rs->fullidname = strdupz(buffer);
347             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
348                                                              rs->fullidname, rs->type, rs->value);
349
350             // fullnameid
351             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
352             freez(rs->fullnameid);
353             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
354             rs->fullnameid = strdupz(buffer);
355             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
356                                                           rs->fullnameid, rs->type, rs->value);
357
358             // fullnamename
359             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
360             freez(rs->fullnamename);
361             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
362             rs->fullnamename = strdupz(buffer);
363             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
364                                                           rs->fullnamename, rs->type, rs->value);
365         }
366     }
367 }
368
369 void rrddimvar_free(RRDDIMVAR *rs) {
370     RRDDIM *rd = rs->rrddim;
371     RRDSET *st = rd->rrdset;
372     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
373
374     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
375     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
376
377     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
378     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
379
380     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
381     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
382     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
383     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
384
385     if(rd->variables == rs) {
386         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
387         rd->variables = rs->next;
388     }
389     else {
390         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
391         RRDDIMVAR *t;
392         for (t = rd->variables; t && t->next != rs; t = t->next) ;
393         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
394         else t->next = rs->next;
395     }
396
397     freez(rs->prefix);
398     freez(rs->suffix);
399     freez(rs->id);
400     freez(rs->name);
401     freez(rs->fullidid);
402     freez(rs->fullidname);
403     freez(rs->fullnameid);
404     freez(rs->fullnamename);
405     freez(rs);
406 }
407
408 // ----------------------------------------------------------------------------
409 // RRDCALC management
410
411 static inline const char *rrdcalc_status2string(int status) {
412     switch(status) {
413         case RRDCALC_STATUS_UNINITIALIZED:
414             return "UNINITIALIZED";
415
416         case RRDCALC_STATUS_UNDEFINED:
417             return "UNDEFINED";
418
419         case RRDCALC_STATUS_CLEAR:
420             return "CLEAR";
421
422         case RRDCALC_STATUS_RAISED:
423             return "RAISED";
424
425         case RRDCALC_STATUS_WARNING:
426             return "WARNING";
427
428         case RRDCALC_STATUS_CRITICAL:
429             return "CRITICAL";
430
431         default:
432             return "UNKNOWN";
433     }
434 }
435
436 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
437     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
438
439     rc->last_status_change = time(NULL);
440     rc->rrdset = st;
441
442     rc->rrdset_next = st->alarms;
443     rc->rrdset_prev = NULL;
444     st->alarms = rc;
445
446     if(rc->update_every < rc->rrdset->update_every) {
447         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
448         rc->update_every = rc->rrdset->update_every;
449     }
450
451     if(!isnan(rc->green) && isnan(st->green)) {
452         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
453         st->green = rc->green;
454     }
455
456     if(!isnan(rc->red) && isnan(st->red)) {
457         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
458         st->red = rc->red;
459     }
460
461     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
462     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
463
464     char fullname[RRDVAR_MAX_LENGTH + 1];
465     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
466     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
467
468     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
469     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
470
471         if(!rc->units) rc->units = strdupz(st->units);
472 }
473
474 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
475     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
476             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
477         return 1;
478
479     return 0;
480 }
481
482 // this has to be called while the RRDHOST is locked
483 inline void rrdsetcalc_link_matching(RRDSET *st) {
484     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
485
486     RRDCALC *rc;
487     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
488         if(rc->rrdset) continue;
489
490         if(rrdcalc_is_matching_this_rrdset(rc, st))
491             rrdsetcalc_link(st, rc);
492     }
493 }
494
495 // this has to be called while the RRDHOST is locked
496 inline void rrdsetcalc_unlink(RRDCALC *rc) {
497     RRDSET *st = rc->rrdset;
498
499     if(!st) {
500         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
501         return;
502     }
503
504     RRDHOST *host = st->rrdhost;
505
506     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
507
508     // unlink it
509     if(rc->rrdset_prev)
510         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
511
512     if(rc->rrdset_next)
513         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
514
515     if(st->alarms == rc)
516         st->alarms = rc->rrdset_next;
517
518     rc->rrdset_prev = rc->rrdset_next = NULL;
519
520     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
521     rc->local = NULL;
522
523     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
524     rc->family = NULL;
525
526     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
527     rc->hostid = NULL;
528
529     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
530     rc->hostname = NULL;
531
532     rc->rrdset = NULL;
533
534     // RRDCALC will remain in RRDHOST
535     // so that if the matching chart is found in the future
536     // it will be applied automatically
537 }
538
539 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
540     RRDCALC *rc;
541     uint32_t hash = simple_hash(name);
542
543     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
544         if(rc->hash == hash && !strcmp(rc->name, name))
545             return rc;
546     }
547
548     return NULL;
549 }
550
551 static inline int rrdcalc_exists(RRDHOST *host, const char *name, uint32_t hash) {
552     RRDCALC *rc;
553
554     // make sure it does not already exist
555     for(rc = host->alarms; rc ; rc = rc->next) {
556         if (rc->hash == hash && !strcmp(name, rc->name)) {
557             error("Health alarm '%s' already exists in host '%s'.", name, host->hostname);
558             return 1;
559         }
560     }
561
562     return 0;
563 }
564
565 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
566     rrdhost_check_rdlock(host);
567
568     if(rc->calculation) {
569         rc->calculation->this = &rc->value;
570         rc->calculation->after = &rc->db_after;
571         rc->calculation->before = &rc->db_before;
572         rc->calculation->rrdcalc = rc;
573     }
574
575     if(rc->warning) {
576         rc->warning->this = &rc->value;
577         rc->warning->after = &rc->db_after;
578         rc->warning->before = &rc->db_before;
579         rc->warning->rrdcalc = rc;
580     }
581
582     if(rc->critical) {
583         rc->critical->this = &rc->value;
584         rc->critical->after = &rc->db_after;
585         rc->critical->before = &rc->db_before;
586         rc->critical->rrdcalc = rc;
587     }
588
589     // link it to the host
590     rc->next = host->alarms;
591     host->alarms = rc;
592
593     // link it to its chart
594     RRDSET *st;
595     for(st = host->rrdset_root; st ; st = st->next) {
596         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
597             rrdsetcalc_link(st, rc);
598             break;
599         }
600     }
601 }
602
603 static inline uint32_t rrdcalc_fullname(char *fullname, size_t len, const char *chart, const char *name) {
604     snprintfz(fullname, len - 1, "%s%s%s", chart?chart:"", chart?".":"", name);
605     rrdvar_fix_name(fullname);
606     return simple_hash(fullname);
607 }
608
609 static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const char *chart, const char *dimensions,
610                         const char *units, const char *info,
611                         int group_method, int after, int before, int update_every, uint32_t options,
612                         calculated_number green, calculated_number red,
613                         const char *exec, const char *source,
614                         const char *calc, const char *warn, const char *crit) {
615
616     char fullname[RRDVAR_MAX_LENGTH + 1];
617     uint32_t hash = rrdcalc_fullname(fullname, RRDVAR_MAX_LENGTH + 1, chart, name);
618
619     if(rrdcalc_exists(host, fullname, hash))
620         return NULL;
621
622     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
623
624     rc->name = strdupz(name);
625     rc->hash = simple_hash(rc->name);
626
627     rc->chart = strdupz(chart);
628     rc->hash_chart = simple_hash(rc->chart);
629
630     if(dimensions) rc->dimensions = strdupz(dimensions);
631
632     rc->green = green;
633     rc->red = red;
634     rc->value = NAN;
635     rc->old_value = NAN;
636
637     rc->group = group_method;
638     rc->after = after;
639     rc->before = before;
640     rc->update_every = update_every;
641     rc->options = options;
642
643     if(exec) rc->exec = strdupz(exec);
644     if(source) rc->source = strdupz(source);
645     if(units) rc->units = strdupz(units);
646     if(info) rc->info = strdupz(info);
647
648     if(calc) {
649         rc->calculation = expression_parse(calc, NULL, NULL);
650         if(!rc->calculation)
651             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, name, calc);
652     }
653     if(warn) {
654         rc->warning = expression_parse(warn, NULL, NULL);
655         if(!rc->warning)
656             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, name, warn);
657     }
658     if(crit) {
659         rc->critical = expression_parse(crit, NULL, NULL);
660         if(!rc->critical)
661             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, name, crit);
662     }
663
664     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
665           (rc->chart)?rc->chart:"NOCHART",
666           rc->name,
667           (rc->exec)?rc->exec:"DEFAULT",
668           rc->green,
669           rc->red,
670           rc->group,
671           rc->after,
672           rc->before,
673           rc->options,
674           (rc->dimensions)?rc->dimensions:"NONE",
675           rc->update_every,
676           (rc->calculation)?rc->calculation->parsed_as:"NONE",
677           (rc->warning)?rc->warning->parsed_as:"NONE",
678           (rc->critical)?rc->critical->parsed_as:"NONE",
679           rc->source
680     );
681
682     rrdcalc_create_part2(host, rc);
683     return rc;
684 }
685
686 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
687     if(!rc) return;
688
689     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
690
691     // unlink it from RRDSET
692     if(rc->rrdset) rrdsetcalc_unlink(rc);
693
694     // unlink it from RRDHOST
695     if(rc == host->alarms)
696         host->alarms = rc->next;
697
698     else if(host->alarms) {
699         RRDCALC *t, *last = host->alarms;
700
701         for(t = last->next; t && t != rc; last = t, t = t->next) ;
702         if(last && last->next == rc)
703             last->next = rc->next;
704         else
705             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
706     }
707     else
708         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
709
710     expression_free(rc->calculation);
711     expression_free(rc->warning);
712     expression_free(rc->critical);
713
714     freez(rc->name);
715     freez(rc->chart);
716     freez(rc->family);
717     freez(rc->dimensions);
718     freez(rc->exec);
719     freez(rc->source);
720     freez(rc->units);
721     freez(rc->info);
722     freez(rc);
723 }
724
725 // ----------------------------------------------------------------------------
726 // RRDCALCTEMPLATE management
727
728 void rrdcalctemplate_link_matching(RRDSET *st) {
729     RRDCALCTEMPLATE *rt;
730
731     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
732         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
733
734             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt->name, st->id,
735                            rt->dimensions, rt->units, rt->info, rt->group, rt->after, rt->before, rt->update_every, rt->options,
736                            rt->green, rt->red, rt->exec, rt->source,
737                            (rt->calculation)?rt->calculation->source:NULL,
738                            (rt->warning)?rt->warning->source:NULL,
739                            (rt->critical)?rt->critical->source:NULL);
740
741             if(!rc)
742                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
743
744 #ifdef NETDATA_INTERNAL_CHECKS
745             else if(rc->rrdset != st)
746                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
747 #else
748             (void)rc;
749 #endif
750         }
751     }
752 }
753
754 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
755     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
756
757     if(host->templates) {
758         if(host->templates == rt) {
759             host->templates = rt->next;
760         }
761         else {
762             RRDCALCTEMPLATE *t, *last = host->templates;
763             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
764             if(last && last->next == rt) {
765                 last->next = rt->next;
766                 rt->next = NULL;
767             }
768             else
769                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
770         }
771     }
772
773     expression_free(rt->calculation);
774     expression_free(rt->warning);
775     expression_free(rt->critical);
776
777     freez(rt->name);
778     freez(rt->exec);
779     freez(rt->context);
780     freez(rt->source);
781     freez(rt->units);
782     freez(rt->info);
783     freez(rt->dimensions);
784     freez(rt);
785 }
786
787 // ----------------------------------------------------------------------------
788 // load health configuration
789
790 #define HEALTH_CONF_MAX_LINE 4096
791
792 #define HEALTH_ALARM_KEY "alarm"
793 #define HEALTH_TEMPLATE_KEY "template"
794 #define HEALTH_ON_KEY "on"
795 #define HEALTH_LOOKUP_KEY "lookup"
796 #define HEALTH_CALC_KEY "calc"
797 #define HEALTH_EVERY_KEY "every"
798 #define HEALTH_GREEN_KEY "green"
799 #define HEALTH_RED_KEY "red"
800 #define HEALTH_WARN_KEY "warn"
801 #define HEALTH_CRIT_KEY "crit"
802 #define HEALTH_EXEC_KEY "exec"
803 #define HEALTH_UNITS_KEY "units"
804 #define HEALTH_INFO_KEY "info"
805
806 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
807     {
808         char fullname[RRDVAR_MAX_LENGTH + 1];
809         uint32_t hash = rrdcalc_fullname(fullname, RRDVAR_MAX_LENGTH + 1, rc->chart, rc->name);
810
811         if (rrdcalc_exists(host, fullname, hash))
812             return 0;
813     }
814
815     if(!rc->chart) {
816         error("Health configuration for alarm '%s' does not have a chart", rc->name);
817         return 0;
818     }
819
820     if(!rc->update_every) {
821         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
822         return 0;
823     }
824
825     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
826         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
827         return 0;
828     }
829
830     debug(D_HEALTH, "Health configuration adding alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
831           rc->chart?rc->chart:"NOCHART",
832           rc->name,
833           (rc->exec)?rc->exec:"DEFAULT",
834           rc->green,
835           rc->red,
836           rc->group,
837           rc->after,
838           rc->before,
839           rc->options,
840           (rc->dimensions)?rc->dimensions:"NONE",
841           rc->update_every,
842           (rc->calculation)?rc->calculation->parsed_as:"NONE",
843           (rc->warning)?rc->warning->parsed_as:"NONE",
844           (rc->critical)?rc->critical->parsed_as:"NONE",
845           rc->source
846     );
847
848     rrdcalc_create_part2(host, rc);
849     return 1;
850 }
851
852 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
853     if(!rt->context) {
854         error("Health configuration for template '%s' does not have a context", rt->name);
855         return 0;
856     }
857
858     if(!rt->update_every) {
859         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
860         return 0;
861     }
862
863     if(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical) {
864         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
865         return 0;
866     }
867
868     RRDCALCTEMPLATE *t;
869     for (t = host->templates; t ; t = t->next) {
870         if(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name)) {
871             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
872             return 0;
873         }
874     }
875
876     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'",
877           rt->name,
878           (rt->context)?rt->context:"NONE",
879           (rt->exec)?rt->exec:"DEFAULT",
880           rt->green,
881           rt->red,
882           rt->group,
883           rt->after,
884           rt->before,
885           rt->options,
886           (rt->dimensions)?rt->dimensions:"NONE",
887           rt->update_every,
888           (rt->calculation)?rt->calculation->parsed_as:"NONE",
889           (rt->warning)?rt->warning->parsed_as:"NONE",
890           (rt->critical)?rt->critical->parsed_as:"NONE",
891           rt->source
892     );
893
894     rt->next = host->templates;
895     host->templates = rt;
896     return 1;
897 }
898
899 static inline int health_parse_duration(char *string, int *result) {
900     // make sure it is a number
901     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
902         *result = 0;
903         return 0;
904     }
905
906     char *e = NULL;
907     calculated_number n = strtold(string, &e);
908     if(e && *e) {
909         switch (*e) {
910             case 'Y':
911                 *result = (int) (n * 86400 * 365);
912                 break;
913             case 'M':
914                 *result = (int) (n * 86400 * 30);
915                 break;
916             case 'w':
917                 *result = (int) (n * 86400 * 7);
918                 break;
919             case 'd':
920                 *result = (int) (n * 86400);
921                 break;
922             case 'h':
923                 *result = (int) (n * 3600);
924                 break;
925             case 'm':
926                 *result = (int) (n * 60);
927                 break;
928
929             default:
930             case 's':
931                 *result = (int) (n);
932                 break;
933         }
934     }
935     else
936        *result = (int)(n);
937
938     return 1;
939 }
940
941 static inline int health_parse_db_lookup(
942         size_t line, const char *path, const char *file, char *string,
943         int *group_method, int *after, int *before, int *every,
944         uint32_t *options, char **dimensions
945 ) {
946     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
947
948     if(*dimensions) freez(*dimensions);
949     *dimensions = NULL;
950     *after = 0;
951     *before = 0;
952     *every = 0;
953     *options = 0;
954
955     char *s = string, *key;
956
957     // first is the group method
958     key = s;
959     while(*s && !isspace(*s)) s++;
960     while(*s && isspace(*s)) *s++ = '\0';
961     if(!*s) {
962         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
963               line, path, file, key);
964         return 0;
965     }
966
967     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
968         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
969               line, path, file, key);
970         return 0;
971     }
972
973     // then is the 'after' time
974     key = s;
975     while(*s && !isspace(*s)) s++;
976     while(*s && isspace(*s)) *s++ = '\0';
977
978     if(!health_parse_duration(key, after)) {
979         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
980               line, path, file, key);
981         return 0;
982     }
983
984     // sane defaults
985     *every = abs(*after);
986
987     // now we may have optional parameters
988     while(*s) {
989         key = s;
990         while(*s && !isspace(*s)) s++;
991         while(*s && isspace(*s)) *s++ = '\0';
992         if(!*key) break;
993
994         if(!strcasecmp(key, "at")) {
995             char *value = s;
996             while(*s && !isspace(*s)) s++;
997             while(*s && isspace(*s)) *s++ = '\0';
998
999             if (!health_parse_duration(value, before)) {
1000                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1001                       line, path, file, value, key);
1002             }
1003         }
1004         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1005             char *value = s;
1006             while(*s && !isspace(*s)) s++;
1007             while(*s && isspace(*s)) *s++ = '\0';
1008
1009             if (!health_parse_duration(value, every)) {
1010                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1011                       line, path, file, value, key);
1012             }
1013         }
1014         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1015             *options |= RRDR_OPTION_ABSOLUTE;
1016         }
1017         else if(!strcasecmp(key, "min2max")) {
1018             *options |= RRDR_OPTION_MIN2MAX;
1019         }
1020         else if(!strcasecmp(key, "null2zero")) {
1021             *options |= RRDR_OPTION_NULL2ZERO;
1022         }
1023         else if(!strcasecmp(key, "percentage")) {
1024             *options |= RRDR_OPTION_PERCENTAGE;
1025         }
1026         else if(!strcasecmp(key, "unaligned")) {
1027             *options |= RRDR_OPTION_NOT_ALIGNED;
1028         }
1029         else if(!strcasecmp(key, "of")) {
1030             if(*s && strcasecmp(s, "all"))
1031                *dimensions = strdupz(s);
1032             break;
1033         }
1034         else {
1035             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1036                   line, path, file, key);
1037         }
1038     }
1039
1040     return 1;
1041 }
1042
1043 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1044     char buffer[FILENAME_MAX + 1];
1045     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1046     return strdupz(buffer);
1047 }
1048
1049 static inline void strip_quotes(char *s) {
1050     while(*s) {
1051         if(*s == '\'' || *s == '"') *s = ' ';
1052         s++;
1053     }
1054 }
1055
1056 int health_readfile(const char *path, const char *filename) {
1057     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1058
1059     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0;
1060     char buffer[HEALTH_CONF_MAX_LINE + 1];
1061
1062     if(unlikely(!hash_alarm)) {
1063         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1064         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1065         hash_on = simple_uhash(HEALTH_ON_KEY);
1066         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1067         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1068         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1069         hash_red = simple_uhash(HEALTH_RED_KEY);
1070         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1071         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1072         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1073         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1074         hash_units = simple_hash(HEALTH_UNITS_KEY);
1075         hash_info = simple_hash(HEALTH_INFO_KEY);
1076     }
1077
1078     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1079     FILE *fp = fopen(buffer, "r");
1080     if(!fp) {
1081         error("Health configuration cannot read file '%s'.", buffer);
1082         return 0;
1083     }
1084
1085     RRDCALC *rc = NULL;
1086     RRDCALCTEMPLATE *rt = NULL;
1087
1088     size_t line = 0, append = 0;
1089     char *s;
1090     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1091         int stop_appending = !s;
1092         line++;
1093         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1094         s = trim(buffer);
1095         if(!s) continue;
1096         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1097
1098         append = strlen(s);
1099         if(!stop_appending && s[append - 1] == '\\') {
1100             s[append - 1] = ' ';
1101             append = &s[append] - buffer;
1102             if(append < HEALTH_CONF_MAX_LINE)
1103                 continue;
1104             else {
1105                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1106             }
1107         }
1108         append = 0;
1109
1110         char *key = s;
1111         while(*s && *s != ':') s++;
1112         if(!*s) {
1113             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1114             continue;
1115         }
1116         *s = '\0';
1117         s++;
1118
1119         char *value = s;
1120         key = trim(key);
1121         value = trim(value);
1122
1123         if(!key) {
1124             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1125             continue;
1126         }
1127
1128         if(!value) {
1129             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1130             continue;
1131         }
1132
1133         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1134         uint32_t hash = simple_uhash(key);
1135
1136         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1137             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1138                 rrdcalc_free(&localhost, rc);
1139
1140             if(rt) {
1141                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1142                     rrdcalctemplate_free(&localhost, rt);
1143                 rt = NULL;
1144             }
1145
1146             rc = callocz(1, sizeof(RRDCALC));
1147             rc->name = strdupz(value);
1148             rc->hash = simple_hash(rc->name);
1149             rc->source = health_source_file(line, path, filename);
1150             rc->green = NAN;
1151             rc->red = NAN;
1152             rc->value = NAN;
1153             rc->old_value = NAN;
1154
1155             if(rrdvar_fix_name(rc->name))
1156                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1157         }
1158         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1159             if(rc) {
1160                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1161                     rrdcalc_free(&localhost, rc);
1162                 rc = NULL;
1163             }
1164
1165             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1166                 rrdcalctemplate_free(&localhost, rt);
1167
1168             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1169             rt->name = strdupz(value);
1170             rt->hash_name = simple_hash(rt->name);
1171             rt->source = health_source_file(line, path, filename);
1172             rt->green = NAN;
1173             rt->red = NAN;
1174
1175             if(rrdvar_fix_name(rt->name))
1176                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1177         }
1178         else if(rc) {
1179             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1180                 if(rc->chart) {
1181                     if(strcmp(rc->chart, value))
1182                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1183                              line, path, filename, rc->name, key, rc->chart, value, value);
1184
1185                     freez(rc->chart);
1186                 }
1187                 rc->chart = strdupz(value);
1188                 rc->hash_chart = simple_hash(rc->chart);
1189             }
1190             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1191                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1192                                        &rc->update_every,
1193                                        &rc->options, &rc->dimensions);
1194             }
1195             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1196                 if(!health_parse_duration(value, &rc->update_every))
1197                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1198                          line, path, filename, rc->name, key, value);
1199             }
1200             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1201                 char *e;
1202                 rc->green = strtold(value, &e);
1203                 if(e && *e) {
1204                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1205                          line, path, filename, rc->name, key, e);
1206                 }
1207             }
1208             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1209                 char *e;
1210                 rc->red = strtold(value, &e);
1211                 if(e && *e) {
1212                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1213                          line, path, filename, rc->name, key, e);
1214                 }
1215             }
1216             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1217                 const char *failed_at = NULL;
1218                 int error = 0;
1219                 rc->calculation = expression_parse(value, &failed_at, &error);
1220                 if(!rc->calculation) {
1221                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1222                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1223                 }
1224             }
1225             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1226                 const char *failed_at = NULL;
1227                 int error = 0;
1228                 rc->warning = expression_parse(value, &failed_at, &error);
1229                 if(!rc->warning) {
1230                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1231                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1232                 }
1233             }
1234             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1235                 const char *failed_at = NULL;
1236                 int error = 0;
1237                 rc->critical = expression_parse(value, &failed_at, &error);
1238                 if(!rc->critical) {
1239                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1240                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1241                 }
1242             }
1243             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1244                 if(rc->exec) {
1245                     if(strcmp(rc->exec, value))
1246                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1247                              line, path, filename, rc->name, key, rc->exec, value, value);
1248
1249                     freez(rc->exec);
1250                 }
1251                 rc->exec = strdupz(value);
1252             }
1253             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1254                 if(rc->units) {
1255                     if(strcmp(rc->units, value))
1256                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1257                              line, path, filename, rc->name, key, rc->units, value, value);
1258
1259                     freez(rc->units);
1260                 }
1261                 rc->units = strdupz(value);
1262                 strip_quotes(rc->units);
1263             }
1264             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1265                 if(rc->info) {
1266                     if(strcmp(rc->info, value))
1267                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1268                              line, path, filename, rc->name, key, rc->info, value, value);
1269
1270                     freez(rc->info);
1271                 }
1272                 rc->info = strdupz(value);
1273                 strip_quotes(rc->info);
1274             }
1275             else {
1276                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1277                      line, path, filename, rc->name, key);
1278             }
1279         }
1280         else if(rt) {
1281             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1282                 if(rt->context) {
1283                     if(strcmp(rt->context, value))
1284                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1285                              line, path, filename, rt->name, key, rt->context, value, value);
1286
1287                     freez(rt->context);
1288                 }
1289                 rt->context = strdupz(value);
1290                 rt->hash_context = simple_hash(rt->context);
1291             }
1292             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1293                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1294                                        &rt->update_every,
1295                                        &rt->options, &rt->dimensions);
1296             }
1297             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1298                 if(!health_parse_duration(value, &rt->update_every))
1299                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1300                          line, path, filename, rt->name, key, value);
1301             }
1302             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1303                 char *e;
1304                 rt->green = strtold(value, &e);
1305                 if(e && *e) {
1306                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1307                          line, path, filename, rt->name, key, e);
1308                 }
1309             }
1310             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1311                 char *e;
1312                 rt->red = strtold(value, &e);
1313                 if(e && *e) {
1314                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1315                          line, path, filename, rt->name, key, e);
1316                 }
1317             }
1318             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1319                 const char *failed_at = NULL;
1320                 int error = 0;
1321                 rt->calculation = expression_parse(value, &failed_at, &error);
1322                 if(!rt->calculation) {
1323                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1324                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1325                 }
1326             }
1327             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1328                 const char *failed_at = NULL;
1329                 int error = 0;
1330                 rt->warning = expression_parse(value, &failed_at, &error);
1331                 if(!rt->warning) {
1332                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1333                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1334                 }
1335             }
1336             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1337                 const char *failed_at = NULL;
1338                 int error = 0;
1339                 rt->critical = expression_parse(value, &failed_at, &error);
1340                 if(!rt->critical) {
1341                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1342                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1343                 }
1344             }
1345             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1346                 if(rt->exec) {
1347                     if(strcmp(rt->exec, value))
1348                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1349                              line, path, filename, rt->name, key, rt->exec, value, value);
1350
1351                     freez(rt->exec);
1352                 }
1353                 rt->exec = strdupz(value);
1354             }
1355             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1356                 if(rt->units) {
1357                     if(strcmp(rt->units, value))
1358                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1359                              line, path, filename, rt->name, key, rt->units, value, value);
1360
1361                     freez(rt->units);
1362                 }
1363                 rt->units = strdupz(value);
1364                 strip_quotes(rt->units);
1365             }
1366             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1367                 if(rt->info) {
1368                     if(strcmp(rt->info, value))
1369                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1370                              line, path, filename, rt->name, key, rt->info, value, value);
1371
1372                     freez(rt->info);
1373                 }
1374                 rt->info = strdupz(value);
1375                 strip_quotes(rt->info);
1376             }
1377             else {
1378                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1379                       line, path, filename, rt->name, key);
1380             }
1381         }
1382         else {
1383             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1384                   line, path, filename, key);
1385         }
1386     }
1387
1388     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1389         rrdcalc_free(&localhost, rc);
1390
1391     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1392         rrdcalctemplate_free(&localhost, rt);
1393
1394     fclose(fp);
1395     return 1;
1396 }
1397
1398 void health_readdir(const char *path) {
1399     size_t pathlen = strlen(path);
1400
1401     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
1402
1403     DIR *dir = opendir(path);
1404     if (!dir) {
1405         error("Health configuration cannot open directory '%s'.", path);
1406         return;
1407     }
1408
1409     struct dirent *de = NULL;
1410     while ((de = readdir(dir))) {
1411         size_t len = strlen(de->d_name);
1412
1413         if(de->d_type == DT_DIR
1414            && (
1415                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
1416                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
1417            ))
1418             continue;
1419
1420         else if(de->d_type == DT_DIR) {
1421             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
1422             strcpy(s, path);
1423             strcat(s, "/");
1424             strcat(s, de->d_name);
1425             health_readdir(s);
1426             freez(s);
1427             continue;
1428         }
1429
1430         else if((de->d_type == DT_LNK || de->d_type == DT_REG) &&
1431                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
1432             health_readfile(path, de->d_name);
1433         }
1434     }
1435
1436     closedir(dir);
1437 }
1438
1439 static inline char *health_config_dir(void) {
1440     char buffer[FILENAME_MAX + 1];
1441     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
1442     return config_get("health", "health configuration directory", buffer);
1443 }
1444
1445 void health_init(void) {
1446     debug(D_HEALTH, "Health configuration initializing");
1447
1448     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
1449         debug(D_HEALTH, "Health is disabled.");
1450         return;
1451     }
1452
1453     char *path = health_config_dir();
1454
1455     {
1456         char buffer[FILENAME_MAX + 1];
1457         snprintfz(buffer, FILENAME_MAX, "%s/alarm-email.sh", config_get("global", "plugins directory", PLUGINS_DIR));
1458         health_default_exec = config_get("health", "script to execute on alarm", buffer);
1459     }
1460
1461     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1462     if(n < 2) {
1463         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
1464         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1465     }
1466     else localhost.health_log.max = (unsigned int)n;
1467
1468     rrdhost_rwlock(&localhost);
1469     health_readdir(path);
1470     rrdhost_unlock(&localhost);
1471 }
1472
1473 // ----------------------------------------------------------------------------
1474 // JSON generation
1475
1476 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
1477     if(value && *value)
1478         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
1479     else
1480         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
1481 }
1482
1483 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
1484     buffer_sprintf(wb, "\n\t{\n"
1485                            "\t\t\"id\":%u,\n"
1486                            "\t\t\"name\":\"%s\",\n"
1487                            "\t\t\"chart\":\"%s\",\n"
1488                            "\t\t\"family\":\"%s\",\n"
1489                            "\t\t\"processed\":%s,\n"
1490                            "\t\t\"updated\":%s,\n"
1491                            "\t\t\"exec_run\":%s,\n"
1492                            "\t\t\"exec_failed\":%s,\n"
1493                            "\t\t\"exec\":\"%s\",\n"
1494                            "\t\t\"exec_code\":%d,\n"
1495                            "\t\t\"source\":\"%s\",\n"
1496                            "\t\t\"units\":\"%s\",\n"
1497                            "\t\t\"info\":\"%s\",\n"
1498                            "\t\t\"when\":%lu,\n"
1499                            "\t\t\"duration\":%lu,\n"
1500                            "\t\t\"non_clear_duration\":%lu,\n"
1501                            "\t\t\"status\":\"%s\",\n"
1502                            "\t\t\"old_status\":\"%s\",\n",
1503                    ae->id,
1504                    ae->name,
1505                    ae->chart,
1506                    ae->family,
1507                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)?"true":"false",
1508                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)?"true":"false",
1509                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN)?"true":"false",
1510                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false",
1511                    ae->exec?ae->exec:health_default_exec,
1512                    ae->exec_code,
1513                    ae->source,
1514                    ae->units?ae->units:"",
1515                    ae->info?ae->info:"",
1516                    (unsigned long)ae->when,
1517                    (unsigned long)ae->duration,
1518                    (unsigned long)ae->non_clear_duration,
1519                    rrdcalc_status2string(ae->new_status),
1520                    rrdcalc_status2string(ae->old_status)
1521     );
1522
1523     buffer_strcat(wb, "\t\t\"value\":");
1524     buffer_rrd_value(wb, ae->new_value);
1525     buffer_strcat(wb, ",\n");
1526
1527     buffer_strcat(wb, "\t\t\"old_value\":");
1528     buffer_rrd_value(wb, ae->old_value);
1529     buffer_strcat(wb, "\n");
1530
1531     buffer_strcat(wb, "\t}");
1532 }
1533
1534 void health_alarm_log2json(RRDHOST *host, BUFFER *wb) {
1535     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1536
1537     buffer_strcat(wb, "[");
1538
1539     unsigned int max = host->health_log.max;
1540     unsigned int count = 0;
1541     ALARM_ENTRY *ae;
1542     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
1543         if(likely(count)) buffer_strcat(wb, ",");
1544         health_alarm_entry2json_nolock(wb, ae);
1545     }
1546
1547     buffer_strcat(wb, "\n]\n");
1548
1549     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1550 }
1551
1552 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
1553     buffer_sprintf(wb,
1554            "\t\t\"%s.%s\": {\n"
1555                    "\t\t\t\"name\": \"%s\",\n"
1556                    "\t\t\t\"chart\": \"%s\",\n"
1557                    "\t\t\t\"family\": \"%s\",\n"
1558                    "\t\t\t\"active\": %s,\n"
1559                    "\t\t\t\"exec\": \"%s\",\n"
1560                    "\t\t\t\"source\": \"%s\",\n"
1561                    "\t\t\t\"units\": \"%s\",\n"
1562                    "\t\t\t\"info\": \"%s\",\n"
1563                                    "\t\t\t\"status\": \"%s\",\n"
1564                    "\t\t\t\"last_status_change\": %lu,\n"
1565                    "\t\t\t\"last_updated\": %lu,\n"
1566                    "\t\t\t\"next_update\": %lu,\n"
1567                    "\t\t\t\"update_every\": %d,\n"
1568             , rc->chart, rc->name
1569             , rc->name
1570             , rc->chart
1571             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
1572             , (rc->rrdset)?"true":"false"
1573             , rc->exec?rc->exec:health_default_exec
1574             , rc->source
1575             , rc->units?rc->units:""
1576             , rc->info?rc->info:""
1577             , rrdcalc_status2string(rc->status)
1578             , (unsigned long)rc->last_status_change
1579             , (unsigned long)rc->last_updated
1580             , (unsigned long)rc->next_update
1581             , rc->update_every
1582     );
1583
1584     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
1585         if(rc->dimensions && *rc->dimensions)
1586             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
1587
1588         buffer_sprintf(wb,
1589                        "\t\t\t\"db_after\": %lu,\n"
1590                        "\t\t\t\"db_before\": %lu,\n"
1591                        "\t\t\t\"lookup_method\": \"%s\",\n"
1592                        "\t\t\t\"lookup_after\": %d,\n"
1593                        "\t\t\t\"lookup_before\": %d,\n"
1594                        "\t\t\t\"lookup_options\": \"",
1595                        (unsigned long) rc->db_after,
1596                        (unsigned long) rc->db_before,
1597                        group_method2string(rc->group),
1598                        rc->after,
1599                        rc->before
1600         );
1601         buffer_data_options2string(wb, rc->options);
1602         buffer_strcat(wb, "\",\n");
1603     }
1604
1605     if(rc->calculation) {
1606         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
1607         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
1608     }
1609
1610     if(rc->warning) {
1611         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
1612         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
1613     }
1614
1615     if(rc->critical) {
1616         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
1617         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
1618     }
1619
1620     buffer_strcat(wb, "\t\t\t\"green\":");
1621     buffer_rrd_value(wb, rc->green);
1622     buffer_strcat(wb, ",\n");
1623
1624     buffer_strcat(wb, "\t\t\t\"red\":");
1625     buffer_rrd_value(wb, rc->red);
1626     buffer_strcat(wb, ",\n");
1627
1628     buffer_strcat(wb, "\t\t\t\"value\":");
1629     buffer_rrd_value(wb, rc->value);
1630     buffer_strcat(wb, "\n");
1631
1632     buffer_strcat(wb, "\t\t}");
1633 }
1634
1635 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
1636 //
1637 //}
1638
1639 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
1640     int i;
1641     rrdhost_rdlock(&localhost);
1642
1643     buffer_strcat(wb, "{\n\t\"alarms\": {\n");
1644     RRDCALC *rc;
1645     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
1646         if(!rc->rrdset)
1647             continue;
1648
1649         if(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL))
1650             continue;
1651
1652         if(likely(i)) buffer_strcat(wb, ",\n");
1653         health_rrdcalc2json_nolock(wb, rc);
1654         i++;
1655     }
1656
1657 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
1658
1659 //    RRDCALCTEMPLATE *rt;
1660 //    for(rt = host->templates; rt ; rt = rt->next)
1661 //        health_rrdcalctemplate2json_nolock(wb, rt);
1662
1663     buffer_sprintf(wb, "\n\t},\n\t\"now\": %lu\n}\n", (unsigned long)time(NULL));
1664     rrdhost_unlock(&localhost);
1665 }
1666
1667
1668 // ----------------------------------------------------------------------------
1669 // re-load health configuration
1670
1671 static inline void health_free_all_nolock(RRDHOST *host) {
1672     while(host->templates)
1673         rrdcalctemplate_free(host, host->templates);
1674
1675     while(host->alarms)
1676         rrdcalc_free(host, host->alarms);
1677 }
1678
1679 void health_reload(void) {
1680     if(!health_enabled) {
1681         error("Health reload is requested, but health is not enabled.");
1682         return;
1683     }
1684
1685     char *path = health_config_dir();
1686
1687     rrdhost_rwlock(&localhost);
1688     health_free_all_nolock(&localhost);
1689     rrdhost_unlock(&localhost);
1690
1691     RRDSET *st;
1692     for(st = localhost.rrdset_root; st ; st = st->next) {
1693         st->green = NAN;
1694         st->red = NAN;
1695     }
1696
1697     rrdhost_rwlock(&localhost);
1698     health_readdir(path);
1699     rrdhost_unlock(&localhost);
1700
1701     for(st = localhost.rrdset_root; st ; st = st->next) {
1702         rrdhost_rwlock(&localhost);
1703
1704         rrdsetcalc_link_matching(st);
1705         rrdcalctemplate_link_matching(st);
1706
1707         rrdhost_unlock(&localhost);
1708     }
1709 }
1710
1711
1712 // ----------------------------------------------------------------------------
1713 // health main thread and friends
1714
1715 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
1716     if (unlikely(!rc->rrdset)) {
1717         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
1718         return 0;
1719     }
1720
1721     if (unlikely(!rc->update_every)) {
1722         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
1723         return 0;
1724     }
1725
1726     if (unlikely(rc->next_update > now)) {
1727         if (*next_run > rc->next_update)
1728             *next_run = rc->next_update;
1729
1730         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
1731         return 0;
1732     }
1733
1734     return 1;
1735 }
1736
1737 static inline int rrdcalc_value2status(calculated_number n) {
1738     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
1739     if(n) return RRDCALC_STATUS_RAISED;
1740     return RRDCALC_STATUS_CLEAR;
1741 }
1742
1743 static inline void health_alarm_execute(ALARM_ENTRY *ae) {
1744     if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)
1745         return;
1746
1747     char buffer[FILENAME_MAX + 1];
1748     pid_t command_pid;
1749
1750     const char *exec = ae->exec;
1751     if(!exec) exec = health_default_exec;
1752
1753     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
1754               exec,
1755               ae->name,
1756               ae->chart?ae->chart:"NOCAHRT",
1757               ae->family?ae->family:"NOFAMILY",
1758               rrdcalc_status2string(ae->new_status),
1759               rrdcalc_status2string(ae->old_status),
1760               ae->new_value,
1761               ae->old_value,
1762               ae->source?ae->source:"UNKNOWN",
1763               (uint32_t)ae->duration,
1764               (uint32_t)ae->non_clear_duration,
1765               ae->units?ae->units:"",
1766               ae->info?ae->info:""
1767     );
1768
1769     ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN;
1770
1771     debug(D_HEALTH, "executing command '%s'", buffer);
1772     FILE *fp = mypopen(buffer, &command_pid);
1773     if(!fp) {
1774         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
1775         return;
1776     }
1777     debug(D_HEALTH, "HEALTH reading from command");
1778     char *s = fgets(buffer, FILENAME_MAX, fp);
1779     (void)s;
1780     debug(D_HEALTH, "HEALTH closing command");
1781     ae->exec_code = mypclose(fp, command_pid);
1782     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
1783
1784     if(ae->exec_code != 0)
1785         ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED;
1786 }
1787
1788 static inline void health_process_notifications(ALARM_ENTRY *ae) {
1789     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
1790          ae->chart?ae->chart:"NOCHART", ae->name,
1791          ae->new_value,
1792          rrdcalc_status2string(ae->old_status),
1793          rrdcalc_status2string(ae->new_status)
1794     );
1795
1796     health_alarm_execute(ae);
1797 }
1798
1799 static inline void health_alarm_log(RRDHOST *host, time_t when,
1800                 const char *name, const char *chart, const char *family,
1801                 const char *exec, time_t duration,
1802                 calculated_number old_value, calculated_number new_value,
1803                 int old_status, int new_status,
1804                 const char *source,
1805                 const char *units,
1806                 const char *info
1807 ) {
1808     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
1809     ae->name = strdupz(name);
1810     ae->hash_name = simple_hash(ae->name);
1811
1812     if(chart) {
1813         ae->chart = strdupz(chart);
1814         ae->hash_chart = simple_hash(ae->chart);
1815     }
1816
1817     if(family)
1818         ae->family = strdupz(family);
1819
1820     if(exec) ae->exec = strdupz(exec);
1821     if(source) ae->source = strdupz(source);
1822     if(units) ae->units = strdupz(units);
1823     if(info) ae->info = strdupz(info);
1824
1825     ae->id = host->health_log.nextid++;
1826     ae->when = when;
1827     ae->old_value = old_value;
1828     ae->new_value = new_value;
1829     ae->old_status = old_status;
1830     ae->new_status = new_status;
1831     ae->duration = duration;
1832
1833     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
1834         ae->non_clear_duration += ae->duration;
1835
1836     // link it
1837     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1838     ae->next = host->health_log.alarms;
1839     host->health_log.alarms = ae;
1840     host->health_log.count++;
1841     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1842
1843     // match previous alarms
1844     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1845     ALARM_ENTRY *t;
1846     for(t = host->health_log.alarms ; t ; t = t->next) {
1847         if(t != ae &&
1848                 t->hash_name == ae->hash_name &&
1849                 t->hash_chart == ae->hash_chart &&
1850                 !strcmp(t->name, ae->name) &&
1851                 t->chart && ae->chart && !strcmp(t->chart, ae->chart)) {
1852
1853             if(!(t->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED) && !t->updated_by) {
1854                 t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
1855                 t->updated_by = ae;
1856
1857                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
1858                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
1859                     ae->non_clear_duration += t->non_clear_duration;
1860             }
1861             else {
1862                 // no need to continue
1863                 break;
1864             }
1865         }
1866     }
1867     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1868 }
1869
1870 static inline void health_alarm_log_process(RRDHOST *host) {
1871     static uint32_t last_processed = 0;
1872     ALARM_ENTRY *ae;
1873
1874     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1875
1876     for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1877         if(last_processed >= ae->id) break;
1878
1879         if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
1880                 !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) {
1881             ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
1882             health_process_notifications(ae);
1883         }
1884     }
1885
1886     if(host->health_log.alarms)
1887         last_processed = host->health_log.alarms->id;
1888
1889     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1890
1891     if(host->health_log.count <= host->health_log.max)
1892         return;
1893
1894     // cleanup excess entries in the log
1895     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1896
1897     ALARM_ENTRY *last = NULL;
1898     unsigned int count = host->health_log.max;
1899     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
1900
1901     if(ae && last && last->next == ae)
1902         last->next = NULL;
1903     else
1904         ae = NULL;
1905
1906     while(ae) {
1907         ALARM_ENTRY *t = ae->next;
1908
1909         freez(ae->name);
1910         freez(ae->chart);
1911         freez(ae->family);
1912         freez(ae->exec);
1913         freez(ae->source);
1914         freez(ae->units);
1915         freez(ae->info);
1916         freez(ae);
1917
1918         ae = t;
1919     }
1920
1921     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1922 }
1923
1924 void *health_main(void *ptr) {
1925     (void)ptr;
1926
1927     info("HEALTH thread created with task id %d", gettid());
1928
1929     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
1930         error("Cannot set pthread cancel type to DEFERRED.");
1931
1932     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
1933         error("Cannot set pthread cancel state to ENABLE.");
1934
1935     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
1936     if(min_run_every < 1) min_run_every = 1;
1937
1938     BUFFER *wb = buffer_create(100);
1939
1940     unsigned int loop = 0;
1941     while(health_enabled) {
1942         loop++;
1943         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
1944
1945         int oldstate, runnable = 0;
1946         time_t now = time(NULL);
1947         time_t next_run = now + min_run_every;
1948         RRDCALC *rc;
1949
1950         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
1951             error("Cannot set pthread cancel state to DISABLE.");
1952
1953         rrdhost_rdlock(&localhost);
1954
1955         // the first loop is to lookup values from the db
1956         for (rc = localhost.alarms; rc; rc = rc->next) {
1957             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
1958                 continue;
1959
1960             runnable++;
1961             rc->old_value = rc->value;
1962
1963             // 1. if there is database lookup, do it
1964             // 2. if there is calculation expression, run it
1965
1966             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
1967                 time_t old_db_timestamp = rc->db_before;
1968                 int value_is_null = 0;
1969
1970                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
1971                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
1972                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
1973
1974                 if (unlikely(ret != 200)) {
1975                     // database lookup failed
1976                     rc->value = NAN;
1977
1978                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
1979
1980                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
1981                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
1982                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
1983                     }
1984                 }
1985                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
1986                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
1987
1988                 if (unlikely(old_db_timestamp == rc->db_before)) {
1989                     // database is stale
1990
1991                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
1992
1993                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
1994                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
1995                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
1996                     }
1997                 }
1998                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
1999                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2000
2001                 if (unlikely(value_is_null)) {
2002                     // collected value is null
2003
2004                     rc->value = NAN;
2005
2006                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2007                           rc->chart?rc->chart:"NOCHART", rc->name);
2008
2009                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2010                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2011                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2012                               rc->chart?rc->chart:"NOCHART", rc->name);
2013                     }
2014                 }
2015                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2016                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2017
2018                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2019                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2020             }
2021
2022             if(unlikely(rc->calculation)) {
2023                 if (unlikely(!expression_evaluate(rc->calculation))) {
2024                     // calculation failed
2025
2026                     rc->value = NAN;
2027
2028                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2029                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2030
2031                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2032                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2033                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2034                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2035                     }
2036                 }
2037                 else {
2038                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2039                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2040
2041                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
2042                             CALCULATED_NUMBER_FORMAT
2043                             ": %s (source: %s)",
2044                           rc->chart?rc->chart:"NOCHART", rc->name,
2045                           rc->calculation->result,
2046                           buffer_tostring(rc->calculation->error_msg),
2047                           rc->source
2048                     );
2049
2050                     rc->value = rc->calculation->result;
2051                 }
2052             }
2053         }
2054         rrdhost_unlock(&localhost);
2055
2056         if (runnable) {
2057             rrdhost_rdlock(&localhost);
2058
2059             for (rc = localhost.alarms; rc; rc = rc->next) {
2060                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2061                     continue;
2062
2063                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2064                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2065
2066                 if(unlikely(rc->warning)) {
2067                     if(unlikely(!expression_evaluate(rc->warning))) {
2068                         // calculation failed
2069
2070                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2071                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2072
2073                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2074                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2075                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2076                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2077                         }
2078                     }
2079                     else {
2080                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2081                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2082
2083                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2084                                 CALCULATED_NUMBER_FORMAT
2085                                 ": %s (source: %s)",
2086                               rc->chart?rc->chart:"NOCHART", rc->name,
2087                               rc->warning->result,
2088                               buffer_tostring(rc->warning->error_msg),
2089                               rc->source
2090                         );
2091
2092                         warning_status = rrdcalc_value2status(rc->warning->result);
2093                     }
2094                 }
2095
2096                 if(unlikely(rc->critical)) {
2097                     if(unlikely(!expression_evaluate(rc->critical))) {
2098                         // calculation failed
2099
2100                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2101                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2102
2103                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2104                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2105                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2106                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2107                         }
2108                     }
2109                     else {
2110                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2111                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2112
2113                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2114                                 CALCULATED_NUMBER_FORMAT
2115                                 ": %s (source: %s)",
2116                               rc->chart?rc->chart:"NOCHART", rc->name,
2117                               rc->critical->result,
2118                               buffer_tostring(rc->critical->error_msg),
2119                               rc->source
2120                         );
2121
2122                         critical_status = rrdcalc_value2status(rc->critical->result);
2123                     }
2124                 }
2125
2126                 int status = RRDCALC_STATUS_UNDEFINED;
2127
2128                 switch(warning_status) {
2129                     case RRDCALC_STATUS_CLEAR:
2130                         status = RRDCALC_STATUS_CLEAR;
2131                         break;
2132
2133                     case RRDCALC_STATUS_RAISED:
2134                         status = RRDCALC_STATUS_WARNING;
2135                         break;
2136
2137                     default:
2138                         break;
2139                 }
2140
2141                 switch(critical_status) {
2142                     case RRDCALC_STATUS_CLEAR:
2143                         if(status == RRDCALC_STATUS_UNDEFINED)
2144                             status = RRDCALC_STATUS_CLEAR;
2145                         break;
2146
2147                     case RRDCALC_STATUS_RAISED:
2148                         status = RRDCALC_STATUS_CRITICAL;
2149                         break;
2150
2151                     default:
2152                         break;
2153                 }
2154
2155                 if(status != rc->status) {
2156                     health_alarm_log(&localhost, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info);
2157                     rc->last_status_change = now;
2158                     rc->status = status;
2159                 }
2160
2161                 rc->last_updated = now;
2162                 rc->next_update = now + rc->update_every;
2163
2164                 if (next_run > rc->next_update)
2165                     next_run = rc->next_update;
2166             }
2167
2168             rrdhost_unlock(&localhost);
2169         }
2170
2171         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2172             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2173
2174         // execute notifications
2175         // and cleanup
2176         health_alarm_log_process(&localhost);
2177
2178         now = time(NULL);
2179         if(now < next_run) {
2180             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2181                   loop, (int) (next_run - now));
2182             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2183         }
2184         else {
2185             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2186         }
2187     }
2188
2189     buffer_free(wb);
2190
2191     info("HEALTH thread exiting");
2192     pthread_exit(NULL);
2193     return NULL;
2194 }