]> arthur.barton.de Git - netdata.git/blob - src/health.c
basic health monitoring information is now rendered on the dashboard
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 static const char *health_default_exec = PLUGINS_DIR "/alarm-email.sh";
6 int health_enabled = 1;
7
8 // ----------------------------------------------------------------------------
9 // RRDVAR management
10
11 static inline int rrdvar_fix_name(char *variable) {
12     int fixed = 0;
13     while(*variable) {
14         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
15             *variable++ = '_';
16             fixed++;
17         }
18         else
19             variable++;
20     }
21
22     return fixed;
23 }
24
25 int rrdvar_compare(void* a, void* b) {
26     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
27     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
28     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
29 }
30
31 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
32     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
33     if(ret != rv)
34         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
35
36     return ret;
37 }
38
39 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
40     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
41     if(!ret)
42         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
43
44     return ret;
45 }
46
47 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
48     RRDVAR tmp;
49     tmp.name = (char *)name;
50     tmp.hash = (hash)?hash:simple_hash(tmp.name);
51
52     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
53 }
54
55 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
56     (void)host;
57
58     if(!rv) return;
59
60     if(tree)
61         rrdvar_index_del(tree, rv);
62
63     freez(rv->name);
64     freez(rv);
65 }
66
67 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
68     char *variable = strdupz(name);
69     rrdvar_fix_name(variable);
70     uint32_t hash = simple_hash(variable);
71
72     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
73     if(unlikely(!rv)) {
74         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
75
76         rv = callocz(1, sizeof(RRDVAR));
77         rv->name = variable;
78         rv->hash = hash;
79         rv->type = type;
80         rv->value = value;
81
82         RRDVAR *ret = rrdvar_index_add(tree, rv);
83         if(unlikely(ret != rv)) {
84             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
85             rrdvar_free(NULL, NULL, rv);
86             rv = NULL;
87         }
88         else
89             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
90     }
91     else {
92         // already exists
93         freez(variable);
94         rv = NULL;
95     }
96
97     return rv;
98 }
99
100 // ----------------------------------------------------------------------------
101 // RRDVAR lookup
102
103 calculated_number rrdvar2number(RRDVAR *rv) {
104     switch(rv->type) {
105         case RRDVAR_TYPE_CALCULATED: {
106             calculated_number *n = (calculated_number *)rv->value;
107             return *n;
108         }
109
110         case RRDVAR_TYPE_TIME_T: {
111             time_t *n = (time_t *)rv->value;
112             return *n;
113         }
114
115         case RRDVAR_TYPE_COLLECTED: {
116             collected_number *n = (collected_number *)rv->value;
117             return *n;
118         }
119
120         case RRDVAR_TYPE_TOTAL: {
121             total_number *n = (total_number *)rv->value;
122             return *n;
123         }
124
125         case RRDVAR_TYPE_INT: {
126             int *n = (int *)rv->value;
127             return *n;
128         }
129
130         default:
131             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
132             return NAN;
133     }
134 }
135
136 void dump_variable(void *data) {
137     RRDVAR *rv = (RRDVAR *)data;
138     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
139 }
140
141 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
142     RRDSET *st = rc->rrdset;
143     RRDVAR *rv;
144
145     if(!st) return 0;
146
147     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
148     if(rv) {
149         *result = rrdvar2number(rv);
150         return 1;
151     }
152
153     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
154     if(rv) {
155         *result = rrdvar2number(rv);
156         return 1;
157     }
158
159     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
160     if(rv) {
161         *result = rrdvar2number(rv);
162         return 1;
163     }
164
165     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
166     avl_traverse_lock(&st->variables_root_index, dump_variable);
167
168     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
169     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
170
171     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
172     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
173
174     return 0;
175 }
176
177 // ----------------------------------------------------------------------------
178 // RRDSETVAR management
179
180 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
181     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
182     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
183
184     char buffer[RRDVAR_MAX_LENGTH + 1];
185     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
186     rs->fullid = strdupz(buffer);
187
188     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
189     rs->fullname = strdupz(buffer);
190
191     rs->variable = strdupz(variable);
192
193     rs->type = type;
194     rs->value = value;
195     rs->options = options;
196     rs->rrdset = st;
197
198     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
199     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
200     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
201     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
202     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
203
204     rs->next = st->variables;
205     st->variables = rs;
206
207     return rs;
208 }
209
210 void rrdsetvar_rename_all(RRDSET *st) {
211     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
212
213     // only these 2 can change name
214     // rs->family_name
215     // rs->host_name
216
217     char buffer[RRDVAR_MAX_LENGTH + 1];
218     RRDSETVAR *rs, *next = st->variables;
219     while((rs = next)) {
220         next = rs->next;
221
222         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
223
224         if (strcmp(buffer, rs->fullname)) {
225             // name changed
226             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
227             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
228
229             freez(rs->fullname);
230             rs->fullname = strdupz(st->name);
231             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
232             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
233         }
234     }
235
236     rrdsetcalc_link_matching(st);
237 }
238
239 void rrdsetvar_free(RRDSETVAR *rs) {
240     RRDSET *st = rs->rrdset;
241     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
242
243     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
244     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
245     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
246     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
247     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
248
249     if(st->variables == rs) {
250         st->variables = rs->next;
251     }
252     else {
253         RRDSETVAR *t;
254         for (t = st->variables; t && t->next != rs; t = t->next);
255         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
256         else t->next = rs->next;
257     }
258
259     freez(rs->fullid);
260     freez(rs->fullname);
261     freez(rs->variable);
262     freez(rs);
263 }
264
265 // ----------------------------------------------------------------------------
266 // RRDDIMVAR management
267
268 #define RRDDIMVAR_ID_MAX 1024
269
270 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
271     RRDSET *st = rd->rrdset;
272
273     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
274
275     if(!prefix) prefix = "";
276     if(!suffix) suffix = "";
277
278     char buffer[RRDDIMVAR_ID_MAX + 1];
279     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
280
281     rs->prefix = strdupz(prefix);
282     rs->suffix = strdupz(suffix);
283
284     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
285     rs->id = strdupz(buffer);
286
287     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
288     rs->name = strdupz(buffer);
289
290     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
291     rs->fullidid = strdupz(buffer);
292
293     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
294     rs->fullidname = strdupz(buffer);
295
296     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
297     rs->fullnameid = strdupz(buffer);
298
299     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
300     rs->fullnamename = strdupz(buffer);
301
302     rs->type = type;
303     rs->value = value;
304     rs->options = options;
305     rs->rrddim = rd;
306
307     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
308     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
309
310     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
311     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
312
313     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
314     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
315     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
316     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
317
318     rs->next = rd->variables;
319     rd->variables = rs;
320
321     return rs;
322 }
323
324 void rrddimvar_rename_all(RRDDIM *rd) {
325     RRDSET *st = rd->rrdset;
326     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
327
328     RRDDIMVAR *rs, *next = rd->variables;
329     while((rs = next)) {
330         next = rs->next;
331
332         if (strcmp(rd->name, rs->name)) {
333             char buffer[RRDDIMVAR_ID_MAX + 1];
334             // name changed
335
336             // name
337             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
338             freez(rs->name);
339             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
340             rs->name = strdupz(buffer);
341             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
342
343             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
344             freez(rs->fullidname);
345             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
346             rs->fullidname = strdupz(buffer);
347             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
348                                                              rs->fullidname, rs->type, rs->value);
349
350             // fullnameid
351             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
352             freez(rs->fullnameid);
353             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
354             rs->fullnameid = strdupz(buffer);
355             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
356                                                           rs->fullnameid, rs->type, rs->value);
357
358             // fullnamename
359             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
360             freez(rs->fullnamename);
361             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
362             rs->fullnamename = strdupz(buffer);
363             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
364                                                           rs->fullnamename, rs->type, rs->value);
365         }
366     }
367 }
368
369 void rrddimvar_free(RRDDIMVAR *rs) {
370     RRDDIM *rd = rs->rrddim;
371     RRDSET *st = rd->rrdset;
372     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
373
374     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
375     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
376
377     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
378     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
379
380     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
381     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
382     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
383     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
384
385     if(rd->variables == rs) {
386         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
387         rd->variables = rs->next;
388     }
389     else {
390         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
391         RRDDIMVAR *t;
392         for (t = rd->variables; t && t->next != rs; t = t->next) ;
393         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
394         else t->next = rs->next;
395     }
396
397     freez(rs->prefix);
398     freez(rs->suffix);
399     freez(rs->id);
400     freez(rs->name);
401     freez(rs->fullidid);
402     freez(rs->fullidname);
403     freez(rs->fullnameid);
404     freez(rs->fullnamename);
405     freez(rs);
406 }
407
408 // ----------------------------------------------------------------------------
409 // RRDCALC management
410
411 static inline const char *rrdcalc_status2string(int status) {
412     switch(status) {
413         case RRDCALC_STATUS_UNINITIALIZED:
414             return "UNINITIALIZED";
415
416         case RRDCALC_STATUS_UNDEFINED:
417             return "UNDEFINED";
418
419         case RRDCALC_STATUS_CLEAR:
420             return "CLEAR";
421
422         case RRDCALC_STATUS_RAISED:
423             return "RAISED";
424
425         case RRDCALC_STATUS_WARNING:
426             return "WARNING";
427
428         case RRDCALC_STATUS_CRITICAL:
429             return "CRITICAL";
430
431         default:
432             return "UNKNOWN";
433     }
434 }
435
436 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
437     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
438
439     rc->last_status_change = time(NULL);
440     rc->rrdset = st;
441
442     rc->rrdset_next = st->alarms;
443     rc->rrdset_prev = NULL;
444     st->alarms = rc;
445
446     if(rc->update_every < rc->rrdset->update_every) {
447         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
448         rc->update_every = rc->rrdset->update_every;
449     }
450
451     if(!isnan(rc->green) && isnan(st->green)) {
452         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
453         st->green = rc->green;
454     }
455
456     if(!isnan(rc->red) && isnan(st->red)) {
457         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
458         st->red = rc->red;
459     }
460
461     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
462     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
463
464     char fullname[RRDVAR_MAX_LENGTH + 1];
465     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
466     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
467
468     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
469     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
470 }
471
472 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
473     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
474             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
475         return 1;
476
477     return 0;
478 }
479
480 // this has to be called while the RRDHOST is locked
481 inline void rrdsetcalc_link_matching(RRDSET *st) {
482     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
483
484     RRDCALC *rc;
485     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
486         if(rc->rrdset) continue;
487
488         if(rrdcalc_is_matching_this_rrdset(rc, st))
489             rrdsetcalc_link(st, rc);
490     }
491 }
492
493 // this has to be called while the RRDHOST is locked
494 inline void rrdsetcalc_unlink(RRDCALC *rc) {
495     RRDSET *st = rc->rrdset;
496
497     if(!st) {
498         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
499         return;
500     }
501
502     RRDHOST *host = st->rrdhost;
503
504     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
505
506     // unlink it
507     if(rc->rrdset_prev)
508         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
509
510     if(rc->rrdset_next)
511         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
512
513     if(st->alarms == rc)
514         st->alarms = rc->rrdset_next;
515
516     rc->rrdset_prev = rc->rrdset_next = NULL;
517
518     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
519     rc->local = NULL;
520
521     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
522     rc->family = NULL;
523
524     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
525     rc->hostid = NULL;
526
527     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
528     rc->hostname = NULL;
529
530     rc->rrdset = NULL;
531
532     // RRDCALC will remain in RRDHOST
533     // so that if the matching chart is found in the future
534     // it will be applied automatically
535 }
536
537 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
538     RRDCALC *rc;
539     uint32_t hash = simple_hash(name);
540
541     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
542         if(rc->hash == hash && !strcmp(rc->name, name))
543             return rc;
544     }
545
546     return NULL;
547 }
548
549 static inline int rrdcalc_exists(RRDHOST *host, const char *name, uint32_t hash) {
550     RRDCALC *rc;
551
552     // make sure it does not already exist
553     for(rc = host->alarms; rc ; rc = rc->next) {
554         if (rc->hash == hash && !strcmp(name, rc->name)) {
555             error("Health alarm '%s' already exists in host '%s'.", name, host->hostname);
556             return 1;
557         }
558     }
559
560     return 0;
561 }
562
563 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
564     rrdhost_check_rdlock(host);
565
566     if(rc->calculation) {
567         rc->calculation->this = &rc->value;
568         rc->calculation->after = &rc->db_after;
569         rc->calculation->before = &rc->db_before;
570         rc->calculation->rrdcalc = rc;
571     }
572
573     if(rc->warning) {
574         rc->warning->this = &rc->value;
575         rc->warning->after = &rc->db_after;
576         rc->warning->before = &rc->db_before;
577         rc->warning->rrdcalc = rc;
578     }
579
580     if(rc->critical) {
581         rc->critical->this = &rc->value;
582         rc->critical->after = &rc->db_after;
583         rc->critical->before = &rc->db_before;
584         rc->critical->rrdcalc = rc;
585     }
586
587     // link it to the host
588     rc->next = host->alarms;
589     host->alarms = rc;
590
591     // link it to its chart
592     RRDSET *st;
593     for(st = host->rrdset_root; st ; st = st->next) {
594         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
595             rrdsetcalc_link(st, rc);
596             break;
597         }
598     }
599 }
600
601 static inline uint32_t rrdcalc_fullname(char *fullname, size_t len, const char *chart, const char *name) {
602     snprintfz(fullname, len - 1, "%s%s%s", chart?chart:"", chart?".":"", name);
603     rrdvar_fix_name(fullname);
604     return simple_hash(fullname);
605 }
606
607 static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const char *chart, const char *dimensions, int group_method,
608                         int after, int before, int update_every, uint32_t options,
609                         calculated_number green, calculated_number red,
610                         const char *exec, const char *source,
611                         const char *calc, const char *warn, const char *crit) {
612
613     char fullname[RRDVAR_MAX_LENGTH + 1];
614     uint32_t hash = rrdcalc_fullname(fullname, RRDVAR_MAX_LENGTH + 1, chart, name);
615
616     if(rrdcalc_exists(host, fullname, hash))
617         return NULL;
618
619     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
620
621     rc->name = strdupz(name);
622     rc->hash = simple_hash(rc->name);
623
624     rc->chart = strdupz(chart);
625     rc->hash_chart = simple_hash(rc->chart);
626
627     if(dimensions) rc->dimensions = strdupz(dimensions);
628
629     rc->green = green;
630     rc->red = red;
631     rc->value = NAN;
632     rc->old_value = NAN;
633
634     rc->group = group_method;
635     rc->after = after;
636     rc->before = before;
637     rc->update_every = update_every;
638     rc->options = options;
639
640     if(exec) rc->exec = strdupz(exec);
641     if(source) rc->source = strdupz(source);
642
643     if(calc) {
644         rc->calculation = expression_parse(calc, NULL, NULL);
645         if(!rc->calculation)
646             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, name, calc);
647     }
648     if(warn) {
649         rc->warning = expression_parse(warn, NULL, NULL);
650         if(!rc->warning)
651             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, name, warn);
652     }
653     if(crit) {
654         rc->critical = expression_parse(crit, NULL, NULL);
655         if(!rc->critical)
656             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, name, crit);
657     }
658
659     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
660           (rc->chart)?rc->chart:"NOCHART",
661           rc->name,
662           (rc->exec)?rc->exec:"DEFAULT",
663           rc->green,
664           rc->red,
665           rc->group,
666           rc->after,
667           rc->before,
668           rc->options,
669           (rc->dimensions)?rc->dimensions:"NONE",
670           rc->update_every,
671           (rc->calculation)?rc->calculation->parsed_as:"NONE",
672           (rc->warning)?rc->warning->parsed_as:"NONE",
673           (rc->critical)?rc->critical->parsed_as:"NONE",
674           rc->source
675     );
676
677     rrdcalc_create_part2(host, rc);
678     return rc;
679 }
680
681 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
682     if(!rc) return;
683
684     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
685
686     // unlink it from RRDSET
687     if(rc->rrdset) rrdsetcalc_unlink(rc);
688
689     // unlink it from RRDHOST
690     if(rc == host->alarms)
691         host->alarms = rc->next;
692
693     else if(host->alarms) {
694         RRDCALC *t, *last = host->alarms;
695
696         for(t = last->next; t && t != rc; last = t, t = t->next) ;
697         if(last && last->next == rc)
698             last->next = rc->next;
699         else
700             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
701     }
702     else
703         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
704
705     expression_free(rc->calculation);
706     expression_free(rc->warning);
707     expression_free(rc->critical);
708
709     freez(rc->source);
710     freez(rc->name);
711     freez(rc->chart);
712     freez(rc->dimensions);
713     freez(rc->exec);
714     freez(rc);
715 }
716
717 // ----------------------------------------------------------------------------
718 // RRDCALCTEMPLATE management
719
720 void rrdcalctemplate_link_matching(RRDSET *st) {
721     RRDCALCTEMPLATE *rt;
722
723     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
724         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
725
726             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt->name, st->id,
727                            rt->dimensions, rt->group, rt->after, rt->before, rt->update_every, rt->options,
728                            rt->green, rt->red, rt->exec, rt->source,
729                            (rt->calculation)?rt->calculation->source:NULL,
730                            (rt->warning)?rt->warning->source:NULL,
731                            (rt->critical)?rt->critical->source:NULL);
732
733             if(!rc)
734                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
735
736 #ifdef NETDATA_INTERNAL_CHECKS
737             else if(rc->rrdset != st)
738                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
739 #else
740             (void)rc;
741 #endif
742         }
743     }
744 }
745
746 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
747     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
748
749     if(host->templates) {
750         if(host->templates == rt) {
751             host->templates = rt->next;
752         }
753         else {
754             RRDCALCTEMPLATE *t, *last = host->templates;
755             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
756             if(last && last->next == rt) {
757                 last->next = rt->next;
758                 rt->next = NULL;
759             }
760             else
761                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
762         }
763     }
764
765     expression_free(rt->calculation);
766     expression_free(rt->warning);
767     expression_free(rt->critical);
768
769     freez(rt->dimensions);
770     freez(rt->context);
771     freez(rt->name);
772     freez(rt->exec);
773     freez(rt->source);
774     freez(rt);
775 }
776
777 // ----------------------------------------------------------------------------
778 // load health configuration
779
780 #define HEALTH_CONF_MAX_LINE 4096
781
782 #define HEALTH_ALARM_KEY "alarm"
783 #define HEALTH_TEMPLATE_KEY "template"
784 #define HEALTH_ON_KEY "on"
785 #define HEALTH_LOOKUP_KEY "lookup"
786 #define HEALTH_CALC_KEY "calc"
787 #define HEALTH_EVERY_KEY "every"
788 #define HEALTH_GREEN_KEY "green"
789 #define HEALTH_RED_KEY "red"
790 #define HEALTH_WARN_KEY "warn"
791 #define HEALTH_CRIT_KEY "crit"
792 #define HEALTH_EXEC_KEY "exec"
793
794 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
795     {
796         char fullname[RRDVAR_MAX_LENGTH + 1];
797         uint32_t hash = rrdcalc_fullname(fullname, RRDVAR_MAX_LENGTH + 1, rc->chart, rc->name);
798
799         if (rrdcalc_exists(host, fullname, hash))
800             return 0;
801     }
802
803     if(!rc->chart) {
804         error("Health configuration for alarm '%s' does not have a chart", rc->name);
805         return 0;
806     }
807
808     if(!rc->update_every) {
809         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
810         return 0;
811     }
812
813     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
814         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
815         return 0;
816     }
817
818     debug(D_HEALTH, "Health configuration adding alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
819           rc->chart?rc->chart:"NOCHART",
820           rc->name,
821           (rc->exec)?rc->exec:"DEFAULT",
822           rc->green,
823           rc->red,
824           rc->group,
825           rc->after,
826           rc->before,
827           rc->options,
828           (rc->dimensions)?rc->dimensions:"NONE",
829           rc->update_every,
830           (rc->calculation)?rc->calculation->parsed_as:"NONE",
831           (rc->warning)?rc->warning->parsed_as:"NONE",
832           (rc->critical)?rc->critical->parsed_as:"NONE",
833           rc->source
834     );
835
836     rrdcalc_create_part2(host, rc);
837     return 1;
838 }
839
840 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
841     if(!rt->context) {
842         error("Health configuration for template '%s' does not have a context", rt->name);
843         return 0;
844     }
845
846     if(!rt->update_every) {
847         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
848         return 0;
849     }
850
851     if(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical) {
852         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
853         return 0;
854     }
855
856     RRDCALCTEMPLATE *t;
857     for (t = host->templates; t ; t = t->next) {
858         if(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name)) {
859             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
860             return 0;
861         }
862     }
863
864     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'",
865           rt->name,
866           (rt->context)?rt->context:"NONE",
867           (rt->exec)?rt->exec:"DEFAULT",
868           rt->green,
869           rt->red,
870           rt->group,
871           rt->after,
872           rt->before,
873           rt->options,
874           (rt->dimensions)?rt->dimensions:"NONE",
875           rt->update_every,
876           (rt->calculation)?rt->calculation->parsed_as:"NONE",
877           (rt->warning)?rt->warning->parsed_as:"NONE",
878           (rt->critical)?rt->critical->parsed_as:"NONE",
879           rt->source
880     );
881
882     rt->next = host->templates;
883     host->templates = rt;
884     return 1;
885 }
886
887 static inline int health_parse_duration(char *string, int *result) {
888     // make sure it is a number
889     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
890         *result = 0;
891         return 0;
892     }
893
894     char *e = NULL;
895     calculated_number n = strtold(string, &e);
896     if(e && *e) {
897         switch (*e) {
898             case 'Y':
899                 *result = (int) (n * 86400 * 365);
900                 break;
901             case 'M':
902                 *result = (int) (n * 86400 * 30);
903                 break;
904             case 'w':
905                 *result = (int) (n * 86400 * 7);
906                 break;
907             case 'd':
908                 *result = (int) (n * 86400);
909                 break;
910             case 'h':
911                 *result = (int) (n * 3600);
912                 break;
913             case 'm':
914                 *result = (int) (n * 60);
915                 break;
916
917             default:
918             case 's':
919                 *result = (int) (n);
920                 break;
921         }
922     }
923     else
924        *result = (int)(n);
925
926     return 1;
927 }
928
929 static inline int health_parse_db_lookup(
930         size_t line, const char *path, const char *file, char *string,
931         int *group_method, int *after, int *before, int *every,
932         uint32_t *options, char **dimensions
933 ) {
934     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
935
936     if(*dimensions) freez(*dimensions);
937     *dimensions = NULL;
938     *after = 0;
939     *before = 0;
940     *every = 0;
941     *options = 0;
942
943     char *s = string, *key;
944
945     // first is the group method
946     key = s;
947     while(*s && !isspace(*s)) s++;
948     while(*s && isspace(*s)) *s++ = '\0';
949     if(!*s) {
950         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
951               line, path, file, key);
952         return 0;
953     }
954
955     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
956         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
957               line, path, file, key);
958         return 0;
959     }
960
961     // then is the 'after' time
962     key = s;
963     while(*s && !isspace(*s)) s++;
964     while(*s && isspace(*s)) *s++ = '\0';
965
966     if(!health_parse_duration(key, after)) {
967         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
968               line, path, file, key);
969         return 0;
970     }
971
972     // sane defaults
973     *every = abs(*after);
974
975     // now we may have optional parameters
976     while(*s) {
977         key = s;
978         while(*s && !isspace(*s)) s++;
979         while(*s && isspace(*s)) *s++ = '\0';
980         if(!*key) break;
981
982         if(!strcasecmp(key, "at")) {
983             char *value = s;
984             while(*s && !isspace(*s)) s++;
985             while(*s && isspace(*s)) *s++ = '\0';
986
987             if (!health_parse_duration(value, before)) {
988                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
989                       line, path, file, value, key);
990             }
991         }
992         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
993             char *value = s;
994             while(*s && !isspace(*s)) s++;
995             while(*s && isspace(*s)) *s++ = '\0';
996
997             if (!health_parse_duration(value, every)) {
998                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
999                       line, path, file, value, key);
1000             }
1001         }
1002         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1003             *options |= RRDR_OPTION_ABSOLUTE;
1004         }
1005         else if(!strcasecmp(key, "min2max")) {
1006             *options |= RRDR_OPTION_MIN2MAX;
1007         }
1008         else if(!strcasecmp(key, "null2zero")) {
1009             *options |= RRDR_OPTION_NULL2ZERO;
1010         }
1011         else if(!strcasecmp(key, "percentage")) {
1012             *options |= RRDR_OPTION_PERCENTAGE;
1013         }
1014         else if(!strcasecmp(key, "unaligned")) {
1015             *options |= RRDR_OPTION_NOT_ALIGNED;
1016         }
1017         else if(!strcasecmp(key, "of")) {
1018             if(*s && strcasecmp(s, "all"))
1019                *dimensions = strdupz(s);
1020             break;
1021         }
1022         else {
1023             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1024                   line, path, file, key);
1025         }
1026     }
1027
1028     return 1;
1029 }
1030
1031 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1032     char buffer[FILENAME_MAX + 1];
1033     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1034     return strdupz(buffer);
1035 }
1036
1037 int health_readfile(const char *path, const char *filename) {
1038     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1039
1040     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0;
1041     char buffer[HEALTH_CONF_MAX_LINE + 1];
1042
1043     if(unlikely(!hash_alarm)) {
1044         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1045         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1046         hash_on = simple_uhash(HEALTH_ON_KEY);
1047         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1048         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1049         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1050         hash_red = simple_uhash(HEALTH_RED_KEY);
1051         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1052         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1053         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1054         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1055     }
1056
1057     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1058     FILE *fp = fopen(buffer, "r");
1059     if(!fp) {
1060         error("Health configuration cannot read file '%s'.", buffer);
1061         return 0;
1062     }
1063
1064     RRDCALC *rc = NULL;
1065     RRDCALCTEMPLATE *rt = NULL;
1066
1067     size_t line = 0, append = 0;
1068     char *s;
1069     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1070         int stop_appending = !s;
1071         line++;
1072         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1073         s = trim(buffer);
1074         if(!s) continue;
1075         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1076
1077         append = strlen(s);
1078         if(!stop_appending && s[append - 1] == '\\') {
1079             s[append - 1] = ' ';
1080             append = &s[append] - buffer;
1081             if(append < HEALTH_CONF_MAX_LINE)
1082                 continue;
1083             else {
1084                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1085             }
1086         }
1087         append = 0;
1088
1089         char *key = s;
1090         while(*s && *s != ':') s++;
1091         if(!*s) {
1092             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1093             continue;
1094         }
1095         *s = '\0';
1096         s++;
1097
1098         char *value = s;
1099         key = trim(key);
1100         value = trim(value);
1101
1102         if(!key) {
1103             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1104             continue;
1105         }
1106
1107         if(!value) {
1108             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1109             continue;
1110         }
1111
1112         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1113         uint32_t hash = simple_uhash(key);
1114
1115         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1116             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1117                 rrdcalc_free(&localhost, rc);
1118
1119             if(rt) {
1120                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1121                     rrdcalctemplate_free(&localhost, rt);
1122                 rt = NULL;
1123             }
1124
1125             rc = callocz(1, sizeof(RRDCALC));
1126             rc->name = strdupz(value);
1127             rc->hash = simple_hash(rc->name);
1128             rc->source = health_source_file(line, path, filename);
1129             rc->green = NAN;
1130             rc->red = NAN;
1131             rc->value = NAN;
1132             rc->old_value = NAN;
1133
1134             if(rrdvar_fix_name(rc->name))
1135                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1136         }
1137         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1138             if(rc) {
1139                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1140                     rrdcalc_free(&localhost, rc);
1141                 rc = NULL;
1142             }
1143
1144             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1145                 rrdcalctemplate_free(&localhost, rt);
1146
1147             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1148             rt->name = strdupz(value);
1149             rt->hash_name = simple_hash(rt->name);
1150             rt->source = health_source_file(line, path, filename);
1151             rt->green = NAN;
1152             rt->red = NAN;
1153
1154             if(rrdvar_fix_name(rt->name))
1155                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1156         }
1157         else if(rc) {
1158             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1159                 if(rc->chart) {
1160                     if(strcmp(rc->chart, value))
1161                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1162                              line, path, filename, rc->name, key, rc->chart, value, value);
1163
1164                     freez(rc->chart);
1165                 }
1166                 rc->chart = strdupz(value);
1167                 rc->hash_chart = simple_hash(rc->chart);
1168             }
1169             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1170                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1171                                        &rc->update_every,
1172                                        &rc->options, &rc->dimensions);
1173             }
1174             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1175                 if(!health_parse_duration(value, &rc->update_every))
1176                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1177                          line, path, filename, rc->name, key, value);
1178             }
1179             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1180                 char *e;
1181                 rc->green = strtold(value, &e);
1182                 if(e && *e) {
1183                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1184                          line, path, filename, rc->name, key, e);
1185                 }
1186             }
1187             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1188                 char *e;
1189                 rc->red = strtold(value, &e);
1190                 if(e && *e) {
1191                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1192                          line, path, filename, rc->name, key, e);
1193                 }
1194             }
1195             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1196                 const char *failed_at = NULL;
1197                 int error = 0;
1198                 rc->calculation = expression_parse(value, &failed_at, &error);
1199                 if(!rc->calculation) {
1200                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1201                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1202                 }
1203             }
1204             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1205                 const char *failed_at = NULL;
1206                 int error = 0;
1207                 rc->warning = expression_parse(value, &failed_at, &error);
1208                 if(!rc->warning) {
1209                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1210                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1211                 }
1212             }
1213             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1214                 const char *failed_at = NULL;
1215                 int error = 0;
1216                 rc->critical = expression_parse(value, &failed_at, &error);
1217                 if(!rc->critical) {
1218                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1219                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1220                 }
1221             }
1222             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1223                 if(rc->exec) {
1224                     if(strcmp(rc->exec, value))
1225                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1226                              line, path, filename, rc->name, key, rc->exec, value, value);
1227
1228                     freez(rc->exec);
1229                 }
1230                 rc->exec = strdupz(value);
1231             }
1232             else {
1233                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1234                      line, path, filename, rc->name, key);
1235             }
1236         }
1237         else if(rt) {
1238             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1239                 if(rt->context) {
1240                     if(strcmp(rt->context, value))
1241                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1242                              line, path, filename, rt->name, key, rt->context, value, value);
1243
1244                     freez(rt->context);
1245                 }
1246                 rt->context = strdupz(value);
1247                 rt->hash_context = simple_hash(rt->context);
1248             }
1249             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1250                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1251                                        &rt->update_every,
1252                                        &rt->options, &rt->dimensions);
1253             }
1254             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1255                 if(!health_parse_duration(value, &rt->update_every))
1256                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1257                          line, path, filename, rt->name, key, value);
1258             }
1259             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1260                 char *e;
1261                 rt->green = strtold(value, &e);
1262                 if(e && *e) {
1263                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1264                          line, path, filename, rt->name, key, e);
1265                 }
1266             }
1267             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1268                 char *e;
1269                 rt->red = strtold(value, &e);
1270                 if(e && *e) {
1271                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1272                          line, path, filename, rt->name, key, e);
1273                 }
1274             }
1275             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1276                 const char *failed_at = NULL;
1277                 int error = 0;
1278                 rt->calculation = expression_parse(value, &failed_at, &error);
1279                 if(!rt->calculation) {
1280                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1281                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1282                 }
1283             }
1284             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1285                 const char *failed_at = NULL;
1286                 int error = 0;
1287                 rt->warning = expression_parse(value, &failed_at, &error);
1288                 if(!rt->warning) {
1289                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1290                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1291                 }
1292             }
1293             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1294                 const char *failed_at = NULL;
1295                 int error = 0;
1296                 rt->critical = expression_parse(value, &failed_at, &error);
1297                 if(!rt->critical) {
1298                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1299                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1300                 }
1301             }
1302             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1303                 if(rt->exec) {
1304                     if(strcmp(rt->exec, value))
1305                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1306                              line, path, filename, rt->name, key, rt->exec, value, value);
1307
1308                     freez(rt->exec);
1309                 }
1310                 rt->exec = strdupz(value);
1311             }
1312             else {
1313                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1314                       line, path, filename, rt->name, key);
1315             }
1316         }
1317         else {
1318             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1319                   line, path, filename, key);
1320         }
1321     }
1322
1323     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1324         rrdcalc_free(&localhost, rc);
1325
1326     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1327         rrdcalctemplate_free(&localhost, rt);
1328
1329     fclose(fp);
1330     return 1;
1331 }
1332
1333 void health_readdir(const char *path) {
1334     size_t pathlen = strlen(path);
1335
1336     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
1337
1338     DIR *dir = opendir(path);
1339     if (!dir) {
1340         error("Health configuration cannot open directory '%s'.", path);
1341         return;
1342     }
1343
1344     struct dirent *de = NULL;
1345     while ((de = readdir(dir))) {
1346         size_t len = strlen(de->d_name);
1347
1348         if(de->d_type == DT_DIR
1349            && (
1350                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
1351                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
1352            ))
1353             continue;
1354
1355         else if(de->d_type == DT_DIR) {
1356             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
1357             strcpy(s, path);
1358             strcat(s, "/");
1359             strcat(s, de->d_name);
1360             health_readdir(s);
1361             freez(s);
1362             continue;
1363         }
1364
1365         else if((de->d_type == DT_LNK || de->d_type == DT_REG) &&
1366                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
1367             health_readfile(path, de->d_name);
1368         }
1369     }
1370
1371     closedir(dir);
1372 }
1373
1374 static inline char *health_config_dir(void) {
1375     char buffer[FILENAME_MAX + 1];
1376     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
1377     return config_get("health", "health configuration directory", buffer);
1378 }
1379
1380 void health_init(void) {
1381     debug(D_HEALTH, "Health configuration initializing");
1382
1383     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
1384         debug(D_HEALTH, "Health is disabled.");
1385         return;
1386     }
1387
1388     char *path = health_config_dir();
1389
1390     {
1391         char buffer[FILENAME_MAX + 1];
1392         snprintfz(buffer, FILENAME_MAX, "%s/alarm-email.sh", config_get("global", "plugins directory", PLUGINS_DIR));
1393         health_default_exec = config_get("health", "script to execute on alarm", buffer);
1394     }
1395
1396     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1397     if(n < 2) {
1398         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
1399         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1400     }
1401     else localhost.health_log.max = (unsigned int)n;
1402
1403     rrdhost_rwlock(&localhost);
1404     health_readdir(path);
1405     rrdhost_unlock(&localhost);
1406 }
1407
1408 // ----------------------------------------------------------------------------
1409 // JSON generation
1410
1411 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
1412     if(value && *value)
1413         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
1414     else
1415         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
1416 }
1417
1418 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
1419     buffer_sprintf(wb, "\n\t{\n"
1420                            "\t\t\"id\":%u,\n"
1421                            "\t\t\"name\":\"%s\",\n"
1422                            "\t\t\"chart\":\"%s\",\n"
1423                            "\t\t\"family\":\"%s\",\n"
1424                            "\t\t\"processed\":%s,\n"
1425                            "\t\t\"updated\":%s,\n"
1426                            "\t\t\"exec_run\":%s,\n"
1427                            "\t\t\"exec_failed\":%s,\n"
1428                            "\t\t\"exec\":\"%s\",\n"
1429                            "\t\t\"exec_code\":%d,\n"
1430                            "\t\t\"source\":\"%s\",\n"
1431                            "\t\t\"when\":%lu,\n"
1432                            "\t\t\"duration\":%lu,\n"
1433                            "\t\t\"non_clear_duration\":%lu,\n"
1434                            "\t\t\"status\":\"%s\",\n"
1435                            "\t\t\"old_status\":\"%s\",\n",
1436                    ae->id,
1437                    ae->name,
1438                    ae->chart,
1439                    ae->family,
1440                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)?"true":"false",
1441                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)?"true":"false",
1442                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN)?"true":"false",
1443                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false",
1444                    ae->exec?ae->exec:health_default_exec,
1445                    ae->exec_code,
1446                    ae->source,
1447                    (unsigned long)ae->when,
1448                    (unsigned long)ae->duration,
1449                    (unsigned long)ae->non_clear_duration,
1450                    rrdcalc_status2string(ae->new_status),
1451                    rrdcalc_status2string(ae->old_status)
1452     );
1453
1454     buffer_strcat(wb, "\t\t\"value\":");
1455     buffer_rrd_value(wb, ae->new_value);
1456     buffer_strcat(wb, ",\n");
1457
1458     buffer_strcat(wb, "\t\t\"old_value\":");
1459     buffer_rrd_value(wb, ae->old_value);
1460     buffer_strcat(wb, "\n");
1461
1462     buffer_strcat(wb, "\t}");
1463 }
1464
1465 void health_alarm_log2json(RRDHOST *host, BUFFER *wb) {
1466     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1467
1468     buffer_strcat(wb, "[");
1469
1470     unsigned int max = host->health_log.max;
1471     unsigned int count = 0;
1472     ALARM_ENTRY *ae;
1473     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
1474         if(likely(count)) buffer_strcat(wb, ",");
1475         health_alarm_entry2json_nolock(wb, ae);
1476     }
1477
1478     buffer_strcat(wb, "\n]\n");
1479
1480     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1481 }
1482
1483 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
1484     buffer_sprintf(wb,
1485            "\t\t\"%s.%s\": {\n"
1486                    "\t\t\t\"name\": \"%s\",\n"
1487                    "\t\t\t\"chart\": \"%s\",\n"
1488                    "\t\t\t\"family\": \"%s\",\n"
1489                    "\t\t\t\"active\": %s,\n"
1490                    "\t\t\t\"exec\": \"%s\",\n"
1491                    "\t\t\t\"source\": \"%s\",\n"
1492                    "\t\t\t\"status\": \"%s\",\n"
1493                    "\t\t\t\"last_status_change\": %lu,\n"
1494                    "\t\t\t\"last_updated\": %lu,\n"
1495                    "\t\t\t\"next_update\": %lu,\n"
1496                    "\t\t\t\"update_every\": %d,\n"
1497             , rc->chart, rc->name
1498             , rc->name
1499             , rc->chart
1500             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
1501             , (rc->rrdset)?"true":"false"
1502             , rc->exec?rc->exec:health_default_exec
1503             , rc->source
1504             , rrdcalc_status2string(rc->status)
1505             , (unsigned long)rc->last_status_change
1506             , (unsigned long)rc->last_updated
1507             , (unsigned long)rc->next_update
1508             , rc->update_every
1509     );
1510
1511     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
1512         if(rc->dimensions && *rc->dimensions)
1513             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
1514
1515         buffer_sprintf(wb,
1516                        "\t\t\t\"db_after\": %lu,\n"
1517                        "\t\t\t\"db_before\": %lu,\n"
1518                        "\t\t\t\"lookup_method\": \"%s\",\n"
1519                        "\t\t\t\"lookup_after\": %d,\n"
1520                        "\t\t\t\"lookup_before\": %d,\n"
1521                        "\t\t\t\"lookup_options\": \"",
1522                        (unsigned long) rc->db_after,
1523                        (unsigned long) rc->db_before,
1524                        group_method2string(rc->group),
1525                        rc->after,
1526                        rc->before
1527         );
1528         buffer_data_options2string(wb, rc->options);
1529         buffer_strcat(wb, "\",\n");
1530     }
1531
1532     if(rc->calculation) {
1533         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
1534         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
1535     }
1536
1537     if(rc->warning) {
1538         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
1539         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
1540     }
1541
1542     if(rc->critical) {
1543         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
1544         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
1545     }
1546
1547     buffer_strcat(wb, "\t\t\t\"green\":");
1548     buffer_rrd_value(wb, rc->green);
1549     buffer_strcat(wb, ",\n");
1550
1551     buffer_strcat(wb, "\t\t\t\"red\":");
1552     buffer_rrd_value(wb, rc->red);
1553     buffer_strcat(wb, ",\n");
1554
1555     buffer_strcat(wb, "\t\t\t\"value\":");
1556     buffer_rrd_value(wb, rc->value);
1557     buffer_strcat(wb, "\n");
1558
1559     buffer_strcat(wb, "\t\t}");
1560 }
1561
1562 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
1563 //
1564 //}
1565
1566 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
1567     int i;
1568     rrdhost_rdlock(&localhost);
1569
1570     buffer_strcat(wb, "{\n\t\"alarms\": {\n");
1571     RRDCALC *rc;
1572     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
1573         if(!rc->rrdset)
1574             continue;
1575
1576         if(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL))
1577             continue;
1578
1579         if(likely(i)) buffer_strcat(wb, ",\n");
1580         health_rrdcalc2json_nolock(wb, rc);
1581         i++;
1582     }
1583
1584 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
1585
1586 //    RRDCALCTEMPLATE *rt;
1587 //    for(rt = host->templates; rt ; rt = rt->next)
1588 //        health_rrdcalctemplate2json_nolock(wb, rt);
1589
1590     buffer_sprintf(wb, "\n\t},\n\t\"now\": %lu\n}\n", (unsigned long)time(NULL));
1591     rrdhost_unlock(&localhost);
1592 }
1593
1594
1595 // ----------------------------------------------------------------------------
1596 // re-load health configuration
1597
1598 static inline void health_free_all_nolock(RRDHOST *host) {
1599     while(host->templates)
1600         rrdcalctemplate_free(host, host->templates);
1601
1602     while(host->alarms)
1603         rrdcalc_free(host, host->alarms);
1604 }
1605
1606 void health_reload(void) {
1607     if(!health_enabled) {
1608         error("Health reload is requested, but health is not enabled.");
1609         return;
1610     }
1611
1612     char *path = health_config_dir();
1613
1614     rrdhost_rwlock(&localhost);
1615     health_free_all_nolock(&localhost);
1616     rrdhost_unlock(&localhost);
1617
1618     RRDSET *st;
1619     for(st = localhost.rrdset_root; st ; st = st->next) {
1620         st->green = NAN;
1621         st->red = NAN;
1622     }
1623
1624     rrdhost_rwlock(&localhost);
1625     health_readdir(path);
1626     rrdhost_unlock(&localhost);
1627
1628     for(st = localhost.rrdset_root; st ; st = st->next) {
1629         rrdhost_rwlock(&localhost);
1630
1631         rrdsetcalc_link_matching(st);
1632         rrdcalctemplate_link_matching(st);
1633
1634         rrdhost_unlock(&localhost);
1635     }
1636 }
1637
1638
1639 // ----------------------------------------------------------------------------
1640 // health main thread and friends
1641
1642 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
1643     if (unlikely(!rc->rrdset)) {
1644         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
1645         return 0;
1646     }
1647
1648     if (unlikely(!rc->update_every)) {
1649         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
1650         return 0;
1651     }
1652
1653     if (unlikely(rc->next_update > now)) {
1654         if (*next_run > rc->next_update)
1655             *next_run = rc->next_update;
1656
1657         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
1658         return 0;
1659     }
1660
1661     return 1;
1662 }
1663
1664 static inline int rrdcalc_value2status(calculated_number n) {
1665     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
1666     if(n) return RRDCALC_STATUS_RAISED;
1667     return RRDCALC_STATUS_CLEAR;
1668 }
1669
1670 static inline void health_alarm_execute(ALARM_ENTRY *ae) {
1671     if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)
1672         return;
1673
1674     char buffer[FILENAME_MAX + 1];
1675     pid_t command_pid;
1676
1677     const char *exec = ae->exec;
1678     if(!exec) exec = health_default_exec;
1679
1680     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u'",
1681               exec,
1682               ae->name,
1683               ae->chart?ae->chart:"NOCAHRT",
1684               ae->family?ae->family:"NOFAMILY",
1685               rrdcalc_status2string(ae->new_status),
1686               rrdcalc_status2string(ae->old_status),
1687               ae->new_value,
1688               ae->old_value,
1689               ae->source?ae->source:"UNKNOWN",
1690               (uint32_t)ae->duration,
1691               (uint32_t)ae->non_clear_duration
1692     );
1693
1694     ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN;
1695
1696     debug(D_HEALTH, "executing command '%s'", buffer);
1697     FILE *fp = mypopen(buffer, &command_pid);
1698     if(!fp) {
1699         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
1700         return;
1701     }
1702     debug(D_HEALTH, "HEALTH reading from command");
1703     char *s = fgets(buffer, FILENAME_MAX, fp);
1704     (void)s;
1705     debug(D_HEALTH, "HEALTH closing command");
1706     ae->exec_code = mypclose(fp, command_pid);
1707     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
1708
1709     if(ae->exec_code != 0)
1710         ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED;
1711 }
1712
1713 static inline void health_process_notifications(ALARM_ENTRY *ae) {
1714     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
1715          ae->chart?ae->chart:"NOCHART", ae->name,
1716          ae->new_value,
1717          rrdcalc_status2string(ae->old_status),
1718          rrdcalc_status2string(ae->new_status)
1719     );
1720
1721     health_alarm_execute(ae);
1722 }
1723
1724 static inline void health_alarm_log(RRDHOST *host, time_t when,
1725                 const char *name, const char *chart, const char *family,
1726                 const char *exec, time_t duration,
1727                 calculated_number old_value, calculated_number new_value,
1728                 int old_status, int new_status,
1729                 const char *source
1730 ) {
1731     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
1732     ae->name = strdupz(name);
1733     ae->hash_name = simple_hash(ae->name);
1734
1735     if(chart) {
1736         ae->chart = strdupz(chart);
1737         ae->hash_chart = simple_hash(ae->chart);
1738     }
1739
1740     if(family)
1741         ae->family = strdupz(family);
1742
1743     if(exec) ae->exec = strdupz(exec);
1744     if(source) ae->source = strdupz(source);
1745
1746     ae->id = host->health_log.nextid++;
1747     ae->when = when;
1748     ae->old_value = old_value;
1749     ae->new_value = new_value;
1750     ae->old_status = old_status;
1751     ae->new_status = new_status;
1752     ae->duration = duration;
1753
1754     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
1755         ae->non_clear_duration += ae->duration;
1756
1757     // link it
1758     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1759     ae->next = host->health_log.alarms;
1760     host->health_log.alarms = ae;
1761     host->health_log.count++;
1762     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1763
1764     // match previous alarms
1765     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1766     ALARM_ENTRY *t;
1767     for(t = host->health_log.alarms ; t ; t = t->next) {
1768         if(t != ae &&
1769                 t->hash_name == ae->hash_name &&
1770                 t->hash_chart == ae->hash_chart &&
1771                 !strcmp(t->name, ae->name) &&
1772                 t->chart && ae->chart && !strcmp(t->chart, ae->chart)) {
1773
1774             if(!(t->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED) && !t->updated_by) {
1775                 t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
1776                 t->updated_by = ae;
1777
1778                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
1779                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
1780                     ae->non_clear_duration += t->non_clear_duration;
1781             }
1782             else {
1783                 // no need to continue
1784                 break;
1785             }
1786         }
1787     }
1788     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1789 }
1790
1791 static inline void health_alarm_log_process(RRDHOST *host) {
1792     static uint32_t last_processed = 0;
1793     ALARM_ENTRY *ae;
1794
1795     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1796
1797     for(ae = host->health_log.alarms; ae ;ae = ae->next) {
1798         if(last_processed >= ae->id) break;
1799
1800         if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
1801                 !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) {
1802             ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
1803             health_process_notifications(ae);
1804         }
1805     }
1806
1807     if(host->health_log.alarms)
1808         last_processed = host->health_log.alarms->id;
1809
1810     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1811
1812     if(host->health_log.count <= host->health_log.max)
1813         return;
1814
1815     // cleanup excess entries in the log
1816     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1817
1818     ALARM_ENTRY *last = NULL;
1819     unsigned int count = host->health_log.max;
1820     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
1821
1822     if(ae && last && last->next == ae)
1823         last->next = NULL;
1824     else
1825         ae = NULL;
1826
1827     while(ae) {
1828         ALARM_ENTRY *t = ae->next;
1829
1830         freez(ae->family);
1831         freez(ae->chart);
1832         freez(ae->name);
1833         freez(ae->exec);
1834         freez(ae);
1835
1836         ae = t;
1837     }
1838
1839     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1840 }
1841
1842 void *health_main(void *ptr) {
1843     (void)ptr;
1844
1845     info("HEALTH thread created with task id %d", gettid());
1846
1847     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
1848         error("Cannot set pthread cancel type to DEFERRED.");
1849
1850     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
1851         error("Cannot set pthread cancel state to ENABLE.");
1852
1853     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
1854     if(min_run_every < 1) min_run_every = 1;
1855
1856     BUFFER *wb = buffer_create(100);
1857
1858     unsigned int loop = 0;
1859     while(health_enabled) {
1860         loop++;
1861         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
1862
1863         int oldstate, runnable = 0;
1864         time_t now = time(NULL);
1865         time_t next_run = now + min_run_every;
1866         RRDCALC *rc;
1867
1868         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
1869             error("Cannot set pthread cancel state to DISABLE.");
1870
1871         rrdhost_rdlock(&localhost);
1872
1873         // the first loop is to lookup values from the db
1874         for (rc = localhost.alarms; rc; rc = rc->next) {
1875             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
1876                 continue;
1877
1878             runnable++;
1879             rc->old_value = rc->value;
1880
1881             // 1. if there is database lookup, do it
1882             // 2. if there is calculation expression, run it
1883
1884             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
1885                 time_t old_db_timestamp = rc->db_before;
1886                 int value_is_null = 0;
1887
1888                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
1889                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
1890                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
1891
1892                 if (unlikely(ret != 200)) {
1893                     // database lookup failed
1894                     rc->value = NAN;
1895
1896                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
1897
1898                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
1899                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
1900                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
1901                     }
1902                 }
1903                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
1904                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
1905
1906                 if (unlikely(old_db_timestamp == rc->db_before)) {
1907                     // database is stale
1908
1909                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
1910
1911                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
1912                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
1913                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
1914                     }
1915                 }
1916                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
1917                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
1918
1919                 if (unlikely(value_is_null)) {
1920                     // collected value is null
1921
1922                     rc->value = NAN;
1923
1924                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
1925                           rc->chart?rc->chart:"NOCHART", rc->name);
1926
1927                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
1928                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
1929                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
1930                               rc->chart?rc->chart:"NOCHART", rc->name);
1931                     }
1932                 }
1933                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
1934                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
1935
1936                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
1937                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
1938             }
1939
1940             if(unlikely(rc->calculation)) {
1941                 if (unlikely(!expression_evaluate(rc->calculation))) {
1942                     // calculation failed
1943
1944                     rc->value = NAN;
1945
1946                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
1947                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
1948
1949                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
1950                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
1951                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
1952                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
1953                     }
1954                 }
1955                 else {
1956                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
1957                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
1958
1959                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
1960                             CALCULATED_NUMBER_FORMAT
1961                             ": %s (source: %s)",
1962                           rc->chart?rc->chart:"NOCHART", rc->name,
1963                           rc->calculation->result,
1964                           buffer_tostring(rc->calculation->error_msg),
1965                           rc->source
1966                     );
1967
1968                     rc->value = rc->calculation->result;
1969                 }
1970             }
1971         }
1972         rrdhost_unlock(&localhost);
1973
1974         if (runnable) {
1975             rrdhost_rdlock(&localhost);
1976
1977             for (rc = localhost.alarms; rc; rc = rc->next) {
1978                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
1979                     continue;
1980
1981                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
1982                 int critical_status = RRDCALC_STATUS_UNDEFINED;
1983
1984                 if(unlikely(rc->warning)) {
1985                     if(unlikely(!expression_evaluate(rc->warning))) {
1986                         // calculation failed
1987
1988                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
1989                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1990
1991                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
1992                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
1993                             error("Health alarm '%s.%s': warning expression failed with error: %s",
1994                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1995                         }
1996                     }
1997                     else {
1998                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
1999                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2000
2001                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2002                                 CALCULATED_NUMBER_FORMAT
2003                                 ": %s (source: %s)",
2004                               rc->chart?rc->chart:"NOCHART", rc->name,
2005                               rc->warning->result,
2006                               buffer_tostring(rc->warning->error_msg),
2007                               rc->source
2008                         );
2009
2010                         warning_status = rrdcalc_value2status(rc->warning->result);
2011                     }
2012                 }
2013
2014                 if(unlikely(rc->critical)) {
2015                     if(unlikely(!expression_evaluate(rc->critical))) {
2016                         // calculation failed
2017
2018                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2019                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2020
2021                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2022                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2023                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2024                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2025                         }
2026                     }
2027                     else {
2028                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2029                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2030
2031                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2032                                 CALCULATED_NUMBER_FORMAT
2033                                 ": %s (source: %s)",
2034                               rc->chart?rc->chart:"NOCHART", rc->name,
2035                               rc->critical->result,
2036                               buffer_tostring(rc->critical->error_msg),
2037                               rc->source
2038                         );
2039
2040                         critical_status = rrdcalc_value2status(rc->critical->result);
2041                     }
2042                 }
2043
2044                 int status = RRDCALC_STATUS_UNDEFINED;
2045
2046                 switch(warning_status) {
2047                     case RRDCALC_STATUS_CLEAR:
2048                         status = RRDCALC_STATUS_CLEAR;
2049                         break;
2050
2051                     case RRDCALC_STATUS_RAISED:
2052                         status = RRDCALC_STATUS_WARNING;
2053                         break;
2054
2055                     default:
2056                         break;
2057                 }
2058
2059                 switch(critical_status) {
2060                     case RRDCALC_STATUS_CLEAR:
2061                         if(status == RRDCALC_STATUS_UNDEFINED)
2062                             status = RRDCALC_STATUS_CLEAR;
2063                         break;
2064
2065                     case RRDCALC_STATUS_RAISED:
2066                         status = RRDCALC_STATUS_CRITICAL;
2067                         break;
2068
2069                     default:
2070                         break;
2071                 }
2072
2073                 if(status != rc->status) {
2074                     health_alarm_log(&localhost, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source);
2075                     rc->last_status_change = now;
2076                     rc->status = status;
2077                 }
2078
2079                 rc->last_updated = now;
2080                 rc->next_update = now + rc->update_every;
2081
2082                 if (next_run > rc->next_update)
2083                     next_run = rc->next_update;
2084             }
2085
2086             rrdhost_unlock(&localhost);
2087         }
2088
2089         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2090             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2091
2092         // execute notifications
2093         // and cleanup
2094         health_alarm_log_process(&localhost);
2095
2096         now = time(NULL);
2097         if(now < next_run) {
2098             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2099                   loop, (int) (next_run - now));
2100             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2101         }
2102         else {
2103             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2104         }
2105     }
2106
2107     buffer_free(wb);
2108
2109     info("HEALTH thread exiting");
2110     pthread_exit(NULL);
2111     return NULL;
2112 }