]> arthur.barton.de Git - netdata.git/blob - src/health.c
every host has its own health
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 #define RRDVAR_MAX_LENGTH 1024
5
6 int health_enabled = 1;
7
8 // ----------------------------------------------------------------------------
9 // RRDVAR management
10
11 inline int rrdvar_fix_name(char *variable) {
12     int fixed = 0;
13     while(*variable) {
14         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
15             *variable++ = '_';
16             fixed++;
17         }
18         else
19             variable++;
20     }
21
22     return fixed;
23 }
24
25 int rrdvar_compare(void* a, void* b) {
26     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
27     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
28     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
29 }
30
31 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
32     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
33     if(ret != rv)
34         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
35
36     return ret;
37 }
38
39 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
40     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
41     if(!ret)
42         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
43
44     return ret;
45 }
46
47 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
48     RRDVAR tmp;
49     tmp.name = (char *)name;
50     tmp.hash = (hash)?hash:simple_hash(tmp.name);
51
52     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
53 }
54
55 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
56     (void)host;
57
58     if(!rv) return;
59
60     if(tree) {
61         debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
62         if(unlikely(!rrdvar_index_del(tree, rv)))
63             error("Attempted to delete variable '%s' from host '%s', but it is not found.", rv->name, host->hostname);
64     }
65
66     freez(rv->name);
67     freez(rv);
68 }
69
70 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, void *value) {
71     char *variable = strdupz(name);
72     rrdvar_fix_name(variable);
73     uint32_t hash = simple_hash(variable);
74
75     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
76     if(unlikely(!rv)) {
77         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
78
79         rv = callocz(1, sizeof(RRDVAR));
80         rv->name = variable;
81         rv->hash = hash;
82         rv->type = type;
83         rv->value = value;
84
85         RRDVAR *ret = rrdvar_index_add(tree, rv);
86         if(unlikely(ret != rv)) {
87             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
88             rrdvar_free(NULL, NULL, rv);
89             rv = NULL;
90         }
91         else
92             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
93     }
94     else {
95         debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
96
97         // already exists
98         freez(variable);
99
100         // this is important
101         // it must return NULL - not the existing variable - or double-free will happen
102         rv = NULL;
103     }
104
105     return rv;
106 }
107
108 // ----------------------------------------------------------------------------
109 // CUSTOM VARIABLES
110
111 RRDVAR *rrdvar_custom_host_variable_create(RRDHOST *host, const char *name) {
112     calculated_number *v = callocz(1, sizeof(calculated_number));
113     *v = NAN;
114     RRDVAR *rv = rrdvar_create_and_index("host", &host->variables_root_index, name, RRDVAR_TYPE_CALCULATED_ALLOCATED, v);
115     if(unlikely(!rv)) {
116         free(v);
117         error("Requested variable '%s' already exists - possibly 2 plugins will be updating it at the same time", name);
118
119         char *variable = strdupz(name);
120         rrdvar_fix_name(variable);
121         uint32_t hash = simple_hash(variable);
122
123         rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
124     }
125
126     return rv;
127 }
128
129 void rrdvar_custom_host_variable_destroy(RRDHOST *host, const char *name) {
130     char *variable = strdupz(name);
131     rrdvar_fix_name(variable);
132     uint32_t hash = simple_hash(variable);
133
134     RRDVAR *rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
135     freez(variable);
136
137     if(!rv) {
138         error("Attempted to remove variable '%s' from host '%s', but it does not exist.", name, host->hostname);
139         return;
140     }
141
142     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED) {
143         error("Attempted to remove variable '%s' from host '%s', but it does not a custom allocated variable.", name, host->hostname);
144         return;
145     }
146
147     if(!rrdvar_index_del(&host->variables_root_index, rv)) {
148         error("Attempted to remove variable '%s' from host '%s', but it cannot be found.", name, host->hostname);
149         return;
150     }
151
152     freez(rv->name);
153     freez(rv->value);
154     freez(rv);
155 }
156
157 void rrdvar_custom_host_variable_set(RRDVAR *rv, calculated_number value) {
158     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED)
159         error("requested to set variable '%s' to value " CALCULATED_NUMBER_FORMAT " but the variable is not a custom one.", rv->name, value);
160     else {
161         calculated_number *v = rv->value;
162         *v = value;
163     }
164 }
165
166 // ----------------------------------------------------------------------------
167 // RRDVAR lookup
168
169 static calculated_number rrdvar2number(RRDVAR *rv) {
170     switch(rv->type) {
171         case RRDVAR_TYPE_CALCULATED_ALLOCATED:
172         case RRDVAR_TYPE_CALCULATED: {
173             calculated_number *n = (calculated_number *)rv->value;
174             return *n;
175         }
176
177         case RRDVAR_TYPE_TIME_T: {
178             time_t *n = (time_t *)rv->value;
179             return *n;
180         }
181
182         case RRDVAR_TYPE_COLLECTED: {
183             collected_number *n = (collected_number *)rv->value;
184             return *n;
185         }
186
187         case RRDVAR_TYPE_TOTAL: {
188             total_number *n = (total_number *)rv->value;
189             return *n;
190         }
191
192         case RRDVAR_TYPE_INT: {
193             int *n = (int *)rv->value;
194             return *n;
195         }
196
197         default:
198             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
199             return NAN;
200     }
201 }
202
203 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
204     RRDSET *st = rc->rrdset;
205     RRDVAR *rv;
206
207     if(!st) return 0;
208
209     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
210     if(rv) {
211         *result = rrdvar2number(rv);
212         return 1;
213     }
214
215     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
216     if(rv) {
217         *result = rrdvar2number(rv);
218         return 1;
219     }
220
221     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
222     if(rv) {
223         *result = rrdvar2number(rv);
224         return 1;
225     }
226
227     return 0;
228 }
229
230 // ----------------------------------------------------------------------------
231 // RRDVAR to JSON
232
233 struct variable2json_helper {
234     BUFFER *buf;
235     size_t counter;
236 };
237
238 static int single_variable2json(void *entry, void *data) {
239     struct variable2json_helper *helper = (struct variable2json_helper *)data;
240     RRDVAR *rv = (RRDVAR *)entry;
241     calculated_number value = rrdvar2number(rv);
242
243     if(unlikely(isnan(value) || isinf(value)))
244         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
245     else
246         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
247
248     helper->counter++;
249
250     return 0;
251 }
252
253 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
254     struct variable2json_helper helper = {
255             .buf = buf,
256             .counter = 0
257     };
258
259     buffer_sprintf(buf, "{\n\t\"chart\": \"%s\",\n\t\"chart_name\": \"%s\",\n\t\"chart_context\": \"%s\",\n\t\"chart_variables\": {", st->id, st->name, st->context);
260     avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
261     buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
262     helper.counter = 0;
263     avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
264     buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
265     helper.counter = 0;
266     avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
267     buffer_strcat(buf, "\n\t}\n}\n");
268 }
269
270
271 // ----------------------------------------------------------------------------
272 // RRDDIMVAR management
273 // DIMENSION VARIABLES
274
275 #define RRDDIMVAR_ID_MAX 1024
276
277 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
278     RRDDIM *rd = rs->rrddim;
279     RRDSET *st = rd->rrdset;
280
281     // CHART VARIABLES FOR THIS DIMENSION
282
283     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
284     rs->var_local_id = NULL;
285
286     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
287     rs->var_local_name = NULL;
288
289     // FAMILY VARIABLES FOR THIS DIMENSION
290
291     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
292     rs->var_family_id = NULL;
293
294     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
295     rs->var_family_name = NULL;
296
297     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
298     rs->var_family_contextid = NULL;
299
300     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
301     rs->var_family_contextname = NULL;
302
303     // HOST VARIABLES FOR THIS DIMENSION
304
305     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
306     rs->var_host_chartidid = NULL;
307
308     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
309     rs->var_host_chartidname = NULL;
310
311     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
312     rs->var_host_chartnameid = NULL;
313
314     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
315     rs->var_host_chartnamename = NULL;
316
317     // KEYS
318
319     freez(rs->key_id);
320     rs->key_id = NULL;
321
322     freez(rs->key_name);
323     rs->key_name = NULL;
324
325     freez(rs->key_fullidid);
326     rs->key_fullidid = NULL;
327
328     freez(rs->key_fullidname);
329     rs->key_fullidname = NULL;
330
331     freez(rs->key_contextid);
332     rs->key_contextid = NULL;
333
334     freez(rs->key_contextname);
335     rs->key_contextname = NULL;
336
337     freez(rs->key_fullnameid);
338     rs->key_fullnameid = NULL;
339
340     freez(rs->key_fullnamename);
341     rs->key_fullnamename = NULL;
342 }
343
344 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
345     rrddimvar_free_variables(rs);
346
347     RRDDIM *rd = rs->rrddim;
348     RRDSET *st = rd->rrdset;
349
350     char buffer[RRDDIMVAR_ID_MAX + 1];
351
352     // KEYS
353
354     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
355     rs->key_id = strdupz(buffer);
356
357     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
358     rs->key_name = strdupz(buffer);
359
360     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
361     rs->key_fullidid = strdupz(buffer);
362
363     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
364     rs->key_fullidname = strdupz(buffer);
365
366     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
367     rs->key_contextid = strdupz(buffer);
368
369     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
370     rs->key_contextname = strdupz(buffer);
371
372     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
373     rs->key_fullnameid = strdupz(buffer);
374
375     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
376     rs->key_fullnamename = strdupz(buffer);
377
378     // CHART VARIABLES FOR THIS DIMENSION
379     // -----------------------------------
380     //
381     // dimensions are available as:
382     // - $id
383     // - $name
384
385     rs->var_local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
386     rs->var_local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
387
388     // FAMILY VARIABLES FOR THIS DIMENSION
389     // -----------------------------------
390     //
391     // dimensions are available as:
392     // - $id                 (only the first, when multiple overlap)
393     // - $name               (only the first, when multiple overlap)
394     // - $chart-context.id
395     // - $chart-context.name
396
397     rs->var_family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
398     rs->var_family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
399     rs->var_family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
400     rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
401
402     // HOST VARIABLES FOR THIS DIMENSION
403     // -----------------------------------
404     //
405     // dimensions are available as:
406     // - $chart-id.id
407     // - $chart-id.name
408     // - $chart-name.id
409     // - $chart-name.name
410
411     rs->var_host_chartidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
412     rs->var_host_chartidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
413     rs->var_host_chartnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
414     rs->var_host_chartnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
415 }
416
417 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
418     RRDSET *st = rd->rrdset;
419
420     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
421
422     if(!prefix) prefix = "";
423     if(!suffix) suffix = "";
424
425     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
426
427     rs->prefix = strdupz(prefix);
428     rs->suffix = strdupz(suffix);
429
430     rs->type = type;
431     rs->value = value;
432     rs->options = options;
433     rs->rrddim = rd;
434
435     rs->next = rd->variables;
436     rd->variables = rs;
437
438     rrddimvar_create_variables(rs);
439
440     return rs;
441 }
442
443 void rrddimvar_rename_all(RRDDIM *rd) {
444     RRDSET *st = rd->rrdset;
445     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
446
447     RRDDIMVAR *rs, *next = rd->variables;
448     while((rs = next)) {
449         next = rs->next;
450         rrddimvar_create_variables(rs);
451     }
452 }
453
454 void rrddimvar_free(RRDDIMVAR *rs) {
455     RRDDIM *rd = rs->rrddim;
456     RRDSET *st = rd->rrdset;
457     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
458
459     rrddimvar_free_variables(rs);
460
461     if(rd->variables == rs) {
462         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
463         rd->variables = rs->next;
464     }
465     else {
466         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
467         RRDDIMVAR *t;
468         for (t = rd->variables; t && t->next != rs; t = t->next) ;
469         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
470         else t->next = rs->next;
471     }
472
473     freez(rs->prefix);
474     freez(rs->suffix);
475     freez(rs);
476 }
477
478 // ----------------------------------------------------------------------------
479 // RRDSETVAR management
480 // CHART VARIABLES
481
482 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
483     RRDSET *st = rs->rrdset;
484
485     // CHART
486
487     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
488     rs->var_local = NULL;
489
490     // FAMILY
491
492     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
493     rs->var_family = NULL;
494
495     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
496     rs->var_host = NULL;
497
498     // HOST
499
500     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
501     rs->var_family_name = NULL;
502
503     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
504     rs->var_host_name = NULL;
505
506     // KEYS
507
508     freez(rs->key_fullid);
509     rs->key_fullid = NULL;
510
511     freez(rs->key_fullname);
512     rs->key_fullname = NULL;
513 }
514
515 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
516     rrdsetvar_free_variables(rs);
517
518     RRDSET *st = rs->rrdset;
519
520     // KEYS
521
522     char buffer[RRDVAR_MAX_LENGTH + 1];
523     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
524     rs->key_fullid = strdupz(buffer);
525
526     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
527     rs->key_fullname = strdupz(buffer);
528
529     // CHART
530
531     rs->var_local       = rrdvar_create_and_index("local",  &st->variables_root_index,               rs->variable, rs->type, rs->value);
532
533     // FAMILY
534
535     rs->var_family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullid,   rs->type, rs->value);
536     rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullname, rs->type, rs->value);
537
538     // HOST
539
540     rs->var_host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullid,   rs->type, rs->value);
541     rs->var_host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullname, rs->type, rs->value);
542
543 }
544
545 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
546     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
547     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
548
549     rs->variable = strdupz(variable);
550     rs->type = type;
551     rs->value = value;
552     rs->options = options;
553     rs->rrdset = st;
554
555     rs->next = st->variables;
556     st->variables = rs;
557
558     rrdsetvar_create_variables(rs);
559
560     return rs;
561 }
562
563 void rrdsetvar_rename_all(RRDSET *st) {
564     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
565
566     RRDSETVAR *rs, *next = st->variables;
567     while((rs = next)) {
568         next = rs->next;
569         rrdsetvar_create_variables(rs);
570     }
571
572     rrdsetcalc_link_matching(st);
573 }
574
575 void rrdsetvar_free(RRDSETVAR *rs) {
576     RRDSET *st = rs->rrdset;
577     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
578
579     if(st->variables == rs) {
580         st->variables = rs->next;
581     }
582     else {
583         RRDSETVAR *t;
584         for (t = st->variables; t && t->next != rs; t = t->next);
585         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
586         else t->next = rs->next;
587     }
588
589     rrdsetvar_free_variables(rs);
590
591     freez(rs->variable);
592     freez(rs);
593 }
594
595 // ----------------------------------------------------------------------------
596 // RRDCALC management
597
598 inline const char *rrdcalc_status2string(int status) {
599     switch(status) {
600         case RRDCALC_STATUS_REMOVED:
601             return "REMOVED";
602
603         case RRDCALC_STATUS_UNDEFINED:
604             return "UNDEFINED";
605
606         case RRDCALC_STATUS_UNINITIALIZED:
607             return "UNINITIALIZED";
608
609         case RRDCALC_STATUS_CLEAR:
610             return "CLEAR";
611
612         case RRDCALC_STATUS_RAISED:
613             return "RAISED";
614
615         case RRDCALC_STATUS_WARNING:
616             return "WARNING";
617
618         case RRDCALC_STATUS_CRITICAL:
619             return "CRITICAL";
620
621         default:
622             error("Unknown alarm status %d", status);
623             return "UNKNOWN";
624     }
625 }
626
627 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
628     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
629
630     rc->last_status_change = now_realtime_sec();
631     rc->rrdset = st;
632
633     rc->rrdset_next = st->alarms;
634     rc->rrdset_prev = NULL;
635     
636     if(rc->rrdset_next)
637         rc->rrdset_next->rrdset_prev = rc;
638
639     st->alarms = rc;
640
641     if(rc->update_every < rc->rrdset->update_every) {
642         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
643         rc->update_every = rc->rrdset->update_every;
644     }
645
646     if(!isnan(rc->green) && isnan(st->green)) {
647         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
648         st->green = rc->green;
649     }
650
651     if(!isnan(rc->red) && isnan(st->red)) {
652         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
653         st->red = rc->red;
654     }
655
656     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
657     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
658
659     char fullname[RRDVAR_MAX_LENGTH + 1];
660     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
661     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
662
663     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
664     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
665
666         if(!rc->units) rc->units = strdupz(st->units);
667
668     {
669         time_t now = now_realtime_sec();
670         health_alarm_log(
671                 st->rrdhost,
672                 rc->id,
673                 rc->next_event_id++,
674                 now,
675                 rc->name,
676                 rc->rrdset->id,
677                 rc->rrdset->family,
678                 rc->exec,
679                 rc->recipient,
680                 now - rc->last_status_change,
681                 rc->old_value,
682                 rc->value,
683                 rc->status,
684                 RRDCALC_STATUS_UNINITIALIZED,
685                 rc->source,
686                 rc->units,
687                 rc->info,
688                 0,
689                 0
690         );
691     }
692 }
693
694 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
695     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
696             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
697         return 1;
698
699     return 0;
700 }
701
702 // this has to be called while the RRDHOST is locked
703 inline void rrdsetcalc_link_matching(RRDSET *st) {
704     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
705
706     RRDCALC *rc;
707     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
708         if(unlikely(rc->rrdset))
709             continue;
710
711         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
712             rrdsetcalc_link(st, rc);
713     }
714 }
715
716 // this has to be called while the RRDHOST is locked
717 inline void rrdsetcalc_unlink(RRDCALC *rc) {
718     RRDSET *st = rc->rrdset;
719
720     if(!st) {
721         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
722         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
723         return;
724     }
725
726     {
727         time_t now = now_realtime_sec();
728         health_alarm_log(
729                 st->rrdhost,
730                 rc->id,
731                 rc->next_event_id++,
732                 now,
733                 rc->name,
734                 rc->rrdset->id,
735                 rc->rrdset->family,
736                 rc->exec,
737                 rc->recipient,
738                 now - rc->last_status_change,
739                 rc->old_value,
740                 rc->value,
741                 rc->status,
742                 RRDCALC_STATUS_REMOVED,
743                 rc->source,
744                 rc->units,
745                 rc->info,
746                 0,
747                 0
748         );
749     }
750
751     RRDHOST *host = st->rrdhost;
752
753     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
754
755     // unlink it
756     if(rc->rrdset_prev)
757         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
758
759     if(rc->rrdset_next)
760         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
761
762     if(st->alarms == rc)
763         st->alarms = rc->rrdset_next;
764
765     rc->rrdset_prev = rc->rrdset_next = NULL;
766
767     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
768     rc->local = NULL;
769
770     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
771     rc->family = NULL;
772
773     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
774     rc->hostid = NULL;
775
776     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
777     rc->hostname = NULL;
778
779     rc->rrdset = NULL;
780
781     // RRDCALC will remain in RRDHOST
782     // so that if the matching chart is found in the future
783     // it will be applied automatically
784 }
785
786 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
787     RRDCALC *rc;
788     uint32_t hash = simple_hash(name);
789
790     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
791         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
792             return rc;
793     }
794
795     return NULL;
796 }
797
798 inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
799     RRDCALC *rc;
800
801     if(unlikely(!chart)) {
802         error("attempt to find RRDCALC '%s' without giving a chart name", name);
803         return 1;
804     }
805
806     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
807     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
808
809     // make sure it does not already exist
810     for(rc = host->alarms; rc ; rc = rc->next) {
811         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
812             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
813             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
814             return 1;
815         }
816     }
817
818     return 0;
819 }
820
821 inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
822     if(chart && name) {
823         uint32_t hash_chart = simple_hash(chart);
824         uint32_t hash_name = simple_hash(name);
825
826         // re-use old IDs, by looking them up in the alarm log
827         ALARM_ENTRY *ae;
828         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
829             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
830                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
831                 return ae->alarm_id;
832             }
833         }
834     }
835
836     return host->health_log.next_alarm_id++;
837 }
838
839 inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
840     rrdhost_check_rdlock(host);
841
842     if(rc->calculation) {
843         rc->calculation->status = &rc->status;
844         rc->calculation->this = &rc->value;
845         rc->calculation->after = &rc->db_after;
846         rc->calculation->before = &rc->db_before;
847         rc->calculation->rrdcalc = rc;
848     }
849
850     if(rc->warning) {
851         rc->warning->status = &rc->status;
852         rc->warning->this = &rc->value;
853         rc->warning->after = &rc->db_after;
854         rc->warning->before = &rc->db_before;
855         rc->warning->rrdcalc = rc;
856     }
857
858     if(rc->critical) {
859         rc->critical->status = &rc->status;
860         rc->critical->this = &rc->value;
861         rc->critical->after = &rc->db_after;
862         rc->critical->before = &rc->db_before;
863         rc->critical->rrdcalc = rc;
864     }
865
866     // link it to the host
867     if(likely(host->alarms)) {
868         // append it
869         RRDCALC *t;
870         for(t = host->alarms; t && t->next ; t = t->next) ;
871         t->next = rc;
872     }
873     else {
874         host->alarms = rc;
875     }
876
877     // link it to its chart
878     RRDSET *st;
879     for(st = host->rrdset_root; st ; st = st->next) {
880         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
881             rrdsetcalc_link(st, rc);
882             break;
883         }
884     }
885 }
886
887 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
888
889     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
890
891     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
892         return NULL;
893
894     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
895     rc->next_event_id = 1;
896     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
897     rc->name = strdupz(rt->name);
898     rc->hash = simple_hash(rc->name);
899     rc->chart = strdupz(chart);
900     rc->hash_chart = simple_hash(rc->chart);
901
902     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
903
904     rc->green = rt->green;
905     rc->red = rt->red;
906     rc->value = NAN;
907     rc->old_value = NAN;
908
909     rc->delay_up_duration = rt->delay_up_duration;
910     rc->delay_down_duration = rt->delay_down_duration;
911     rc->delay_max_duration = rt->delay_max_duration;
912     rc->delay_multiplier = rt->delay_multiplier;
913
914     rc->group = rt->group;
915     rc->after = rt->after;
916     rc->before = rt->before;
917     rc->update_every = rt->update_every;
918     rc->options = rt->options;
919
920     if(rt->exec) rc->exec = strdupz(rt->exec);
921     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
922     if(rt->source) rc->source = strdupz(rt->source);
923     if(rt->units) rc->units = strdupz(rt->units);
924     if(rt->info) rc->info = strdupz(rt->info);
925
926     if(rt->calculation) {
927         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
928         if(!rc->calculation)
929             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
930     }
931     if(rt->warning) {
932         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
933         if(!rc->warning)
934             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
935     }
936     if(rt->critical) {
937         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
938         if(!rc->critical)
939             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
940     }
941
942     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
943           (rc->chart)?rc->chart:"NOCHART",
944           rc->name,
945           (rc->exec)?rc->exec:"DEFAULT",
946           (rc->recipient)?rc->recipient:"DEFAULT",
947           rc->green,
948           rc->red,
949           rc->group,
950           rc->after,
951           rc->before,
952           rc->options,
953           (rc->dimensions)?rc->dimensions:"NONE",
954           rc->update_every,
955           (rc->calculation)?rc->calculation->parsed_as:"NONE",
956           (rc->warning)?rc->warning->parsed_as:"NONE",
957           (rc->critical)?rc->critical->parsed_as:"NONE",
958           rc->source,
959           rc->delay_up_duration,
960           rc->delay_down_duration,
961           rc->delay_max_duration,
962           rc->delay_multiplier
963     );
964
965     rrdcalc_create_part2(host, rc);
966     return rc;
967 }
968
969 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
970     if(!rc) return;
971
972     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
973
974     // unlink it from RRDSET
975     if(rc->rrdset) rrdsetcalc_unlink(rc);
976
977     // unlink it from RRDHOST
978     if(unlikely(rc == host->alarms))
979         host->alarms = rc->next;
980
981     else if(likely(host->alarms)) {
982         RRDCALC *t, *last = host->alarms;
983         for(t = last->next; t && t != rc; last = t, t = t->next) ;
984         if(last->next == rc)
985             last->next = rc->next;
986         else
987             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
988     }
989     else
990         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
991
992     expression_free(rc->calculation);
993     expression_free(rc->warning);
994     expression_free(rc->critical);
995
996     freez(rc->name);
997     freez(rc->chart);
998     freez(rc->family);
999     freez(rc->dimensions);
1000     freez(rc->exec);
1001     freez(rc->recipient);
1002     freez(rc->source);
1003     freez(rc->units);
1004     freez(rc->info);
1005     freez(rc);
1006 }
1007
1008 // ----------------------------------------------------------------------------
1009 // RRDCALCTEMPLATE management
1010
1011 void rrdcalctemplate_link_matching(RRDSET *st) {
1012     RRDCALCTEMPLATE *rt;
1013
1014     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1015         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
1016                 && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
1017             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1018             if(unlikely(!rc))
1019                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1020
1021 #ifdef NETDATA_INTERNAL_CHECKS
1022             else if(rc->rrdset != st)
1023                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1024 #endif
1025         }
1026     }
1027 }
1028
1029 inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1030     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1031
1032     if(host->templates) {
1033         if(host->templates == rt) {
1034             host->templates = rt->next;
1035         }
1036         else {
1037             RRDCALCTEMPLATE *t, *last = host->templates;
1038             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1039             if(last && last->next == rt) {
1040                 last->next = rt->next;
1041                 rt->next = NULL;
1042             }
1043             else
1044                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1045         }
1046     }
1047
1048     expression_free(rt->calculation);
1049     expression_free(rt->warning);
1050     expression_free(rt->critical);
1051
1052     freez(rt->family_match);
1053     simple_pattern_free(rt->family_pattern);
1054
1055     freez(rt->name);
1056     freez(rt->exec);
1057     freez(rt->recipient);
1058     freez(rt->context);
1059     freez(rt->source);
1060     freez(rt->units);
1061     freez(rt->info);
1062     freez(rt->dimensions);
1063     freez(rt);
1064 }
1065
1066 // ----------------------------------------------------------------------------
1067 // health initialization
1068
1069 inline char *health_config_dir(void) {
1070     char buffer[FILENAME_MAX + 1];
1071     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
1072     return config_get("health", "health configuration directory", buffer);
1073 }
1074
1075 void health_init(void) {
1076     debug(D_HEALTH, "Health configuration initializing");
1077
1078     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
1079         debug(D_HEALTH, "Health is disabled.");
1080         return;
1081     }
1082
1083     char pathname[FILENAME_MAX + 1];
1084     snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir);
1085     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
1086         fatal("Cannot create directory '%s'.", pathname);
1087 }
1088
1089 // ----------------------------------------------------------------------------
1090 // re-load health configuration
1091
1092 inline void health_free_host_nolock(RRDHOST *host) {
1093     while(host->templates)
1094         rrdcalctemplate_free(host, host->templates);
1095
1096     while(host->alarms)
1097         rrdcalc_free(host, host->alarms);
1098 }
1099
1100 void health_reload_host(RRDHOST *host) {
1101     if(!health_enabled) {
1102         error("Health reload is requested, but health is not enabled.");
1103         return;
1104     }
1105
1106     char *path = health_config_dir();
1107
1108     // free all running alarms
1109     rrdhost_rwlock(host);
1110     health_free_host_nolock(host);
1111     rrdhost_unlock(host);
1112
1113     // invalidate all previous entries in the alarm log
1114     ALARM_ENTRY *t;
1115     for(t = host->health_log.alarms ; t ; t = t->next) {
1116         if(t->new_status != RRDCALC_STATUS_REMOVED)
1117             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
1118     }
1119
1120     // reset all thresholds to all charts
1121     RRDSET *st;
1122     for(st = host->rrdset_root; st ; st = st->next) {
1123         st->green = NAN;
1124         st->red = NAN;
1125     }
1126
1127     // load the new alarms
1128     rrdhost_rwlock(host);
1129     health_readdir(host, path);
1130     rrdhost_unlock(host);
1131
1132     // link the loaded alarms to their charts
1133     for(st = host->rrdset_root; st ; st = st->next) {
1134         rrdhost_rwlock(host);
1135
1136         rrdsetcalc_link_matching(st);
1137         rrdcalctemplate_link_matching(st);
1138
1139         rrdhost_unlock(host);
1140     }
1141 }
1142
1143 void health_reload(void) {
1144     RRDHOST *host;
1145
1146     for(host = localhost; host ; host = host->next)
1147         health_reload_host(host);
1148 }
1149
1150 // ----------------------------------------------------------------------------
1151 // health main thread and friends
1152
1153 static inline int rrdcalc_value2status(calculated_number n) {
1154     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
1155     if(n) return RRDCALC_STATUS_RAISED;
1156     return RRDCALC_STATUS_CLEAR;
1157 }
1158
1159 #define ALARM_EXEC_COMMAND_LENGTH 8192
1160
1161 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
1162     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
1163
1164     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
1165         // do not send notifications for internal statuses
1166         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1167         goto done;
1168     }
1169
1170     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
1171         // do not send notifications for disabled statuses
1172         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1173         // mark it as run, so that we will send the same alarm if it happens again
1174         goto done;
1175     }
1176
1177     // find the previous notification for the same alarm
1178     // which we have run the exec script
1179     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
1180     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
1181         uint32_t id = ae->alarm_id;
1182         ALARM_ENTRY *t;
1183         for(t = ae->next; t ; t = t->next) {
1184             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
1185                 break;
1186         }
1187
1188         if(likely(t)) {
1189             // we have executed this alarm notification in the past
1190             if(t && t->new_status == ae->new_status) {
1191                 // don't send the notification for the same status again
1192                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
1193                       , rrdcalc_status2string(ae->new_status));
1194                 goto done;
1195             }
1196         }
1197         else {
1198             // we have not executed this alarm notification in the past
1199             // so, don't send CLEAR notifications
1200             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
1201                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
1202                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1203                 goto done;
1204             }
1205         }
1206     }
1207
1208     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
1209     pid_t command_pid;
1210
1211     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
1212     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
1213
1214     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
1215               exec,
1216               recipient,
1217               host->hostname,
1218               ae->unique_id,
1219               ae->alarm_id,
1220               ae->alarm_event_id,
1221               (unsigned long)ae->when,
1222               ae->name,
1223               ae->chart?ae->chart:"NOCAHRT",
1224               ae->family?ae->family:"NOFAMILY",
1225               rrdcalc_status2string(ae->new_status),
1226               rrdcalc_status2string(ae->old_status),
1227               ae->new_value,
1228               ae->old_value,
1229               ae->source?ae->source:"UNKNOWN",
1230               (uint32_t)ae->duration,
1231               (uint32_t)ae->non_clear_duration,
1232               ae->units?ae->units:"",
1233               ae->info?ae->info:"",
1234               ae->new_value_string,
1235               ae->old_value_string
1236     );
1237
1238     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
1239     ae->exec_run_timestamp = now_realtime_sec();
1240
1241     debug(D_HEALTH, "executing command '%s'", command_to_run);
1242     FILE *fp = mypopen(command_to_run, &command_pid);
1243     if(!fp) {
1244         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
1245         goto done;
1246     }
1247     debug(D_HEALTH, "HEALTH reading from command");
1248     char *s = fgets(command_to_run, FILENAME_MAX, fp);
1249     (void)s;
1250     ae->exec_code = mypclose(fp, command_pid);
1251     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
1252
1253     if(ae->exec_code != 0)
1254         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
1255
1256 done:
1257     health_alarm_log_save(host, ae);
1258     return;
1259 }
1260
1261 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
1262     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
1263          ae->chart?ae->chart:"NOCHART", ae->name,
1264          ae->new_value,
1265          rrdcalc_status2string(ae->old_status),
1266          rrdcalc_status2string(ae->new_status)
1267     );
1268
1269     health_alarm_execute(host, ae);
1270 }
1271
1272 static inline void health_alarm_log_process(RRDHOST *host) {
1273     static uint32_t stop_at_id = 0;
1274     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
1275     time_t now = now_realtime_sec();
1276
1277     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1278
1279     ALARM_ENTRY *ae;
1280     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
1281         if(unlikely(
1282             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
1283             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
1284             )) {
1285
1286             if(unlikely(ae->unique_id < first_waiting))
1287                 first_waiting = ae->unique_id;
1288
1289             if(likely(now >= ae->delay_up_to_timestamp))
1290                 health_process_notifications(host, ae);
1291         }
1292     }
1293
1294     // remember this for the next iteration
1295     stop_at_id = first_waiting;
1296
1297     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1298
1299     if(host->health_log.count <= host->health_log.max)
1300         return;
1301
1302     // cleanup excess entries in the log
1303     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1304
1305     ALARM_ENTRY *last = NULL;
1306     unsigned int count = host->health_log.max * 2 / 3;
1307     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
1308
1309     if(ae && last && last->next == ae)
1310         last->next = NULL;
1311     else
1312         ae = NULL;
1313
1314     while(ae) {
1315         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
1316
1317         ALARM_ENTRY *t = ae->next;
1318
1319         freez(ae->name);
1320         freez(ae->chart);
1321         freez(ae->family);
1322         freez(ae->exec);
1323         freez(ae->recipient);
1324         freez(ae->source);
1325         freez(ae->units);
1326         freez(ae->info);
1327         freez(ae->old_value_string);
1328         freez(ae->new_value_string);
1329         freez(ae);
1330
1331         ae = t;
1332         host->health_log.count--;
1333     }
1334
1335     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1336 }
1337
1338 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
1339     if(unlikely(!rc->rrdset)) {
1340         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
1341         return 0;
1342     }
1343
1344     if(unlikely(rc->next_update > now)) {
1345         if (unlikely(*next_run > rc->next_update)) {
1346             // update the next_run time of the main loop
1347             // to run this alarm precisely the time required
1348             *next_run = rc->next_update;
1349         }
1350
1351         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
1352         return 0;
1353     }
1354
1355     if(unlikely(!rc->update_every)) {
1356         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
1357         return 0;
1358     }
1359
1360     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
1361         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
1362         return 0;
1363     }
1364
1365     int update_every = rc->rrdset->update_every;
1366     time_t first = rrdset_first_entry_t(rc->rrdset);
1367     time_t last = rrdset_last_entry_t(rc->rrdset);
1368
1369     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
1370         debug(D_HEALTH
1371               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
1372               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
1373               , (unsigned long) last);
1374         return 0;
1375     }
1376
1377     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
1378         time_t needed = now + rc->before + rc->after;
1379
1380         if(needed + update_every < first || needed - update_every > last) {
1381             debug(D_HEALTH
1382                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
1383                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
1384                   , (unsigned long) last);
1385             return 0;
1386         }
1387     }
1388
1389     return 1;
1390 }
1391
1392 void *health_main(void *ptr) {
1393     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
1394
1395     info("HEALTH thread created with task id %d", gettid());
1396
1397     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
1398         error("Cannot set pthread cancel type to DEFERRED.");
1399
1400     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
1401         error("Cannot set pthread cancel state to ENABLE.");
1402
1403     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
1404     if(min_run_every < 1) min_run_every = 1;
1405
1406     BUFFER *wb = buffer_create(100);
1407
1408     unsigned int loop = 0;
1409     while(health_enabled && !netdata_exit) {
1410         loop++;
1411         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
1412
1413         int oldstate, runnable = 0;
1414         time_t now = now_realtime_sec();
1415         time_t next_run = now + min_run_every;
1416         RRDCALC *rc;
1417
1418         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
1419             error("Cannot set pthread cancel state to DISABLE.");
1420
1421         RRDHOST *host;
1422         for(host = localhost; host ; host = host->next) {
1423             rrdhost_rdlock(host);
1424
1425             // the first loop is to lookup values from the db
1426             for(rc = host->alarms; rc; rc = rc->next) {
1427                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
1428                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
1429                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
1430                     continue;
1431                 }
1432
1433                 runnable++;
1434                 rc->old_value = rc->value;
1435                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
1436
1437                 // 1. if there is database lookup, do it
1438                 // 2. if there is calculation expression, run it
1439
1440                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
1441                     /* time_t old_db_timestamp = rc->db_before; */
1442                     int value_is_null = 0;
1443
1444                     int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
1445
1446                     if(unlikely(ret != 200)) {
1447                         // database lookup failed
1448                         rc->value = NAN;
1449
1450                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
1451
1452                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
1453                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
1454                             error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
1455                         }
1456                     }
1457                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
1458                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
1459
1460                     /* - RRDCALC_FLAG_DB_STALE not currently used
1461                     if (unlikely(old_db_timestamp == rc->db_before)) {
1462                         // database is stale
1463
1464                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
1465
1466                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
1467                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
1468                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
1469                         }
1470                     }
1471                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
1472                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
1473                     */
1474
1475                     if(unlikely(value_is_null)) {
1476                         // collected value is null
1477
1478                         rc->value = NAN;
1479
1480                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
1481
1482                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
1483                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
1484                             error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname,  rc->chart ? rc->chart : "NOCHART", rc->name);
1485                         }
1486                     }
1487                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
1488                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
1489
1490                     debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
1491                 }
1492
1493                 if(unlikely(rc->calculation)) {
1494                     if(unlikely(!expression_evaluate(rc->calculation))) {
1495                         // calculation failed
1496
1497                         rc->value = NAN;
1498
1499                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
1500
1501                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
1502                             rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
1503                             error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname,  rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
1504                         }
1505                     }
1506                     else {
1507                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
1508                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
1509
1510                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
1511                                 CALCULATED_NUMBER_FORMAT
1512                                 ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
1513                               , rc->calculation->parsed_as, rc->calculation->result,
1514                                 buffer_tostring(rc->calculation->error_msg), rc->source
1515                         );
1516
1517                         rc->value = rc->calculation->result;
1518                     }
1519                 }
1520             }
1521             rrdhost_unlock(host);
1522
1523             if(unlikely(runnable && !netdata_exit)) {
1524                 rrdhost_rdlock(host);
1525
1526                 for(rc = host->alarms; rc; rc = rc->next) {
1527                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
1528                         continue;
1529
1530                     int warning_status = RRDCALC_STATUS_UNDEFINED;
1531                     int critical_status = RRDCALC_STATUS_UNDEFINED;
1532
1533                     if(likely(rc->warning)) {
1534                         if(unlikely(!expression_evaluate(rc->warning))) {
1535                             // calculation failed
1536
1537                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1538
1539                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
1540                                 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
1541                                 error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1542                             }
1543                         }
1544                         else {
1545                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
1546                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
1547
1548                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
1549
1550                             warning_status = rrdcalc_value2status(rc->warning->result);
1551                         }
1552                     }
1553
1554                     if(likely(rc->critical)) {
1555                         if(unlikely(!expression_evaluate(rc->critical))) {
1556                             // calculation failed
1557
1558                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
1559
1560                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
1561                                 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
1562                                 error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
1563                             }
1564                         }
1565                         else {
1566                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
1567                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
1568
1569                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
1570
1571                             critical_status = rrdcalc_value2status(rc->critical->result);
1572                         }
1573                     }
1574
1575                     int status = RRDCALC_STATUS_UNDEFINED;
1576
1577                     switch(warning_status) {
1578                         case RRDCALC_STATUS_CLEAR:
1579                             status = RRDCALC_STATUS_CLEAR;
1580                             break;
1581
1582                         case RRDCALC_STATUS_RAISED:
1583                             status = RRDCALC_STATUS_WARNING;
1584                             break;
1585
1586                         default:
1587                             break;
1588                     }
1589
1590                     switch(critical_status) {
1591                         case RRDCALC_STATUS_CLEAR:
1592                             if(status == RRDCALC_STATUS_UNDEFINED)
1593                                 status = RRDCALC_STATUS_CLEAR;
1594                             break;
1595
1596                         case RRDCALC_STATUS_RAISED:
1597                             status = RRDCALC_STATUS_CRITICAL;
1598                             break;
1599
1600                         default:
1601                             break;
1602                     }
1603
1604                     if(status != rc->status) {
1605                         int delay = 0;
1606
1607                         if(now > rc->delay_up_to_timestamp) {
1608                             rc->delay_up_current = rc->delay_up_duration;
1609                             rc->delay_down_current = rc->delay_down_duration;
1610                             rc->delay_last = 0;
1611                             rc->delay_up_to_timestamp = 0;
1612                         }
1613                         else {
1614                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
1615                             if(rc->delay_up_current > rc->delay_max_duration)
1616                                 rc->delay_up_current = rc->delay_max_duration;
1617
1618                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
1619                             if(rc->delay_down_current > rc->delay_max_duration)
1620                                 rc->delay_down_current = rc->delay_max_duration;
1621                         }
1622
1623                         if(status > rc->status)
1624                             delay = rc->delay_up_current;
1625                         else
1626                             delay = rc->delay_down_current;
1627
1628                         // COMMENTED: because we do need to send raising alarms
1629                         // if(now + delay < rc->delay_up_to_timestamp)
1630                         //    delay = (int)(rc->delay_up_to_timestamp - now);
1631
1632                         rc->delay_last = delay;
1633                         rc->delay_up_to_timestamp = now + delay;
1634                         health_alarm_log(
1635                                 host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
1636                                 , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
1637                                 , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
1638                                 , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
1639                                                   ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
1640                         );
1641                         rc->last_status_change = now;
1642                         rc->status = status;
1643                     }
1644
1645                     rc->last_updated = now;
1646                     rc->next_update = now + rc->update_every;
1647
1648                     if(next_run > rc->next_update)
1649                         next_run = rc->next_update;
1650                 }
1651
1652                 rrdhost_unlock(host);
1653             }
1654
1655             if(unlikely(netdata_exit))
1656                 break;
1657
1658             // execute notifications
1659             // and cleanup
1660             health_alarm_log_process(host);
1661
1662             if(unlikely(netdata_exit))
1663                 break;
1664
1665         } /* host loop */
1666
1667         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
1668             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
1669
1670         if(unlikely(netdata_exit))
1671             break;
1672
1673         now = now_realtime_sec();
1674         if(now < next_run) {
1675             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
1676             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
1677         }
1678         else
1679             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
1680     }
1681
1682     buffer_free(wb);
1683
1684     info("HEALTH thread exiting");
1685
1686     static_thread->enabled = 0;
1687     pthread_exit(NULL);
1688     return NULL;
1689 }