]> arthur.barton.de Git - netdata.git/blob - src/health.c
bd62fbcf8b7fbb8d3ead594422b15df7a61909d4
[netdata.git] / src / health.c
1 #define NETDATA_HEALTH_INTERNALS
2 #include "common.h"
3
4 #define RRDVAR_MAX_LENGTH 1024
5
6 int default_localhost_health_enabled = 1;
7
8 // ----------------------------------------------------------------------------
9 // RRDVAR management
10
11 inline int rrdvar_fix_name(char *variable) {
12     int fixed = 0;
13     while(*variable) {
14         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
15             *variable++ = '_';
16             fixed++;
17         }
18         else
19             variable++;
20     }
21
22     return fixed;
23 }
24
25 int rrdvar_compare(void* a, void* b) {
26     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
27     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
28     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
29 }
30
31 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
32     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
33     if(ret != rv)
34         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
35
36     return ret;
37 }
38
39 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
40     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
41     if(!ret)
42         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
43
44     return ret;
45 }
46
47 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
48     RRDVAR tmp;
49     tmp.name = (char *)name;
50     tmp.hash = (hash)?hash:simple_hash(tmp.name);
51
52     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
53 }
54
55 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
56     (void)host;
57
58     if(!rv) return;
59
60     if(tree) {
61         debug(D_VARIABLES, "Deleting variable '%s'", rv->name);
62         if(unlikely(!rrdvar_index_del(tree, rv)))
63             error("Attempted to delete variable '%s' from host '%s', but it is not found.", rv->name, host->hostname);
64     }
65
66     freez(rv->name);
67     freez(rv);
68 }
69
70 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, void *value) {
71     char *variable = strdupz(name);
72     rrdvar_fix_name(variable);
73     uint32_t hash = simple_hash(variable);
74
75     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
76     if(unlikely(!rv)) {
77         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
78
79         rv = callocz(1, sizeof(RRDVAR));
80         rv->name = variable;
81         rv->hash = hash;
82         rv->type = type;
83         rv->value = value;
84
85         RRDVAR *ret = rrdvar_index_add(tree, rv);
86         if(unlikely(ret != rv)) {
87             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
88             rrdvar_free(NULL, NULL, rv);
89             rv = NULL;
90         }
91         else
92             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
93     }
94     else {
95         debug(D_VARIABLES, "Variable '%s' is already found in scope '%s'.", variable, scope);
96
97         // already exists
98         freez(variable);
99
100         // this is important
101         // it must return NULL - not the existing variable - or double-free will happen
102         rv = NULL;
103     }
104
105     return rv;
106 }
107
108 // ----------------------------------------------------------------------------
109 // CUSTOM VARIABLES
110
111 RRDVAR *rrdvar_custom_host_variable_create(RRDHOST *host, const char *name) {
112     calculated_number *v = callocz(1, sizeof(calculated_number));
113     *v = NAN;
114     RRDVAR *rv = rrdvar_create_and_index("host", &host->variables_root_index, name, RRDVAR_TYPE_CALCULATED_ALLOCATED, v);
115     if(unlikely(!rv)) {
116         free(v);
117         error("Requested variable '%s' already exists - possibly 2 plugins will be updating it at the same time", name);
118
119         char *variable = strdupz(name);
120         rrdvar_fix_name(variable);
121         uint32_t hash = simple_hash(variable);
122
123         rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
124     }
125
126     return rv;
127 }
128
129 void rrdvar_custom_host_variable_destroy(RRDHOST *host, const char *name) {
130     char *variable = strdupz(name);
131     rrdvar_fix_name(variable);
132     uint32_t hash = simple_hash(variable);
133
134     RRDVAR *rv = rrdvar_index_find(&host->variables_root_index, variable, hash);
135     freez(variable);
136
137     if(!rv) {
138         error("Attempted to remove variable '%s' from host '%s', but it does not exist.", name, host->hostname);
139         return;
140     }
141
142     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED) {
143         error("Attempted to remove variable '%s' from host '%s', but it does not a custom allocated variable.", name, host->hostname);
144         return;
145     }
146
147     if(!rrdvar_index_del(&host->variables_root_index, rv)) {
148         error("Attempted to remove variable '%s' from host '%s', but it cannot be found.", name, host->hostname);
149         return;
150     }
151
152     freez(rv->name);
153     freez(rv->value);
154     freez(rv);
155 }
156
157 void rrdvar_custom_host_variable_set(RRDVAR *rv, calculated_number value) {
158     if(rv->type != RRDVAR_TYPE_CALCULATED_ALLOCATED)
159         error("requested to set variable '%s' to value " CALCULATED_NUMBER_FORMAT " but the variable is not a custom one.", rv->name, value);
160     else {
161         calculated_number *v = rv->value;
162         *v = value;
163     }
164 }
165
166 // ----------------------------------------------------------------------------
167 // RRDVAR lookup
168
169 static calculated_number rrdvar2number(RRDVAR *rv) {
170     switch(rv->type) {
171         case RRDVAR_TYPE_CALCULATED_ALLOCATED:
172         case RRDVAR_TYPE_CALCULATED: {
173             calculated_number *n = (calculated_number *)rv->value;
174             return *n;
175         }
176
177         case RRDVAR_TYPE_TIME_T: {
178             time_t *n = (time_t *)rv->value;
179             return *n;
180         }
181
182         case RRDVAR_TYPE_COLLECTED: {
183             collected_number *n = (collected_number *)rv->value;
184             return *n;
185         }
186
187         case RRDVAR_TYPE_TOTAL: {
188             total_number *n = (total_number *)rv->value;
189             return *n;
190         }
191
192         case RRDVAR_TYPE_INT: {
193             int *n = (int *)rv->value;
194             return *n;
195         }
196
197         default:
198             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
199             return NAN;
200     }
201 }
202
203 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
204     RRDSET *st = rc->rrdset;
205     RRDVAR *rv;
206
207     if(!st) return 0;
208
209     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
210     if(rv) {
211         *result = rrdvar2number(rv);
212         return 1;
213     }
214
215     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
216     if(rv) {
217         *result = rrdvar2number(rv);
218         return 1;
219     }
220
221     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
222     if(rv) {
223         *result = rrdvar2number(rv);
224         return 1;
225     }
226
227     return 0;
228 }
229
230 // ----------------------------------------------------------------------------
231 // RRDVAR to JSON
232
233 struct variable2json_helper {
234     BUFFER *buf;
235     size_t counter;
236 };
237
238 static int single_variable2json(void *entry, void *data) {
239     struct variable2json_helper *helper = (struct variable2json_helper *)data;
240     RRDVAR *rv = (RRDVAR *)entry;
241     calculated_number value = rrdvar2number(rv);
242
243     if(unlikely(isnan(value) || isinf(value)))
244         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rv->name);
245     else
246         buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5Lf", helper->counter?",":"", rv->name, (long double)value);
247
248     helper->counter++;
249
250     return 0;
251 }
252
253 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) {
254     struct variable2json_helper helper = {
255             .buf = buf,
256             .counter = 0
257     };
258
259     buffer_sprintf(buf, "{\n\t\"chart\": \"%s\",\n\t\"chart_name\": \"%s\",\n\t\"chart_context\": \"%s\",\n\t\"chart_variables\": {", st->id, st->name, st->context);
260     avl_traverse_lock(&st->variables_root_index, single_variable2json, (void *)&helper);
261     buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", st->family);
262     helper.counter = 0;
263     avl_traverse_lock(&st->rrdfamily->variables_root_index, single_variable2json, (void *)&helper);
264     buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", st->rrdhost->hostname);
265     helper.counter = 0;
266     avl_traverse_lock(&st->rrdhost->variables_root_index, single_variable2json, (void *)&helper);
267     buffer_strcat(buf, "\n\t}\n}\n");
268 }
269
270
271 // ----------------------------------------------------------------------------
272 // RRDDIMVAR management
273 // DIMENSION VARIABLES
274
275 #define RRDDIMVAR_ID_MAX 1024
276
277 static inline void rrddimvar_free_variables(RRDDIMVAR *rs) {
278     RRDDIM *rd = rs->rrddim;
279     RRDSET *st = rd->rrdset;
280
281     // CHART VARIABLES FOR THIS DIMENSION
282
283     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_id);
284     rs->var_local_id = NULL;
285
286     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local_name);
287     rs->var_local_name = NULL;
288
289     // FAMILY VARIABLES FOR THIS DIMENSION
290
291     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_id);
292     rs->var_family_id = NULL;
293
294     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
295     rs->var_family_name = NULL;
296
297     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextid);
298     rs->var_family_contextid = NULL;
299
300     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_contextname);
301     rs->var_family_contextname = NULL;
302
303     // HOST VARIABLES FOR THIS DIMENSION
304
305     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidid);
306     rs->var_host_chartidid = NULL;
307
308     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartidname);
309     rs->var_host_chartidname = NULL;
310
311     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnameid);
312     rs->var_host_chartnameid = NULL;
313
314     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_chartnamename);
315     rs->var_host_chartnamename = NULL;
316
317     // KEYS
318
319     freez(rs->key_id);
320     rs->key_id = NULL;
321
322     freez(rs->key_name);
323     rs->key_name = NULL;
324
325     freez(rs->key_fullidid);
326     rs->key_fullidid = NULL;
327
328     freez(rs->key_fullidname);
329     rs->key_fullidname = NULL;
330
331     freez(rs->key_contextid);
332     rs->key_contextid = NULL;
333
334     freez(rs->key_contextname);
335     rs->key_contextname = NULL;
336
337     freez(rs->key_fullnameid);
338     rs->key_fullnameid = NULL;
339
340     freez(rs->key_fullnamename);
341     rs->key_fullnamename = NULL;
342 }
343
344 static inline void rrddimvar_create_variables(RRDDIMVAR *rs) {
345     rrddimvar_free_variables(rs);
346
347     RRDDIM *rd = rs->rrddim;
348     RRDSET *st = rd->rrdset;
349
350     char buffer[RRDDIMVAR_ID_MAX + 1];
351
352     // KEYS
353
354     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
355     rs->key_id = strdupz(buffer);
356
357     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
358     rs->key_name = strdupz(buffer);
359
360     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_id);
361     rs->key_fullidid = strdupz(buffer);
362
363     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->key_name);
364     rs->key_fullidname = strdupz(buffer);
365
366     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_id);
367     rs->key_contextid = strdupz(buffer);
368
369     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->context, rs->key_name);
370     rs->key_contextname = strdupz(buffer);
371
372     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_id);
373     rs->key_fullnameid = strdupz(buffer);
374
375     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->key_name);
376     rs->key_fullnamename = strdupz(buffer);
377
378     // CHART VARIABLES FOR THIS DIMENSION
379     // -----------------------------------
380     //
381     // dimensions are available as:
382     // - $id
383     // - $name
384
385     rs->var_local_id           = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_id, rs->type, rs->value);
386     rs->var_local_name         = rrdvar_create_and_index("local", &st->variables_root_index, rs->key_name, rs->type, rs->value);
387
388     // FAMILY VARIABLES FOR THIS DIMENSION
389     // -----------------------------------
390     //
391     // dimensions are available as:
392     // - $id                 (only the first, when multiple overlap)
393     // - $name               (only the first, when multiple overlap)
394     // - $chart-context.id
395     // - $chart-context.name
396
397     rs->var_family_id          = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_id, rs->type, rs->value);
398     rs->var_family_name        = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_name, rs->type, rs->value);
399     rs->var_family_contextid   = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextid, rs->type, rs->value);
400     rs->var_family_contextname = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->key_contextname, rs->type, rs->value);
401
402     // HOST VARIABLES FOR THIS DIMENSION
403     // -----------------------------------
404     //
405     // dimensions are available as:
406     // - $chart-id.id
407     // - $chart-id.name
408     // - $chart-name.id
409     // - $chart-name.name
410
411     rs->var_host_chartidid      = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidid, rs->type, rs->value);
412     rs->var_host_chartidname    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullidname, rs->type, rs->value);
413     rs->var_host_chartnameid    = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnameid, rs->type, rs->value);
414     rs->var_host_chartnamename  = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->key_fullnamename, rs->type, rs->value);
415 }
416
417 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
418     RRDSET *st = rd->rrdset;
419
420     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
421
422     if(!prefix) prefix = "";
423     if(!suffix) suffix = "";
424
425     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
426
427     rs->prefix = strdupz(prefix);
428     rs->suffix = strdupz(suffix);
429
430     rs->type = type;
431     rs->value = value;
432     rs->options = options;
433     rs->rrddim = rd;
434
435     rs->next = rd->variables;
436     rd->variables = rs;
437
438     rrddimvar_create_variables(rs);
439
440     return rs;
441 }
442
443 void rrddimvar_rename_all(RRDDIM *rd) {
444     RRDSET *st = rd->rrdset;
445     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
446
447     RRDDIMVAR *rs, *next = rd->variables;
448     while((rs = next)) {
449         next = rs->next;
450         rrddimvar_create_variables(rs);
451     }
452 }
453
454 void rrddimvar_free(RRDDIMVAR *rs) {
455     RRDDIM *rd = rs->rrddim;
456     RRDSET *st = rd->rrdset;
457     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
458
459     rrddimvar_free_variables(rs);
460
461     if(rd->variables == rs) {
462         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
463         rd->variables = rs->next;
464     }
465     else {
466         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
467         RRDDIMVAR *t;
468         for (t = rd->variables; t && t->next != rs; t = t->next) ;
469         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->key_name, st->id, rd->id);
470         else t->next = rs->next;
471     }
472
473     freez(rs->prefix);
474     freez(rs->suffix);
475     freez(rs);
476 }
477
478 // ----------------------------------------------------------------------------
479 // RRDSETVAR management
480 // CHART VARIABLES
481
482 static inline void rrdsetvar_free_variables(RRDSETVAR *rs) {
483     RRDSET *st = rs->rrdset;
484
485     // CHART
486
487     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->var_local);
488     rs->var_local = NULL;
489
490     // FAMILY
491
492     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family);
493     rs->var_family = NULL;
494
495     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host);
496     rs->var_host = NULL;
497
498     // HOST
499
500     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->var_family_name);
501     rs->var_family_name = NULL;
502
503     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->var_host_name);
504     rs->var_host_name = NULL;
505
506     // KEYS
507
508     freez(rs->key_fullid);
509     rs->key_fullid = NULL;
510
511     freez(rs->key_fullname);
512     rs->key_fullname = NULL;
513 }
514
515 static inline void rrdsetvar_create_variables(RRDSETVAR *rs) {
516     rrdsetvar_free_variables(rs);
517
518     RRDSET *st = rs->rrdset;
519
520     // KEYS
521
522     char buffer[RRDVAR_MAX_LENGTH + 1];
523     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rs->variable);
524     rs->key_fullid = strdupz(buffer);
525
526     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
527     rs->key_fullname = strdupz(buffer);
528
529     // CHART
530
531     rs->var_local       = rrdvar_create_and_index("local",  &st->variables_root_index,               rs->variable, rs->type, rs->value);
532
533     // FAMILY
534
535     rs->var_family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullid,   rs->type, rs->value);
536     rs->var_family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index,    rs->key_fullname, rs->type, rs->value);
537
538     // HOST
539
540     rs->var_host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullid,   rs->type, rs->value);
541     rs->var_host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index,      rs->key_fullname, rs->type, rs->value);
542
543 }
544
545 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
546     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
547     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
548
549     rs->variable = strdupz(variable);
550     rs->type = type;
551     rs->value = value;
552     rs->options = options;
553     rs->rrdset = st;
554
555     rs->next = st->variables;
556     st->variables = rs;
557
558     rrdsetvar_create_variables(rs);
559
560     return rs;
561 }
562
563 void rrdsetvar_rename_all(RRDSET *st) {
564     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
565
566     RRDSETVAR *rs, *next = st->variables;
567     while((rs = next)) {
568         next = rs->next;
569         rrdsetvar_create_variables(rs);
570     }
571
572     rrdsetcalc_link_matching(st);
573 }
574
575 void rrdsetvar_free(RRDSETVAR *rs) {
576     RRDSET *st = rs->rrdset;
577     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
578
579     if(st->variables == rs) {
580         st->variables = rs->next;
581     }
582     else {
583         RRDSETVAR *t;
584         for (t = st->variables; t && t->next != rs; t = t->next);
585         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->key_fullname, st->id);
586         else t->next = rs->next;
587     }
588
589     rrdsetvar_free_variables(rs);
590
591     freez(rs->variable);
592     freez(rs);
593 }
594
595 // ----------------------------------------------------------------------------
596 // RRDCALC management
597
598 inline const char *rrdcalc_status2string(int status) {
599     switch(status) {
600         case RRDCALC_STATUS_REMOVED:
601             return "REMOVED";
602
603         case RRDCALC_STATUS_UNDEFINED:
604             return "UNDEFINED";
605
606         case RRDCALC_STATUS_UNINITIALIZED:
607             return "UNINITIALIZED";
608
609         case RRDCALC_STATUS_CLEAR:
610             return "CLEAR";
611
612         case RRDCALC_STATUS_RAISED:
613             return "RAISED";
614
615         case RRDCALC_STATUS_WARNING:
616             return "WARNING";
617
618         case RRDCALC_STATUS_CRITICAL:
619             return "CRITICAL";
620
621         default:
622             error("Unknown alarm status %d", status);
623             return "UNKNOWN";
624     }
625 }
626
627 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
628     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
629
630     rc->last_status_change = now_realtime_sec();
631     rc->rrdset = st;
632
633     rc->rrdset_next = st->alarms;
634     rc->rrdset_prev = NULL;
635     
636     if(rc->rrdset_next)
637         rc->rrdset_next->rrdset_prev = rc;
638
639     st->alarms = rc;
640
641     if(rc->update_every < rc->rrdset->update_every) {
642         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
643         rc->update_every = rc->rrdset->update_every;
644     }
645
646     if(!isnan(rc->green) && isnan(st->green)) {
647         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
648         st->green = rc->green;
649     }
650
651     if(!isnan(rc->red) && isnan(st->red)) {
652         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
653         st->red = rc->red;
654     }
655
656     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
657     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
658
659     char fullname[RRDVAR_MAX_LENGTH + 1];
660     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
661     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
662
663     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
664     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
665
666         if(!rc->units) rc->units = strdupz(st->units);
667
668     {
669         time_t now = now_realtime_sec();
670         health_alarm_log(
671                 st->rrdhost,
672                 rc->id,
673                 rc->next_event_id++,
674                 now,
675                 rc->name,
676                 rc->rrdset->id,
677                 rc->rrdset->family,
678                 rc->exec,
679                 rc->recipient,
680                 now - rc->last_status_change,
681                 rc->old_value,
682                 rc->value,
683                 rc->status,
684                 RRDCALC_STATUS_UNINITIALIZED,
685                 rc->source,
686                 rc->units,
687                 rc->info,
688                 0,
689                 0
690         );
691     }
692 }
693
694 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
695     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
696             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
697         return 1;
698
699     return 0;
700 }
701
702 // this has to be called while the RRDHOST is locked
703 inline void rrdsetcalc_link_matching(RRDSET *st) {
704     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
705
706     RRDCALC *rc;
707     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
708         if(unlikely(rc->rrdset))
709             continue;
710
711         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
712             rrdsetcalc_link(st, rc);
713     }
714 }
715
716 // this has to be called while the RRDHOST is locked
717 inline void rrdsetcalc_unlink(RRDCALC *rc) {
718     RRDSET *st = rc->rrdset;
719
720     if(!st) {
721         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
722         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
723         return;
724     }
725
726     {
727         time_t now = now_realtime_sec();
728         health_alarm_log(
729                 st->rrdhost,
730                 rc->id,
731                 rc->next_event_id++,
732                 now,
733                 rc->name,
734                 rc->rrdset->id,
735                 rc->rrdset->family,
736                 rc->exec,
737                 rc->recipient,
738                 now - rc->last_status_change,
739                 rc->old_value,
740                 rc->value,
741                 rc->status,
742                 RRDCALC_STATUS_REMOVED,
743                 rc->source,
744                 rc->units,
745                 rc->info,
746                 0,
747                 0
748         );
749     }
750
751     RRDHOST *host = st->rrdhost;
752
753     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
754
755     // unlink it
756     if(rc->rrdset_prev)
757         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
758
759     if(rc->rrdset_next)
760         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
761
762     if(st->alarms == rc)
763         st->alarms = rc->rrdset_next;
764
765     rc->rrdset_prev = rc->rrdset_next = NULL;
766
767     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
768     rc->local = NULL;
769
770     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
771     rc->family = NULL;
772
773     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
774     rc->hostid = NULL;
775
776     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
777     rc->hostname = NULL;
778
779     rc->rrdset = NULL;
780
781     // RRDCALC will remain in RRDHOST
782     // so that if the matching chart is found in the future
783     // it will be applied automatically
784 }
785
786 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
787     RRDCALC *rc;
788     uint32_t hash = simple_hash(name);
789
790     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
791         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
792             return rc;
793     }
794
795     return NULL;
796 }
797
798 inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
799     RRDCALC *rc;
800
801     if(unlikely(!chart)) {
802         error("attempt to find RRDCALC '%s' without giving a chart name", name);
803         return 1;
804     }
805
806     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
807     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
808
809     // make sure it does not already exist
810     for(rc = host->alarms; rc ; rc = rc->next) {
811         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
812             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
813             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
814             return 1;
815         }
816     }
817
818     return 0;
819 }
820
821 inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
822     if(chart && name) {
823         uint32_t hash_chart = simple_hash(chart);
824         uint32_t hash_name = simple_hash(name);
825
826         // re-use old IDs, by looking them up in the alarm log
827         ALARM_ENTRY *ae;
828         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
829             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
830                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
831                 return ae->alarm_id;
832             }
833         }
834     }
835
836     return host->health_log.next_alarm_id++;
837 }
838
839 inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
840     rrdhost_check_rdlock(host);
841
842     if(rc->calculation) {
843         rc->calculation->status = &rc->status;
844         rc->calculation->this = &rc->value;
845         rc->calculation->after = &rc->db_after;
846         rc->calculation->before = &rc->db_before;
847         rc->calculation->rrdcalc = rc;
848     }
849
850     if(rc->warning) {
851         rc->warning->status = &rc->status;
852         rc->warning->this = &rc->value;
853         rc->warning->after = &rc->db_after;
854         rc->warning->before = &rc->db_before;
855         rc->warning->rrdcalc = rc;
856     }
857
858     if(rc->critical) {
859         rc->critical->status = &rc->status;
860         rc->critical->this = &rc->value;
861         rc->critical->after = &rc->db_after;
862         rc->critical->before = &rc->db_before;
863         rc->critical->rrdcalc = rc;
864     }
865
866     // link it to the host
867     if(likely(host->alarms)) {
868         // append it
869         RRDCALC *t;
870         for(t = host->alarms; t && t->next ; t = t->next) ;
871         t->next = rc;
872     }
873     else {
874         host->alarms = rc;
875     }
876
877     // link it to its chart
878     RRDSET *st;
879     for(st = host->rrdset_root; st ; st = st->next) {
880         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
881             rrdsetcalc_link(st, rc);
882             break;
883         }
884     }
885 }
886
887 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
888
889     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
890
891     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
892         return NULL;
893
894     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
895     rc->next_event_id = 1;
896     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
897     rc->name = strdupz(rt->name);
898     rc->hash = simple_hash(rc->name);
899     rc->chart = strdupz(chart);
900     rc->hash_chart = simple_hash(rc->chart);
901
902     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
903
904     rc->green = rt->green;
905     rc->red = rt->red;
906     rc->value = NAN;
907     rc->old_value = NAN;
908
909     rc->delay_up_duration = rt->delay_up_duration;
910     rc->delay_down_duration = rt->delay_down_duration;
911     rc->delay_max_duration = rt->delay_max_duration;
912     rc->delay_multiplier = rt->delay_multiplier;
913
914     rc->group = rt->group;
915     rc->after = rt->after;
916     rc->before = rt->before;
917     rc->update_every = rt->update_every;
918     rc->options = rt->options;
919
920     if(rt->exec) rc->exec = strdupz(rt->exec);
921     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
922     if(rt->source) rc->source = strdupz(rt->source);
923     if(rt->units) rc->units = strdupz(rt->units);
924     if(rt->info) rc->info = strdupz(rt->info);
925
926     if(rt->calculation) {
927         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
928         if(!rc->calculation)
929             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
930     }
931     if(rt->warning) {
932         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
933         if(!rc->warning)
934             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
935     }
936     if(rt->critical) {
937         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
938         if(!rc->critical)
939             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
940     }
941
942     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
943           (rc->chart)?rc->chart:"NOCHART",
944           rc->name,
945           (rc->exec)?rc->exec:"DEFAULT",
946           (rc->recipient)?rc->recipient:"DEFAULT",
947           rc->green,
948           rc->red,
949           rc->group,
950           rc->after,
951           rc->before,
952           rc->options,
953           (rc->dimensions)?rc->dimensions:"NONE",
954           rc->update_every,
955           (rc->calculation)?rc->calculation->parsed_as:"NONE",
956           (rc->warning)?rc->warning->parsed_as:"NONE",
957           (rc->critical)?rc->critical->parsed_as:"NONE",
958           rc->source,
959           rc->delay_up_duration,
960           rc->delay_down_duration,
961           rc->delay_max_duration,
962           rc->delay_multiplier
963     );
964
965     rrdcalc_create_part2(host, rc);
966     return rc;
967 }
968
969 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
970     if(!rc) return;
971
972     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
973
974     // unlink it from RRDSET
975     if(rc->rrdset) rrdsetcalc_unlink(rc);
976
977     // unlink it from RRDHOST
978     if(unlikely(rc == host->alarms))
979         host->alarms = rc->next;
980
981     else if(likely(host->alarms)) {
982         RRDCALC *t, *last = host->alarms;
983         for(t = last->next; t && t != rc; last = t, t = t->next) ;
984         if(last->next == rc)
985             last->next = rc->next;
986         else
987             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
988     }
989     else
990         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
991
992     expression_free(rc->calculation);
993     expression_free(rc->warning);
994     expression_free(rc->critical);
995
996     freez(rc->name);
997     freez(rc->chart);
998     freez(rc->family);
999     freez(rc->dimensions);
1000     freez(rc->exec);
1001     freez(rc->recipient);
1002     freez(rc->source);
1003     freez(rc->units);
1004     freez(rc->info);
1005     freez(rc);
1006 }
1007
1008 // ----------------------------------------------------------------------------
1009 // RRDCALCTEMPLATE management
1010
1011 void rrdcalctemplate_link_matching(RRDSET *st) {
1012     RRDCALCTEMPLATE *rt;
1013
1014     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
1015         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
1016                 && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
1017             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
1018             if(unlikely(!rc))
1019                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
1020
1021 #ifdef NETDATA_INTERNAL_CHECKS
1022             else if(rc->rrdset != st)
1023                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
1024 #endif
1025         }
1026     }
1027 }
1028
1029 inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1030     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
1031
1032     if(host->templates) {
1033         if(host->templates == rt) {
1034             host->templates = rt->next;
1035         }
1036         else {
1037             RRDCALCTEMPLATE *t, *last = host->templates;
1038             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
1039             if(last && last->next == rt) {
1040                 last->next = rt->next;
1041                 rt->next = NULL;
1042             }
1043             else
1044                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
1045         }
1046     }
1047
1048     expression_free(rt->calculation);
1049     expression_free(rt->warning);
1050     expression_free(rt->critical);
1051
1052     freez(rt->family_match);
1053     simple_pattern_free(rt->family_pattern);
1054
1055     freez(rt->name);
1056     freez(rt->exec);
1057     freez(rt->recipient);
1058     freez(rt->context);
1059     freez(rt->source);
1060     freez(rt->units);
1061     freez(rt->info);
1062     freez(rt->dimensions);
1063     freez(rt);
1064 }
1065
1066 // ----------------------------------------------------------------------------
1067 // health initialization
1068
1069 inline char *health_config_dir(void) {
1070     char buffer[FILENAME_MAX + 1];
1071     snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_config_dir);
1072     return config_get("health", "health configuration directory", buffer);
1073 }
1074
1075 void health_init(void) {
1076     debug(D_HEALTH, "Health configuration initializing");
1077
1078     if(!(default_localhost_health_enabled = config_get_boolean("health", "enabled", 1))) {
1079         debug(D_HEALTH, "Health is disabled.");
1080         return;
1081     }
1082
1083     char pathname[FILENAME_MAX + 1];
1084     snprintfz(pathname, FILENAME_MAX, "%s/health", netdata_configured_varlib_dir);
1085     if(mkdir(pathname, 0770) == -1 && errno != EEXIST)
1086         fatal("Cannot create directory '%s'.", pathname);
1087 }
1088
1089 // ----------------------------------------------------------------------------
1090 // re-load health configuration
1091
1092 inline void health_free_host_nolock(RRDHOST *host) {
1093     while(host->templates)
1094         rrdcalctemplate_free(host, host->templates);
1095
1096     while(host->alarms)
1097         rrdcalc_free(host, host->alarms);
1098 }
1099
1100 void health_reload_host(RRDHOST *host) {
1101     char *path = health_config_dir();
1102
1103     // free all running alarms
1104     rrdhost_wrlock(host);
1105     health_free_host_nolock(host);
1106     rrdhost_unlock(host);
1107
1108     // invalidate all previous entries in the alarm log
1109     ALARM_ENTRY *t;
1110     for(t = host->health_log.alarms ; t ; t = t->next) {
1111         if(t->new_status != RRDCALC_STATUS_REMOVED)
1112             t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
1113     }
1114
1115     // reset all thresholds to all charts
1116     RRDSET *st;
1117     for(st = host->rrdset_root; st ; st = st->next) {
1118         st->green = NAN;
1119         st->red = NAN;
1120     }
1121
1122     // load the new alarms
1123     rrdhost_wrlock(host);
1124     health_readdir(host, path);
1125     rrdhost_unlock(host);
1126
1127     // link the loaded alarms to their charts
1128     for(st = host->rrdset_root; st ; st = st->next) {
1129         rrdhost_wrlock(host);
1130
1131         rrdsetcalc_link_matching(st);
1132         rrdcalctemplate_link_matching(st);
1133
1134         rrdhost_unlock(host);
1135     }
1136 }
1137
1138 void health_reload(void) {
1139     RRDHOST *host;
1140
1141     for(host = localhost; host ; host = host->next)
1142         health_reload_host(host);
1143 }
1144
1145 // ----------------------------------------------------------------------------
1146 // health main thread and friends
1147
1148 static inline int rrdcalc_value2status(calculated_number n) {
1149     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
1150     if(n) return RRDCALC_STATUS_RAISED;
1151     return RRDCALC_STATUS_CLEAR;
1152 }
1153
1154 #define ALARM_EXEC_COMMAND_LENGTH 8192
1155
1156 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
1157     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
1158
1159     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
1160         // do not send notifications for internal statuses
1161         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1162         goto done;
1163     }
1164
1165     if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
1166         // do not send notifications for disabled statuses
1167         debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1168         // mark it as run, so that we will send the same alarm if it happens again
1169         goto done;
1170     }
1171
1172     // find the previous notification for the same alarm
1173     // which we have run the exec script
1174     // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
1175     if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
1176         uint32_t id = ae->alarm_id;
1177         ALARM_ENTRY *t;
1178         for(t = ae->next; t ; t = t->next) {
1179             if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
1180                 break;
1181         }
1182
1183         if(likely(t)) {
1184             // we have executed this alarm notification in the past
1185             if(t && t->new_status == ae->new_status) {
1186                 // don't send the notification for the same status again
1187                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
1188                       , rrdcalc_status2string(ae->new_status));
1189                 goto done;
1190             }
1191         }
1192         else {
1193             // we have not executed this alarm notification in the past
1194             // so, don't send CLEAR notifications
1195             if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
1196                 debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
1197                       , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
1198                 goto done;
1199             }
1200         }
1201     }
1202
1203     static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
1204     pid_t command_pid;
1205
1206     const char *exec      = (ae->exec)      ? ae->exec      : host->health_default_exec;
1207     const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
1208
1209     snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
1210               exec,
1211               recipient,
1212               host->hostname,
1213               ae->unique_id,
1214               ae->alarm_id,
1215               ae->alarm_event_id,
1216               (unsigned long)ae->when,
1217               ae->name,
1218               ae->chart?ae->chart:"NOCAHRT",
1219               ae->family?ae->family:"NOFAMILY",
1220               rrdcalc_status2string(ae->new_status),
1221               rrdcalc_status2string(ae->old_status),
1222               ae->new_value,
1223               ae->old_value,
1224               ae->source?ae->source:"UNKNOWN",
1225               (uint32_t)ae->duration,
1226               (uint32_t)ae->non_clear_duration,
1227               ae->units?ae->units:"",
1228               ae->info?ae->info:"",
1229               ae->new_value_string,
1230               ae->old_value_string
1231     );
1232
1233     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
1234     ae->exec_run_timestamp = now_realtime_sec();
1235
1236     debug(D_HEALTH, "executing command '%s'", command_to_run);
1237     FILE *fp = mypopen(command_to_run, &command_pid);
1238     if(!fp) {
1239         error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
1240         goto done;
1241     }
1242     debug(D_HEALTH, "HEALTH reading from command");
1243     char *s = fgets(command_to_run, FILENAME_MAX, fp);
1244     (void)s;
1245     ae->exec_code = mypclose(fp, command_pid);
1246     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
1247
1248     if(ae->exec_code != 0)
1249         ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
1250
1251 done:
1252     health_alarm_log_save(host, ae);
1253     return;
1254 }
1255
1256 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
1257     debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
1258          ae->chart?ae->chart:"NOCHART", ae->name,
1259          ae->new_value,
1260          rrdcalc_status2string(ae->old_status),
1261          rrdcalc_status2string(ae->new_status)
1262     );
1263
1264     health_alarm_execute(host, ae);
1265 }
1266
1267 static inline void health_alarm_log_process(RRDHOST *host) {
1268     static uint32_t stop_at_id = 0;
1269     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
1270     time_t now = now_realtime_sec();
1271
1272     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1273
1274     ALARM_ENTRY *ae;
1275     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
1276         if(unlikely(
1277             !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
1278             !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
1279             )) {
1280
1281             if(unlikely(ae->unique_id < first_waiting))
1282                 first_waiting = ae->unique_id;
1283
1284             if(likely(now >= ae->delay_up_to_timestamp))
1285                 health_process_notifications(host, ae);
1286         }
1287     }
1288
1289     // remember this for the next iteration
1290     stop_at_id = first_waiting;
1291
1292     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1293
1294     if(host->health_log.count <= host->health_log.max)
1295         return;
1296
1297     // cleanup excess entries in the log
1298     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
1299
1300     ALARM_ENTRY *last = NULL;
1301     unsigned int count = host->health_log.max * 2 / 3;
1302     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
1303
1304     if(ae && last && last->next == ae)
1305         last->next = NULL;
1306     else
1307         ae = NULL;
1308
1309     while(ae) {
1310         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
1311
1312         ALARM_ENTRY *t = ae->next;
1313
1314         freez(ae->name);
1315         freez(ae->chart);
1316         freez(ae->family);
1317         freez(ae->exec);
1318         freez(ae->recipient);
1319         freez(ae->source);
1320         freez(ae->units);
1321         freez(ae->info);
1322         freez(ae->old_value_string);
1323         freez(ae->new_value_string);
1324         freez(ae);
1325
1326         ae = t;
1327         host->health_log.count--;
1328     }
1329
1330     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1331 }
1332
1333 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
1334     if(unlikely(!rc->rrdset)) {
1335         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
1336         return 0;
1337     }
1338
1339     if(unlikely(rc->next_update > now)) {
1340         if (unlikely(*next_run > rc->next_update)) {
1341             // update the next_run time of the main loop
1342             // to run this alarm precisely the time required
1343             *next_run = rc->next_update;
1344         }
1345
1346         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
1347         return 0;
1348     }
1349
1350     if(unlikely(!rc->update_every)) {
1351         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
1352         return 0;
1353     }
1354
1355     if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
1356         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
1357         return 0;
1358     }
1359
1360     int update_every = rc->rrdset->update_every;
1361     time_t first = rrdset_first_entry_t(rc->rrdset);
1362     time_t last = rrdset_last_entry_t(rc->rrdset);
1363
1364     if(unlikely(now + update_every < first /* || now - update_every > last */)) {
1365         debug(D_HEALTH
1366               , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
1367               , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
1368               , (unsigned long) last);
1369         return 0;
1370     }
1371
1372     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
1373         time_t needed = now + rc->before + rc->after;
1374
1375         if(needed + update_every < first || needed - update_every > last) {
1376             debug(D_HEALTH
1377                   , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
1378                   , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
1379                   , (unsigned long) last);
1380             return 0;
1381         }
1382     }
1383
1384     return 1;
1385 }
1386
1387 void *health_main(void *ptr) {
1388     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
1389
1390     info("HEALTH thread created with task id %d", gettid());
1391
1392     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
1393         error("Cannot set pthread cancel type to DEFERRED.");
1394
1395     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
1396         error("Cannot set pthread cancel state to ENABLE.");
1397
1398     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
1399     if(min_run_every < 1) min_run_every = 1;
1400
1401     BUFFER *wb = buffer_create(100);
1402
1403     unsigned int loop = 0;
1404     while(!netdata_exit) {
1405         loop++;
1406         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
1407
1408         int oldstate, runnable = 0;
1409         time_t now = now_realtime_sec();
1410         time_t next_run = now + min_run_every;
1411         RRDCALC *rc;
1412
1413         if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
1414             error("Cannot set pthread cancel state to DISABLE.");
1415
1416         RRDHOST *host;
1417         for(host = localhost; host ; host = host->next) {
1418             if(unlikely(!host->health_enabled)) continue;
1419
1420             rrdhost_rdlock(host);
1421
1422             // the first loop is to lookup values from the db
1423             for(rc = host->alarms; rc; rc = rc->next) {
1424                 if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
1425                     if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
1426                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
1427                     continue;
1428                 }
1429
1430                 runnable++;
1431                 rc->old_value = rc->value;
1432                 rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
1433
1434                 // 1. if there is database lookup, do it
1435                 // 2. if there is calculation expression, run it
1436
1437                 if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
1438                     /* time_t old_db_timestamp = rc->db_before; */
1439                     int value_is_null = 0;
1440
1441                     int ret = rrd2value(rc->rrdset, wb, &rc->value, rc->dimensions, 1, rc->after, rc->before, rc->group, rc->options, &rc->db_after, &rc->db_before, &value_is_null);
1442
1443                     if(unlikely(ret != 200)) {
1444                         // database lookup failed
1445                         rc->value = NAN;
1446
1447                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
1448
1449                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
1450                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
1451                             error("Health on host '%s', alarm '%s.%s': database lookup returned error %d", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret);
1452                         }
1453                     }
1454                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
1455                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
1456
1457                     /* - RRDCALC_FLAG_DB_STALE not currently used
1458                     if (unlikely(old_db_timestamp == rc->db_before)) {
1459                         // database is stale
1460
1461                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
1462
1463                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
1464                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
1465                             error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
1466                         }
1467                     }
1468                     else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
1469                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
1470                     */
1471
1472                     if(unlikely(value_is_null)) {
1473                         // collected value is null
1474
1475                         rc->value = NAN;
1476
1477                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name);
1478
1479                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
1480                             rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
1481                             error("Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", host->hostname,  rc->chart ? rc->chart : "NOCHART", rc->name);
1482                         }
1483                     }
1484                     else if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
1485                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
1486
1487                     debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value);
1488                 }
1489
1490                 if(unlikely(rc->calculation)) {
1491                     if(unlikely(!expression_evaluate(rc->calculation))) {
1492                         // calculation failed
1493
1494                         rc->value = NAN;
1495
1496                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
1497
1498                         if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
1499                             rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
1500                             error("Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rc->chart ? rc->chart : "NOCHART", host->hostname,  rc->name, rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg));
1501                         }
1502                     }
1503                     else {
1504                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
1505                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
1506
1507                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
1508                                 CALCULATED_NUMBER_FORMAT
1509                                 ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
1510                               , rc->calculation->parsed_as, rc->calculation->result,
1511                                 buffer_tostring(rc->calculation->error_msg), rc->source
1512                         );
1513
1514                         rc->value = rc->calculation->result;
1515                     }
1516                 }
1517             }
1518             rrdhost_unlock(host);
1519
1520             if(unlikely(runnable && !netdata_exit)) {
1521                 rrdhost_rdlock(host);
1522
1523                 for(rc = host->alarms; rc; rc = rc->next) {
1524                     if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
1525                         continue;
1526
1527                     int warning_status = RRDCALC_STATUS_UNDEFINED;
1528                     int critical_status = RRDCALC_STATUS_UNDEFINED;
1529
1530                     if(likely(rc->warning)) {
1531                         if(unlikely(!expression_evaluate(rc->warning))) {
1532                             // calculation failed
1533
1534                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1535
1536                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
1537                                 rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
1538                                 error("Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
1539                             }
1540                         }
1541                         else {
1542                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
1543                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
1544
1545                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source);
1546
1547                             warning_status = rrdcalc_value2status(rc->warning->result);
1548                         }
1549                     }
1550
1551                     if(likely(rc->critical)) {
1552                         if(unlikely(!expression_evaluate(rc->critical))) {
1553                             // calculation failed
1554
1555                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
1556
1557                             if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
1558                                 rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
1559                                 error("Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
1560                             }
1561                         }
1562                         else {
1563                             if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
1564                                 rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
1565
1566                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name , rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source);
1567
1568                             critical_status = rrdcalc_value2status(rc->critical->result);
1569                         }
1570                     }
1571
1572                     int status = RRDCALC_STATUS_UNDEFINED;
1573
1574                     switch(warning_status) {
1575                         case RRDCALC_STATUS_CLEAR:
1576                             status = RRDCALC_STATUS_CLEAR;
1577                             break;
1578
1579                         case RRDCALC_STATUS_RAISED:
1580                             status = RRDCALC_STATUS_WARNING;
1581                             break;
1582
1583                         default:
1584                             break;
1585                     }
1586
1587                     switch(critical_status) {
1588                         case RRDCALC_STATUS_CLEAR:
1589                             if(status == RRDCALC_STATUS_UNDEFINED)
1590                                 status = RRDCALC_STATUS_CLEAR;
1591                             break;
1592
1593                         case RRDCALC_STATUS_RAISED:
1594                             status = RRDCALC_STATUS_CRITICAL;
1595                             break;
1596
1597                         default:
1598                             break;
1599                     }
1600
1601                     if(status != rc->status) {
1602                         int delay = 0;
1603
1604                         if(now > rc->delay_up_to_timestamp) {
1605                             rc->delay_up_current = rc->delay_up_duration;
1606                             rc->delay_down_current = rc->delay_down_duration;
1607                             rc->delay_last = 0;
1608                             rc->delay_up_to_timestamp = 0;
1609                         }
1610                         else {
1611                             rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
1612                             if(rc->delay_up_current > rc->delay_max_duration)
1613                                 rc->delay_up_current = rc->delay_max_duration;
1614
1615                             rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
1616                             if(rc->delay_down_current > rc->delay_max_duration)
1617                                 rc->delay_down_current = rc->delay_max_duration;
1618                         }
1619
1620                         if(status > rc->status)
1621                             delay = rc->delay_up_current;
1622                         else
1623                             delay = rc->delay_down_current;
1624
1625                         // COMMENTED: because we do need to send raising alarms
1626                         // if(now + delay < rc->delay_up_to_timestamp)
1627                         //    delay = (int)(rc->delay_up_to_timestamp - now);
1628
1629                         rc->delay_last = delay;
1630                         rc->delay_up_to_timestamp = now + delay;
1631                         health_alarm_log(
1632                                 host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id
1633                                 , rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change
1634                                 , rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info
1635                                 , rc->delay_last, (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)
1636                                                   ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0
1637                         );
1638                         rc->last_status_change = now;
1639                         rc->status = status;
1640                     }
1641
1642                     rc->last_updated = now;
1643                     rc->next_update = now + rc->update_every;
1644
1645                     if(next_run > rc->next_update)
1646                         next_run = rc->next_update;
1647                 }
1648
1649                 rrdhost_unlock(host);
1650             }
1651
1652             if(unlikely(netdata_exit))
1653                 break;
1654
1655             // execute notifications
1656             // and cleanup
1657             health_alarm_log_process(host);
1658
1659             if(unlikely(netdata_exit))
1660                 break;
1661
1662         } /* host loop */
1663
1664         if(unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
1665             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
1666
1667         if(unlikely(netdata_exit))
1668             break;
1669
1670         now = now_realtime_sec();
1671         if(now < next_run) {
1672             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
1673             sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
1674         }
1675         else
1676             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
1677     }
1678
1679     buffer_free(wb);
1680
1681     info("HEALTH thread exiting");
1682
1683     static_thread->enabled = 0;
1684     pthread_exit(NULL);
1685     return NULL;
1686 }