]> arthur.barton.de Git - netdata.git/blob - src/health.c
Merge pull request #1040 from ktsaou/master
[netdata.git] / src / health.c
1 #include "common.h"
2
3 #define RRDVAR_MAX_LENGTH 1024
4
5 struct health_options {
6     const char *health_default_exec;
7     const char *health_default_recipient;
8     const char *log_filename;
9     FILE *log_fp;
10 };
11
12 static struct health_options health = {
13     .health_default_exec = PLUGINS_DIR "/alarm-notify.sh",
14     .health_default_recipient = "root",
15     .log_filename = VARLIB_DIR "/health/alarm_log.db",
16     .log_fp = NULL
17 };
18
19 int health_enabled = 1;
20
21 // ----------------------------------------------------------------------------
22 // health alarm log load/save
23 // no need for locking - only one thread is reading / writing the alarms log
24
25 static inline int health_alarm_log_open(void) {
26     if(health.log_fp)
27         fclose(health.log_fp);
28
29     health.log_fp = fopen(health.log_filename, "a");
30
31     if(health.log_fp) {
32         if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0)
33             error("Cannot set line buffering on health log file.");
34         return 0;
35     }
36
37     error("Cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename);
38     return -1;
39 }
40
41 static inline void health_alarm_log_close(void) {
42     if(health.log_fp) {
43         fclose(health.log_fp);
44         health.log_fp = NULL;
45     }
46 }
47
48 static inline void health_log_recreate(void) {
49     if(health.log_fp != NULL) {
50         health_alarm_log_close();
51
52         // open it with truncate
53         health.log_fp = fopen(health.log_filename, "w");
54         if(health.log_fp) fclose(health.log_fp);
55         else error("Cannot truncate health log '%s'", health.log_filename);
56
57         health.log_fp = NULL;
58
59         health_alarm_log_open();
60     }
61 }
62
63 static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
64     (void)host;
65     (void)ae;
66     
67 /*    if(likely(health.log_fp)) {
68         if(unlikely(fprintf(health.log_fp, "A\t%s\t%08x\t%08x\t%08x\t%08x\t%08x\t%08x\t%s\t%s\t%s\t%s\t%s\t%08x\n",
69             host->hostname,
70             ae->unique_id,
71             ae->alarm_id,
72             ae->alarm_event_id,
73             (uint32_t)ae->when,
74             (uint32_t)ae->duration,
75             (uint32_t)ae->non_clear_duration,
76             (uint32_t)ae->exec_run_timestamp,
77             ae->name,
78             ae->chart,
79             ae->family,
80             ae->exec,
81             ae->recipient
82             ) < 0))
83             error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart.");
84     }
85 */
86 }
87
88 static inline void health_alarm_log_load(RRDHOST *host) {
89     (void)host;
90
91 }
92
93 // ----------------------------------------------------------------------------
94 // health alarm log management
95
96 static inline void health_alarm_log(RRDHOST *host,
97                 uint32_t alarm_id, uint32_t alarm_event_id,
98                 time_t when,
99                 const char *name, const char *chart, const char *family,
100                 const char *exec, const char *recipient, time_t duration,
101                 calculated_number old_value, calculated_number new_value,
102                 int old_status, int new_status,
103                 const char *source,
104                 const char *units,
105                 const char *info,
106                 int delay
107 ) {
108     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
109
110     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
111     ae->name = strdupz(name);
112     ae->hash_name = simple_hash(ae->name);
113
114     if(chart) {
115         ae->chart = strdupz(chart);
116         ae->hash_chart = simple_hash(ae->chart);
117     }
118
119     if(family)
120         ae->family = strdupz(family);
121
122     if(exec) ae->exec = strdupz(exec);
123     if(recipient) ae->recipient = strdupz(recipient);
124     if(source) ae->source = strdupz(source);
125     if(units) ae->units = strdupz(units);
126     if(info) ae->info = strdupz(info);
127
128     ae->unique_id = host->health_log.next_log_id++;
129     ae->alarm_id = alarm_id;
130     ae->alarm_event_id = alarm_event_id;
131     ae->when = when;
132     ae->old_value = old_value;
133     ae->new_value = new_value;
134     ae->old_status = old_status;
135     ae->new_status = new_status;
136     ae->duration = duration;
137     ae->delay = delay;
138     ae->delay_up_to_timestamp = when + delay;
139
140     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
141         ae->non_clear_duration += ae->duration;
142
143     // link it
144     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
145     ae->next = host->health_log.alarms;
146     host->health_log.alarms = ae;
147     host->health_log.count++;
148     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
149
150     // match previous alarms
151     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
152     ALARM_ENTRY *t;
153     for(t = host->health_log.alarms ; t ; t = t->next) {
154         if(t != ae && t->alarm_id == ae->alarm_id) {
155             if(!(t->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED) && !t->updated_by_id) {
156                 t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
157                 t->updated_by_id = ae->unique_id;
158                 ae->updates_id = t->unique_id;
159
160                 if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
161                    (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
162                     ae->non_clear_duration += t->non_clear_duration;
163
164                 health_alarm_log_save(host, t);
165             }
166             else {
167                 // no need to continue
168                 break;
169             }
170         }
171     }
172     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
173
174     health_alarm_log_save(host, ae);
175 }
176
177 // ----------------------------------------------------------------------------
178 // RRDVAR management
179
180 static inline int rrdvar_fix_name(char *variable) {
181     int fixed = 0;
182     while(*variable) {
183         if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
184             *variable++ = '_';
185             fixed++;
186         }
187         else
188             variable++;
189     }
190
191     return fixed;
192 }
193
194 int rrdvar_compare(void* a, void* b) {
195     if(((RRDVAR *)a)->hash < ((RRDVAR *)b)->hash) return -1;
196     else if(((RRDVAR *)a)->hash > ((RRDVAR *)b)->hash) return 1;
197     else return strcmp(((RRDVAR *)a)->name, ((RRDVAR *)b)->name);
198 }
199
200 static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) {
201     RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv));
202     if(ret != rv)
203         debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name);
204
205     return ret;
206 }
207
208 static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) {
209     RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv));
210     if(!ret)
211         error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name);
212
213     return ret;
214 }
215
216 static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, uint32_t hash) {
217     RRDVAR tmp;
218     tmp.name = (char *)name;
219     tmp.hash = (hash)?hash:simple_hash(tmp.name);
220
221     return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp);
222 }
223
224 static inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) {
225     (void)host;
226
227     if(!rv) return;
228
229     if(tree)
230         rrdvar_index_del(tree, rv);
231
232     freez(rv->name);
233     freez(rv);
234 }
235
236 static inline RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, calculated_number *value) {
237     char *variable = strdupz(name);
238     rrdvar_fix_name(variable);
239     uint32_t hash = simple_hash(variable);
240
241     RRDVAR *rv = rrdvar_index_find(tree, variable, hash);
242     if(unlikely(!rv)) {
243         debug(D_VARIABLES, "Variable '%s' not found in scope '%s'. Creating a new one.", variable, scope);
244
245         rv = callocz(1, sizeof(RRDVAR));
246         rv->name = variable;
247         rv->hash = hash;
248         rv->type = type;
249         rv->value = value;
250
251         RRDVAR *ret = rrdvar_index_add(tree, rv);
252         if(unlikely(ret != rv)) {
253             debug(D_VARIABLES, "Variable '%s' in scope '%s' already exists", variable, scope);
254             rrdvar_free(NULL, NULL, rv);
255             rv = NULL;
256         }
257         else
258             debug(D_VARIABLES, "Variable '%s' created in scope '%s'", variable, scope);
259     }
260     else {
261         // already exists
262         freez(variable);
263         rv = NULL;
264     }
265
266     return rv;
267 }
268
269 // ----------------------------------------------------------------------------
270 // RRDVAR lookup
271
272 calculated_number rrdvar2number(RRDVAR *rv) {
273     switch(rv->type) {
274         case RRDVAR_TYPE_CALCULATED: {
275             calculated_number *n = (calculated_number *)rv->value;
276             return *n;
277         }
278
279         case RRDVAR_TYPE_TIME_T: {
280             time_t *n = (time_t *)rv->value;
281             return *n;
282         }
283
284         case RRDVAR_TYPE_COLLECTED: {
285             collected_number *n = (collected_number *)rv->value;
286             return *n;
287         }
288
289         case RRDVAR_TYPE_TOTAL: {
290             total_number *n = (total_number *)rv->value;
291             return *n;
292         }
293
294         case RRDVAR_TYPE_INT: {
295             int *n = (int *)rv->value;
296             return *n;
297         }
298
299         default:
300             error("I don't know how to convert RRDVAR type %d to calculated_number", rv->type);
301             return NAN;
302     }
303 }
304
305 void dump_variable(void *data) {
306     RRDVAR *rv = (RRDVAR *)data;
307     debug(D_HEALTH, "%50s : %20.5Lf", rv->name, rrdvar2number(rv));
308 }
309
310 int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result) {
311     RRDSET *st = rc->rrdset;
312     RRDVAR *rv;
313
314     if(!st) return 0;
315
316     rv = rrdvar_index_find(&st->variables_root_index, variable, hash);
317     if(rv) {
318         *result = rrdvar2number(rv);
319         return 1;
320     }
321
322     rv = rrdvar_index_find(&st->rrdfamily->variables_root_index, variable, hash);
323     if(rv) {
324         *result = rrdvar2number(rv);
325         return 1;
326     }
327
328     rv = rrdvar_index_find(&st->rrdhost->variables_root_index, variable, hash);
329     if(rv) {
330         *result = rrdvar2number(rv);
331         return 1;
332     }
333
334     debug(D_HEALTH, "Available local chart '%s' variables:", st->id);
335     avl_traverse_lock(&st->variables_root_index, dump_variable);
336
337     debug(D_HEALTH, "Available family '%s' variables:", st->rrdfamily->family);
338     avl_traverse_lock(&st->rrdfamily->variables_root_index, dump_variable);
339
340     debug(D_HEALTH, "Available host '%s' variables:", st->rrdhost->hostname);
341     avl_traverse_lock(&st->rrdhost->variables_root_index, dump_variable);
342
343     return 0;
344 }
345
346 // ----------------------------------------------------------------------------
347 // RRDSETVAR management
348
349 RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options) {
350     debug(D_VARIABLES, "RRDVARSET create for chart id '%s' name '%s' with variable name '%s'", st->id, st->name, variable);
351     RRDSETVAR *rs = (RRDSETVAR *)callocz(1, sizeof(RRDSETVAR));
352
353     char buffer[RRDVAR_MAX_LENGTH + 1];
354     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->id, variable);
355     rs->fullid = strdupz(buffer);
356
357     snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, variable);
358     rs->fullname = strdupz(buffer);
359
360     rs->variable = strdupz(variable);
361
362     rs->type = type;
363     rs->value = value;
364     rs->options = options;
365     rs->rrdset = st;
366
367     rs->local       = rrdvar_create_and_index("local",  &st->variables_root_index, rs->variable, rs->type, rs->value);
368     rs->family      = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullid, rs->type, rs->value);
369     rs->host        = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullid, rs->type, rs->value);
370     rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
371     rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
372
373     rs->next = st->variables;
374     st->variables = rs;
375
376     return rs;
377 }
378
379 void rrdsetvar_rename_all(RRDSET *st) {
380     debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", st->id, st->name);
381
382     // only these 2 can change name
383     // rs->family_name
384     // rs->host_name
385
386     char buffer[RRDVAR_MAX_LENGTH + 1];
387     RRDSETVAR *rs, *next = st->variables;
388     while((rs = next)) {
389         next = rs->next;
390
391         snprintfz(buffer, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rs->variable);
392
393         if (strcmp(buffer, rs->fullname)) {
394             // name changed
395             rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
396             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
397
398             freez(rs->fullname);
399             rs->fullname = strdupz(st->name);
400             rs->family_name = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->fullname, rs->type, rs->value);
401             rs->host_name   = rrdvar_create_and_index("host",   &st->rrdhost->variables_root_index, rs->fullname, rs->type, rs->value);
402         }
403     }
404
405     rrdsetcalc_link_matching(st);
406 }
407
408 void rrdsetvar_free(RRDSETVAR *rs) {
409     RRDSET *st = rs->rrdset;
410     debug(D_VARIABLES, "RRDSETVAR free for chart id '%s' name '%s', variable '%s'", st->id, st->name, rs->variable);
411
412     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local);
413     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family);
414     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host);
415     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
416     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_name);
417
418     if(st->variables == rs) {
419         st->variables = rs->next;
420     }
421     else {
422         RRDSETVAR *t;
423         for (t = st->variables; t && t->next != rs; t = t->next);
424         if(!t) error("RRDSETVAR '%s' not found in chart '%s' variables linked list", rs->fullname, st->id);
425         else t->next = rs->next;
426     }
427
428     freez(rs->fullid);
429     freez(rs->fullname);
430     freez(rs->variable);
431     freez(rs);
432 }
433
434 // ----------------------------------------------------------------------------
435 // RRDDIMVAR management
436
437 #define RRDDIMVAR_ID_MAX 1024
438
439 RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options) {
440     RRDSET *st = rd->rrdset;
441
442     debug(D_VARIABLES, "RRDDIMSET create for chart id '%s' name '%s', dimension id '%s', name '%s%s%s'", st->id, st->name, rd->id, (prefix)?prefix:"", rd->name, (suffix)?suffix:"");
443
444     if(!prefix) prefix = "";
445     if(!suffix) suffix = "";
446
447     char buffer[RRDDIMVAR_ID_MAX + 1];
448     RRDDIMVAR *rs = (RRDDIMVAR *)callocz(1, sizeof(RRDDIMVAR));
449
450     rs->prefix = strdupz(prefix);
451     rs->suffix = strdupz(suffix);
452
453     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->id, rs->suffix);
454     rs->id = strdupz(buffer);
455
456     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
457     rs->name = strdupz(buffer);
458
459     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->id);
460     rs->fullidid = strdupz(buffer);
461
462     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->id, rs->name);
463     rs->fullidname = strdupz(buffer);
464
465     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->id);
466     rs->fullnameid = strdupz(buffer);
467
468     snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", rd->rrdset->name, rs->name);
469     rs->fullnamename = strdupz(buffer);
470
471     rs->type = type;
472     rs->value = value;
473     rs->options = options;
474     rs->rrddim = rd;
475
476     rs->local_id     = rrdvar_create_and_index("local", &st->variables_root_index, rs->id, rs->type, rs->value);
477     rs->local_name   = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
478
479     rs->family_id    = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->id, rs->type, rs->value);
480     rs->family_name  = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rs->name, rs->type, rs->value);
481
482     rs->host_fullidid     = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidid, rs->type, rs->value);
483     rs->host_fullidname   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullidname, rs->type, rs->value);
484     rs->host_fullnameid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnameid, rs->type, rs->value);
485     rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, rs->fullnamename, rs->type, rs->value);
486
487     rs->next = rd->variables;
488     rd->variables = rs;
489
490     return rs;
491 }
492
493 void rrddimvar_rename_all(RRDDIM *rd) {
494     RRDSET *st = rd->rrdset;
495     debug(D_VARIABLES, "RRDDIMSET rename for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
496
497     RRDDIMVAR *rs, *next = rd->variables;
498     while((rs = next)) {
499         next = rs->next;
500
501         if (strcmp(rd->name, rs->name)) {
502             char buffer[RRDDIMVAR_ID_MAX + 1];
503             // name changed
504
505             // name
506             rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
507             freez(rs->name);
508             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s%s%s", rs->prefix, rd->name, rs->suffix);
509             rs->name = strdupz(buffer);
510             rs->local_name = rrdvar_create_and_index("local", &st->variables_root_index, rs->name, rs->type, rs->value);
511
512             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
513             freez(rs->fullidname);
514             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->id, rs->name);
515             rs->fullidname = strdupz(buffer);
516             rs->host_fullidname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
517                                                              rs->fullidname, rs->type, rs->value);
518
519             // fullnameid
520             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
521             freez(rs->fullnameid);
522             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->id);
523             rs->fullnameid = strdupz(buffer);
524             rs->host_fullnameid = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
525                                                           rs->fullnameid, rs->type, rs->value);
526
527             // fullnamename
528             rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
529             freez(rs->fullnamename);
530             snprintfz(buffer, RRDDIMVAR_ID_MAX, "%s.%s", st->name, rs->name);
531             rs->fullnamename = strdupz(buffer);
532             rs->host_fullnamename = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index,
533                                                           rs->fullnamename, rs->type, rs->value);
534         }
535     }
536 }
537
538 void rrddimvar_free(RRDDIMVAR *rs) {
539     RRDDIM *rd = rs->rrddim;
540     RRDSET *st = rd->rrdset;
541     debug(D_VARIABLES, "RRDDIMSET free for chart id '%s' name '%s', dimension id '%s', name '%s', prefix='%s', suffix='%s'", st->id, st->name, rd->id, rd->name, rs->prefix, rs->suffix);
542
543     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_id);
544     rrdvar_free(st->rrdhost, &st->variables_root_index, rs->local_name);
545
546     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_id);
547     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rs->family_name);
548
549     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidid);
550     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullidname);
551     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnameid);
552     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rs->host_fullnamename);
553
554     if(rd->variables == rs) {
555         debug(D_VARIABLES, "RRDDIMSET removing first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
556         rd->variables = rs->next;
557     }
558     else {
559         debug(D_VARIABLES, "RRDDIMSET removing non-first entry for chart id '%s' name '%s', dimension id '%s', name '%s'", st->id, st->name, rd->id, rd->name);
560         RRDDIMVAR *t;
561         for (t = rd->variables; t && t->next != rs; t = t->next) ;
562         if(!t) error("RRDDIMVAR '%s' not found in dimension '%s/%s' variables linked list", rs->name, st->id, rd->id);
563         else t->next = rs->next;
564     }
565
566     freez(rs->prefix);
567     freez(rs->suffix);
568     freez(rs->id);
569     freez(rs->name);
570     freez(rs->fullidid);
571     freez(rs->fullidname);
572     freez(rs->fullnameid);
573     freez(rs->fullnamename);
574     freez(rs);
575 }
576
577 // ----------------------------------------------------------------------------
578 // RRDCALC management
579
580 static inline const char *rrdcalc_status2string(int status) {
581     switch(status) {
582         case RRDCALC_STATUS_REMOVED:
583             return "REMOVED";
584
585         case RRDCALC_STATUS_UNDEFINED:
586             return "UNDEFINED";
587
588         case RRDCALC_STATUS_UNINITIALIZED:
589             return "UNINITIALIZED";
590
591         case RRDCALC_STATUS_CLEAR:
592             return "CLEAR";
593
594         case RRDCALC_STATUS_RAISED:
595             return "RAISED";
596
597         case RRDCALC_STATUS_WARNING:
598             return "WARNING";
599
600         case RRDCALC_STATUS_CRITICAL:
601             return "CRITICAL";
602
603         default:
604             error("Unknown alarm status %d", status);
605             return "UNKNOWN";
606     }
607 }
608
609 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
610     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
611
612     rc->last_status_change = time(NULL);
613     rc->rrdset = st;
614
615     rc->rrdset_next = st->alarms;
616     rc->rrdset_prev = NULL;
617     
618     if(rc->rrdset_next)
619         rc->rrdset_next->rrdset_prev = rc;
620
621     st->alarms = rc;
622
623     if(rc->update_every < rc->rrdset->update_every) {
624         error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rc->rrdset->id, rc->name, rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every);
625         rc->update_every = rc->rrdset->update_every;
626     }
627
628     if(!isnan(rc->green) && isnan(st->green)) {
629         debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->green, rc->green);
630         st->green = rc->green;
631     }
632
633     if(!isnan(rc->red) && isnan(st->red)) {
634         debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from %Lf to %Lf.", rc->rrdset->id, rc->name, rc->rrdset->red, rc->red);
635         st->red = rc->red;
636     }
637
638     rc->local  = rrdvar_create_and_index("local",  &st->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
639     rc->family = rrdvar_create_and_index("family", &st->rrdfamily->variables_root_index, rc->name, RRDVAR_TYPE_CALCULATED, &rc->value);
640
641     char fullname[RRDVAR_MAX_LENGTH + 1];
642     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->id, rc->name);
643     rc->hostid   = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
644
645     snprintfz(fullname, RRDVAR_MAX_LENGTH, "%s.%s", st->name, rc->name);
646     rc->hostname = rrdvar_create_and_index("host", &st->rrdhost->variables_root_index, fullname, RRDVAR_TYPE_CALCULATED, &rc->value);
647
648         if(!rc->units) rc->units = strdupz(st->units);
649
650     {
651         time_t now = time(NULL);
652         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
653     }
654 }
655
656 static inline int rrdcalc_is_matching_this_rrdset(RRDCALC *rc, RRDSET *st) {
657     if(     (rc->hash_chart == st->hash      && !strcmp(rc->chart, st->id)) ||
658             (rc->hash_chart == st->hash_name && !strcmp(rc->chart, st->name)))
659         return 1;
660
661     return 0;
662 }
663
664 // this has to be called while the RRDHOST is locked
665 inline void rrdsetcalc_link_matching(RRDSET *st) {
666     // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id);
667
668     RRDCALC *rc;
669     for(rc = st->rrdhost->alarms; rc ; rc = rc->next) {
670         if(unlikely(rc->rrdset))
671             continue;
672
673         if(unlikely(rrdcalc_is_matching_this_rrdset(rc, st)))
674             rrdsetcalc_link(st, rc);
675     }
676 }
677
678 // this has to be called while the RRDHOST is locked
679 inline void rrdsetcalc_unlink(RRDCALC *rc) {
680     RRDSET *st = rc->rrdset;
681
682     if(!st) {
683         debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
684         error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rc->chart?rc->chart:"NOCHART", rc->name);
685         return;
686     }
687
688     {
689         time_t now = time(NULL);
690         health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
691     }
692
693     RRDHOST *host = st->rrdhost;
694
695     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
696
697     // unlink it
698     if(rc->rrdset_prev)
699         rc->rrdset_prev->rrdset_next = rc->rrdset_next;
700
701     if(rc->rrdset_next)
702         rc->rrdset_next->rrdset_prev = rc->rrdset_prev;
703
704     if(st->alarms == rc)
705         st->alarms = rc->rrdset_next;
706
707     rc->rrdset_prev = rc->rrdset_next = NULL;
708
709     rrdvar_free(st->rrdhost, &st->variables_root_index, rc->local);
710     rc->local = NULL;
711
712     rrdvar_free(st->rrdhost, &st->rrdfamily->variables_root_index, rc->family);
713     rc->family = NULL;
714
715     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostid);
716     rc->hostid = NULL;
717
718     rrdvar_free(st->rrdhost, &st->rrdhost->variables_root_index, rc->hostname);
719     rc->hostname = NULL;
720
721     rc->rrdset = NULL;
722
723     // RRDCALC will remain in RRDHOST
724     // so that if the matching chart is found in the future
725     // it will be applied automatically
726 }
727
728 RRDCALC *rrdcalc_find(RRDSET *st, const char *name) {
729     RRDCALC *rc;
730     uint32_t hash = simple_hash(name);
731
732     for( rc = st->alarms; rc ; rc = rc->rrdset_next ) {
733         if(unlikely(rc->hash == hash && !strcmp(rc->name, name)))
734             return rc;
735     }
736
737     return NULL;
738 }
739
740 static inline int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name) {
741     RRDCALC *rc;
742
743     if(unlikely(!chart)) {
744         error("attempt to find RRDCALC '%s' without giving a chart name", name);
745         return 1;
746     }
747
748     if(unlikely(!hash_chart)) hash_chart = simple_hash(chart);
749     if(unlikely(!hash_name))  hash_name  = simple_hash(name);
750
751     // make sure it does not already exist
752     for(rc = host->alarms; rc ; rc = rc->next) {
753         if (unlikely(rc->chart && rc->hash == hash_name && rc->hash_chart == hash_chart && !strcmp(name, rc->name) && !strcmp(chart, rc->chart))) {
754             debug(D_HEALTH, "Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
755             error("Health alarm '%s.%s' already exists in host '%s'.", chart, name, host->hostname);
756             return 1;
757         }
758     }
759
760     return 0;
761 }
762
763 static inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id) {
764     if(chart && name) {
765         uint32_t hash_chart = simple_hash(chart);
766         uint32_t hash_name = simple_hash(name);
767
768         // re-use old IDs, by looking them up in the alarm log
769         ALARM_ENTRY *ae;
770         for(ae = host->health_log.alarms; ae ;ae = ae->next) {
771             if(unlikely(ae->hash_name == hash_name && ae->hash_chart == hash_chart && !strcmp(name, ae->name) && !strcmp(chart, ae->chart))) {
772                 if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
773                 return ae->alarm_id;
774             }
775         }
776     }
777
778     return host->health_log.next_alarm_id++;
779 }
780
781 static inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
782     rrdhost_check_rdlock(host);
783
784     if(rc->calculation) {
785         rc->calculation->status = &rc->status;
786         rc->calculation->this = &rc->value;
787         rc->calculation->after = &rc->db_after;
788         rc->calculation->before = &rc->db_before;
789         rc->calculation->rrdcalc = rc;
790     }
791
792     if(rc->warning) {
793         rc->warning->status = &rc->status;
794         rc->warning->this = &rc->value;
795         rc->warning->after = &rc->db_after;
796         rc->warning->before = &rc->db_before;
797         rc->warning->rrdcalc = rc;
798     }
799
800     if(rc->critical) {
801         rc->critical->status = &rc->status;
802         rc->critical->this = &rc->value;
803         rc->critical->after = &rc->db_after;
804         rc->critical->before = &rc->db_before;
805         rc->critical->rrdcalc = rc;
806     }
807
808     // link it to the host
809     if(likely(host->alarms)) {
810         // append it
811         RRDCALC *t;
812         for(t = host->alarms; t && t->next ; t = t->next) ;
813         t->next = rc;
814     }
815     else {
816         host->alarms = rc;
817     }
818
819     // link it to its chart
820     RRDSET *st;
821     for(st = host->rrdset_root; st ; st = st->next) {
822         if(rrdcalc_is_matching_this_rrdset(rc, st)) {
823             rrdsetcalc_link(st, rc);
824             break;
825         }
826     }
827 }
828
829 static inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
830
831     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
832
833     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
834         return NULL;
835
836     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
837     rc->next_event_id = 1;
838     rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id);
839     rc->name = strdupz(rt->name);
840     rc->hash = simple_hash(rc->name);
841     rc->chart = strdupz(chart);
842     rc->hash_chart = simple_hash(rc->chart);
843
844     if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions);
845
846     rc->green = rt->green;
847     rc->red = rt->red;
848     rc->value = NAN;
849     rc->old_value = NAN;
850
851     rc->delay_up_duration = rt->delay_up_duration;
852     rc->delay_down_duration = rt->delay_down_duration;
853     rc->delay_max_duration = rt->delay_max_duration;
854     rc->delay_multiplier = rt->delay_multiplier;
855
856     rc->group = rt->group;
857     rc->after = rt->after;
858     rc->before = rt->before;
859     rc->update_every = rt->update_every;
860     rc->options = rt->options;
861
862     if(rt->exec) rc->exec = strdupz(rt->exec);
863     if(rt->recipient) rc->recipient = strdupz(rt->recipient);
864     if(rt->source) rc->source = strdupz(rt->source);
865     if(rt->units) rc->units = strdupz(rt->units);
866     if(rt->info) rc->info = strdupz(rt->info);
867
868     if(rt->calculation) {
869         rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
870         if(!rc->calculation)
871             error("Health alarm '%s.%s': failed to parse calculation expression '%s'", chart, rt->name, rt->calculation->source);
872     }
873     if(rt->warning) {
874         rc->warning = expression_parse(rt->warning->source, NULL, NULL);
875         if(!rc->warning)
876             error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", chart, rt->name, rt->warning->source);
877     }
878     if(rt->critical) {
879         rc->critical = expression_parse(rt->critical->source, NULL, NULL);
880         if(!rc->critical)
881             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
882     }
883
884     debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
885           (rc->chart)?rc->chart:"NOCHART",
886           rc->name,
887           (rc->exec)?rc->exec:"DEFAULT",
888           (rc->recipient)?rc->recipient:"DEFAULT",
889           rc->green,
890           rc->red,
891           rc->group,
892           rc->after,
893           rc->before,
894           rc->options,
895           (rc->dimensions)?rc->dimensions:"NONE",
896           rc->update_every,
897           (rc->calculation)?rc->calculation->parsed_as:"NONE",
898           (rc->warning)?rc->warning->parsed_as:"NONE",
899           (rc->critical)?rc->critical->parsed_as:"NONE",
900           rc->source,
901           rc->delay_up_duration,
902           rc->delay_down_duration,
903           rc->delay_max_duration,
904           rc->delay_multiplier
905     );
906
907     rrdcalc_create_part2(host, rc);
908     return rc;
909 }
910
911 void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
912     if(!rc) return;
913
914     debug(D_HEALTH, "Health removing alarm '%s.%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
915
916     // unlink it from RRDSET
917     if(rc->rrdset) rrdsetcalc_unlink(rc);
918
919     // unlink it from RRDHOST
920     if(unlikely(rc == host->alarms))
921         host->alarms = rc->next;
922
923     else if(likely(host->alarms)) {
924         RRDCALC *t, *last = host->alarms;
925         for(t = last->next; t && t != rc; last = t, t = t->next) ;
926         if(last->next == rc)
927             last->next = rc->next;
928         else
929             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
930     }
931     else
932         error("Cannot unlink unlink '%s.%s' from host '%s': This host does not have any calculations", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
933
934     expression_free(rc->calculation);
935     expression_free(rc->warning);
936     expression_free(rc->critical);
937
938     freez(rc->name);
939     freez(rc->chart);
940     freez(rc->family);
941     freez(rc->dimensions);
942     freez(rc->exec);
943     freez(rc->recipient);
944     freez(rc->source);
945     freez(rc->units);
946     freez(rc->info);
947     freez(rc);
948 }
949
950 // ----------------------------------------------------------------------------
951 // RRDCALCTEMPLATE management
952
953 void rrdcalctemplate_link_matching(RRDSET *st) {
954     RRDCALCTEMPLATE *rt;
955
956     for(rt = st->rrdhost->templates; rt ; rt = rt->next) {
957         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
958             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt, st->id);
959             if(unlikely(!rc))
960                 error("Health tried to create alarm from template '%s', but it failed", rt->name);
961
962 #ifdef NETDATA_INTERNAL_CHECKS
963             else if(rc->rrdset != st)
964                 error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id);
965 #endif
966         }
967     }
968 }
969
970 static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
971     debug(D_HEALTH, "Health removing template '%s' of host '%s'", rt->name, host->hostname);
972
973     if(host->templates) {
974         if(host->templates == rt) {
975             host->templates = rt->next;
976         }
977         else {
978             RRDCALCTEMPLATE *t, *last = host->templates;
979             for (t = last->next; t && t != rt; last = t, t = t->next ) ;
980             if(last && last->next == rt) {
981                 last->next = rt->next;
982                 rt->next = NULL;
983             }
984             else
985                 error("Cannot find RRDCALCTEMPLATE '%s' linked in host '%s'", rt->name, host->hostname);
986         }
987     }
988
989     expression_free(rt->calculation);
990     expression_free(rt->warning);
991     expression_free(rt->critical);
992
993     freez(rt->name);
994     freez(rt->exec);
995     freez(rt->recipient);
996     freez(rt->context);
997     freez(rt->source);
998     freez(rt->units);
999     freez(rt->info);
1000     freez(rt->dimensions);
1001     freez(rt);
1002 }
1003
1004 // ----------------------------------------------------------------------------
1005 // load health configuration
1006
1007 #define HEALTH_CONF_MAX_LINE 4096
1008
1009 #define HEALTH_ALARM_KEY "alarm"
1010 #define HEALTH_TEMPLATE_KEY "template"
1011 #define HEALTH_ON_KEY "on"
1012 #define HEALTH_LOOKUP_KEY "lookup"
1013 #define HEALTH_CALC_KEY "calc"
1014 #define HEALTH_EVERY_KEY "every"
1015 #define HEALTH_GREEN_KEY "green"
1016 #define HEALTH_RED_KEY "red"
1017 #define HEALTH_WARN_KEY "warn"
1018 #define HEALTH_CRIT_KEY "crit"
1019 #define HEALTH_EXEC_KEY "exec"
1020 #define HEALTH_RECIPIENT_KEY "to"
1021 #define HEALTH_UNITS_KEY "units"
1022 #define HEALTH_INFO_KEY "info"
1023 #define HEALTH_DELAY_KEY "delay"
1024
1025 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
1026     if(!rc->chart) {
1027         error("Health configuration for alarm '%s' does not have a chart", rc->name);
1028         return 0;
1029     }
1030
1031     if(!rc->update_every) {
1032         error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
1033         return 0;
1034     }
1035
1036     if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->warning && !rc->critical) {
1037         error("Health configuration for alarm '%s.%s' is useless (no calculation, no warning and no critical evaluation)", rc->chart?rc->chart:"NOCHART", rc->name);
1038         return 0;
1039     }
1040
1041     if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
1042         return 0;
1043
1044     rc->id = rrdcalc_get_unique_id(&localhost, rc->chart, rc->name, &rc->next_event_id);
1045
1046     debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1047           rc->chart?rc->chart:"NOCHART",
1048           rc->name,
1049           rc->id,
1050           (rc->exec)?rc->exec:"DEFAULT",
1051           (rc->recipient)?rc->recipient:"DEFAULT",
1052           rc->green,
1053           rc->red,
1054           rc->group,
1055           rc->after,
1056           rc->before,
1057           rc->options,
1058           (rc->dimensions)?rc->dimensions:"NONE",
1059           rc->update_every,
1060           (rc->calculation)?rc->calculation->parsed_as:"NONE",
1061           (rc->warning)?rc->warning->parsed_as:"NONE",
1062           (rc->critical)?rc->critical->parsed_as:"NONE",
1063           rc->source,
1064           rc->delay_up_duration,
1065           rc->delay_down_duration,
1066           rc->delay_max_duration,
1067           rc->delay_multiplier
1068     );
1069
1070     rrdcalc_create_part2(host, rc);
1071     return 1;
1072 }
1073
1074 static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
1075     if(unlikely(!rt->context)) {
1076         error("Health configuration for template '%s' does not have a context", rt->name);
1077         return 0;
1078     }
1079
1080     if(unlikely(!rt->update_every)) {
1081         error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
1082         return 0;
1083     }
1084
1085     if(unlikely(!RRDCALCTEMPLATE_HAS_CALCULATION(rt) && !rt->warning && !rt->critical)) {
1086         error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
1087         return 0;
1088     }
1089
1090     RRDCALCTEMPLATE *t, *last = NULL;
1091     for (t = host->templates; t ; last = t, t = t->next) {
1092         if(unlikely(t->hash_name == rt->hash_name && !strcmp(t->name, rt->name))) {
1093             error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
1094             return 0;
1095         }
1096     }
1097
1098     debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
1099           rt->name,
1100           (rt->context)?rt->context:"NONE",
1101           (rt->exec)?rt->exec:"DEFAULT",
1102           (rt->recipient)?rt->recipient:"DEFAULT",
1103           rt->green,
1104           rt->red,
1105           rt->group,
1106           rt->after,
1107           rt->before,
1108           rt->options,
1109           (rt->dimensions)?rt->dimensions:"NONE",
1110           rt->update_every,
1111           (rt->calculation)?rt->calculation->parsed_as:"NONE",
1112           (rt->warning)?rt->warning->parsed_as:"NONE",
1113           (rt->critical)?rt->critical->parsed_as:"NONE",
1114           rt->source,
1115           rt->delay_up_duration,
1116           rt->delay_down_duration,
1117           rt->delay_max_duration,
1118           rt->delay_multiplier
1119     );
1120
1121     if(likely(last)) {
1122         last->next = rt;
1123     }
1124     else {
1125         rt->next = host->templates;
1126         host->templates = rt;
1127     }
1128
1129     return 1;
1130 }
1131
1132 static inline int health_parse_duration(char *string, int *result) {
1133     // make sure it is a number
1134     if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
1135         *result = 0;
1136         return 0;
1137     }
1138
1139     char *e = NULL;
1140     calculated_number n = strtold(string, &e);
1141     if(e && *e) {
1142         switch (*e) {
1143             case 'Y':
1144                 *result = (int) (n * 86400 * 365);
1145                 break;
1146             case 'M':
1147                 *result = (int) (n * 86400 * 30);
1148                 break;
1149             case 'w':
1150                 *result = (int) (n * 86400 * 7);
1151                 break;
1152             case 'd':
1153                 *result = (int) (n * 86400);
1154                 break;
1155             case 'h':
1156                 *result = (int) (n * 3600);
1157                 break;
1158             case 'm':
1159                 *result = (int) (n * 60);
1160                 break;
1161
1162             default:
1163             case 's':
1164                 *result = (int) (n);
1165                 break;
1166         }
1167     }
1168     else
1169        *result = (int)(n);
1170
1171     return 1;
1172 }
1173
1174 static inline int health_parse_delay(
1175         size_t line, const char *path, const char *file, char *string,
1176         int *delay_up_duration,
1177         int *delay_down_duration,
1178         int *delay_max_duration,
1179         float *delay_multiplier) {
1180
1181     char given_up = 0;
1182     char given_down = 0;
1183     char given_max = 0;
1184     char given_multiplier = 0;
1185
1186     char *s = string;
1187     while(*s) {
1188         char *key = s;
1189
1190         while(*s && !isspace(*s)) s++;
1191         while(*s && isspace(*s)) *s++ = '\0';
1192
1193         if(!*key) break;
1194
1195         char *value = s;
1196         while(*s && !isspace(*s)) s++;
1197         while(*s && isspace(*s)) *s++ = '\0';
1198
1199         if(!strcasecmp(key, "up")) {
1200             if (!health_parse_duration(value, delay_up_duration)) {
1201                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1202                       line, path, file, value, key);
1203             }
1204             else given_up = 1;
1205         }
1206         else if(!strcasecmp(key, "down")) {
1207             if (!health_parse_duration(value, delay_down_duration)) {
1208                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1209                       line, path, file, value, key);
1210             }
1211             else given_down = 1;
1212         }
1213         else if(!strcasecmp(key, "multiplier")) {
1214             *delay_multiplier = strtof(value, NULL);
1215             if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
1216                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1217                       line, path, file, value, key);
1218             }
1219             else given_multiplier = 1;
1220         }
1221         else if(!strcasecmp(key, "max")) {
1222             if (!health_parse_duration(value, delay_max_duration)) {
1223                 error("Health configuration at line %zu of file '%s/%s': invalid value '%s' for '%s' keyword",
1224                       line, path, file, value, key);
1225             }
1226             else given_max = 1;
1227         }
1228         else {
1229             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1230                   line, path, file, key);
1231         }
1232     }
1233
1234     if(!given_up)
1235         *delay_up_duration = 0;
1236
1237     if(!given_down)
1238         *delay_down_duration = 0;
1239
1240     if(!given_multiplier)
1241         *delay_multiplier = 1.0;
1242
1243     if(!given_max) {
1244         if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier))
1245             *delay_max_duration = (*delay_up_duration) * (*delay_multiplier);
1246
1247         if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier))
1248             *delay_max_duration = (*delay_down_duration) * (*delay_multiplier);
1249     }
1250
1251     return 1;
1252 }
1253
1254 static inline int health_parse_db_lookup(
1255         size_t line, const char *path, const char *file, char *string,
1256         int *group_method, int *after, int *before, int *every,
1257         uint32_t *options, char **dimensions
1258 ) {
1259     debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s/%s: %s", line, path, file, string);
1260
1261     if(*dimensions) freez(*dimensions);
1262     *dimensions = NULL;
1263     *after = 0;
1264     *before = 0;
1265     *every = 0;
1266     *options = 0;
1267
1268     char *s = string, *key;
1269
1270     // first is the group method
1271     key = s;
1272     while(*s && !isspace(*s)) s++;
1273     while(*s && isspace(*s)) *s++ = '\0';
1274     if(!*s) {
1275         error("Health configuration invalid chart calculation at line %zu of file '%s/%s': expected group method followed by the 'after' time, but got '%s'",
1276               line, path, file, key);
1277         return 0;
1278     }
1279
1280     if((*group_method = web_client_api_request_v1_data_group(key, -1)) == -1) {
1281         error("Health configuration at line %zu of file '%s/%s': invalid group method '%s'",
1282               line, path, file, key);
1283         return 0;
1284     }
1285
1286     // then is the 'after' time
1287     key = s;
1288     while(*s && !isspace(*s)) s++;
1289     while(*s && isspace(*s)) *s++ = '\0';
1290
1291     if(!health_parse_duration(key, after)) {
1292         error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' after group method",
1293               line, path, file, key);
1294         return 0;
1295     }
1296
1297     // sane defaults
1298     *every = abs(*after);
1299
1300     // now we may have optional parameters
1301     while(*s) {
1302         key = s;
1303         while(*s && !isspace(*s)) s++;
1304         while(*s && isspace(*s)) *s++ = '\0';
1305         if(!*key) break;
1306
1307         if(!strcasecmp(key, "at")) {
1308             char *value = s;
1309             while(*s && !isspace(*s)) s++;
1310             while(*s && isspace(*s)) *s++ = '\0';
1311
1312             if (!health_parse_duration(value, before)) {
1313                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1314                       line, path, file, value, key);
1315             }
1316         }
1317         else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
1318             char *value = s;
1319             while(*s && !isspace(*s)) s++;
1320             while(*s && isspace(*s)) *s++ = '\0';
1321
1322             if (!health_parse_duration(value, every)) {
1323                 error("Health configuration at line %zu of file '%s/%s': invalid duration '%s' for '%s' keyword",
1324                       line, path, file, value, key);
1325             }
1326         }
1327         else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
1328             *options |= RRDR_OPTION_ABSOLUTE;
1329         }
1330         else if(!strcasecmp(key, "min2max")) {
1331             *options |= RRDR_OPTION_MIN2MAX;
1332         }
1333         else if(!strcasecmp(key, "null2zero")) {
1334             *options |= RRDR_OPTION_NULL2ZERO;
1335         }
1336         else if(!strcasecmp(key, "percentage")) {
1337             *options |= RRDR_OPTION_PERCENTAGE;
1338         }
1339         else if(!strcasecmp(key, "unaligned")) {
1340             *options |= RRDR_OPTION_NOT_ALIGNED;
1341         }
1342         else if(!strcasecmp(key, "of")) {
1343             if(*s && strcasecmp(s, "all"))
1344                *dimensions = strdupz(s);
1345             break;
1346         }
1347         else {
1348             error("Health configuration at line %zu of file '%s/%s': unknown keyword '%s'",
1349                   line, path, file, key);
1350         }
1351     }
1352
1353     return 1;
1354 }
1355
1356 static inline char *health_source_file(size_t line, const char *path, const char *filename) {
1357     char buffer[FILENAME_MAX + 1];
1358     snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
1359     return strdupz(buffer);
1360 }
1361
1362 static inline void strip_quotes(char *s) {
1363     while(*s) {
1364         if(*s == '\'' || *s == '"') *s = ' ';
1365         s++;
1366     }
1367 }
1368
1369 int health_readfile(const char *path, const char *filename) {
1370     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
1371
1372     static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
1373     char buffer[HEALTH_CONF_MAX_LINE + 1];
1374
1375     if(unlikely(!hash_alarm)) {
1376         hash_alarm = simple_uhash(HEALTH_ALARM_KEY);
1377         hash_template = simple_uhash(HEALTH_TEMPLATE_KEY);
1378         hash_on = simple_uhash(HEALTH_ON_KEY);
1379         hash_calc = simple_uhash(HEALTH_CALC_KEY);
1380         hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY);
1381         hash_green = simple_uhash(HEALTH_GREEN_KEY);
1382         hash_red = simple_uhash(HEALTH_RED_KEY);
1383         hash_warn = simple_uhash(HEALTH_WARN_KEY);
1384         hash_crit = simple_uhash(HEALTH_CRIT_KEY);
1385         hash_exec = simple_uhash(HEALTH_EXEC_KEY);
1386         hash_every = simple_uhash(HEALTH_EVERY_KEY);
1387         hash_units = simple_hash(HEALTH_UNITS_KEY);
1388         hash_info = simple_hash(HEALTH_INFO_KEY);
1389         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
1390         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
1391     }
1392
1393     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
1394     FILE *fp = fopen(buffer, "r");
1395     if(!fp) {
1396         error("Health configuration cannot read file '%s'.", buffer);
1397         return 0;
1398     }
1399
1400     RRDCALC *rc = NULL;
1401     RRDCALCTEMPLATE *rt = NULL;
1402
1403     size_t line = 0, append = 0;
1404     char *s;
1405     while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) {
1406         int stop_appending = !s;
1407         line++;
1408         // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s);
1409         s = trim(buffer);
1410         if(!s) continue;
1411         // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s);
1412
1413         append = strlen(s);
1414         if(!stop_appending && s[append - 1] == '\\') {
1415             s[append - 1] = ' ';
1416             append = &s[append] - buffer;
1417             if(append < HEALTH_CONF_MAX_LINE)
1418                 continue;
1419             else {
1420                 error("Health configuration has too long muli-line at line %zu of file '%s/%s'.", line, path, filename);
1421             }
1422         }
1423         append = 0;
1424
1425         char *key = s;
1426         while(*s && *s != ':') s++;
1427         if(!*s) {
1428             error("Health configuration has invalid line %zu of file '%s/%s'. It does not contain a ':'. Ignoring it.", line, path, filename);
1429             continue;
1430         }
1431         *s = '\0';
1432         s++;
1433
1434         char *value = s;
1435         key = trim(key);
1436         value = trim(value);
1437
1438         if(!key) {
1439             error("Health configuration has invalid line %zu of file '%s/%s'. Keyword is empty. Ignoring it.", line, path, filename);
1440             continue;
1441         }
1442
1443         if(!value) {
1444             error("Health configuration has invalid line %zu of file '%s/%s'. value is empty. Ignoring it.", line, path, filename);
1445             continue;
1446         }
1447
1448         // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value);
1449         uint32_t hash = simple_uhash(key);
1450
1451         if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
1452             if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1453                 rrdcalc_free(&localhost, rc);
1454
1455             if(rt) {
1456                 if (!rrdcalctemplate_add_template_from_config(&localhost, rt))
1457                     rrdcalctemplate_free(&localhost, rt);
1458                 rt = NULL;
1459             }
1460
1461             rc = callocz(1, sizeof(RRDCALC));
1462             rc->next_event_id = 1;
1463             rc->name = strdupz(value);
1464             rc->hash = simple_hash(rc->name);
1465             rc->source = health_source_file(line, path, filename);
1466             rc->green = NAN;
1467             rc->red = NAN;
1468             rc->value = NAN;
1469             rc->old_value = NAN;
1470             rc->delay_multiplier = 1.0;
1471
1472             if(rrdvar_fix_name(rc->name))
1473                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
1474         }
1475         else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
1476             if(rc) {
1477                 if(!rrdcalc_add_alarm_from_config(&localhost, rc))
1478                     rrdcalc_free(&localhost, rc);
1479                 rc = NULL;
1480             }
1481
1482             if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1483                 rrdcalctemplate_free(&localhost, rt);
1484
1485             rt = callocz(1, sizeof(RRDCALCTEMPLATE));
1486             rt->name = strdupz(value);
1487             rt->hash_name = simple_hash(rt->name);
1488             rt->source = health_source_file(line, path, filename);
1489             rt->green = NAN;
1490             rt->red = NAN;
1491             rt->delay_multiplier = 1.0;
1492
1493             if(rrdvar_fix_name(rt->name))
1494                 error("Health configuration renamed template '%s' to '%s'", value, rt->name);
1495         }
1496         else if(rc) {
1497             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1498                 if(rc->chart) {
1499                     if(strcmp(rc->chart, value))
1500                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1501                              line, path, filename, rc->name, key, rc->chart, value, value);
1502
1503                     freez(rc->chart);
1504                 }
1505                 rc->chart = strdupz(value);
1506                 rc->hash_chart = simple_hash(rc->chart);
1507             }
1508             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1509                 health_parse_db_lookup(line, path, filename, value, &rc->group, &rc->after, &rc->before,
1510                                        &rc->update_every,
1511                                        &rc->options, &rc->dimensions);
1512             }
1513             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1514                 if(!health_parse_duration(value, &rc->update_every))
1515                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
1516                          line, path, filename, rc->name, key, value);
1517             }
1518             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1519                 char *e;
1520                 rc->green = strtold(value, &e);
1521                 if(e && *e) {
1522                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1523                          line, path, filename, rc->name, key, e);
1524                 }
1525             }
1526             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1527                 char *e;
1528                 rc->red = strtold(value, &e);
1529                 if(e && *e) {
1530                     info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
1531                          line, path, filename, rc->name, key, e);
1532                 }
1533             }
1534             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1535                 const char *failed_at = NULL;
1536                 int error = 0;
1537                 rc->calculation = expression_parse(value, &failed_at, &error);
1538                 if(!rc->calculation) {
1539                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1540                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1541                 }
1542             }
1543             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1544                 const char *failed_at = NULL;
1545                 int error = 0;
1546                 rc->warning = expression_parse(value, &failed_at, &error);
1547                 if(!rc->warning) {
1548                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1549                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1550                 }
1551             }
1552             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1553                 const char *failed_at = NULL;
1554                 int error = 0;
1555                 rc->critical = expression_parse(value, &failed_at, &error);
1556                 if(!rc->critical) {
1557                     error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1558                           line, path, filename, rc->name, key, value, expression_strerror(error), failed_at);
1559                 }
1560             }
1561             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1562                 if(rc->exec) {
1563                     if(strcmp(rc->exec, value))
1564                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1565                              line, path, filename, rc->name, key, rc->exec, value, value);
1566
1567                     freez(rc->exec);
1568                 }
1569                 rc->exec = strdupz(value);
1570             }
1571             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1572                 if(rc->recipient) {
1573                     if(strcmp(rc->recipient, value))
1574                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1575                              line, path, filename, rc->name, key, rc->recipient, value, value);
1576
1577                     freez(rc->recipient);
1578                 }
1579                 rc->recipient = strdupz(value);
1580             }
1581             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1582                 if(rc->units) {
1583                     if(strcmp(rc->units, value))
1584                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1585                              line, path, filename, rc->name, key, rc->units, value, value);
1586
1587                     freez(rc->units);
1588                 }
1589                 rc->units = strdupz(value);
1590                 strip_quotes(rc->units);
1591             }
1592             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1593                 if(rc->info) {
1594                     if(strcmp(rc->info, value))
1595                         info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1596                              line, path, filename, rc->name, key, rc->info, value, value);
1597
1598                     freez(rc->info);
1599                 }
1600                 rc->info = strdupz(value);
1601                 strip_quotes(rc->info);
1602             }
1603             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1604                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
1605             }
1606             else {
1607                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
1608                      line, path, filename, rc->name, key);
1609             }
1610         }
1611         else if(rt) {
1612             if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
1613                 if(rt->context) {
1614                     if(strcmp(rt->context, value))
1615                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1616                              line, path, filename, rt->name, key, rt->context, value, value);
1617
1618                     freez(rt->context);
1619                 }
1620                 rt->context = strdupz(value);
1621                 rt->hash_context = simple_hash(rt->context);
1622             }
1623             else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
1624                 health_parse_db_lookup(line, path, filename, value, &rt->group, &rt->after, &rt->before,
1625                                        &rt->update_every,
1626                                        &rt->options, &rt->dimensions);
1627             }
1628             else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
1629                 if(!health_parse_duration(value, &rt->update_every))
1630                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
1631                          line, path, filename, rt->name, key, value);
1632             }
1633             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
1634                 char *e;
1635                 rt->green = strtold(value, &e);
1636                 if(e && *e) {
1637                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1638                          line, path, filename, rt->name, key, e);
1639                 }
1640             }
1641             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
1642                 char *e;
1643                 rt->red = strtold(value, &e);
1644                 if(e && *e) {
1645                     info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
1646                          line, path, filename, rt->name, key, e);
1647                 }
1648             }
1649             else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
1650                 const char *failed_at = NULL;
1651                 int error = 0;
1652                 rt->calculation = expression_parse(value, &failed_at, &error);
1653                 if(!rt->calculation) {
1654                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1655                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1656                 }
1657             }
1658             else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
1659                 const char *failed_at = NULL;
1660                 int error = 0;
1661                 rt->warning = expression_parse(value, &failed_at, &error);
1662                 if(!rt->warning) {
1663                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1664                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1665                 }
1666             }
1667             else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
1668                 const char *failed_at = NULL;
1669                 int error = 0;
1670                 rt->critical = expression_parse(value, &failed_at, &error);
1671                 if(!rt->critical) {
1672                     error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
1673                           line, path, filename, rt->name, key, value, expression_strerror(error), failed_at);
1674                 }
1675             }
1676             else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
1677                 if(rt->exec) {
1678                     if(strcmp(rt->exec, value))
1679                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1680                              line, path, filename, rt->name, key, rt->exec, value, value);
1681
1682                     freez(rt->exec);
1683                 }
1684                 rt->exec = strdupz(value);
1685             }
1686             else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
1687                 if(rt->recipient) {
1688                     if(strcmp(rt->recipient, value))
1689                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1690                              line, path, filename, rt->name, key, rt->recipient, value, value);
1691
1692                     freez(rt->recipient);
1693                 }
1694                 rt->recipient = strdupz(value);
1695             }
1696             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
1697                 if(rt->units) {
1698                     if(strcmp(rt->units, value))
1699                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1700                              line, path, filename, rt->name, key, rt->units, value, value);
1701
1702                     freez(rt->units);
1703                 }
1704                 rt->units = strdupz(value);
1705                 strip_quotes(rt->units);
1706             }
1707             else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
1708                 if(rt->info) {
1709                     if(strcmp(rt->info, value))
1710                         info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
1711                              line, path, filename, rt->name, key, rt->info, value, value);
1712
1713                     freez(rt->info);
1714                 }
1715                 rt->info = strdupz(value);
1716                 strip_quotes(rt->info);
1717             }
1718             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
1719                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
1720             }
1721             else {
1722                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
1723                       line, path, filename, rt->name, key);
1724             }
1725         }
1726         else {
1727             error("Health configuration at line %zu of file '%s/%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
1728                   line, path, filename, key);
1729         }
1730     }
1731
1732     if(rc && !rrdcalc_add_alarm_from_config(&localhost, rc))
1733         rrdcalc_free(&localhost, rc);
1734
1735     if(rt && !rrdcalctemplate_add_template_from_config(&localhost, rt))
1736         rrdcalctemplate_free(&localhost, rt);
1737
1738     fclose(fp);
1739     return 1;
1740 }
1741
1742 void health_readdir(const char *path) {
1743     size_t pathlen = strlen(path);
1744
1745     debug(D_HEALTH, "Health configuration reading directory '%s'", path);
1746
1747     DIR *dir = opendir(path);
1748     if (!dir) {
1749         error("Health configuration cannot open directory '%s'.", path);
1750         return;
1751     }
1752
1753     struct dirent *de = NULL;
1754     while ((de = readdir(dir))) {
1755         size_t len = strlen(de->d_name);
1756
1757         if(de->d_type == DT_DIR
1758            && (
1759                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
1760                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
1761            )) {
1762             debug(D_HEALTH, "Ignoring directory '%s'", de->d_name);
1763             continue;
1764         }
1765
1766         else if(de->d_type == DT_DIR) {
1767             char *s = mallocz(pathlen + strlen(de->d_name) + 2);
1768             strcpy(s, path);
1769             strcat(s, "/");
1770             strcat(s, de->d_name);
1771             health_readdir(s);
1772             freez(s);
1773             continue;
1774         }
1775
1776         else if((de->d_type == DT_LNK || de->d_type == DT_REG || de->d_type == DT_UNKNOWN) &&
1777                 len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) {
1778             health_readfile(path, de->d_name);
1779         }
1780
1781         else debug(D_HEALTH, "Ignoring file '%s'", de->d_name);
1782     }
1783
1784     closedir(dir);
1785 }
1786
1787 static inline char *health_config_dir(void) {
1788     char buffer[FILENAME_MAX + 1];
1789     snprintfz(buffer, FILENAME_MAX, "%s/health.d", config_get("global", "config directory", CONFIG_DIR));
1790     return config_get("health", "health configuration directory", buffer);
1791 }
1792
1793 void health_init(void) {
1794     debug(D_HEALTH, "Health configuration initializing");
1795
1796     if(!(health_enabled = config_get_boolean("health", "enabled", 1))) {
1797         debug(D_HEALTH, "Health is disabled.");
1798         return;
1799     }
1800
1801     health_alarm_log_load(&localhost);
1802
1803     char *path = health_config_dir();
1804
1805     {
1806         char buffer[FILENAME_MAX + 1];
1807         snprintfz(buffer, FILENAME_MAX, "%s/alarm-notify.sh", config_get("global", "plugins directory", PLUGINS_DIR));
1808         health.health_default_exec = config_get("health", "script to execute on alarm", buffer);
1809     }
1810
1811     long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1812     if(n < 10) {
1813         error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
1814         config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
1815     }
1816     else localhost.health_log.max = (unsigned int)n;
1817
1818     rrdhost_rwlock(&localhost);
1819     health_readdir(path);
1820     rrdhost_unlock(&localhost);
1821 }
1822
1823 // ----------------------------------------------------------------------------
1824 // JSON generation
1825
1826 static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
1827     if(value && *value)
1828         buffer_sprintf(wb, "%s\"%s\":\"%s\"%s", prefix, label, value, suffix);
1829     else
1830         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
1831 }
1832
1833 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
1834     buffer_sprintf(wb, "\n\t{\n"
1835                            "\t\t\"hostname\": \"%s\",\n"
1836                            "\t\t\"unique_id\": %u,\n"
1837                            "\t\t\"alarm_id\": %u,\n"
1838                            "\t\t\"alarm_event_id\": %u,\n"
1839                            "\t\t\"name\": \"%s\",\n"
1840                            "\t\t\"chart\": \"%s\",\n"
1841                            "\t\t\"family\": \"%s\",\n"
1842                            "\t\t\"processed\": %s,\n"
1843                            "\t\t\"updated\": %s,\n"
1844                            "\t\t\"exec_run\": %lu,\n"
1845                            "\t\t\"exec_failed\": %s,\n"
1846                            "\t\t\"exec\": \"%s\",\n"
1847                            "\t\t\"recipient\": \"%s\",\n"
1848                            "\t\t\"exec_code\": %d,\n"
1849                            "\t\t\"source\": \"%s\",\n"
1850                            "\t\t\"units\": \"%s\",\n"
1851                            "\t\t\"info\": \"%s\",\n"
1852                            "\t\t\"when\": %lu,\n"
1853                            "\t\t\"duration\": %lu,\n"
1854                            "\t\t\"non_clear_duration\": %lu,\n"
1855                            "\t\t\"status\": \"%s\",\n"
1856                            "\t\t\"old_status\": \"%s\",\n"
1857                            "\t\t\"delay\": %d,\n"
1858                            "\t\t\"delay_up_to_timestamp\": %lu,\n"
1859                            "\t\t\"updated_by_id\": %u,\n"
1860                            "\t\t\"updates_id\": %u,\n",
1861                    host->hostname,
1862                    ae->unique_id,
1863                    ae->alarm_id,
1864                    ae->alarm_event_id,
1865                    ae->name,
1866                    ae->chart,
1867                    ae->family,
1868                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)?"true":"false",
1869                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)?"true":"false",
1870                    (unsigned long)ae->exec_run_timestamp,
1871                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false",
1872                    ae->exec?ae->exec:health.health_default_exec,
1873                    ae->recipient?ae->recipient:health.health_default_recipient,
1874                    ae->exec_code,
1875                    ae->source,
1876                    ae->units?ae->units:"",
1877                    ae->info?ae->info:"",
1878                    (unsigned long)ae->when,
1879                    (unsigned long)ae->duration,
1880                    (unsigned long)ae->non_clear_duration,
1881                    rrdcalc_status2string(ae->new_status),
1882                    rrdcalc_status2string(ae->old_status),
1883                    ae->delay,
1884                    (unsigned long)ae->delay_up_to_timestamp,
1885                    ae->updated_by_id,
1886                    ae->updates_id
1887     );
1888
1889     buffer_strcat(wb, "\t\t\"value\":");
1890     buffer_rrd_value(wb, ae->new_value);
1891     buffer_strcat(wb, ",\n");
1892
1893     buffer_strcat(wb, "\t\t\"old_value\":");
1894     buffer_rrd_value(wb, ae->old_value);
1895     buffer_strcat(wb, "\n");
1896
1897     buffer_strcat(wb, "\t}");
1898 }
1899
1900 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
1901     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
1902
1903     buffer_strcat(wb, "[");
1904
1905     unsigned int max = host->health_log.max;
1906     unsigned int count = 0;
1907     ALARM_ENTRY *ae;
1908     for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
1909         if(ae->unique_id > after) {
1910             if(likely(count)) buffer_strcat(wb, ",");
1911             health_alarm_entry2json_nolock(wb, ae, host);
1912         }
1913     }
1914
1915     buffer_strcat(wb, "\n]\n");
1916
1917     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
1918 }
1919
1920 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
1921     buffer_sprintf(wb,
1922            "\t\t\"%s.%s\": {\n"
1923                    "\t\t\t\"id\": %lu,\n"
1924                    "\t\t\t\"name\": \"%s\",\n"
1925                    "\t\t\t\"chart\": \"%s\",\n"
1926                    "\t\t\t\"family\": \"%s\",\n"
1927                    "\t\t\t\"active\": %s,\n"
1928                    "\t\t\t\"exec\": \"%s\",\n"
1929                    "\t\t\t\"recipient\": \"%s\",\n"
1930                    "\t\t\t\"source\": \"%s\",\n"
1931                    "\t\t\t\"units\": \"%s\",\n"
1932                    "\t\t\t\"info\": \"%s\",\n"
1933                                    "\t\t\t\"status\": \"%s\",\n"
1934                    "\t\t\t\"last_status_change\": %lu,\n"
1935                    "\t\t\t\"last_updated\": %lu,\n"
1936                    "\t\t\t\"next_update\": %lu,\n"
1937                    "\t\t\t\"update_every\": %d,\n"
1938                    "\t\t\t\"delay_up_duration\": %d,\n"
1939                    "\t\t\t\"delay_down_duration\": %d,\n"
1940                    "\t\t\t\"delay_max_duration\": %d,\n"
1941                    "\t\t\t\"delay_multiplier\": %f,\n"
1942                    "\t\t\t\"delay\": %d,\n"
1943                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
1944             , rc->chart, rc->name
1945             , (unsigned long)rc->id
1946             , rc->name
1947             , rc->chart
1948             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
1949             , (rc->rrdset)?"true":"false"
1950             , rc->exec?rc->exec:health.health_default_exec
1951             , rc->recipient?rc->recipient:health.health_default_recipient
1952             , rc->source
1953             , rc->units?rc->units:""
1954             , rc->info?rc->info:""
1955             , rrdcalc_status2string(rc->status)
1956             , (unsigned long)rc->last_status_change
1957             , (unsigned long)rc->last_updated
1958             , (unsigned long)rc->next_update
1959             , rc->update_every
1960             , rc->delay_up_duration
1961             , rc->delay_down_duration
1962             , rc->delay_max_duration
1963             , rc->delay_multiplier
1964             , rc->delay_last
1965             , (unsigned long)rc->delay_up_to_timestamp
1966     );
1967
1968     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
1969         if(rc->dimensions && *rc->dimensions)
1970             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
1971
1972         buffer_sprintf(wb,
1973                        "\t\t\t\"db_after\": %lu,\n"
1974                        "\t\t\t\"db_before\": %lu,\n"
1975                        "\t\t\t\"lookup_method\": \"%s\",\n"
1976                        "\t\t\t\"lookup_after\": %d,\n"
1977                        "\t\t\t\"lookup_before\": %d,\n"
1978                        "\t\t\t\"lookup_options\": \"",
1979                        (unsigned long) rc->db_after,
1980                        (unsigned long) rc->db_before,
1981                        group_method2string(rc->group),
1982                        rc->after,
1983                        rc->before
1984         );
1985         buffer_data_options2string(wb, rc->options);
1986         buffer_strcat(wb, "\",\n");
1987     }
1988
1989     if(rc->calculation) {
1990         health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
1991         health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
1992     }
1993
1994     if(rc->warning) {
1995         health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
1996         health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
1997     }
1998
1999     if(rc->critical) {
2000         health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
2001         health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
2002     }
2003
2004     buffer_strcat(wb, "\t\t\t\"green\":");
2005     buffer_rrd_value(wb, rc->green);
2006     buffer_strcat(wb, ",\n");
2007
2008     buffer_strcat(wb, "\t\t\t\"red\":");
2009     buffer_rrd_value(wb, rc->red);
2010     buffer_strcat(wb, ",\n");
2011
2012     buffer_strcat(wb, "\t\t\t\"value\":");
2013     buffer_rrd_value(wb, rc->value);
2014     buffer_strcat(wb, "\n");
2015
2016     buffer_strcat(wb, "\t\t}");
2017 }
2018
2019 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
2020 //
2021 //}
2022
2023 void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
2024     int i;
2025
2026     rrdhost_rdlock(&localhost);
2027     buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
2028                         "\n\t\"latest_alarm_log_unique_id\": %u,"
2029                         "\n\t\"status\": %s,"
2030                         "\n\t\"now\": %lu,"
2031                         "\n\t\"alarms\": {\n",
2032                         host->hostname,
2033                         (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
2034                         health_enabled?"true":"false",
2035                         (unsigned long)time(NULL));
2036
2037     RRDCALC *rc;
2038     for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
2039         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
2040             continue;
2041
2042         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
2043             continue;
2044
2045         if(likely(i)) buffer_strcat(wb, ",\n");
2046         health_rrdcalc2json_nolock(wb, rc);
2047         i++;
2048     }
2049
2050 //    buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
2051 //    RRDCALCTEMPLATE *rt;
2052 //    for(rt = host->templates; rt ; rt = rt->next)
2053 //        health_rrdcalctemplate2json_nolock(wb, rt);
2054
2055     buffer_strcat(wb, "\n\t}\n}\n");
2056     rrdhost_unlock(&localhost);
2057 }
2058
2059
2060 // ----------------------------------------------------------------------------
2061 // re-load health configuration
2062
2063 static inline void health_free_all_nolock(RRDHOST *host) {
2064     while(host->templates)
2065         rrdcalctemplate_free(host, host->templates);
2066
2067     while(host->alarms)
2068         rrdcalc_free(host, host->alarms);
2069 }
2070
2071 void health_reload(void) {
2072     if(!health_enabled) {
2073         error("Health reload is requested, but health is not enabled.");
2074         return;
2075     }
2076
2077     char *path = health_config_dir();
2078
2079     // free all running alarms
2080     rrdhost_rwlock(&localhost);
2081     health_free_all_nolock(&localhost);
2082     rrdhost_unlock(&localhost);
2083
2084     // invalidate all previous entries in the alarm log
2085     ALARM_ENTRY *t;
2086     for(t = localhost.health_log.alarms ; t ; t = t->next) {
2087         if(t->new_status != RRDCALC_STATUS_REMOVED)
2088             t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
2089     }
2090
2091     // reset all thresholds to all charts
2092     RRDSET *st;
2093     for(st = localhost.rrdset_root; st ; st = st->next) {
2094         st->green = NAN;
2095         st->red = NAN;
2096     }
2097
2098     // load the new alarms
2099     rrdhost_rwlock(&localhost);
2100     health_readdir(path);
2101     rrdhost_unlock(&localhost);
2102
2103     // link the loaded alarms to their charts
2104     for(st = localhost.rrdset_root; st ; st = st->next) {
2105         rrdhost_rwlock(&localhost);
2106
2107         rrdsetcalc_link_matching(st);
2108         rrdcalctemplate_link_matching(st);
2109
2110         rrdhost_unlock(&localhost);
2111     }
2112 }
2113
2114 // ----------------------------------------------------------------------------
2115 // health main thread and friends
2116
2117 static inline int rrdcalc_value2status(calculated_number n) {
2118     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
2119     if(n) return RRDCALC_STATUS_RAISED;
2120     return RRDCALC_STATUS_CLEAR;
2121 }
2122
2123 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
2124     ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
2125
2126     // find the previous notification for the same alarm
2127     ALARM_ENTRY *t;
2128     for(t = ae->next; t ;t = t->next) {
2129         if(t->alarm_id == ae->alarm_id && t->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN)
2130             break;
2131     }
2132
2133     if(t && t->new_status == ae->new_status) {
2134         // don't send the same notification again
2135         info("Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2136         goto done;
2137     }
2138
2139     if((ae->old_status == RRDCALC_STATUS_UNDEFINED && ae->new_status == RRDCALC_STATUS_UNINITIALIZED)
2140         || (ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) {
2141         info("Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
2142         goto done;
2143     }
2144
2145     char buffer[FILENAME_MAX + 1];
2146     pid_t command_pid;
2147
2148     const char *exec = ae->exec;
2149     if(!exec) exec = health.health_default_exec;
2150
2151     const char *recipient = ae->recipient;
2152     if(!recipient) recipient = health.health_default_recipient;
2153
2154     snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
2155               exec,
2156               recipient,
2157               host->hostname,
2158               ae->unique_id,
2159               ae->alarm_id,
2160               ae->alarm_event_id,
2161               (unsigned long)ae->when,
2162               ae->name,
2163               ae->chart?ae->chart:"NOCAHRT",
2164               ae->family?ae->family:"NOFAMILY",
2165               rrdcalc_status2string(ae->new_status),
2166               rrdcalc_status2string(ae->old_status),
2167               ae->new_value,
2168               ae->old_value,
2169               ae->source?ae->source:"UNKNOWN",
2170               (uint32_t)ae->duration,
2171               (uint32_t)ae->non_clear_duration,
2172               ae->units?ae->units:"",
2173               ae->info?ae->info:""
2174     );
2175
2176     ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN;
2177     ae->exec_run_timestamp = time(NULL);
2178
2179     debug(D_HEALTH, "executing command '%s'", buffer);
2180     FILE *fp = mypopen(buffer, &command_pid);
2181     if(!fp) {
2182         error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
2183         goto done;
2184     }
2185     debug(D_HEALTH, "HEALTH reading from command");
2186     char *s = fgets(buffer, FILENAME_MAX, fp);
2187     (void)s;
2188     ae->exec_code = mypclose(fp, command_pid);
2189     debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
2190
2191     if(ae->exec_code != 0)
2192         ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED;
2193
2194 done:
2195     health_alarm_log_save(host, ae);
2196     return;
2197 }
2198
2199 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
2200     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
2201          ae->chart?ae->chart:"NOCHART", ae->name,
2202          ae->new_value,
2203          rrdcalc_status2string(ae->old_status),
2204          rrdcalc_status2string(ae->new_status)
2205     );
2206
2207     health_alarm_execute(host, ae);
2208 }
2209
2210 static inline void health_alarm_log_process(RRDHOST *host) {
2211     static uint32_t stop_at_id = 0;
2212     uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
2213     time_t now = time(NULL);
2214
2215     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
2216
2217     ALARM_ENTRY *ae;
2218     for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) {
2219         if(unlikely(
2220             !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
2221             !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)
2222             )) {
2223
2224             if(unlikely(ae->unique_id < first_waiting))
2225                 first_waiting = ae->unique_id;
2226
2227             if(likely(now >= ae->delay_up_to_timestamp))
2228                 health_process_notifications(host, ae);
2229         }
2230     }
2231
2232     // remember this for the next iteration
2233     stop_at_id = first_waiting;
2234
2235     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2236
2237     if(host->health_log.count <= host->health_log.max)
2238         return;
2239
2240     // cleanup excess entries in the log
2241     pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
2242
2243     ALARM_ENTRY *last = NULL;
2244     unsigned int count = host->health_log.max * 2 / 3;
2245     for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
2246
2247     if(ae && last && last->next == ae)
2248         last->next = NULL;
2249     else
2250         ae = NULL;
2251
2252     while(ae) {
2253         debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
2254
2255         ALARM_ENTRY *t = ae->next;
2256
2257         freez(ae->name);
2258         freez(ae->chart);
2259         freez(ae->family);
2260         freez(ae->exec);
2261         freez(ae->recipient);
2262         freez(ae->source);
2263         freez(ae->units);
2264         freez(ae->info);
2265         freez(ae);
2266
2267         ae = t;
2268         host->health_log.count--;
2269     }
2270
2271     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
2272 }
2273
2274 static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
2275     if (unlikely(!rc->rrdset)) {
2276         debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
2277         return 0;
2278     }
2279
2280     if (unlikely(!rc->rrdset->last_collected_time.tv_sec)) {
2281         debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not yet collected.", rc->chart?rc->chart:"NOCHART", rc->name);
2282         return 0;
2283     }
2284
2285     if (unlikely(!rc->update_every)) {
2286         debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
2287         return 0;
2288     }
2289
2290     if (unlikely(rc->next_update > now)) {
2291         if (unlikely(*next_run > rc->next_update))
2292             *next_run = rc->next_update;
2293
2294         debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
2295         return 0;
2296     }
2297
2298     // FIXME
2299     // we should check that the DB lookup is possible
2300     // i.e.
2301     // - the duration of the chart includes the required timeframe
2302     // we SHOULD NOT check the dimensions - there might be alarms that refer non-existing dimensions (e.g. cpu steal)
2303
2304     return 1;
2305 }
2306
2307 void *health_main(void *ptr) {
2308     (void)ptr;
2309
2310     info("HEALTH thread created with task id %d", gettid());
2311
2312     if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
2313         error("Cannot set pthread cancel type to DEFERRED.");
2314
2315     if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
2316         error("Cannot set pthread cancel state to ENABLE.");
2317
2318     int min_run_every = (int)config_get_number("health", "run at least every seconds", 10);
2319     if(min_run_every < 1) min_run_every = 1;
2320
2321     BUFFER *wb = buffer_create(100);
2322
2323     unsigned int loop = 0;
2324     while(health_enabled && !netdata_exit) {
2325         loop++;
2326         debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
2327
2328         int oldstate, runnable = 0;
2329         time_t now = time(NULL);
2330         time_t next_run = now + min_run_every;
2331         RRDCALC *rc;
2332
2333         if (unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate) != 0))
2334             error("Cannot set pthread cancel state to DISABLE.");
2335
2336         rrdhost_rdlock(&localhost);
2337
2338         // the first loop is to lookup values from the db
2339         for (rc = localhost.alarms; rc; rc = rc->next) {
2340             if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2341                 continue;
2342
2343             runnable++;
2344             rc->old_value = rc->value;
2345
2346             // 1. if there is database lookup, do it
2347             // 2. if there is calculation expression, run it
2348
2349             if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
2350                 time_t old_db_timestamp = rc->db_before;
2351                 int value_is_null = 0;
2352
2353                 int ret = rrd2value(rc->rrdset, wb, &rc->value,
2354                                     rc->dimensions, 1, rc->after, rc->before, rc->group,
2355                                     rc->options, &rc->db_after, &rc->db_before, &value_is_null);
2356
2357                 if (unlikely(ret != 200)) {
2358                     // database lookup failed
2359                     rc->value = NAN;
2360
2361                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2362
2363                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))) {
2364                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
2365                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
2366                     }
2367                 }
2368                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_ERROR))
2369                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
2370
2371                 if (unlikely(old_db_timestamp == rc->db_before)) {
2372                     // database is stale
2373
2374                     debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2375
2376                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
2377                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
2378                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
2379                     }
2380                 }
2381                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
2382                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
2383
2384                 if (unlikely(value_is_null)) {
2385                     // collected value is null
2386
2387                     rc->value = NAN;
2388
2389                     debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2390                           rc->chart?rc->chart:"NOCHART", rc->name);
2391
2392                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))) {
2393                         rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
2394                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
2395                               rc->chart?rc->chart:"NOCHART", rc->name);
2396                     }
2397                 }
2398                 else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_NAN))
2399                     rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
2400
2401                 debug(D_HEALTH, "Health alarm '%s.%s': database lookup gave value "
2402                         CALCULATED_NUMBER_FORMAT, rc->chart?rc->chart:"NOCHART", rc->name, rc->value);
2403             }
2404
2405             if(unlikely(rc->calculation)) {
2406                 if (unlikely(!expression_evaluate(rc->calculation))) {
2407                     // calculation failed
2408
2409                     rc->value = NAN;
2410
2411                     debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2412                           rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2413
2414                     if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))) {
2415                         rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
2416                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
2417                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
2418                     }
2419                 }
2420                 else {
2421                     if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CALC_ERROR))
2422                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
2423
2424                     debug(D_HEALTH, "Health alarm '%s.%s': calculation expression gave value "
2425                             CALCULATED_NUMBER_FORMAT
2426                             ": %s (source: %s)",
2427                           rc->chart?rc->chart:"NOCHART", rc->name,
2428                           rc->calculation->result,
2429                           buffer_tostring(rc->calculation->error_msg),
2430                           rc->source
2431                     );
2432
2433                     rc->value = rc->calculation->result;
2434                 }
2435             }
2436         }
2437         rrdhost_unlock(&localhost);
2438
2439         if (unlikely(runnable && !netdata_exit)) {
2440             rrdhost_rdlock(&localhost);
2441
2442             for (rc = localhost.alarms; rc; rc = rc->next) {
2443                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
2444                     continue;
2445
2446                 int warning_status  = RRDCALC_STATUS_UNDEFINED;
2447                 int critical_status = RRDCALC_STATUS_UNDEFINED;
2448
2449                 if(likely(rc->warning)) {
2450                     if(unlikely(!expression_evaluate(rc->warning))) {
2451                         // calculation failed
2452
2453                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
2454                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2455
2456                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))) {
2457                             rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
2458                             error("Health alarm '%s.%s': warning expression failed with error: %s",
2459                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
2460                         }
2461                     }
2462                     else {
2463                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_WARN_ERROR))
2464                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
2465
2466                         debug(D_HEALTH, "Health alarm '%s.%s': warning expression gave value "
2467                                 CALCULATED_NUMBER_FORMAT
2468                                 ": %s (source: %s)",
2469                               rc->chart?rc->chart:"NOCHART", rc->name,
2470                               rc->warning->result,
2471                               buffer_tostring(rc->warning->error_msg),
2472                               rc->source
2473                         );
2474
2475                         warning_status = rrdcalc_value2status(rc->warning->result);
2476                     }
2477                 }
2478
2479                 if(likely(rc->critical)) {
2480                     if(unlikely(!expression_evaluate(rc->critical))) {
2481                         // calculation failed
2482
2483                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
2484                               rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2485
2486                         if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))) {
2487                             rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
2488                             error("Health alarm '%s.%s': critical expression failed with error: %s",
2489                                   rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
2490                         }
2491                     }
2492                     else {
2493                         if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_CRIT_ERROR))
2494                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
2495
2496                         debug(D_HEALTH, "Health alarm '%s.%s': critical expression gave value "
2497                                 CALCULATED_NUMBER_FORMAT
2498                                 ": %s (source: %s)",
2499                               rc->chart?rc->chart:"NOCHART", rc->name,
2500                               rc->critical->result,
2501                               buffer_tostring(rc->critical->error_msg),
2502                               rc->source
2503                         );
2504
2505                         critical_status = rrdcalc_value2status(rc->critical->result);
2506                     }
2507                 }
2508
2509                 int status = RRDCALC_STATUS_UNDEFINED;
2510
2511                 switch(warning_status) {
2512                     case RRDCALC_STATUS_CLEAR:
2513                         status = RRDCALC_STATUS_CLEAR;
2514                         break;
2515
2516                     case RRDCALC_STATUS_RAISED:
2517                         status = RRDCALC_STATUS_WARNING;
2518                         break;
2519
2520                     default:
2521                         break;
2522                 }
2523
2524                 switch(critical_status) {
2525                     case RRDCALC_STATUS_CLEAR:
2526                         if(status == RRDCALC_STATUS_UNDEFINED)
2527                             status = RRDCALC_STATUS_CLEAR;
2528                         break;
2529
2530                     case RRDCALC_STATUS_RAISED:
2531                         status = RRDCALC_STATUS_CRITICAL;
2532                         break;
2533
2534                     default:
2535                         break;
2536                 }
2537
2538                 if(status != rc->status) {
2539                     int delay = 0;
2540
2541                     if(now > rc->delay_up_to_timestamp) {
2542                         rc->delay_up_current = rc->delay_up_duration;
2543                         rc->delay_down_current = rc->delay_down_duration;
2544                         rc->delay_last = 0;
2545                         rc->delay_up_to_timestamp = 0;
2546                     }
2547                     else {
2548                         rc->delay_up_current = (int)(rc->delay_up_current * rc->delay_multiplier);
2549                         if(rc->delay_up_current > rc->delay_max_duration) rc->delay_up_current = rc->delay_max_duration;
2550
2551                         rc->delay_down_current = (int)(rc->delay_down_current * rc->delay_multiplier);
2552                         if(rc->delay_down_current > rc->delay_max_duration) rc->delay_down_current = rc->delay_max_duration;
2553                     }
2554
2555                     if(status > rc->status)
2556                         delay = rc->delay_up_current;
2557                     else
2558                         delay = rc->delay_down_current;
2559
2560                     // COMMENTED: because we do need to send raising alarms
2561                     // if(now + delay < rc->delay_up_to_timestamp)
2562                     //    delay = (int)(rc->delay_up_to_timestamp - now);
2563
2564                     rc->delay_last = delay;
2565                     rc->delay_up_to_timestamp = now + delay;
2566                     health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
2567                     rc->last_status_change = now;
2568                     rc->status = status;
2569                 }
2570
2571                 rc->last_updated = now;
2572                 rc->next_update = now + rc->update_every;
2573
2574                 if (next_run > rc->next_update)
2575                     next_run = rc->next_update;
2576             }
2577
2578             rrdhost_unlock(&localhost);
2579         }
2580
2581         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
2582             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
2583
2584         if(unlikely(netdata_exit))
2585             break;
2586
2587         // execute notifications
2588         // and cleanup
2589         health_alarm_log_process(&localhost);
2590
2591         if(unlikely(netdata_exit))
2592             break;
2593         
2594         now = time(NULL);
2595         if(now < next_run) {
2596             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
2597                   loop, (int) (next_run - now));
2598             sleep_usec(1000000 * (unsigned long long) (next_run - now));
2599         }
2600         else {
2601             debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
2602         }
2603     }
2604
2605     buffer_free(wb);
2606
2607     info("HEALTH thread exiting");
2608     pthread_exit(NULL);
2609     return NULL;
2610 }