]> arthur.barton.de Git - netdata.git/commitdiff
added the ability to execute scripts on alarms
authorCosta Tsaousis <costa@tsaousis.gr>
Tue, 16 Aug 2016 20:36:38 +0000 (23:36 +0300)
committerCosta Tsaousis <costa@tsaousis.gr>
Tue, 16 Aug 2016 20:36:38 +0000 (23:36 +0300)
conf.d/health.d/disks.conf
src/health.c
src/health.h
src/rrd.h

index b550de83a310302a945f3d4772409045dd884567..6b612bbc150810d1db8d56e4edb521416501b540 100644 (file)
@@ -5,11 +5,12 @@
 # raise an alarm if the disk is low on
 # available disk space
 
-template: low_disk_space
+template: disk_full_percentage
       on: disk.space
+    calc: $used * 100 / ($avail + $used)
    every: 1m
-    warn: $avail * 100 / ($avail + $used) < 80
-    crit: $avail * 100 / ($avail + $used) < 95
+    warn: $this > 80
+    crit: $used > 95
 
 
 # -----------------------------------------------------------------------------
index b05139b2aaf89a82804cf27e13fc78ea8804d3fa..6db51063df2f850d925a400ab34b6ffb674d0b64 100644 (file)
@@ -5,6 +5,13 @@
 static const char *health_default_exec = PLUGINS_DIR "/alarm.sh";
 int health_enabled = 1;
 
+ALARM_LOG health_log = {
+        .nextid = 0,
+        .count = 0,
+        .max = 1000,
+        .alarms = NULL
+};
+
 // ----------------------------------------------------------------------------
 // RRDVAR management
 
@@ -568,6 +575,9 @@ static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const cha
 
     if(dimensions) rc->dimensions = strdupz(dimensions);
 
+    rc->value = NAN;
+    rc->old_value = NAN;
+
     rc->group = group_method;
     rc->after = after;
     rc->before = before;
@@ -958,9 +968,9 @@ static inline int health_parse_db_lookup(
     return 1;
 }
 
-static inline char *health_source_file(int line, const char *path, const char *filename) {
+static inline char *health_source_file(size_t line, const char *path, const char *filename) {
     char buffer[FILENAME_MAX + 1];
-    snprintfz(buffer, FILENAME_MAX, "%d@%s/%s", line, path, filename);
+    snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename);
     return strdupz(buffer);
 }
 
@@ -1054,6 +1064,8 @@ int health_readfile(const char *path, const char *filename) {
             rc->name = strdupz(value);
             rc->hash = simple_hash(rc->name);
             rc->source = health_source_file(line, path, filename);
+            rc->value = NAN;
+            rc->old_value = NAN;
 
             if(rrdvar_fix_name(rc->name))
                 error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
@@ -1315,6 +1327,13 @@ void health_init(void) {
         health_default_exec = config_get("health", "script to execute on alarm", buffer);
     }
 
+    long n = config_get_number("health", "in memory max health log entries", (long)health_log.max);
+    if(n < 2) {
+        error("Health configuration has invalid max log entries %ld. Using default %u", n, health_log.max);
+        config_set_number("health", "in memory max health log entries", (long)health_log.max);
+    }
+    else health_log.max = (unsigned int)n;
+
     rrdhost_rwlock(&localhost);
     health_readdir(path);
     rrdhost_unlock(&localhost);
@@ -1409,38 +1428,151 @@ static inline const char *rrdcalc_status2string(int status) {
     }
 }
 
-void rrdcalc_check_critical_event(RRDCALC *rc) {
+static inline const char *alarm_entry_type2string(int type) {
+    switch(type) {
+        case ALARM_ENTRY_TYPE_WARNING:
+            return "WARNING";
+
+        case ALARM_ENTRY_TYPE_CRITICAL:
+            return "WARNING";
+
+        default:
+            return "UNKNOWN";
+    }
+}
+
+static inline void health_alarm_execute(ALARM_ENTRY *ae) {
+    if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_OFF)
+        return;
+
+    char buffer[FILENAME_MAX + 1];
+    pid_t command_pid;
+
+    const char *exec = ae->exec;
+    if(!exec) exec = health_default_exec;
+
+    snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf', '%s'",
+              exec,
+              alarm_entry_type2string(ae->type),
+              ae->name,
+              ae->chart?ae->chart:"NOCAHRT",
+              rrdcalc_status2string(ae->new_status),
+              rrdcalc_status2string(ae->old_status),
+              ae->new_value,
+              ae->old_value,
+              ae->source?ae->source:"UNKNOWN"
+    );
+
+    debug(D_HEALTH, "executing command '%s'", buffer);
+    FILE *fp = mypopen(buffer, &command_pid);
+    if(!fp) {
+        error("HEALTH: Cannot popen(\"%s\", \"r\").", buffer);
+        return;
+    }
+    debug(D_HEALTH, "HEALTH reading from command");
+    char *s = fgets(buffer, FILENAME_MAX, fp);
+    (void)s;
+    debug(D_HEALTH, "HEALTH closing command");
+    mypclose(fp, command_pid);
+    debug(D_HEALTH, "closed command");
+}
+
+static inline void health_process_notifications(ALARM_ENTRY *ae) {
+    info("Health alarm '%s.%s' = %0.2Lf - %s changed status from %s to %s",
+         ae->chart?ae->chart:"NOCHART", ae->name,
+         ae->new_value,
+         (ae->type == ALARM_ENTRY_TYPE_WARNING)?"WARNING":"CRITICAL",
+         rrdcalc_status2string(ae->old_status),
+         rrdcalc_status2string(ae->new_status)
+    );
+
+    health_alarm_execute(ae);
+}
+
+static inline void health_alarm_log(time_t when, int type,
+                const char *name, const char *chart, const char *exec,
+                calculated_number old_value, calculated_number new_value,
+                int old_status, int new_status,
+                const char *source
+) {
+    ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
+    ae->name = strdupz(name);
+    if(chart) ae->chart = strdupz(chart);
+    if(exec) ae->exec = strdupz(exec);
+    if(source) ae->source = strdupz(source);
+
+    ae->id = health_log.nextid++;
+    ae->when = when;
+    ae->type = type;
+    ae->old_value = old_value;
+    ae->new_value = new_value;
+    ae->old_status = old_status;
+    ae->new_status = new_status;
+
+    // link it
+    ae->next = health_log.alarms;
+    health_log.alarms = ae;
+    health_log.count++;
+}
+
+static inline void health_alarm_log_process(void) {
+    static uint32_t last_processed = 0;
+    ALARM_ENTRY *ae;
+
+    for(ae = health_log.alarms; ae ;ae = ae->next) {
+        if(last_processed >= ae->id) break;
+
+        if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)) {
+            ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
+            health_process_notifications(ae);
+        }
+    }
+
+    if(health_log.alarms)
+        last_processed = health_log.alarms->id;
+
+    if(health_log.count <= health_log.max)
+        return;
+
+    // cleanup excess entries in the log
+    ALARM_ENTRY *last = NULL;
+    unsigned int count = health_log.max;
+    for(ae = health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
+    if(!ae || !last || last->next != ae) return;
+    last->next = NULL;
+
+    while(ae) {
+        ALARM_ENTRY *t = ae->next;
+
+        freez(ae->chart);
+        freez(ae->name);
+        freez(ae->exec);
+        freez(ae);
+
+        ae = t;
+    }
+}
+
+static inline void rrdcalc_check_critical_event(RRDCALC *rc) {
     calculated_number n = rc->critical->result;
 
     int old_status = rc->critical_status;
     int new_status = rrdcalc_value2status(n);
 
     if(new_status != old_status) {
-        info("Health alarm '%s.%s' = %0.2Lf - CRITICAL condition changed status from %s to %s",
-             rc->chart?rc->chart:"NOCHART", rc->name,
-             rc->value,
-             rrdcalc_status2string(old_status),
-             rrdcalc_status2string(new_status)
-        );
-
+        health_alarm_log(time(NULL), ALARM_ENTRY_TYPE_WARNING, rc->name, rc->rrdset->id, rc->exec, rc->old_value, rc->value, old_status, new_status, rc->source);
         rc->critical_status = new_status;
     }
 }
 
-void rrdcalc_check_warning_event(RRDCALC *rc) {
+static inline void rrdcalc_check_warning_event(RRDCALC *rc) {
     calculated_number n = rc->warning->result;
 
     int old_status = rc->warning_status;
     int new_status = rrdcalc_value2status(n);
 
     if(new_status != old_status) {
-        info("Health alarm '%s.%s' = %0.2Lf - WARNING condition changed status from %s to %s",
-             rc->chart?rc->chart:"NOCHART", rc->name,
-             rc->value,
-             rrdcalc_status2string(old_status),
-             rrdcalc_status2string(new_status)
-        );
-
+        health_alarm_log(time(NULL), ALARM_ENTRY_TYPE_CRITICAL, rc->name, rc->rrdset->id, rc->exec, rc->old_value, rc->value, old_status, new_status, rc->source);
         rc->warning_status = new_status;
     }
 }
@@ -1482,6 +1614,7 @@ void *health_main(void *ptr) {
                 continue;
 
             runnable++;
+            rc->old_value = rc->value;
 
             // 1. if there is database lookup, do it
             // 2. if there is calculation expression, run it
@@ -1498,6 +1631,8 @@ void *health_main(void *ptr) {
                     // database lookup failed
                     rc->value = NAN;
 
+                    debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
+
                     if (unlikely(!(rc->rrdcalc_options & RRDCALC_OPTION_DB_ERROR))) {
                         rc->rrdcalc_options |= RRDCALC_OPTION_DB_ERROR;
                         error("Health alarm '%s.%s': database lookup returned error %d", rc->chart?rc->chart:"NOCHART", rc->name, ret);
@@ -1509,6 +1644,8 @@ void *health_main(void *ptr) {
                 if (unlikely(old_db_timestamp == rc->db_timestamp)) {
                     // database is stale
 
+                    debug(D_HEALTH, "Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
+
                     if (unlikely(!(rc->rrdcalc_options & RRDCALC_OPTION_DB_STALE))) {
                         rc->rrdcalc_options |= RRDCALC_OPTION_DB_STALE;
                         error("Health alarm '%s.%s': database is stale", rc->chart?rc->chart:"NOCHART", rc->name);
@@ -1522,6 +1659,9 @@ void *health_main(void *ptr) {
 
                     rc->value = NAN;
 
+                    debug(D_HEALTH, "Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
+                          rc->chart?rc->chart:"NOCHART", rc->name);
+
                     if (unlikely(!(rc->rrdcalc_options & RRDCALC_OPTION_DB_NAN))) {
                         rc->rrdcalc_options |= RRDCALC_OPTION_DB_NAN;
                         error("Health alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
@@ -1541,6 +1681,9 @@ void *health_main(void *ptr) {
 
                     rc->value = NAN;
 
+                    debug(D_HEALTH, "Health alarm '%s.%s': failed to evaluate calculation with error: %s",
+                          rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->calculation->error_msg));
+
                     if (unlikely(!(rc->rrdcalc_options & RRDCALC_OPTION_CALC_ERROR))) {
                         rc->rrdcalc_options |= RRDCALC_OPTION_CALC_ERROR;
                         error("Health alarm '%s.%s': failed to evaluate calculation with error: %s",
@@ -1576,6 +1719,10 @@ void *health_main(void *ptr) {
                 if(unlikely(rc->warning)) {
                     if(unlikely(!expression_evaluate(rc->warning))) {
                         // calculation failed
+
+                        debug(D_HEALTH, "Health alarm '%s.%s': warning expression failed with error: %s",
+                              rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->warning->error_msg));
+
                         if (unlikely(!(rc->rrdcalc_options & RRDCALC_OPTION_WARN_ERROR))) {
                             rc->rrdcalc_options |= RRDCALC_OPTION_WARN_ERROR;
                             error("Health alarm '%s.%s': warning expression failed with error: %s",
@@ -1602,6 +1749,10 @@ void *health_main(void *ptr) {
                 if(unlikely(rc->critical)) {
                     if(unlikely(!expression_evaluate(rc->critical))) {
                         // calculation failed
+
+                        debug(D_HEALTH, "Health alarm '%s.%s': critical expression failed with error: %s",
+                              rc->chart?rc->chart:"NOCHART", rc->name, buffer_tostring(rc->critical->error_msg));
+
                         if (unlikely(!(rc->rrdcalc_options & RRDCALC_OPTION_CRIT_ERROR))) {
                             rc->rrdcalc_options |= RRDCALC_OPTION_CRIT_ERROR;
                             error("Health alarm '%s.%s': critical expression failed with error: %s",
@@ -1635,14 +1786,22 @@ void *health_main(void *ptr) {
             rrdhost_unlock(&localhost);
         }
 
-
         if (unlikely(pthread_setcancelstate(oldstate, NULL) != 0))
             error("Cannot set pthread cancel state to RESTORE (%d).", oldstate);
 
-        debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
-              loop, (int) (next_run - now));
+        // execute notifications
+        // and cleanup
+        health_alarm_log_process();
 
-        sleep_usec(1000000 * (unsigned long long) (next_run - now));
+        now = time(NULL);
+        if(now < next_run) {
+            debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs",
+                  loop, (int) (next_run - now));
+            sleep_usec(1000000 * (unsigned long long) (next_run - now));
+        }
+        else {
+            debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
+        }
     }
 
     buffer_free(wb);
index a75ce44e96f212216a427671b0f994e78f8674ec..2c30914457ded025b41a1a8f05fa7723d6e98282 100644 (file)
@@ -149,6 +149,7 @@ typedef struct rrdcalc {
     time_t db_timestamp;
 
     calculated_number value;
+    calculated_number old_value;
 
     calculated_number green;
     calculated_number red;
@@ -201,6 +202,33 @@ typedef struct rrdcalctemplate {
 
 #define RRDCALCTEMPLATE_HAS_CALCULATION(rt) ((rt)->after)
 
+#define ALARM_ENTRY_TYPE_WARNING  1
+#define ALARM_ENTRY_TYPE_CRITICAL 2
+
+#define HEALTH_ENTRY_NOTIFICATIONS_PROCESSED 0x00000001
+
+typedef struct alarm_entry {
+    uint32_t id;
+    time_t when;
+    int type;
+    char *name;
+    char *chart;
+    char *exec;
+    char *source;
+    calculated_number old_value;
+    calculated_number new_value;
+    int old_status;
+    int new_status;
+    uint32_t notifications;
+    struct alarm_entry *next;
+} ALARM_ENTRY;
+
+typedef struct alarm_log {
+    uint32_t nextid;
+    unsigned int count;
+    unsigned int max;
+    ALARM_ENTRY *alarms;
+} ALARM_LOG;
 
 #include "rrd.h"
 
index abc8b4c0cec117fe529b217ab5fd2d0697881c33..0fd2d4567d1fac1da6e2129ae03c7700aa41c70f 100644 (file)
--- a/src/rrd.h
+++ b/src/rrd.h
@@ -13,7 +13,7 @@ extern int rrd_default_history_entries;
 // set to zero to disable this feature
 extern int rrd_delete_unupdated_dimensions;
 
-#define RRD_ID_LENGTH_MAX 1024
+#define RRD_ID_LENGTH_MAX 400
 
 #define RRDSET_MAGIC        "NETDATA RRD SET FILE V018"
 #define RRDDIMENSION_MAGIC  "NETDATA RRD DIMENSION FILE V018"