]> arthur.barton.de Git - netdata.git/commitdiff
API for getting alarm log ready
authorCosta Tsaousis <costa@tsaousis.gr>
Thu, 25 Aug 2016 16:39:08 +0000 (19:39 +0300)
committerCosta Tsaousis <costa@tsaousis.gr>
Thu, 25 Aug 2016 16:39:08 +0000 (19:39 +0300)
src/health.c
src/health.h
src/rrd.c
src/rrd.h
src/web_client.c

index 3cf9ae50a5d8ff92dd7a29b2cf63dc46e5de273d..827fd3de0b8a6015c0baee7c148afef70ac12d10 100644 (file)
@@ -5,13 +5,6 @@
 static const char *health_default_exec = PLUGINS_DIR "/alarm-email.sh";
 int health_enabled = 1;
 
-ALARM_LOG health_log = {
-        .nextid = 0,
-        .count = 0,
-        .max = 1000,
-        .alarms = NULL
-};
-
 // ----------------------------------------------------------------------------
 // RRDVAR management
 
@@ -443,6 +436,7 @@ static inline const char *rrdcalc_status2string(int status) {
 static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
     debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, st->rrdhost->hostname);
 
+    rc->last_status_change = time(NULL);
     rc->rrdset = st;
 
     rc->rrdset_next = st->alarms;
@@ -1399,12 +1393,12 @@ void health_init(void) {
         health_default_exec = config_get("health", "script to execute on alarm", buffer);
     }
 
-    long n = config_get_number("health", "in memory max health log entries", (long)health_log.max);
+    long n = config_get_number("health", "in memory max health log entries", (long)localhost.health_log.max);
     if(n < 2) {
-        error("Health configuration has invalid max log entries %ld. Using default %u", n, health_log.max);
-        config_set_number("health", "in memory max health log entries", (long)health_log.max);
+        error("Health configuration has invalid max log entries %ld. Using default %u", n, localhost.health_log.max);
+        config_set_number("health", "in memory max health log entries", (long)localhost.health_log.max);
     }
-    else health_log.max = (unsigned int)n;
+    else localhost.health_log.max = (unsigned int)n;
 
     rrdhost_rwlock(&localhost);
     health_readdir(path);
@@ -1421,12 +1415,78 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char
         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
 }
 
-static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
+static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
+    buffer_sprintf(wb, "\n\t{\n"
+                           "\t\t\"id\":%u,\n"
+                           "\t\t\"name\":\"%s\",\n"
+                           "\t\t\"chart\":\"%s\",\n"
+                           "\t\t\"family\":\"%s\",\n"
+                           "\t\t\"processed\":%s,\n"
+                           "\t\t\"updated\":%s,\n"
+                           "\t\t\"exec_run\":%s,\n"
+                           "\t\t\"exec_failed\":%s,\n"
+                           "\t\t\"exec\":\"%s\",\n"
+                           "\t\t\"exec_code\":%d,\n"
+                           "\t\t\"source\":\"%s\",\n"
+                           "\t\t\"when\":%lu,\n"
+                           "\t\t\"duration\":%lu,\n"
+                           "\t\t\"non_clear_duration\":%lu,\n"
+                           "\t\t\"status\":\"%s\",\n"
+                           "\t\t\"old_status\":\"%s\",\n",
+                   ae->id,
+                   ae->name,
+                   ae->chart,
+                   ae->family,
+                   (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)?"true":"false",
+                   (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)?"true":"false",
+                   (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN)?"true":"false",
+                   (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false",
+                   ae->exec?ae->exec:health_default_exec,
+                   ae->exec_code,
+                   ae->source,
+                   (unsigned long)ae->when,
+                   (unsigned long)ae->duration,
+                   (unsigned long)ae->non_clear_duration,
+                   rrdcalc_status2string(ae->new_status),
+                   rrdcalc_status2string(ae->old_status)
+    );
+
+    buffer_strcat(wb, "\t\t\"value\":");
+    buffer_rrd_value(wb, ae->new_value);
+    buffer_strcat(wb, ",\n");
+
+    buffer_strcat(wb, "\t\t\"old_value\":");
+    buffer_rrd_value(wb, ae->old_value);
+    buffer_strcat(wb, "\n");
+
+    buffer_strcat(wb, "\t}");
+}
+
+void health_alarm_log2json(RRDHOST *host, BUFFER *wb) {
+    pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+
+    buffer_strcat(wb, "[");
+
+    unsigned int max = host->health_log.max;
+    unsigned int count = 0;
+    ALARM_ENTRY *ae;
+    for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
+        if(likely(count)) buffer_strcat(wb, ",");
+        health_alarm_entry2json_nolock(wb, ae);
+    }
 
+    buffer_strcat(wb, "\n]\n");
+
+    pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+}
+
+static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
     buffer_sprintf(wb,
            "\t\t\"%s.%s\": {\n"
                    "\t\t\t\"name\": \"%s\",\n"
                    "\t\t\t\"chart\": \"%s\",\n"
+                   "\t\t\t\"family\": \"%s\",\n"
+                   "\t\t\t\"active\": %s,\n"
                    "\t\t\t\"exec\": \"%s\",\n"
                    "\t\t\t\"source\": \"%s\",\n"
                    "\t\t\t\"status\": \"%s\",\n"
@@ -1437,6 +1497,8 @@ static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
             , rc->chart, rc->name
             , rc->name
             , rc->chart
+            , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
+            , (rc->rrdset)?"true":"false"
             , rc->exec?rc->exec:health_default_exec
             , rc->source
             , rrdcalc_status2string(rc->status)
@@ -1508,6 +1570,8 @@ void health_alarms2json(RRDHOST *host, BUFFER *wb) {
     buffer_strcat(wb, "{\n\t\"alarms\": {\n");
     RRDCALC *rc;
     for(i = 0, rc = host->alarms; rc ; rc = rc->next, i++) {
+        if(!rc->rrdset) continue;
+
         if(likely(i)) buffer_strcat(wb, ",\n");
         health_rrdcalc2json_nolock(wb, rc);
     }
@@ -1624,6 +1688,8 @@ static inline void health_alarm_execute(ALARM_ENTRY *ae) {
               (uint32_t)ae->non_clear_duration
     );
 
+    ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN;
+
     debug(D_HEALTH, "executing command '%s'", buffer);
     FILE *fp = mypopen(buffer, &command_pid);
     if(!fp) {
@@ -1634,8 +1700,11 @@ static inline void health_alarm_execute(ALARM_ENTRY *ae) {
     char *s = fgets(buffer, FILENAME_MAX, fp);
     (void)s;
     debug(D_HEALTH, "HEALTH closing command");
-    mypclose(fp, command_pid);
-    debug(D_HEALTH, "closed command");
+    ae->exec_code = mypclose(fp, command_pid);
+    debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
+
+    if(ae->exec_code != 0)
+        ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED;
 }
 
 static inline void health_process_notifications(ALARM_ENTRY *ae) {
@@ -1649,7 +1718,7 @@ static inline void health_process_notifications(ALARM_ENTRY *ae) {
     health_alarm_execute(ae);
 }
 
-static inline void health_alarm_log(time_t when,
+static inline void health_alarm_log(RRDHOST *host, time_t when,
                 const char *name, const char *chart, const char *family,
                 const char *exec, time_t duration,
                 calculated_number old_value, calculated_number new_value,
@@ -1671,7 +1740,7 @@ static inline void health_alarm_log(time_t when,
     if(exec) ae->exec = strdupz(exec);
     if(source) ae->source = strdupz(source);
 
-    ae->id = health_log.nextid++;
+    ae->id = host->health_log.nextid++;
     ae->when = when;
     ae->old_value = old_value;
     ae->new_value = new_value;
@@ -1683,13 +1752,16 @@ static inline void health_alarm_log(time_t when,
         ae->non_clear_duration += ae->duration;
 
     // link it
-    ae->next = health_log.alarms;
-    health_log.alarms = ae;
-    health_log.count++;
+    pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
+    ae->next = host->health_log.alarms;
+    host->health_log.alarms = ae;
+    host->health_log.count++;
+    pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
 
     // match previous alarms
+    pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
     ALARM_ENTRY *t;
-    for(t = health_log.alarms ; t ; t = t->next) {
+    for(t = host->health_log.alarms ; t ; t = t->next) {
         if(t != ae &&
                 t->hash_name == ae->hash_name &&
                 t->hash_chart == ae->hash_chart &&
@@ -1710,13 +1782,16 @@ static inline void health_alarm_log(time_t when,
             }
         }
     }
+    pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
 }
 
-static inline void health_alarm_log_process(void) {
+static inline void health_alarm_log_process(RRDHOST *host) {
     static uint32_t last_processed = 0;
     ALARM_ENTRY *ae;
 
-    for(ae = health_log.alarms; ae ;ae = ae->next) {
+    pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+
+    for(ae = host->health_log.alarms; ae ;ae = ae->next) {
         if(last_processed >= ae->id) break;
 
         if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
@@ -1726,18 +1801,25 @@ static inline void health_alarm_log_process(void) {
         }
     }
 
-    if(health_log.alarms)
-        last_processed = health_log.alarms->id;
+    if(host->health_log.alarms)
+        last_processed = host->health_log.alarms->id;
+
+    pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
 
-    if(health_log.count <= health_log.max)
+    if(host->health_log.count <= host->health_log.max)
         return;
 
     // cleanup excess entries in the log
+    pthread_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
+
     ALARM_ENTRY *last = NULL;
-    unsigned int count = health_log.max;
-    for(ae = health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
-    if(!ae || !last || last->next != ae) return;
-    last->next = NULL;
+    unsigned int count = host->health_log.max;
+    for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
+
+    if(ae && last && last->next == ae)
+        last->next = NULL;
+    else
+        ae = NULL;
 
     while(ae) {
         ALARM_ENTRY *t = ae->next;
@@ -1750,6 +1832,8 @@ static inline void health_alarm_log_process(void) {
 
         ae = t;
     }
+
+    pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
 }
 
 void *health_main(void *ptr) {
@@ -1984,7 +2068,7 @@ void *health_main(void *ptr) {
                 }
 
                 if(status != rc->status) {
-                    health_alarm_log(time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source);
+                    health_alarm_log(&localhost, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source);
                     rc->last_status_change = now;
                     rc->status = status;
                 }
@@ -2004,7 +2088,7 @@ void *health_main(void *ptr) {
 
         // execute notifications
         // and cleanup
-        health_alarm_log_process();
+        health_alarm_log_process(&localhost);
 
         now = time(NULL);
         if(now < next_run) {
index 5cc9dcbdc92e602ff5c6251cecfe722c9b9bad7d..b6b890911859d809587f2ce828ba522193affbc8 100644 (file)
@@ -206,8 +206,10 @@ typedef struct rrdcalctemplate {
 
 #define RRDCALCTEMPLATE_HAS_CALCULATION(rt) ((rt)->after)
 
-#define HEALTH_ENTRY_NOTIFICATIONS_PROCESSED 0x00000001
-#define HEALTH_ENTRY_NOTIFICATIONS_UPDATED   0x00000002
+#define HEALTH_ENTRY_NOTIFICATIONS_PROCESSED    0x00000001
+#define HEALTH_ENTRY_NOTIFICATIONS_UPDATED      0x00000002
+#define HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN     0x00000004
+#define HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED  0x00000008
 
 typedef struct alarm_entry {
     uint32_t id;
@@ -225,6 +227,8 @@ typedef struct alarm_entry {
     char *family;
 
     char *exec;
+    int exec_code;
+
     char *source;
     calculated_number old_value;
     calculated_number new_value;
@@ -242,6 +246,7 @@ typedef struct alarm_log {
     unsigned int count;
     unsigned int max;
     ALARM_ENTRY *alarms;
+    pthread_rwlock_t alarm_log_rwlock;
 } ALARM_LOG;
 
 #include "rrd.h"
@@ -266,5 +271,6 @@ extern void health_reload(void);
 
 extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result);
 extern void health_alarms2json(RRDHOST *host, BUFFER *wb);
+extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb);
 
 #endif //NETDATA_HEALTH_H
index 948e8dedf660c63746b0e0321c90c428f9b38ef2..61f99eda8eba99674c20033f97a231d6fdc216ab 100644 (file)
--- a/src/rrd.c
+++ b/src/rrd.c
@@ -40,6 +40,13 @@ RRDHOST localhost = {
         .variables_root_index = {
             { NULL, rrdvar_compare },
             AVL_LOCK_INITIALIZER
+        },
+        .health_log = {
+            .nextid = 1,
+            .count = 0,
+            .max = 1000,
+            .alarms = NULL,
+            .alarm_log_rwlock = PTHREAD_RWLOCK_INITIALIZER
         }
 };
 
index 4c894344820f45178cfdcffbc4b87c2540e3f922..37eb121e7e495448178e37133c99d90d85f0185e 100644 (file)
--- a/src/rrd.h
+++ b/src/rrd.h
@@ -304,6 +304,7 @@ struct rrdhost {
     // RRDCALCs may be linked to charts at any point
     // (charts may or may not exist when these are loaded)
     RRDCALC *alarms;
+    ALARM_LOG health_log;
 
     RRDCALCTEMPLATE *templates;
 };
index 7f74ac5a292baae569f48174c42edb48bf43ab4a..29d633b2fb39be6f03881c4f4086c0d37fcfe4c0 100644 (file)
@@ -670,6 +670,16 @@ int web_client_api_request_v1_alarms(struct web_client *w, char *url)
     return 200;
 }
 
+int web_client_api_request_v1_alarm_log(struct web_client *w, char *url)
+{
+    (void)url;
+
+    buffer_flush(w->response.data);
+    w->response.data->contenttype = CT_APPLICATION_JSON;
+    health_alarm_log2json(&localhost, w->response.data);
+    return 200;
+}
+
 int web_client_api_request_v1_charts(struct web_client *w, char *url)
 {
     if(url) { ; }
@@ -1324,7 +1334,7 @@ int web_client_api_request_v1_registry(struct web_client *w, char *url)
 }
 
 int web_client_api_request_v1(struct web_client *w, char *url) {
-    static uint32_t hash_data = 0, hash_chart = 0, hash_charts = 0, hash_registry = 0, hash_badge = 0, hash_alarms = 0;
+    static uint32_t hash_data = 0, hash_chart = 0, hash_charts = 0, hash_registry = 0, hash_badge = 0, hash_alarms = 0, hash_alarm_log = 0;
 
     if(unlikely(hash_data == 0)) {
         hash_data = simple_hash("data");
@@ -1333,6 +1343,7 @@ int web_client_api_request_v1(struct web_client *w, char *url) {
         hash_registry = simple_hash("registry");
         hash_badge = simple_hash("badge.svg");
         hash_alarms = simple_hash("alarms");
+        hash_alarm_log = simple_hash("alarm_log");
     }
 
     // get the command
@@ -1359,6 +1370,9 @@ int web_client_api_request_v1(struct web_client *w, char *url) {
         else if(hash == hash_alarms && !strcmp(tok, "alarms"))
             return web_client_api_request_v1_alarms(w, url);
 
+        else if(hash == hash_alarm_log && !strcmp(tok, "alarm_log"))
+            return web_client_api_request_v1_alarm_log(w, url);
+
         else {
             buffer_flush(w->response.data);
             buffer_sprintf(w->response.data, "Unsupported v1 API command: %s", tok);