]> arthur.barton.de Git - netdata.git/commitdiff
each alarm now has one status with the following possible values: UNINITIALIZED,...
authorCosta Tsaousis <costa@tsaousis.gr>
Wed, 17 Aug 2016 22:50:04 +0000 (01:50 +0300)
committerCosta Tsaousis <costa@tsaousis.gr>
Wed, 17 Aug 2016 22:50:04 +0000 (01:50 +0300)
plugins.d/alarm.sh
src/health.c
src/health.h

index a4bade19a7013bfbaba67acd7106ac51b7675b6f..50a7d9008d096d04298bb2328d2f41c679b683ef 100755 (executable)
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+me="${0}"
+
 sendmail="$(which sendmail 2>/dev/null || command -v sendmail 2>/dev/null)"
 if [ -z "${sendmail}" ]
 then
@@ -8,20 +10,25 @@ fi
 
 sendmail_from_pipe() {
     "${sendmail}" -t
-}
 
-type="${1}"       # WARNING or CRITICAL
-name="${2}"       # the name of the alarm, as given in netdata health.d entries
-chart="${3}"      # the name of the chart (type.id)
-status="${4}"     # the current status
-old_status="${5}" # the previous status
-value="${6}"      # the current value
-old_value="${7}"  # the previous value
-src="${8}"        # the line number and file the alarm has been configured
-duration="${9}"   # the duration in seconds the previous state took
+    if [ $? -eq 0 ]
+    then
+        echo >&2 "${me}: Sent notification email for ${status} on '${chart}.${name}'"
+        return 0
+    else
+        echo >&2 "${me}: FAILED to send notification email for ${status} on '${chart}.${name}'"
+        return 1
+    fi
+}
 
-# don't do anything if this is not RAISED or OFF
-[ "${status}" != "RAISED" -a "${status}" != "OFF" ] && exit 0
+name="${1}"       # the name of the alarm, as given in netdata health.d entries
+chart="${2}"      # the name of the chart (type.id)
+status="${3}"     # the current status : UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+old_status="${4}" # the previous status: UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+value="${5}"      # the current value
+old_value="${6}"  # the previous value
+src="${7}"        # the line number and file the alarm has been configured
+duration="${8}"   # the duration in seconds the previous state took
 
 # get the system hostname
 hostname="$(hostname)"
@@ -76,54 +83,64 @@ duration4human() {
     fi
 }
 
-if [ "${old_status}" = "RAISED" -a "${status}" = "OFF" ]
-then
-    # an alarm that is now OFF
-    severity="${type} recovered"
-    raised_for="<br/>(was in ${type,,} state for $(duration4human ${duration}))"
-else
-    # an alarm that is now RAISED
-    severity="${type}"
-    raised_for=""
-fi
+severity="${status}"
+raised_for="<br/>(was in ${old_status,,} for $(duration4human ${duration}))"
+status_message="status unknown"
+color="grey"
+alarm="${name} = ${value}"
 
-# prepare the title
+# prepare the title based on status
 case "${status}" in
-       RAISED)
-               if [ "${type}" = "CRITICAL" ]
-               then
-                       # CRITICAL - red
-                       status_message="is critical"
-                       color="#ca414b"
-               else
-                       # WARNING - yellow
-                       status_message="needs attention"
-                       color="#caca4b"
-               fi
+       CRITICAL)
+        status_message="is critical"
+        color="#ca414b"
+        ;;
+
+    WARNING)
+        status_message="needs attention"
+        color="#caca4b"
                ;;
 
-       OFF)
-               if [ "${type}" = "CRITICAL" ]
-               then
-                       # CRITICAL
-                       status_message="recovered"
-               else
-                       # WARNING
-                       status_message="back to normal"
-               fi
+       CLEAR)
+       status_message="recovered"
                color="#77ca6d"
-               ;;
 
-       *)
-       status_message="status unknown"
-               color="grey"
+               # don't show the value when the status is CLEAR
+               # for certain alarms, this value might not have any meaning
+               alarm="${name}"
                ;;
 esac
 
+if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a "${status}" != "CLEAR" ]
+then
+    # don't do anything if this is not WARNING, CRITICAL or CLEAR
+    echo >&2 "${me}: not sending notification email for ${status} on '${chart}.${name}'"
+    exit 0
+elif [ "${old_status}" != "WARNING" -a "${old_status}" != "CRITICAL" -a "${status}" = "CLEAR" ]
+then
+    # don't do anything if this is CLEAR, but it was not WARNING or CRITICAL
+    echo >&2 "${me}: not sending notification email for ${status} on '${chart}.${name}' (last status was ${old_status})"
+    exit 0
+elif [ "${status}" = "CLEAR" ]
+then
+    severity="Recovered from ${old_status}"
+
+elif [ "${old_status}" = "WARNING" -a "${status}" = "CRITICAL" ]
+then
+    severity="Escalated to ${status}"
+
+elif [ "${old_status}" = "CRITICAL" -a "${status}" = "WARNING" ]
+then
+    severity="Demoted to ${status}"
+
+else
+    raised_for=
+fi
+
 # send the email
 cat <<EOF | sendmail_from_pipe
 To: root
-Subject: ${type} ${hostname} ${status_message} - ${chart}.${name}
+Subject: ${hostname} ${status_message} - ${chart}.${name}
 Content-Type: text/html
 
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
@@ -165,7 +182,7 @@ Content-Type: text/html
                                     <tr style="margin:0;padding:0">
                                         <td style="font-size:18px;vertical-align:top;margin:0;padding:0 0 20px"
                                             align="left" valign="top">
-                                            <span>${name} = ${value}</span>
+                                            <span>${alarm}</span>
                                             <span style="display:block;color:#666666;font-size:12px;font-weight:300;line-height:1;text-transform:uppercase">Alarm</span>
                                         </td>
                                     </tr>
index 984816179a97e25d017e4a2cf7dd7ddc2aae28dc..cee818e7fd34c40af458fab548b46d02da530bcb 100644 (file)
@@ -1406,7 +1406,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
 static inline int rrdcalc_value2status(calculated_number n) {
     if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
     if(n) return RRDCALC_STATUS_RAISED;
-    return RRDCALC_STATUS_OFF;
+    return RRDCALC_STATUS_CLEAR;
 }
 
 static inline const char *rrdcalc_status2string(int status) {
@@ -1417,23 +1417,16 @@ static inline const char *rrdcalc_status2string(int status) {
         case RRDCALC_STATUS_UNDEFINED:
             return "UNDEFINED";
 
+        case RRDCALC_STATUS_CLEAR:
+            return "CLEAR";
+
         case RRDCALC_STATUS_RAISED:
             return "RAISED";
 
-        case RRDCALC_STATUS_OFF:
-            return "OFF";
-
-        default:
-            return "UNKNOWN";
-    }
-}
-
-static inline const char *alarm_entry_type2string(int type) {
-    switch(type) {
-        case ALARM_ENTRY_TYPE_WARNING:
+        case RRDCALC_STATUS_WARNING:
             return "WARNING";
 
-        case ALARM_ENTRY_TYPE_CRITICAL:
+        case RRDCALC_STATUS_CRITICAL:
             return "CRITICAL";
 
         default:
@@ -1442,7 +1435,7 @@ static inline const char *alarm_entry_type2string(int type) {
 }
 
 static inline void health_alarm_execute(ALARM_ENTRY *ae) {
-    if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_OFF)
+    if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)
         return;
 
     char buffer[FILENAME_MAX + 1];
@@ -1451,9 +1444,8 @@ static inline void health_alarm_execute(ALARM_ENTRY *ae) {
     const char *exec = ae->exec;
     if(!exec) exec = health_default_exec;
 
-    snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u'",
+    snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u'",
               exec,
-              alarm_entry_type2string(ae->type),
               ae->name,
               ae->chart?ae->chart:"NOCAHRT",
               rrdcalc_status2string(ae->new_status),
@@ -1479,10 +1471,9 @@ static inline void health_alarm_execute(ALARM_ENTRY *ae) {
 }
 
 static inline void health_process_notifications(ALARM_ENTRY *ae) {
-    info("Health alarm '%s.%s' = %0.2Lf - %s changed status from %s to %s",
+    info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
          ae->chart?ae->chart:"NOCHART", ae->name,
          ae->new_value,
-         alarm_entry_type2string(ae->type),
          rrdcalc_status2string(ae->old_status),
          rrdcalc_status2string(ae->new_status)
     );
@@ -1490,7 +1481,7 @@ static inline void health_process_notifications(ALARM_ENTRY *ae) {
     health_alarm_execute(ae);
 }
 
-static inline void health_alarm_log(time_t when, int type,
+static inline void health_alarm_log(time_t when,
                 const char *name, const char *chart, const char *exec,
                 time_t duration,
                 calculated_number old_value, calculated_number new_value,
@@ -1499,22 +1490,41 @@ static inline void health_alarm_log(time_t when, int type,
 ) {
     ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
     ae->name = strdupz(name);
-    if(chart) ae->chart = strdupz(chart);
+    ae->hash_name = simple_hash(ae->name);
+
+    if(chart) {
+        ae->chart = strdupz(chart);
+        ae->hash_chart = simple_hash(ae->chart);
+    }
+
     if(exec) ae->exec = strdupz(exec);
     if(source) ae->source = strdupz(source);
 
     ae->id = health_log.nextid++;
     ae->when = when;
-    ae->type = type;
     ae->old_value = old_value;
     ae->new_value = new_value;
     ae->old_status = old_status;
     ae->new_status = new_status;
     ae->duration = duration;
+
     // link it
     ae->next = health_log.alarms;
     health_log.alarms = ae;
     health_log.count++;
+
+    // match previous alarms
+    ALARM_ENTRY *t;
+    for(t = health_log.alarms ; t ; t = t->next) {
+        if(t != ae &&
+                t->hash_name == ae->hash_name &&
+                t->hash_chart == ae->hash_chart &&
+                !strcmp(t->name, ae->name) &&
+                t->chart && ae->chart && !strcmp(t->chart, ae->chart)) {
+            t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
+            t->updated_by = ae;
+        }
+    }
 }
 
 static inline void health_alarm_log_process(void) {
@@ -1524,7 +1534,8 @@ static inline void health_alarm_log_process(void) {
     for(ae = health_log.alarms; ae ;ae = ae->next) {
         if(last_processed >= ae->id) break;
 
-        if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)) {
+        if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
+                !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) {
             ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
             health_process_notifications(ae);
         }
@@ -1555,34 +1566,6 @@ static inline void health_alarm_log_process(void) {
     }
 }
 
-static inline void rrdcalc_check_warning_event(RRDCALC *rc) {
-    calculated_number n = rc->warning->result;
-
-    int old_status = rc->warning_status;
-    int new_status = rrdcalc_value2status(n);
-
-    if(new_status != old_status) {
-        time_t now = time(NULL);
-        health_alarm_log(time(NULL), ALARM_ENTRY_TYPE_WARNING, rc->name, rc->rrdset->id, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, old_status, new_status, rc->source);
-        rc->last_status_change = now;
-        rc->warning_status = new_status;
-    }
-}
-
-static inline void rrdcalc_check_critical_event(RRDCALC *rc) {
-    calculated_number n = rc->critical->result;
-
-    int old_status = rc->critical_status;
-    int new_status = rrdcalc_value2status(n);
-
-    if(new_status != old_status) {
-        time_t now = time(NULL);
-        health_alarm_log(time(NULL), ALARM_ENTRY_TYPE_CRITICAL, rc->name, rc->rrdset->id, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, old_status, new_status, rc->source);
-        rc->last_status_change = now;
-        rc->critical_status = new_status;
-    }
-}
-
 void *health_main(void *ptr) {
     (void)ptr;
 
@@ -1722,6 +1705,9 @@ void *health_main(void *ptr) {
                 if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
                     continue;
 
+                int warning_status  = RRDCALC_STATUS_UNDEFINED;
+                int critical_status = RRDCALC_STATUS_UNDEFINED;
+
                 if(unlikely(rc->warning)) {
                     if(unlikely(!expression_evaluate(rc->warning))) {
                         // calculation failed
@@ -1747,9 +1733,8 @@ void *health_main(void *ptr) {
                               buffer_tostring(rc->warning->error_msg),
                               rc->source
                         );
+                        warning_status = rrdcalc_value2status(rc->warning->result);
                     }
-
-                    rrdcalc_check_warning_event(rc);
                 }
 
                 if(unlikely(rc->critical)) {
@@ -1777,9 +1762,23 @@ void *health_main(void *ptr) {
                               buffer_tostring(rc->critical->error_msg),
                               rc->source
                         );
+
+                        critical_status = rrdcalc_value2status(rc->critical->result);
                     }
+                }
+
+                int status = RRDCALC_STATUS_UNDEFINED;
+
+                if(warning_status == RRDCALC_STATUS_RAISED)
+                    status = RRDCALC_STATUS_WARNING;
+
+                if(critical_status == RRDCALC_STATUS_RAISED)
+                    status = RRDCALC_STATUS_CRITICAL;
 
-                    rrdcalc_check_critical_event(rc);
+                if(status != rc->status) {
+                    health_alarm_log(time(NULL), rc->name, rc->rrdset->id, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source);
+                    rc->last_status_change = now;
+                    rc->status = status;
                 }
 
                 rc->last_updated = now;
index 0cd250973c2a4c3e149aa08c039ffe6e498df0aa..3af3faff466761bf101bbe99e04102b76e7fdb53 100644 (file)
@@ -106,8 +106,10 @@ typedef struct rrddimvar {
 
 #define RRDCALC_STATUS_UNINITIALIZED  0
 #define RRDCALC_STATUS_UNDEFINED     -1
-#define RRDCALC_STATUS_OFF            1
+#define RRDCALC_STATUS_CLEAR          1
 #define RRDCALC_STATUS_RAISED         2
+#define RRDCALC_STATUS_WARNING        3
+#define RRDCALC_STATUS_CRITICAL       4
 
 #define RRDCALC_OPTION_DB_ERROR      0x00000001
 #define RRDCALC_OPTION_DB_NAN        0x00000002
@@ -143,6 +145,7 @@ typedef struct rrdcalc {
     EVAL_EXPRESSION *critical;
 
     uint32_t rrdcalc_options;
+    int status;
     int warning_status;
     int critical_status;
 
@@ -203,25 +206,31 @@ typedef struct rrdcalctemplate {
 
 #define RRDCALCTEMPLATE_HAS_CALCULATION(rt) ((rt)->after)
 
-#define ALARM_ENTRY_TYPE_WARNING  1
-#define ALARM_ENTRY_TYPE_CRITICAL 2
-
 #define HEALTH_ENTRY_NOTIFICATIONS_PROCESSED 0x00000001
+#define HEALTH_ENTRY_NOTIFICATIONS_UPDATED   0x00000002
 
 typedef struct alarm_entry {
     uint32_t id;
+
     time_t when;
     time_t duration;
-    int type;
+
     char *name;
+    uint32_t hash_name;
+
     char *chart;
+    uint32_t hash_chart;
+
     char *exec;
     char *source;
     calculated_number old_value;
     calculated_number new_value;
     int old_status;
     int new_status;
+
     uint32_t notifications;
+
+    struct alarm_entry *updated_by;
     struct alarm_entry *next;
 } ALARM_ENTRY;