#!/usr/bin/env bash
+me="${0}"
+
sendmail="$(which sendmail 2>/dev/null || command -v sendmail 2>/dev/null)"
if [ -z "${sendmail}" ]
then
sendmail_from_pipe() {
"${sendmail}" -t
-}
-type="${1}" # WARNING or CRITICAL
-name="${2}" # the name of the alarm, as given in netdata health.d entries
-chart="${3}" # the name of the chart (type.id)
-status="${4}" # the current status
-old_status="${5}" # the previous status
-value="${6}" # the current value
-old_value="${7}" # the previous value
-src="${8}" # the line number and file the alarm has been configured
-duration="${9}" # the duration in seconds the previous state took
+ if [ $? -eq 0 ]
+ then
+ echo >&2 "${me}: Sent notification email for ${status} on '${chart}.${name}'"
+ return 0
+ else
+ echo >&2 "${me}: FAILED to send notification email for ${status} on '${chart}.${name}'"
+ return 1
+ fi
+}
-# don't do anything if this is not RAISED or OFF
-[ "${status}" != "RAISED" -a "${status}" != "OFF" ] && exit 0
+name="${1}" # the name of the alarm, as given in netdata health.d entries
+chart="${2}" # the name of the chart (type.id)
+status="${3}" # the current status : UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+old_status="${4}" # the previous status: UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+value="${5}" # the current value
+old_value="${6}" # the previous value
+src="${7}" # the line number and file the alarm has been configured
+duration="${8}" # the duration in seconds the previous state took
# get the system hostname
hostname="$(hostname)"
fi
}
-if [ "${old_status}" = "RAISED" -a "${status}" = "OFF" ]
-then
- # an alarm that is now OFF
- severity="${type} recovered"
- raised_for="<br/>(was in ${type,,} state for $(duration4human ${duration}))"
-else
- # an alarm that is now RAISED
- severity="${type}"
- raised_for=""
-fi
+severity="${status}"
+raised_for="<br/>(was in ${old_status,,} for $(duration4human ${duration}))"
+status_message="status unknown"
+color="grey"
+alarm="${name} = ${value}"
-# prepare the title
+# prepare the title based on status
case "${status}" in
- RAISED)
- if [ "${type}" = "CRITICAL" ]
- then
- # CRITICAL - red
- status_message="is critical"
- color="#ca414b"
- else
- # WARNING - yellow
- status_message="needs attention"
- color="#caca4b"
- fi
+ CRITICAL)
+ status_message="is critical"
+ color="#ca414b"
+ ;;
+
+ WARNING)
+ status_message="needs attention"
+ color="#caca4b"
;;
- OFF)
- if [ "${type}" = "CRITICAL" ]
- then
- # CRITICAL
- status_message="recovered"
- else
- # WARNING
- status_message="back to normal"
- fi
+ CLEAR)
+ status_message="recovered"
color="#77ca6d"
- ;;
- *)
- status_message="status unknown"
- color="grey"
+ # don't show the value when the status is CLEAR
+ # for certain alarms, this value might not have any meaning
+ alarm="${name}"
;;
esac
+if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a "${status}" != "CLEAR" ]
+then
+ # don't do anything if this is not WARNING, CRITICAL or CLEAR
+ echo >&2 "${me}: not sending notification email for ${status} on '${chart}.${name}'"
+ exit 0
+elif [ "${old_status}" != "WARNING" -a "${old_status}" != "CRITICAL" -a "${status}" = "CLEAR" ]
+then
+ # don't do anything if this is CLEAR, but it was not WARNING or CRITICAL
+ echo >&2 "${me}: not sending notification email for ${status} on '${chart}.${name}' (last status was ${old_status})"
+ exit 0
+elif [ "${status}" = "CLEAR" ]
+then
+ severity="Recovered from ${old_status}"
+
+elif [ "${old_status}" = "WARNING" -a "${status}" = "CRITICAL" ]
+then
+ severity="Escalated to ${status}"
+
+elif [ "${old_status}" = "CRITICAL" -a "${status}" = "WARNING" ]
+then
+ severity="Demoted to ${status}"
+
+else
+ raised_for=
+fi
+
# send the email
cat <<EOF | sendmail_from_pipe
To: root
-Subject: ${type} ${hostname} ${status_message} - ${chart}.${name}
+Subject: ${hostname} ${status_message} - ${chart}.${name}
Content-Type: text/html
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<tr style="margin:0;padding:0">
<td style="font-size:18px;vertical-align:top;margin:0;padding:0 0 20px"
align="left" valign="top">
- <span>${name} = ${value}</span>
+ <span>${alarm}</span>
<span style="display:block;color:#666666;font-size:12px;font-weight:300;line-height:1;text-transform:uppercase">Alarm</span>
</td>
</tr>
static inline int rrdcalc_value2status(calculated_number n) {
if(isnan(n)) return RRDCALC_STATUS_UNDEFINED;
if(n) return RRDCALC_STATUS_RAISED;
- return RRDCALC_STATUS_OFF;
+ return RRDCALC_STATUS_CLEAR;
}
static inline const char *rrdcalc_status2string(int status) {
case RRDCALC_STATUS_UNDEFINED:
return "UNDEFINED";
+ case RRDCALC_STATUS_CLEAR:
+ return "CLEAR";
+
case RRDCALC_STATUS_RAISED:
return "RAISED";
- case RRDCALC_STATUS_OFF:
- return "OFF";
-
- default:
- return "UNKNOWN";
- }
-}
-
-static inline const char *alarm_entry_type2string(int type) {
- switch(type) {
- case ALARM_ENTRY_TYPE_WARNING:
+ case RRDCALC_STATUS_WARNING:
return "WARNING";
- case ALARM_ENTRY_TYPE_CRITICAL:
+ case RRDCALC_STATUS_CRITICAL:
return "CRITICAL";
default:
}
static inline void health_alarm_execute(ALARM_ENTRY *ae) {
- if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_OFF)
+ if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)
return;
char buffer[FILENAME_MAX + 1];
const char *exec = ae->exec;
if(!exec) exec = health_default_exec;
- snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u'",
+ snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u'",
exec,
- alarm_entry_type2string(ae->type),
ae->name,
ae->chart?ae->chart:"NOCAHRT",
rrdcalc_status2string(ae->new_status),
}
static inline void health_process_notifications(ALARM_ENTRY *ae) {
- info("Health alarm '%s.%s' = %0.2Lf - %s changed status from %s to %s",
+ info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
ae->chart?ae->chart:"NOCHART", ae->name,
ae->new_value,
- alarm_entry_type2string(ae->type),
rrdcalc_status2string(ae->old_status),
rrdcalc_status2string(ae->new_status)
);
health_alarm_execute(ae);
}
-static inline void health_alarm_log(time_t when, int type,
+static inline void health_alarm_log(time_t when,
const char *name, const char *chart, const char *exec,
time_t duration,
calculated_number old_value, calculated_number new_value,
) {
ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
ae->name = strdupz(name);
- if(chart) ae->chart = strdupz(chart);
+ ae->hash_name = simple_hash(ae->name);
+
+ if(chart) {
+ ae->chart = strdupz(chart);
+ ae->hash_chart = simple_hash(ae->chart);
+ }
+
if(exec) ae->exec = strdupz(exec);
if(source) ae->source = strdupz(source);
ae->id = health_log.nextid++;
ae->when = when;
- ae->type = type;
ae->old_value = old_value;
ae->new_value = new_value;
ae->old_status = old_status;
ae->new_status = new_status;
ae->duration = duration;
+
// link it
ae->next = health_log.alarms;
health_log.alarms = ae;
health_log.count++;
+
+ // match previous alarms
+ ALARM_ENTRY *t;
+ for(t = health_log.alarms ; t ; t = t->next) {
+ if(t != ae &&
+ t->hash_name == ae->hash_name &&
+ t->hash_chart == ae->hash_chart &&
+ !strcmp(t->name, ae->name) &&
+ t->chart && ae->chart && !strcmp(t->chart, ae->chart)) {
+ t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED;
+ t->updated_by = ae;
+ }
+ }
}
static inline void health_alarm_log_process(void) {
for(ae = health_log.alarms; ae ;ae = ae->next) {
if(last_processed >= ae->id) break;
- if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)) {
+ if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
+ !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) {
ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
health_process_notifications(ae);
}
}
}
-static inline void rrdcalc_check_warning_event(RRDCALC *rc) {
- calculated_number n = rc->warning->result;
-
- int old_status = rc->warning_status;
- int new_status = rrdcalc_value2status(n);
-
- if(new_status != old_status) {
- time_t now = time(NULL);
- health_alarm_log(time(NULL), ALARM_ENTRY_TYPE_WARNING, rc->name, rc->rrdset->id, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, old_status, new_status, rc->source);
- rc->last_status_change = now;
- rc->warning_status = new_status;
- }
-}
-
-static inline void rrdcalc_check_critical_event(RRDCALC *rc) {
- calculated_number n = rc->critical->result;
-
- int old_status = rc->critical_status;
- int new_status = rrdcalc_value2status(n);
-
- if(new_status != old_status) {
- time_t now = time(NULL);
- health_alarm_log(time(NULL), ALARM_ENTRY_TYPE_CRITICAL, rc->name, rc->rrdset->id, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, old_status, new_status, rc->source);
- rc->last_status_change = now;
- rc->critical_status = new_status;
- }
-}
-
void *health_main(void *ptr) {
(void)ptr;
if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run)))
continue;
+ int warning_status = RRDCALC_STATUS_UNDEFINED;
+ int critical_status = RRDCALC_STATUS_UNDEFINED;
+
if(unlikely(rc->warning)) {
if(unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
buffer_tostring(rc->warning->error_msg),
rc->source
);
+ warning_status = rrdcalc_value2status(rc->warning->result);
}
-
- rrdcalc_check_warning_event(rc);
}
if(unlikely(rc->critical)) {
buffer_tostring(rc->critical->error_msg),
rc->source
);
+
+ critical_status = rrdcalc_value2status(rc->critical->result);
}
+ }
+
+ int status = RRDCALC_STATUS_UNDEFINED;
+
+ if(warning_status == RRDCALC_STATUS_RAISED)
+ status = RRDCALC_STATUS_WARNING;
+
+ if(critical_status == RRDCALC_STATUS_RAISED)
+ status = RRDCALC_STATUS_CRITICAL;
- rrdcalc_check_critical_event(rc);
+ if(status != rc->status) {
+ health_alarm_log(time(NULL), rc->name, rc->rrdset->id, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source);
+ rc->last_status_change = now;
+ rc->status = status;
}
rc->last_updated = now;