echo >&2 "I cannot send emails - there is no sendmail command available."
fi
+default_recipient_for_all_roles="root"
+declare -A recipients=()
+if [ -f "${NETDATA_CONFIG_DIR}/health_email_recipients.conf" ]
+ then
+ source "${NETDATA_CONFIG_DIR}/health_email_recipients.conf"
+fi
+
sendmail_from_pipe() {
"${sendmail}" -t
fi
}
-name="${1}" # the name of the alarm, as given in netdata health.d entries
-chart="${2}" # the name of the chart (type.id)
-family="${3}" # the family of the chart
-status="${4}" # the current status : UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
-old_status="${5}" # the previous status: UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
-value="${6}" # the current value
-old_value="${7}" # the previous value
-src="${8}" # the line number and file the alarm has been configured
-duration="${9}" # the duration in seconds the previous state took
-non_clear_duration="${10}" # the total duration in seconds this is non-clear
-units="${11}" # the units of the value
-info="${12}" # a short description of the alarm
+recipient="${1}" # the recepient of the email
+hostname="${2}" # the hostname this event refers to
+unique_id="${3}" # the unique id of this event
+alarm_id="${4}" # the unique id of the alarm that generated this event
+event_id="${5}" # the incremental id of the event, for this alarm
+when="${6}" # the timestamp this event occured
+name="${7}" # the name of the alarm, as given in netdata health.d entries
+chart="${8}" # the name of the chart (type.id)
+family="${9}" # the family of the chart
+status="${10}" # the current status : UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+old_status="${11}" # the previous status: UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+value="${12}" # the current value
+old_value="${13}" # the previous value
+src="${14}" # the line number and file the alarm has been configured
+duration="${15}" # the duration in seconds the previous state took
+non_clear_duration="${16}" # the total duration in seconds this is non-clear
+units="${17}" # the units of the value
+info="${18}" # a short description of the alarm
+
+to="${recipients[${recipient}]}"
+[ -z "${to}" ] && to="${default_recipient_for_all_roles}"
+[ -z "${to}" ] && to="root"
[ ! -z "${info}" ] && info=" <small><br/>${info}</small>"
# get the system hostname
-hostname="${NETDATA_HOSTNAME}"
+[ -z "${hostname}" ] && hostname="${NETDATA_HOSTNAME}"
[ -z "${hostname}" ] && hostname="${NETDATA_REGISTRY_HOSTNAME}"
-[ -z "${hostname}" ] && hostname="$(hostname)"
+[ -z "${hostname}" ] && hostname="$(hostname 2>/dev/null)"
goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?machine_guid=${NETDATA_REGISTRY_UNIQUE_ID}&chart=${chart}&family=${family}"
-# get the current date
-date="$(date)"
+date="$(date --date=@${when} 2>/dev/null)"
+[ -z "${date}" ] && date="$(date 2>/dev/null)"
+# convert a duration in seconds, to a human readable duration
+# using DAYS, MINUTES, SECONDS
duration4human() {
local s="${1}" d=0 h=0 m=0 ds="day" hs="hour" ms="minute" ss="second"
d=$(( s / 86400 ))
}
severity="${status}"
-raised_for="<br/>(was ${old_status,,} for $(duration4human ${duration}))"
+raised_for="<br/><small>(was ${old_status,,} for $(duration4human ${duration}))</small>"
status_message="status unknown"
color="grey"
alarm="${name} = ${value} ${units}"
severity="Recovered from ${old_status}"
if [ $non_clear_duration -gt $duration ]
then
- raised_for="<br/>(had issues for $(duration4human ${non_clear_duration}))"
+ raised_for="<br/><small>(had issues for $(duration4human ${non_clear_duration}))</small>"
fi
elif [ "${old_status}" = "WARNING" -a "${status}" = "CRITICAL" ]
severity="Escalated to ${status}"
if [ $non_clear_duration -gt $duration ]
then
- raised_for="<br/>(has issues for $(duration4human ${non_clear_duration}))"
+ raised_for="<br/><small>(has issues for $(duration4human ${non_clear_duration}))</small>"
fi
elif [ "${old_status}" = "CRITICAL" -a "${status}" = "WARNING" ]
severity="Demoted to ${status}"
if [ $non_clear_duration -gt $duration ]
then
- raised_for="<br/>(has issues for $(duration4human ${non_clear_duration}))"
+ raised_for="<br/><small>(has issues for $(duration4human ${non_clear_duration}))</small>"
fi
else
# send the email
cat <<EOF | sendmail_from_pipe
-To: root
+To: ${to}
Subject: ${hostname} ${status_message} - ${chart}.${name}
Content-Type: text/html
#define RRDVAR_MAX_LENGTH 1024
static const char *health_default_exec = PLUGINS_DIR "/alarm-email.sh";
+static const char *health_default_recipient = "root";
int health_enabled = 1;
// ----------------------------------------------------------------------------
const char *units, const char *info,
int group_method, int after, int before, int update_every, uint32_t options,
calculated_number green, calculated_number red,
- const char *exec, const char *source,
+ const char *exec, const char *recipient, const char *source,
const char *calc, const char *warn, const char *crit) {
char fullname[RRDVAR_MAX_LENGTH + 1];
return NULL;
RRDCALC *rc = callocz(1, sizeof(RRDCALC));
-
+ rc->id = host->health_log.next_alarm_id++;
+ rc->next_event_id = 1;
rc->name = strdupz(name);
rc->hash = simple_hash(rc->name);
-
rc->chart = strdupz(chart);
rc->hash_chart = simple_hash(rc->chart);
rc->options = options;
if(exec) rc->exec = strdupz(exec);
+ if(recipient) rc->recipient = strdupz(recipient);
if(source) rc->source = strdupz(source);
if(units) rc->units = strdupz(units);
if(info) rc->info = strdupz(info);
error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, name, crit);
}
- debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
+ debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
(rc->chart)?rc->chart:"NOCHART",
rc->name,
(rc->exec)?rc->exec:"DEFAULT",
+ (rc->recipient)?rc->recipient:"DEFAULT",
rc->green,
rc->red,
rc->group,
freez(rc->family);
freez(rc->dimensions);
freez(rc->exec);
+ freez(rc->recipient);
freez(rc->source);
freez(rc->units);
freez(rc->info);
if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
RRDCALC *rc = rrdcalc_create(st->rrdhost, rt->name, st->id,
rt->dimensions, rt->units, rt->info, rt->group, rt->after, rt->before, rt->update_every, rt->options,
- rt->green, rt->red, rt->exec, rt->source,
+ rt->green, rt->red,
+ rt->exec, rt->recipient, rt->source,
(rt->calculation)?rt->calculation->source:NULL,
(rt->warning)?rt->warning->source:NULL,
(rt->critical)?rt->critical->source:NULL);
freez(rt->name);
freez(rt->exec);
+ freez(rt->recipient);
freez(rt->context);
freez(rt->source);
freez(rt->units);
#define HEALTH_WARN_KEY "warn"
#define HEALTH_CRIT_KEY "crit"
#define HEALTH_EXEC_KEY "exec"
+#define HEALTH_RECIPIENT_KEY "to"
#define HEALTH_UNITS_KEY "units"
#define HEALTH_INFO_KEY "info"
return 0;
}
- debug(D_HEALTH, "Health configuration adding alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
+ debug(D_HEALTH, "Health configuration adding alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
rc->chart?rc->chart:"NOCHART",
rc->name,
(rc->exec)?rc->exec:"DEFAULT",
+ (rc->recipient)?rc->recipient:"DEFAULT",
rc->green,
rc->red,
rc->group,
}
}
- debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'",
+ debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'",
rt->name,
(rt->context)?rt->context:"NONE",
(rt->exec)?rt->exec:"DEFAULT",
+ (rt->recipient)?rt->recipient:"DEFAULT",
rt->green,
rt->red,
rt->group,
int health_readfile(const char *path, const char *filename) {
debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
- static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0;
+ static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0;
char buffer[HEALTH_CONF_MAX_LINE + 1];
if(unlikely(!hash_alarm)) {
hash_every = simple_uhash(HEALTH_EVERY_KEY);
hash_units = simple_hash(HEALTH_UNITS_KEY);
hash_info = simple_hash(HEALTH_INFO_KEY);
+ hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
}
snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
}
rc = callocz(1, sizeof(RRDCALC));
+ rc->id = localhost.health_log.next_alarm_id++;
+ rc->next_event_id = 1;
rc->name = strdupz(value);
rc->hash = simple_hash(rc->name);
rc->source = health_source_file(line, path, filename);
}
rc->exec = strdupz(value);
}
+ else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
+ if(rc->recipient) {
+ if(strcmp(rc->recipient, value))
+ info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, path, filename, rc->name, key, rc->recipient, value, value);
+
+ freez(rc->recipient);
+ }
+ rc->recipient = strdupz(value);
+ }
else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
if(rc->units) {
if(strcmp(rc->units, value))
}
rt->exec = strdupz(value);
}
+ else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
+ if(rt->recipient) {
+ if(strcmp(rt->recipient, value))
+ info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, path, filename, rt->name, key, rt->recipient, value, value);
+
+ freez(rt->recipient);
+ }
+ rt->recipient = strdupz(value);
+ }
else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
if(rt->units) {
if(strcmp(rt->units, value))
static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
buffer_sprintf(wb, "\n\t{\n"
- "\t\t\"id\":%u,\n"
+ "\t\t\"unique_id\":%u,\n"
+ "\t\t\"alarm_id\":%u,\n"
+ "\t\t\"alarm_event_id\":%u,\n"
"\t\t\"name\":\"%s\",\n"
"\t\t\"chart\":\"%s\",\n"
"\t\t\"family\":\"%s\",\n"
"\t\t\"exec_run\":%s,\n"
"\t\t\"exec_failed\":%s,\n"
"\t\t\"exec\":\"%s\",\n"
+ "\t\t\"recipient\":\"%s\",\n"
"\t\t\"exec_code\":%d,\n"
"\t\t\"source\":\"%s\",\n"
"\t\t\"units\":\"%s\",\n"
"\t\t\"non_clear_duration\":%lu,\n"
"\t\t\"status\":\"%s\",\n"
"\t\t\"old_status\":\"%s\",\n",
- ae->id,
+ ae->unique_id,
+ ae->alarm_id,
+ ae->alarm_event_id,
ae->name,
ae->chart,
ae->family,
(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN)?"true":"false",
(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false",
ae->exec?ae->exec:health_default_exec,
+ ae->recipient?ae->recipient:health_default_recipient,
ae->exec_code,
ae->source,
ae->units?ae->units:"",
"\t\t\t\"family\": \"%s\",\n"
"\t\t\t\"active\": %s,\n"
"\t\t\t\"exec\": \"%s\",\n"
+ "\t\t\t\"recipient\": \"%s\",\n"
"\t\t\t\"source\": \"%s\",\n"
"\t\t\t\"units\": \"%s\",\n"
"\t\t\t\"info\": \"%s\",\n"
, (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
, (rc->rrdset)?"true":"false"
, rc->exec?rc->exec:health_default_exec
+ , rc->recipient?rc->recipient:health_default_recipient
, rc->source
, rc->units?rc->units:""
, rc->info?rc->info:""
return RRDCALC_STATUS_CLEAR;
}
-static inline void health_alarm_execute(ALARM_ENTRY *ae) {
+static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)
return;
const char *exec = ae->exec;
if(!exec) exec = health_default_exec;
- snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
+ const char *recipient = ae->recipient;
+ if(!recipient) recipient = health_default_recipient;
+
+ snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%zu' '%s' '%s' '%s' '%s' '%s' '%0.1Lf' '%0.1Lf' '%s' '%u' '%u' '%s' '%s'",
exec,
+ recipient,
+ host->hostname,
+ ae->unique_id,
+ ae->alarm_id,
+ ae->alarm_event_id,
+ ae->when,
ae->name,
ae->chart?ae->chart:"NOCAHRT",
ae->family?ae->family:"NOFAMILY",
ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED;
}
-static inline void health_process_notifications(ALARM_ENTRY *ae) {
+static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
ae->chart?ae->chart:"NOCHART", ae->name,
ae->new_value,
rrdcalc_status2string(ae->new_status)
);
- health_alarm_execute(ae);
+ health_alarm_execute(host, ae);
}
-static inline void health_alarm_log(RRDHOST *host, time_t when,
+static inline void health_alarm_log(RRDHOST *host,
+ uint32_t alarm_id, uint32_t alarm_event_id,
+ time_t when,
const char *name, const char *chart, const char *family,
- const char *exec, time_t duration,
+ const char *exec, const char *recipient, time_t duration,
calculated_number old_value, calculated_number new_value,
int old_status, int new_status,
const char *source,
ae->family = strdupz(family);
if(exec) ae->exec = strdupz(exec);
+ if(recipient) ae->recipient = strdupz(recipient);
if(source) ae->source = strdupz(source);
if(units) ae->units = strdupz(units);
if(info) ae->info = strdupz(info);
- ae->id = host->health_log.nextid++;
+ ae->unique_id = host->health_log.next_log_id++;
+ ae->alarm_id = alarm_id;
+ ae->alarm_event_id = alarm_event_id;
ae->when = when;
ae->old_value = old_value;
ae->new_value = new_value;
pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
for(ae = host->health_log.alarms; ae ;ae = ae->next) {
- if(last_processed >= ae->id) break;
+ if(last_processed >= ae->unique_id) break;
if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) {
ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
- health_process_notifications(ae);
+ health_process_notifications(host, ae);
}
}
if(host->health_log.alarms)
- last_processed = host->health_log.alarms->id;
+ last_processed = host->health_log.alarms->unique_id;
pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
freez(ae->chart);
freez(ae->family);
freez(ae->exec);
+ freez(ae->recipient);
freez(ae->source);
freez(ae->units);
freez(ae->info);
}
if(status != rc->status) {
- health_alarm_log(&localhost, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info);
+ health_alarm_log(&localhost, rc->id, rc->next_event_id++, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info);
rc->last_status_change = now;
rc->status = status;
}