]> arthur.barton.de Git - netdata.git/commitdiff
added roles to receive email alarms
authorCosta Tsaousis (ktsaou) <costa@tsaousis.gr>
Sat, 3 Sep 2016 01:20:43 +0000 (04:20 +0300)
committerCosta Tsaousis (ktsaou) <costa@tsaousis.gr>
Sat, 3 Sep 2016 01:20:43 +0000 (04:20 +0300)
22 files changed:
conf.d/health.d/apache.conf
conf.d/health.d/cpu.conf
conf.d/health.d/disks.conf
conf.d/health.d/entropy.conf
conf.d/health.d/memcached.conf
conf.d/health.d/named.conf
conf.d/health.d/net.conf
conf.d/health.d/nginx.conf
conf.d/health.d/qos.conf
conf.d/health.d/ram.conf
conf.d/health.d/redis.conf
conf.d/health.d/squid.conf
conf.d/health.d/swap.conf
conf.d/health_email_recipients.conf [new file with mode: 0755]
plugins.d/alarm-email.sh
src/health.c
src/health.h
src/main.c
src/registry.c
src/rrd.c
src/rrd.h
src/rrd2json.c

index 1fddbc99f2fea924fcb8f084f5cf96613938545d..89e811018ee3083a5358e50d3f0cfe00b91cee2b 100644 (file)
@@ -9,5 +9,5 @@ template: apache_last_collected_secs
     crit: $this > (10 * $update_every)
    units: seconds ago
     info: number of seconds since the last successful data collection
-
+      to: webmaster
 
index 9332e508a3d7009cfab585dcb64a0859a0075eca..8caee3259cc61e4f38f97ad55362e9860da44894 100644 (file)
@@ -6,6 +6,7 @@ template: 5min_cpu_pcent
     warn: $this > 90
    units: %
     info: average cpu utilization for the last 5 minutes
+      to: sysadmin
 
 template: 5min_iowait_cpu_pcent
       on: system.cpu
@@ -14,6 +15,7 @@ template: 5min_iowait_cpu_pcent
     warn: $this > 10
    units: %
     info: average wait I/O for the last 5 minutes
+      to: sysadmin
 
 template: 20min_steal_cpu_pcent
       on: system.cpu
@@ -22,3 +24,4 @@ template: 20min_steal_cpu_pcent
     warn: $this > 10
    units: %
     info: average stolen CPU time for the last 20 minutes
+      to: sysadmin
index 2398e2c6429755636dbe3f3125a58100572e9470..7e98511feb4a7a5e14f9e3da62a15376873b0ee3 100644 (file)
@@ -13,6 +13,7 @@ template: disk_full_percent
     crit: $this > 95
    units: %
     info: current disk space usage
+      to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -47,6 +48,7 @@ template: disk_full_after_hours
     crit: $this > 0 and $this < 24
    units: hours
     info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last 30 minutes
+      to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -66,6 +68,7 @@ template: 10min_disk_utilization
     crit: $this > $red
    units: %
     info: the percentage of time the disk was busy, during the last 10 minutes
+      to: sysadmin
 
 
 # raise an alarm if the disk backlog
@@ -83,3 +86,4 @@ template: 10min_disk_backlog
     crit: $this > $red
    units: ms
     info: average of the kernel estimated disk backlog, for the last 10 minutes
+      to: sysadmin
index 6f8b6e85182fd4a162f6d19659d75e12d7056d18..fcbfb4cb54a93325dea342e5c0e760556e94cb31 100644 (file)
@@ -11,3 +11,4 @@
     crit: $this < 100
    units: entries
     info: minimum entries in the random numbers pool (entropy), for the last 30 minutes
+      to: sysadmin
index 05ff14711832ebd469ece4052a9259d712c53f57..152db9dea4745dfea70ed6abbac1b3bd3e2e455d 100644 (file)
@@ -9,6 +9,7 @@ template: memcached_last_collected_secs
     crit: $this > (10 * $update_every)
    units: seconds ago
     info: number of seconds since the last successful data collection
+      to: dba
 
 
 # detect if memcached cache is full
@@ -21,6 +22,7 @@ template: cache_full_pcent
     crit: $this > 90
    units: %
     info: current cache memory usage
+      to: dba
 
 
 # find the rate memcached cache is filling
@@ -44,3 +46,4 @@ template: cache_full_after_hours
     crit: $this > 0 and $this < 24
    units: hours
     info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last 30 minutes
+      to: dba
index e46d1d33005e8da54cf73ad3adf196db82732613..997c6b94fa3bf0951fd0109ef5de14cf9c1c366e 100644 (file)
@@ -6,7 +6,8 @@ template: named_last_collected_secs
     calc: $now - $last_collected_t
    every: 10s
     warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
+    crit: $this > (60 * $update_every)
    units: seconds ago
     info: number of seconds since the last successful data collection
+      to: domainadmin
 
index f65bc4fcb0783ac8a9d4a99c75e69f1b5c04a03a..209dbda6abca238ce6a487e853b8eb4828263915 100644 (file)
@@ -10,6 +10,7 @@ template: 30min_packet_drops
     crit: $this > 0
    units: packets
     info: dropped packets in the last 30 minutes
+      to: sysadmin
 
 
 # check if an interface is having FIFO
@@ -24,4 +25,5 @@ template: 30min_fifo_errors
     crit: $this > 0
    units: errors
     info: network interface fifo errors in the last 30 minutes
+      to: sysadmin
 
index da13008e396d92010a1caf17771700a002a54373..88073a1d148dc5e0201aabde1d8198c0344be456 100644 (file)
@@ -9,4 +9,5 @@ template: nginx_last_collected_secs
     crit: $this > (10 * $update_every)
    units: seconds ago
     info: number of seconds since the last successful data collection
+      to: webmaster
 
index ac3bf8ff4153fed9f91c3b73b1310b7b3287aa0f..bd438aa0b9673b5a274aa1bbbff2a001bbe48ace 100644 (file)
@@ -10,3 +10,4 @@
 #    warn: $this > 0
 #   units: packets
 #    info: dropped packets in the last 30 minutes
+#      to: sysadmin
index 1d368112838464466a7231f54c412649282a6092..8585dfa69b6ff84f2d8632ff5e73f359bf0df366 100644 (file)
@@ -7,3 +7,4 @@
     crit: $this > 90
    units: %
     info: system RAM usage
+      to: sysadmin
index 3750176c5f5864d055d25a0b34783090d0a57ce1..cdeae4a5bd25b4b98b99f90baa33f19499cd9676 100644 (file)
@@ -6,7 +6,8 @@ template: redis_last_collected_secs
     calc: $now - $last_collected_t
    every: 10s
     warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
+    crit: $this > (60 * $update_every)
    units: seconds ago
     info: number of seconds since the last successful data collection
+      to: dba
 
index cc5ce1c3a45fbefe0786ca5035e14fc1a25252d6..8fecce2312324ed301078a0effc06277e98c39c4 100644 (file)
@@ -6,7 +6,8 @@ template: squid_last_collected_secs
     calc: $now - $last_collected_t
    every: 10s
     warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
+    crit: $this > (60 * $update_every)
    units: seconds ago
     info: number of seconds since the last successful data collection
+      to: proxyadmin
 
index 1420565bddfc3010b99477e4cf8697053ea4c71c..f762b03079371a135dfc4db953d4a10cfc8f8119 100644 (file)
@@ -9,6 +9,7 @@
     crit: $this > 20
    units: % of RAM
     info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
+      to: sysadmin
 
    alarm: pcent_of_ram_in_swap
       on: system.swap
@@ -18,3 +19,4 @@
     crit: $this > 50
    units: % of RAM
     info: the swap memory used, as a percentage of the system RAM
+      to: sysadmin
diff --git a/conf.d/health_email_recipients.conf b/conf.d/health_email_recipients.conf
new file mode 100755 (executable)
index 0000000..e485591
--- /dev/null
@@ -0,0 +1,53 @@
+# Configuration for alarms recipients
+
+# netdata alarms have been categorized to allow different roles to receive
+# alarms related to their work.
+
+# this file defines the email addresses for each role. if a role is not
+# defined, the email will be sent to root.
+
+# you can set multiple addresses for each role, like this:
+#
+# recipients[sysadmin]="admin1@example.com, admin2@example.com"
+#
+# it is important to add the comma between email addresses.
+
+# This configuration file is a BASH script itself. The 'recipients' variable is
+# an associative array. So you can use other variables too, like this one:
+
+default_recipient_for_all_roles="root"
+
+
+# -----------------------------------------------------------------------------
+# generic system alarms
+# CPU, disks, entropy, etc
+
+recipients[sysadmin]="${default_recipient_for_all_roles}"
+
+
+# -----------------------------------------------------------------------------
+# DNS related alarms
+
+recipients[domainadmin]="${default_recipient_for_all_roles}"
+
+
+# -----------------------------------------------------------------------------
+# database servers alarms
+# mysql, redis, memcached, etc
+
+recipients[dba]="${default_recipient_for_all_roles}"
+
+
+# -----------------------------------------------------------------------------
+# web servers alarms
+# apache, nginx, etc
+
+recipients[webmaster]="${default_recipient_for_all_roles}"
+
+
+# -----------------------------------------------------------------------------
+# proxy servers alarms
+# apache, nginx, etc
+
+recipients[proxyadmin]="${default_recipient_for_all_roles}"
+
index 78c79ccdb72359088918f56ba0fa10018bea93ea..3f1176b156f5a2475044f198f05c022f50ee920a 100755 (executable)
@@ -8,6 +8,13 @@ then
     echo >&2 "I cannot send emails - there is no sendmail command available."
 fi
 
+default_recipient_for_all_roles="root"
+declare -A recipients=()
+if [ -f "${NETDATA_CONFIG_DIR}/health_email_recipients.conf" ]
+    then
+    source "${NETDATA_CONFIG_DIR}/health_email_recipients.conf"
+fi
+
 sendmail_from_pipe() {
     "${sendmail}" -t
 
@@ -21,31 +28,43 @@ sendmail_from_pipe() {
     fi
 }
 
-name="${1}"       # the name of the alarm, as given in netdata health.d entries
-chart="${2}"      # the name of the chart (type.id)
-family="${3}"     # the family of the chart
-status="${4}"     # the current status : UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
-old_status="${5}" # the previous status: UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
-value="${6}"      # the current value
-old_value="${7}"  # the previous value
-src="${8}"        # the line number and file the alarm has been configured
-duration="${9}"   # the duration in seconds the previous state took
-non_clear_duration="${10}" # the total duration in seconds this is non-clear
-units="${11}"     # the units of the value
-info="${12}"      # a short description of the alarm
+recipient="${1}"   # the recepient of the email
+hostname="${2}"    # the hostname this event refers to
+unique_id="${3}"   # the unique id of this event
+alarm_id="${4}"    # the unique id of the alarm that generated this event
+event_id="${5}"    # the incremental id of the event, for this alarm
+when="${6}"        # the timestamp this event occured
+name="${7}"        # the name of the alarm, as given in netdata health.d entries
+chart="${8}"       # the name of the chart (type.id)
+family="${9}"      # the family of the chart
+status="${10}"     # the current status : UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+old_status="${11}" # the previous status: UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+value="${12}"      # the current value
+old_value="${13}"  # the previous value
+src="${14}"        # the line number and file the alarm has been configured
+duration="${15}"   # the duration in seconds the previous state took
+non_clear_duration="${16}" # the total duration in seconds this is non-clear
+units="${17}"      # the units of the value
+info="${18}"       # a short description of the alarm
+
+to="${recipients[${recipient}]}"
+[ -z "${to}" ] && to="${default_recipient_for_all_roles}"
+[ -z "${to}" ] && to="root"
 
 [ ! -z "${info}" ] && info=" <small><br/>${info}</small>"
 
 # get the system hostname
-hostname="${NETDATA_HOSTNAME}"
+[ -z "${hostname}" ] && hostname="${NETDATA_HOSTNAME}"
 [ -z "${hostname}" ] && hostname="${NETDATA_REGISTRY_HOSTNAME}"
-[ -z "${hostname}" ] && hostname="$(hostname)"
+[ -z "${hostname}" ] && hostname="$(hostname 2>/dev/null)"
 
 goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?machine_guid=${NETDATA_REGISTRY_UNIQUE_ID}&chart=${chart}&family=${family}"
 
-# get the current date
-date="$(date)"
+date="$(date --date=@${when} 2>/dev/null)"
+[ -z "${date}" ] && date="$(date 2>/dev/null)"
 
+# convert a duration in seconds, to a human readable duration
+# using DAYS, MINUTES, SECONDS
 duration4human() {
     local s="${1}" d=0 h=0 m=0 ds="day" hs="hour" ms="minute" ss="second"
     d=$(( s / 86400 ))
@@ -94,7 +113,7 @@ duration4human() {
 }
 
 severity="${status}"
-raised_for="<br/>(was ${old_status,,} for $(duration4human ${duration}))"
+raised_for="<br/><small>(was ${old_status,,} for $(duration4human ${duration}))</small>"
 status_message="status unknown"
 color="grey"
 alarm="${name} = ${value} ${units}"
@@ -136,7 +155,7 @@ then
     severity="Recovered from ${old_status}"
     if [ $non_clear_duration -gt $duration ]
     then
-        raised_for="<br/>(had issues for $(duration4human ${non_clear_duration}))"
+        raised_for="<br/><small>(had issues for $(duration4human ${non_clear_duration}))</small>"
     fi
 
 elif [ "${old_status}" = "WARNING" -a "${status}" = "CRITICAL" ]
@@ -144,7 +163,7 @@ then
     severity="Escalated to ${status}"
     if [ $non_clear_duration -gt $duration ]
     then
-        raised_for="<br/>(has issues for $(duration4human ${non_clear_duration}))"
+        raised_for="<br/><small>(has issues for $(duration4human ${non_clear_duration}))</small>"
     fi
 
 elif [ "${old_status}" = "CRITICAL" -a "${status}" = "WARNING" ]
@@ -152,7 +171,7 @@ then
     severity="Demoted to ${status}"
     if [ $non_clear_duration -gt $duration ]
     then
-        raised_for="<br/>(has issues for $(duration4human ${non_clear_duration}))"
+        raised_for="<br/><small>(has issues for $(duration4human ${non_clear_duration}))</small>"
     fi
 
 else
@@ -161,7 +180,7 @@ fi
 
 # send the email
 cat <<EOF | sendmail_from_pipe
-To: root
+To: ${to}
 Subject: ${hostname} ${status_message} - ${chart}.${name}
 Content-Type: text/html
 
index 4adac154477238eb9fe1bef5b034a606c9dbb38b..2b2c720b76c10722910a085c0f5b6cfe1ec4bb12 100644 (file)
@@ -3,6 +3,7 @@
 #define RRDVAR_MAX_LENGTH 1024
 
 static const char *health_default_exec = PLUGINS_DIR "/alarm-email.sh";
+static const char *health_default_recipient = "root";
 int health_enabled = 1;
 
 // ----------------------------------------------------------------------------
@@ -618,7 +619,7 @@ static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const cha
                         const char *units, const char *info,
                         int group_method, int after, int before, int update_every, uint32_t options,
                         calculated_number green, calculated_number red,
-                        const char *exec, const char *source,
+                        const char *exec, const char *recipient, const char *source,
                         const char *calc, const char *warn, const char *crit) {
 
     char fullname[RRDVAR_MAX_LENGTH + 1];
@@ -628,10 +629,10 @@ static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const cha
         return NULL;
 
     RRDCALC *rc = callocz(1, sizeof(RRDCALC));
-
+    rc->id = host->health_log.next_alarm_id++;
+    rc->next_event_id = 1;
     rc->name = strdupz(name);
     rc->hash = simple_hash(rc->name);
-
     rc->chart = strdupz(chart);
     rc->hash_chart = simple_hash(rc->chart);
 
@@ -649,6 +650,7 @@ static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const cha
     rc->options = options;
 
     if(exec) rc->exec = strdupz(exec);
+    if(recipient) rc->recipient = strdupz(recipient);
     if(source) rc->source = strdupz(source);
     if(units) rc->units = strdupz(units);
     if(info) rc->info = strdupz(info);
@@ -669,10 +671,11 @@ static inline RRDCALC *rrdcalc_create(RRDHOST *host, const char *name, const cha
             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, name, crit);
     }
 
-    debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
+    debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
           (rc->chart)?rc->chart:"NOCHART",
           rc->name,
           (rc->exec)?rc->exec:"DEFAULT",
+          (rc->recipient)?rc->recipient:"DEFAULT",
           rc->green,
           rc->red,
           rc->group,
@@ -724,6 +727,7 @@ void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
     freez(rc->family);
     freez(rc->dimensions);
     freez(rc->exec);
+    freez(rc->recipient);
     freez(rc->source);
     freez(rc->units);
     freez(rc->info);
@@ -740,7 +744,8 @@ void rrdcalctemplate_link_matching(RRDSET *st) {
         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)) {
             RRDCALC *rc = rrdcalc_create(st->rrdhost, rt->name, st->id,
                            rt->dimensions, rt->units, rt->info, rt->group, rt->after, rt->before, rt->update_every, rt->options,
-                           rt->green, rt->red, rt->exec, rt->source,
+                           rt->green, rt->red,
+                           rt->exec, rt->recipient, rt->source,
                            (rt->calculation)?rt->calculation->source:NULL,
                            (rt->warning)?rt->warning->source:NULL,
                            (rt->critical)?rt->critical->source:NULL);
@@ -781,6 +786,7 @@ static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
 
     freez(rt->name);
     freez(rt->exec);
+    freez(rt->recipient);
     freez(rt->context);
     freez(rt->source);
     freez(rt->units);
@@ -805,6 +811,7 @@ static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
 #define HEALTH_WARN_KEY "warn"
 #define HEALTH_CRIT_KEY "crit"
 #define HEALTH_EXEC_KEY "exec"
+#define HEALTH_RECIPIENT_KEY "to"
 #define HEALTH_UNITS_KEY "units"
 #define HEALTH_INFO_KEY "info"
 
@@ -832,10 +839,11 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
         return 0;
     }
 
-    debug(D_HEALTH, "Health configuration adding alarm '%s.%s': exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
+    debug(D_HEALTH, "Health configuration adding alarm '%s.%s': exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s",
           rc->chart?rc->chart:"NOCHART",
           rc->name,
           (rc->exec)?rc->exec:"DEFAULT",
+          (rc->recipient)?rc->recipient:"DEFAULT",
           rc->green,
           rc->red,
           rc->group,
@@ -878,10 +886,11 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
         }
     }
 
-    debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'",
+    debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green %Lf, red %Lf, lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s'",
           rt->name,
           (rt->context)?rt->context:"NONE",
           (rt->exec)?rt->exec:"DEFAULT",
+          (rt->recipient)?rt->recipient:"DEFAULT",
           rt->green,
           rt->red,
           rt->group,
@@ -1067,7 +1076,7 @@ static inline void strip_quotes(char *s) {
 int health_readfile(const char *path, const char *filename) {
     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
 
-    static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0;
+    static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0;
     char buffer[HEALTH_CONF_MAX_LINE + 1];
 
     if(unlikely(!hash_alarm)) {
@@ -1084,6 +1093,7 @@ int health_readfile(const char *path, const char *filename) {
         hash_every = simple_uhash(HEALTH_EVERY_KEY);
         hash_units = simple_hash(HEALTH_UNITS_KEY);
         hash_info = simple_hash(HEALTH_INFO_KEY);
+        hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
     }
 
     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
@@ -1155,6 +1165,8 @@ int health_readfile(const char *path, const char *filename) {
             }
 
             rc = callocz(1, sizeof(RRDCALC));
+            rc->id = localhost.health_log.next_alarm_id++;
+            rc->next_event_id = 1;
             rc->name = strdupz(value);
             rc->hash = simple_hash(rc->name);
             rc->source = health_source_file(line, path, filename);
@@ -1261,6 +1273,16 @@ int health_readfile(const char *path, const char *filename) {
                 }
                 rc->exec = strdupz(value);
             }
+            else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
+                if(rc->recipient) {
+                    if(strcmp(rc->recipient, value))
+                        info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+                             line, path, filename, rc->name, key, rc->recipient, value, value);
+
+                    freez(rc->recipient);
+                }
+                rc->recipient = strdupz(value);
+            }
             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
                 if(rc->units) {
                     if(strcmp(rc->units, value))
@@ -1363,6 +1385,16 @@ int health_readfile(const char *path, const char *filename) {
                 }
                 rt->exec = strdupz(value);
             }
+            else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
+                if(rt->recipient) {
+                    if(strcmp(rt->recipient, value))
+                        info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+                             line, path, filename, rt->name, key, rt->recipient, value, value);
+
+                    freez(rt->recipient);
+                }
+                rt->recipient = strdupz(value);
+            }
             else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
                 if(rt->units) {
                     if(strcmp(rt->units, value))
@@ -1493,7 +1525,9 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char
 
 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
     buffer_sprintf(wb, "\n\t{\n"
-                           "\t\t\"id\":%u,\n"
+                           "\t\t\"unique_id\":%u,\n"
+                           "\t\t\"alarm_id\":%u,\n"
+                           "\t\t\"alarm_event_id\":%u,\n"
                            "\t\t\"name\":\"%s\",\n"
                            "\t\t\"chart\":\"%s\",\n"
                            "\t\t\"family\":\"%s\",\n"
@@ -1502,6 +1536,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
                            "\t\t\"exec_run\":%s,\n"
                            "\t\t\"exec_failed\":%s,\n"
                            "\t\t\"exec\":\"%s\",\n"
+                           "\t\t\"recipient\":\"%s\",\n"
                            "\t\t\"exec_code\":%d,\n"
                            "\t\t\"source\":\"%s\",\n"
                            "\t\t\"units\":\"%s\",\n"
@@ -1511,7 +1546,9 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
                            "\t\t\"non_clear_duration\":%lu,\n"
                            "\t\t\"status\":\"%s\",\n"
                            "\t\t\"old_status\":\"%s\",\n",
-                   ae->id,
+                   ae->unique_id,
+                   ae->alarm_id,
+                   ae->alarm_event_id,
                    ae->name,
                    ae->chart,
                    ae->family,
@@ -1520,6 +1557,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae) {
                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN)?"true":"false",
                    (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false",
                    ae->exec?ae->exec:health_default_exec,
+                   ae->recipient?ae->recipient:health_default_recipient,
                    ae->exec_code,
                    ae->source,
                    ae->units?ae->units:"",
@@ -1568,6 +1606,7 @@ static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
                    "\t\t\t\"family\": \"%s\",\n"
                    "\t\t\t\"active\": %s,\n"
                    "\t\t\t\"exec\": \"%s\",\n"
+                   "\t\t\t\"recipient\": \"%s\",\n"
                    "\t\t\t\"source\": \"%s\",\n"
                    "\t\t\t\"units\": \"%s\",\n"
                    "\t\t\t\"info\": \"%s\",\n"
@@ -1582,6 +1621,7 @@ static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
             , (rc->rrdset)?"true":"false"
             , rc->exec?rc->exec:health_default_exec
+            , rc->recipient?rc->recipient:health_default_recipient
             , rc->source
             , rc->units?rc->units:""
             , rc->info?rc->info:""
@@ -1729,7 +1769,7 @@ static inline int rrdcalc_value2status(calculated_number n) {
     return RRDCALC_STATUS_CLEAR;
 }
 
-static inline void health_alarm_execute(ALARM_ENTRY *ae) {
+static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
     if(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)
         return;
 
@@ -1739,8 +1779,17 @@ static inline void health_alarm_execute(ALARM_ENTRY *ae) {
     const char *exec = ae->exec;
     if(!exec) exec = health_default_exec;
 
-    snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
+    const char *recipient = ae->recipient;
+    if(!recipient) recipient = health_default_recipient;
+
+    snprintfz(buffer, FILENAME_MAX, "exec %s '%s' '%s' '%u' '%u' '%u' '%zu' '%s' '%s' '%s' '%s' '%s' '%0.1Lf' '%0.1Lf' '%s' '%u' '%u' '%s' '%s'",
               exec,
+              recipient,
+              host->hostname,
+              ae->unique_id,
+              ae->alarm_id,
+              ae->alarm_event_id,
+              ae->when,
               ae->name,
               ae->chart?ae->chart:"NOCAHRT",
               ae->family?ae->family:"NOFAMILY",
@@ -1774,7 +1823,7 @@ static inline void health_alarm_execute(ALARM_ENTRY *ae) {
         ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED;
 }
 
-static inline void health_process_notifications(ALARM_ENTRY *ae) {
+static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
     info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s",
          ae->chart?ae->chart:"NOCHART", ae->name,
          ae->new_value,
@@ -1782,12 +1831,14 @@ static inline void health_process_notifications(ALARM_ENTRY *ae) {
          rrdcalc_status2string(ae->new_status)
     );
 
-    health_alarm_execute(ae);
+    health_alarm_execute(host, ae);
 }
 
-static inline void health_alarm_log(RRDHOST *host, time_t when,
+static inline void health_alarm_log(RRDHOST *host,
+                uint32_t alarm_id, uint32_t alarm_event_id,
+                time_t when,
                 const char *name, const char *chart, const char *family,
-                const char *exec, time_t duration,
+                const char *exec, const char *recipient, time_t duration,
                 calculated_number old_value, calculated_number new_value,
                 int old_status, int new_status,
                 const char *source,
@@ -1807,11 +1858,14 @@ static inline void health_alarm_log(RRDHOST *host, time_t when,
         ae->family = strdupz(family);
 
     if(exec) ae->exec = strdupz(exec);
+    if(recipient) ae->recipient = strdupz(recipient);
     if(source) ae->source = strdupz(source);
     if(units) ae->units = strdupz(units);
     if(info) ae->info = strdupz(info);
 
-    ae->id = host->health_log.nextid++;
+    ae->unique_id = host->health_log.next_log_id++;
+    ae->alarm_id = alarm_id;
+    ae->alarm_event_id = alarm_event_id;
     ae->when = when;
     ae->old_value = old_value;
     ae->new_value = new_value;
@@ -1863,17 +1917,17 @@ static inline void health_alarm_log_process(RRDHOST *host) {
     pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
 
     for(ae = host->health_log.alarms; ae ;ae = ae->next) {
-        if(last_processed >= ae->id) break;
+        if(last_processed >= ae->unique_id) break;
 
         if(!(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) &&
                 !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)) {
             ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED;
-            health_process_notifications(ae);
+            health_process_notifications(host, ae);
         }
     }
 
     if(host->health_log.alarms)
-        last_processed = host->health_log.alarms->id;
+        last_processed = host->health_log.alarms->unique_id;
 
     pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock);
 
@@ -1899,6 +1953,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
         freez(ae->chart);
         freez(ae->family);
         freez(ae->exec);
+        freez(ae->recipient);
         freez(ae->source);
         freez(ae->units);
         freez(ae->info);
@@ -2175,7 +2230,7 @@ void *health_main(void *ptr) {
                 }
 
                 if(status != rc->status) {
-                    health_alarm_log(&localhost, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info);
+                    health_alarm_log(&localhost, rc->id, rc->next_event_id++, time(NULL), rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info);
                     rc->last_status_change = now;
                     rc->status = status;
                 }
index ef1158a29812bdead936cd69de4028ee88318232..567e1153974b233c4ff3e001c0fb3bf5bb63f68d 100644 (file)
@@ -120,10 +120,14 @@ typedef struct rrddimvar {
 #define RRDCALC_FLAG_CRIT_ERROR    0x00000020
 
 typedef struct rrdcalc {
+    uint32_t id;
+    uint32_t next_event_id;
+
     char *name;
     uint32_t hash;
 
     char *exec;
+    char *recipient;
 
     char *chart;        // the chart id this should be linked to
     uint32_t hash_chart;
@@ -182,6 +186,7 @@ typedef struct rrdcalctemplate {
     uint32_t hash_name;
 
     char *exec;
+    char *recipient;
 
     char *context;
     uint32_t hash_context;
@@ -216,7 +221,9 @@ typedef struct rrdcalctemplate {
 #define HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED  0x00000008
 
 typedef struct alarm_entry {
-    uint32_t id;
+    uint32_t unique_id;
+    uint32_t alarm_id;
+    uint32_t alarm_event_id;
 
     time_t when;
     time_t duration;
@@ -231,6 +238,7 @@ typedef struct alarm_entry {
     char *family;
 
     char *exec;
+    char *recipient;
     int exec_code;
 
     char *source;
@@ -249,7 +257,8 @@ typedef struct alarm_entry {
 } ALARM_ENTRY;
 
 typedef struct alarm_log {
-    uint32_t nextid;
+    uint32_t next_log_id;
+    uint32_t next_alarm_id;
     unsigned int count;
     unsigned int max;
     ALARM_ENTRY *alarms;
index a751e291d68c3a502861a616277a1a36c29c4295..3db28e4f81e90e5d0783fca789ca525076dcb5e7 100644 (file)
@@ -276,6 +276,7 @@ static const char *verify_required_directory(const char *dir) {
 
 int main(int argc, char **argv)
 {
+    char *hostname = "localhost";
     int i, check_config = 0;
     int config_loaded = 0;
     int dont_fork = 0;
@@ -646,6 +647,11 @@ int main(int argc, char **argv)
             info("Successfully set pthread stacksize to %zu bytes", wanted_stacksize);
     }
 
+    // ------------------------------------------------------------------------
+    // initialize rrd host
+
+    rrdhost_init(hostname);
+
     // ------------------------------------------------------------------------
     // initialize the registry
 
index d64a9437ed331227643681df2e2d38e3f1a038cd..1b8a71442d6633075f766aecc6910b89ea299542 100644 (file)
@@ -1652,7 +1652,7 @@ int registry_init(void) {
     registry.persons_expiration = config_get_number("registry", "registry expire idle persons days", 365) * 86400;
     registry.registry_domain = config_get("registry", "registry domain", "");
     registry.registry_to_announce = config_get("registry", "registry to announce", "https://registry.my-netdata.io");
-    registry.hostname = config_get("registry", "registry hostname", config_get("global", "hostname", hostname));
+    registry.hostname = config_get("registry", "registry hostname", config_get("global", "hostname", localhost.hostname));
     registry.verify_cookies_redirects = config_get_boolean("registry", "verify browser cookies support", 1);
 
     setenv("NETDATA_REGISTRY_HOSTNAME", registry.hostname, 1);
index 61f99eda8eba99674c20033f97a231d6fdc216ab..6be65c7fc8f115960da4a708907660da1923e48f 100644 (file)
--- a/src/rrd.c
+++ b/src/rrd.c
@@ -42,7 +42,8 @@ RRDHOST localhost = {
             AVL_LOCK_INITIALIZER
         },
         .health_log = {
-            .nextid = 1,
+            .next_log_id = 1,
+            .next_alarm_id = 1,
             .count = 0,
             .max = 1000,
             .alarms = NULL,
@@ -50,6 +51,12 @@ RRDHOST localhost = {
         }
 };
 
+void rrdhost_init(char *hostname) {
+    localhost.hostname = hostname;
+    localhost.health_log.next_log_id =
+        localhost.health_log.next_alarm_id = time(NULL);
+}
+
 void rrdhost_rwlock(RRDHOST *host) {
     pthread_rwlock_wrlock(&host->rrdset_root_rwlock);
 }
index 108df0ce71d649e1626c7f33665bb51125909d56..92a65fe8a7beb77a54032480f12cf54390e4b541 100644 (file)
--- a/src/rrd.h
+++ b/src/rrd.h
@@ -310,6 +310,7 @@ struct rrdhost {
 };
 typedef struct rrdhost RRDHOST;
 extern RRDHOST localhost;
+extern void rrdhost_init(char *hostname);
 
 #ifdef NETDATA_INTERNAL_CHECKS
 #define rrdhost_check_wrlock(host) rrdhost_check_wrlock_int(host, __FILE__, __FUNCTION__, __LINE__)
index 9009a8b1dac04e9688e793834fa7c31886d6f5f4..62ce949734557120d66c2ce1376b6fa7d7d92d50 100644 (file)
@@ -1,8 +1,5 @@
 #include "common.h"
 
-#define HOSTNAME_MAX 1024
-char *hostname = "unknown";
-
 void rrd_stats_api_v1_chart(RRDSET *st, BUFFER *wb)
 {
     pthread_rwlock_rdlock(&st->rwlock);
@@ -84,7 +81,7 @@ void rrd_stats_api_v1_charts(BUFFER *wb)
         ",\n\t\"update_every\": %d"
         ",\n\t\"history\": %d"
         ",\n\t\"charts\": {"
-        , hostname
+        , localhost.hostname
         , rrd_update_every
         , rrd_default_history_entries
         );
@@ -246,7 +243,7 @@ void rrd_stats_all_json(BUFFER *wb)
         "\t\"history\": %d,\n"
         "\t\"memory\": %lu\n"
         "}\n"
-        , hostname
+        , localhost.hostname
         , rrd_update_every
         , rrd_default_history_entries
         , memory