]> arthur.barton.de Git - netdata.git/commitdiff
detect ECC memory correctable and uncorrectable errors; fixes #1508
authorCosta Tsaousis (ktsaou) <costa@tsaousis.gr>
Wed, 11 Jan 2017 21:24:07 +0000 (23:24 +0200)
committerCosta Tsaousis (ktsaou) <costa@tsaousis.gr>
Wed, 11 Jan 2017 21:24:07 +0000 (23:24 +0200)
CMakeLists.txt
conf.d/health.d/memory.conf [new file with mode: 0644]
src/Makefile.am
src/plugin_proc.c
src/plugin_proc.h
src/sys_devices_system_edac_mc.c [new file with mode: 0644]

index f4eb2872ff3bd047def6205bf1103143781714c7..49ae6b0c88d6711250c27b134a32871ad3a7645a 100755 (executable)
@@ -101,7 +101,7 @@ set(NETDATA_SOURCE_FILES
         src/registry_person.c
         src/registry_person.h
         src/registry_machine.c
-        src/registry_machine.h src/registry_internals.c src/registry_init.c src/registry_db.c src/registry_log.c src/proc_uptime.c)
+        src/registry_machine.h src/registry_internals.c src/registry_init.c src/registry_db.c src/registry_log.c src/proc_uptime.c src/sys_devices_system_edac_mc.c)
 
 set(APPS_PLUGIN_SOURCE_FILES
         src/appconfig.c
diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf
new file mode 100644 (file)
index 0000000..3c904f6
--- /dev/null
@@ -0,0 +1,30 @@
+
+   alarm: 1hour_ecc_memory_correctable
+      on: mem.ecc_ce
+  lookup: sum -10m unaligned
+   units: errors
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: number of ECC correctable errors during the last hour
+      to: sysadmin
+
+   alarm: 1hour_ecc_memory_uncorrectable
+      on: mem.ecc_ue
+  lookup: sum -10m unaligned
+   units: errors
+   every: 1m
+    crit: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: number of ECC uncorrectable errors during the last hour
+      to: sysadmin
+
+   alarm: 1hour_memory_hw_corrupted
+      on: mem.hwcorrupt
+    calc: $HardwareCorrupted
+   units: MB
+   every: 10s
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: amount of memory corrupted due to a hardware failure
+      to: sysadmin
index 12f296c1f8503d9a47f19c31d4df95636e973bf5..5533c95167cfb72f5947c90785f54a7630a621c1 100644 (file)
@@ -47,6 +47,7 @@ netdata_SOURCES = \
        popen.c popen.h \
        socket.c socket.h \
        sys_fs_cgroup.c \
+       sys_devices_system_edac_mc.c \
        procfile.c procfile.h \
        proc_self_mountinfo.c proc_self_mountinfo.h \
        registry.c registry.h \
index ed483d6754da25e5f009b16afd65957d26794316..5e80889b0073301165dcc1793d0d5a5654061e16 100644 (file)
@@ -30,7 +30,8 @@ void *proc_main(void *ptr)
     int vdo_proc_vmstat             = !config_get_boolean("plugin:proc", "/proc/vmstat", 1);
     int vdo_proc_net_rpc_nfs        = !config_get_boolean("plugin:proc", "/proc/net/rpc/nfs", 1);
     int vdo_proc_net_rpc_nfsd       = !config_get_boolean("plugin:proc", "/proc/net/rpc/nfsd", 1);
-    int vdo_proc_sys_kernel_random_entropy_avail    = !config_get_boolean("plugin:proc", "/proc/sys/kernel/random/entropy_avail", 1);
+    int vdo_proc_sys_kernel_random_entropy_avail = !config_get_boolean("plugin:proc", "/proc/sys/kernel/random/entropy_avail", 1);
+    int vdo_proc_sys_devices_system_edac_mc      = !config_get_boolean("plugin:proc", "/sys/devices/system/edac/mc", 1);
     int vdo_proc_interrupts         = !config_get_boolean("plugin:proc", "/proc/interrupts", 1);
     int vdo_proc_softirqs           = !config_get_boolean("plugin:proc", "/proc/softirqs", 1);
     int vdo_proc_net_softnet_stat   = !config_get_boolean("plugin:proc", "/proc/net/softnet_stat", 1);
@@ -55,6 +56,7 @@ void *proc_main(void *ptr)
     usec_t sutime_proc_net_rpc_nfs = 0ULL;
     usec_t sutime_proc_net_rpc_nfsd = 0ULL;
     usec_t sutime_proc_sys_kernel_random_entropy_avail = 0ULL;
+    usec_t sutime_proc_sys_devices_system_edac_mc = 0ULL;
     usec_t sutime_proc_interrupts = 0ULL;
     usec_t sutime_proc_softirqs = 0ULL;
     usec_t sutime_proc_net_softnet_stat = 0ULL;
@@ -142,6 +144,14 @@ void *proc_main(void *ptr)
         }
         if(unlikely(netdata_exit)) break;
 
+        if(!vdo_proc_sys_devices_system_edac_mc) {
+            debug(D_PROCNETDEV_LOOP, "PROCNETDEV: calling do_proc_sys_devices_system_edac_mc().");
+            now = now_realtime_usec();
+            vdo_proc_sys_devices_system_edac_mc = do_proc_sys_devices_system_edac_mc(rrd_update_every, (sutime_proc_sys_devices_system_edac_mc > 0)?now - sutime_proc_sys_devices_system_edac_mc:0ULL);
+            sutime_proc_sys_devices_system_edac_mc = now;
+        }
+        if(unlikely(netdata_exit)) break;
+
         if(!vdo_proc_net_dev) {
             debug(D_PROCNETDEV_LOOP, "PROCNETDEV: calling do_proc_net_dev().");
             now = now_realtime_usec();
index 0f52e4e451ddd5e2ed8b4a302ce4ad7f986695ac..2ee2b6b2fcfdb70a0e51b312bfae538c84de0ab6 100644 (file)
@@ -23,5 +23,6 @@ extern int do_proc_loadavg(int update_every, usec_t dt);
 extern int do_proc_net_stat_synproxy(int update_every, usec_t dt);
 extern int do_proc_net_softnet_stat(int update_every, usec_t dt);
 extern int do_proc_uptime(int update_every, usec_t dt);
+extern int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt);
 
 #endif /* NETDATA_PLUGIN_PROC_H */
diff --git a/src/sys_devices_system_edac_mc.c b/src/sys_devices_system_edac_mc.c
new file mode 100644 (file)
index 0000000..1e1aa9c
--- /dev/null
@@ -0,0 +1,184 @@
+#include "common.h"
+
+struct mc {
+    char *name;
+    char ce_updated;
+    char ue_updated;
+
+    char *ce_count_filename;
+    char *ue_count_filename;
+
+    procfile *ce_ff;
+    procfile *ue_ff;
+
+    collected_number ce_count;
+    collected_number ue_count;
+
+    RRDDIM *ce_rd;
+    RRDDIM *ue_rd;
+
+    struct mc *next;
+};
+static struct mc *mc_root = NULL;
+
+static void find_all_mc() {
+    char name[FILENAME_MAX + 1];
+    snprintfz(name, FILENAME_MAX, "%s%s", global_host_prefix, "/sys/devices/system/edac/mc");
+    char *dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name);
+
+    DIR *dir = opendir(dirname);
+    if(!dir) {
+        error("Cannot read ECC memory errors directory '%s'", dirname);
+        return;
+    }
+
+    struct dirent *de = NULL;
+    while((de = readdir(dir))) {
+        if(de->d_type == DT_DIR && de->d_name[0] == 'm' && de->d_name[1] == 'c' && isdigit(de->d_name[2])) {
+            struct mc *m = callocz(1, sizeof(struct mc));
+            m->name = strdupz(de->d_name);
+
+            struct stat st;
+
+            snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", dirname, de->d_name);
+            if(stat(name, &st) != -1)
+                m->ce_count_filename = strdupz(name);
+
+            snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", dirname, de->d_name);
+            if(stat(name, &st) != -1)
+                m->ue_count_filename = strdupz(name);
+
+            if(!m->ce_count_filename && !m->ue_count_filename) {
+                freez(m->name);
+                freez(m);
+            }
+            else {
+                m->next = mc_root;
+                mc_root = m;
+            }
+        }
+    }
+
+    closedir(dir);
+}
+
+int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt) {
+    (void)dt;
+
+    if(unlikely(mc_root == NULL)) {
+        find_all_mc();
+        if(unlikely(mc_root == NULL))
+            return 1;
+    }
+
+    static int do_ce = -1, do_ue = -1;
+    calculated_number ce_sum = 0, ue_sum = 0;
+    struct mc *m;
+
+    if(unlikely(do_ce == -1)) {
+        do_ce = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory correctable errors", CONFIG_ONDEMAND_ONDEMAND);
+        do_ue = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory uncorrectable errors", CONFIG_ONDEMAND_ONDEMAND);
+    }
+
+    if(do_ce != CONFIG_ONDEMAND_NO) {
+        for(m = mc_root; m; m = m->next) {
+            if(m->ce_count_filename) {
+                m->ce_updated = 0;
+
+                if(unlikely(!m->ce_ff)) {
+                    m->ce_ff = procfile_open(m->ce_count_filename, " \t", PROCFILE_FLAG_DEFAULT);
+                    if(unlikely(!m->ce_ff))
+                        continue;
+                }
+
+                m->ce_ff = procfile_readall(m->ce_ff);
+                if(unlikely(!m->ce_ff || procfile_lines(m->ce_ff) < 1 || procfile_linewords(m->ce_ff, 0) < 1))
+                    continue;
+
+                m->ce_count = strtoull(procfile_lineword(m->ce_ff, 0, 0), NULL, 0);
+                ce_sum += m->ce_count;
+                m->ce_updated = 1;
+            }
+        }
+    }
+
+    if(do_ue != CONFIG_ONDEMAND_NO) {
+        for(m = mc_root; m; m = m->next) {
+            if(m->ue_count_filename) {
+                m->ue_updated = 0;
+
+                if(unlikely(!m->ue_ff)) {
+                    m->ue_ff = procfile_open(m->ue_count_filename, " \t", PROCFILE_FLAG_DEFAULT);
+                    if(unlikely(!m->ue_ff))
+                        continue;
+                }
+
+                m->ue_ff = procfile_readall(m->ue_ff);
+                if(unlikely(!m->ue_ff || procfile_lines(m->ue_ff) < 1 || procfile_linewords(m->ue_ff, 0) < 1))
+                    continue;
+
+                m->ue_count = strtoull(procfile_lineword(m->ue_ff, 0, 0), NULL, 0);
+                ue_sum += m->ue_count;
+                m->ue_updated = 1;
+            }
+        }
+    }
+
+    // --------------------------------------------------------------------
+
+    if(do_ce == CONFIG_ONDEMAND_YES || (do_ce == CONFIG_ONDEMAND_ONDEMAND && ce_sum > 0)) {
+        do_ce = CONFIG_ONDEMAND_YES;
+
+        static RRDSET *ce_st = NULL;
+
+        if(unlikely(!ce_st))
+            ce_st = rrdset_find("mem.ecc_ce");
+
+        if(unlikely(!ce_st)) {
+            ce_st = rrdset_create("mem", "ecc_ce", NULL, "ecc", NULL, "ECC Memory Correctable Errors", "errors", 6600
+                                  , update_every, RRDSET_TYPE_LINE);
+
+            for(m = mc_root; m; m = m->next)
+                if(m->ce_count_filename)
+                    m->ce_rd = rrddim_add(ce_st, m->name, NULL, 1, 1, RRDDIM_INCREMENTAL);
+        }
+        else
+            rrdset_next(ce_st);
+
+        for(m = mc_root; m; m = m->next)
+            if(m->ce_count_filename && m->ce_updated)
+                rrddim_set_by_pointer(ce_st, m->ce_rd, m->ce_count);
+
+        rrdset_done(ce_st);
+    }
+
+    // --------------------------------------------------------------------
+
+    if(do_ue == CONFIG_ONDEMAND_YES || (do_ue == CONFIG_ONDEMAND_ONDEMAND && ue_sum > 0)) {
+        do_ue = CONFIG_ONDEMAND_YES;
+
+        static RRDSET *ue_st = NULL;
+
+        if(unlikely(!ue_st))
+            ue_st = rrdset_find("mem.ecc_ue");
+
+        if(unlikely(!ue_st)) {
+            ue_st = rrdset_create("mem", "ecc_ue", NULL, "ecc", NULL, "ECC Memory Uncorrectable Errors", "errors", 6610
+                                  , update_every, RRDSET_TYPE_LINE);
+
+            for(m = mc_root; m; m = m->next)
+                if(m->ue_count_filename)
+                    m->ue_rd = rrddim_add(ue_st, m->name, NULL, 1, 1, RRDDIM_INCREMENTAL);
+        }
+        else
+            rrdset_next(ue_st);
+
+        for(m = mc_root; m; m = m->next)
+            if(m->ue_count_filename && m->ue_updated)
+                rrddim_set_by_pointer(ue_st, m->ue_rd, m->ue_count);
+
+        rrdset_done(ue_st);
+    }
+
+    return 0;
+}