From: Costa Tsaousis (ktsaou) Date: Wed, 11 Jan 2017 21:24:07 +0000 (+0200) Subject: detect ECC memory correctable and uncorrectable errors; fixes #1508 X-Git-Tag: v1.5.0~31^2~3 X-Git-Url: https://arthur.barton.de/gitweb/?p=netdata.git;a=commitdiff_plain;h=2ecf423c40b6a774781711d2e6843c44aad05f1c;hp=1887a9aba01139d60fa1f5d313618e79ab268154 detect ECC memory correctable and uncorrectable errors; fixes #1508 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index f4eb2872..49ae6b0c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,7 @@ set(NETDATA_SOURCE_FILES src/registry_person.c src/registry_person.h src/registry_machine.c - src/registry_machine.h src/registry_internals.c src/registry_init.c src/registry_db.c src/registry_log.c src/proc_uptime.c) + src/registry_machine.h src/registry_internals.c src/registry_init.c src/registry_db.c src/registry_log.c src/proc_uptime.c src/sys_devices_system_edac_mc.c) set(APPS_PLUGIN_SOURCE_FILES src/appconfig.c diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf new file mode 100644 index 00000000..3c904f6b --- /dev/null +++ b/conf.d/health.d/memory.conf @@ -0,0 +1,30 @@ + + alarm: 1hour_ecc_memory_correctable + on: mem.ecc_ce + lookup: sum -10m unaligned + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: number of ECC correctable errors during the last hour + to: sysadmin + + alarm: 1hour_ecc_memory_uncorrectable + on: mem.ecc_ue + lookup: sum -10m unaligned + units: errors + every: 1m + crit: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: number of ECC uncorrectable errors during the last hour + to: sysadmin + + alarm: 1hour_memory_hw_corrupted + on: mem.hwcorrupt + calc: $HardwareCorrupted + units: MB + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: amount of memory corrupted due to a hardware failure + to: sysadmin diff --git a/src/Makefile.am b/src/Makefile.am index 12f296c1..5533c951 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -47,6 +47,7 @@ netdata_SOURCES = \ popen.c popen.h \ socket.c socket.h \ sys_fs_cgroup.c \ + sys_devices_system_edac_mc.c \ procfile.c procfile.h \ proc_self_mountinfo.c proc_self_mountinfo.h \ registry.c registry.h \ diff --git a/src/plugin_proc.c b/src/plugin_proc.c index ed483d67..5e80889b 100644 --- a/src/plugin_proc.c +++ b/src/plugin_proc.c @@ -30,7 +30,8 @@ void *proc_main(void *ptr) int vdo_proc_vmstat = !config_get_boolean("plugin:proc", "/proc/vmstat", 1); int vdo_proc_net_rpc_nfs = !config_get_boolean("plugin:proc", "/proc/net/rpc/nfs", 1); int vdo_proc_net_rpc_nfsd = !config_get_boolean("plugin:proc", "/proc/net/rpc/nfsd", 1); - int vdo_proc_sys_kernel_random_entropy_avail = !config_get_boolean("plugin:proc", "/proc/sys/kernel/random/entropy_avail", 1); + int vdo_proc_sys_kernel_random_entropy_avail = !config_get_boolean("plugin:proc", "/proc/sys/kernel/random/entropy_avail", 1); + int vdo_proc_sys_devices_system_edac_mc = !config_get_boolean("plugin:proc", "/sys/devices/system/edac/mc", 1); int vdo_proc_interrupts = !config_get_boolean("plugin:proc", "/proc/interrupts", 1); int vdo_proc_softirqs = !config_get_boolean("plugin:proc", "/proc/softirqs", 1); int vdo_proc_net_softnet_stat = !config_get_boolean("plugin:proc", "/proc/net/softnet_stat", 1); @@ -55,6 +56,7 @@ void *proc_main(void *ptr) usec_t sutime_proc_net_rpc_nfs = 0ULL; usec_t sutime_proc_net_rpc_nfsd = 0ULL; usec_t sutime_proc_sys_kernel_random_entropy_avail = 0ULL; + usec_t sutime_proc_sys_devices_system_edac_mc = 0ULL; usec_t sutime_proc_interrupts = 0ULL; usec_t sutime_proc_softirqs = 0ULL; usec_t sutime_proc_net_softnet_stat = 0ULL; @@ -142,6 +144,14 @@ void *proc_main(void *ptr) } if(unlikely(netdata_exit)) break; + if(!vdo_proc_sys_devices_system_edac_mc) { + debug(D_PROCNETDEV_LOOP, "PROCNETDEV: calling do_proc_sys_devices_system_edac_mc()."); + now = now_realtime_usec(); + vdo_proc_sys_devices_system_edac_mc = do_proc_sys_devices_system_edac_mc(rrd_update_every, (sutime_proc_sys_devices_system_edac_mc > 0)?now - sutime_proc_sys_devices_system_edac_mc:0ULL); + sutime_proc_sys_devices_system_edac_mc = now; + } + if(unlikely(netdata_exit)) break; + if(!vdo_proc_net_dev) { debug(D_PROCNETDEV_LOOP, "PROCNETDEV: calling do_proc_net_dev()."); now = now_realtime_usec(); diff --git a/src/plugin_proc.h b/src/plugin_proc.h index 0f52e4e4..2ee2b6b2 100644 --- a/src/plugin_proc.h +++ b/src/plugin_proc.h @@ -23,5 +23,6 @@ extern int do_proc_loadavg(int update_every, usec_t dt); extern int do_proc_net_stat_synproxy(int update_every, usec_t dt); extern int do_proc_net_softnet_stat(int update_every, usec_t dt); extern int do_proc_uptime(int update_every, usec_t dt); +extern int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt); #endif /* NETDATA_PLUGIN_PROC_H */ diff --git a/src/sys_devices_system_edac_mc.c b/src/sys_devices_system_edac_mc.c new file mode 100644 index 00000000..1e1aa9c7 --- /dev/null +++ b/src/sys_devices_system_edac_mc.c @@ -0,0 +1,184 @@ +#include "common.h" + +struct mc { + char *name; + char ce_updated; + char ue_updated; + + char *ce_count_filename; + char *ue_count_filename; + + procfile *ce_ff; + procfile *ue_ff; + + collected_number ce_count; + collected_number ue_count; + + RRDDIM *ce_rd; + RRDDIM *ue_rd; + + struct mc *next; +}; +static struct mc *mc_root = NULL; + +static void find_all_mc() { + char name[FILENAME_MAX + 1]; + snprintfz(name, FILENAME_MAX, "%s%s", global_host_prefix, "/sys/devices/system/edac/mc"); + char *dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name); + + DIR *dir = opendir(dirname); + if(!dir) { + error("Cannot read ECC memory errors directory '%s'", dirname); + return; + } + + struct dirent *de = NULL; + while((de = readdir(dir))) { + if(de->d_type == DT_DIR && de->d_name[0] == 'm' && de->d_name[1] == 'c' && isdigit(de->d_name[2])) { + struct mc *m = callocz(1, sizeof(struct mc)); + m->name = strdupz(de->d_name); + + struct stat st; + + snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", dirname, de->d_name); + if(stat(name, &st) != -1) + m->ce_count_filename = strdupz(name); + + snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", dirname, de->d_name); + if(stat(name, &st) != -1) + m->ue_count_filename = strdupz(name); + + if(!m->ce_count_filename && !m->ue_count_filename) { + freez(m->name); + freez(m); + } + else { + m->next = mc_root; + mc_root = m; + } + } + } + + closedir(dir); +} + +int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt) { + (void)dt; + + if(unlikely(mc_root == NULL)) { + find_all_mc(); + if(unlikely(mc_root == NULL)) + return 1; + } + + static int do_ce = -1, do_ue = -1; + calculated_number ce_sum = 0, ue_sum = 0; + struct mc *m; + + if(unlikely(do_ce == -1)) { + do_ce = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory correctable errors", CONFIG_ONDEMAND_ONDEMAND); + do_ue = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory uncorrectable errors", CONFIG_ONDEMAND_ONDEMAND); + } + + if(do_ce != CONFIG_ONDEMAND_NO) { + for(m = mc_root; m; m = m->next) { + if(m->ce_count_filename) { + m->ce_updated = 0; + + if(unlikely(!m->ce_ff)) { + m->ce_ff = procfile_open(m->ce_count_filename, " \t", PROCFILE_FLAG_DEFAULT); + if(unlikely(!m->ce_ff)) + continue; + } + + m->ce_ff = procfile_readall(m->ce_ff); + if(unlikely(!m->ce_ff || procfile_lines(m->ce_ff) < 1 || procfile_linewords(m->ce_ff, 0) < 1)) + continue; + + m->ce_count = strtoull(procfile_lineword(m->ce_ff, 0, 0), NULL, 0); + ce_sum += m->ce_count; + m->ce_updated = 1; + } + } + } + + if(do_ue != CONFIG_ONDEMAND_NO) { + for(m = mc_root; m; m = m->next) { + if(m->ue_count_filename) { + m->ue_updated = 0; + + if(unlikely(!m->ue_ff)) { + m->ue_ff = procfile_open(m->ue_count_filename, " \t", PROCFILE_FLAG_DEFAULT); + if(unlikely(!m->ue_ff)) + continue; + } + + m->ue_ff = procfile_readall(m->ue_ff); + if(unlikely(!m->ue_ff || procfile_lines(m->ue_ff) < 1 || procfile_linewords(m->ue_ff, 0) < 1)) + continue; + + m->ue_count = strtoull(procfile_lineword(m->ue_ff, 0, 0), NULL, 0); + ue_sum += m->ue_count; + m->ue_updated = 1; + } + } + } + + // -------------------------------------------------------------------- + + if(do_ce == CONFIG_ONDEMAND_YES || (do_ce == CONFIG_ONDEMAND_ONDEMAND && ce_sum > 0)) { + do_ce = CONFIG_ONDEMAND_YES; + + static RRDSET *ce_st = NULL; + + if(unlikely(!ce_st)) + ce_st = rrdset_find("mem.ecc_ce"); + + if(unlikely(!ce_st)) { + ce_st = rrdset_create("mem", "ecc_ce", NULL, "ecc", NULL, "ECC Memory Correctable Errors", "errors", 6600 + , update_every, RRDSET_TYPE_LINE); + + for(m = mc_root; m; m = m->next) + if(m->ce_count_filename) + m->ce_rd = rrddim_add(ce_st, m->name, NULL, 1, 1, RRDDIM_INCREMENTAL); + } + else + rrdset_next(ce_st); + + for(m = mc_root; m; m = m->next) + if(m->ce_count_filename && m->ce_updated) + rrddim_set_by_pointer(ce_st, m->ce_rd, m->ce_count); + + rrdset_done(ce_st); + } + + // -------------------------------------------------------------------- + + if(do_ue == CONFIG_ONDEMAND_YES || (do_ue == CONFIG_ONDEMAND_ONDEMAND && ue_sum > 0)) { + do_ue = CONFIG_ONDEMAND_YES; + + static RRDSET *ue_st = NULL; + + if(unlikely(!ue_st)) + ue_st = rrdset_find("mem.ecc_ue"); + + if(unlikely(!ue_st)) { + ue_st = rrdset_create("mem", "ecc_ue", NULL, "ecc", NULL, "ECC Memory Uncorrectable Errors", "errors", 6610 + , update_every, RRDSET_TYPE_LINE); + + for(m = mc_root; m; m = m->next) + if(m->ue_count_filename) + m->ue_rd = rrddim_add(ue_st, m->name, NULL, 1, 1, RRDDIM_INCREMENTAL); + } + else + rrdset_next(ue_st); + + for(m = mc_root; m; m = m->next) + if(m->ue_count_filename && m->ue_updated) + rrddim_set_by_pointer(ue_st, m->ue_rd, m->ue_count); + + rrdset_done(ue_st); + } + + return 0; +}