]> arthur.barton.de Git - netdata.git/commitdiff
Merge pull request #960 from ktsaou/master
authorCosta Tsaousis <costa@tsaousis.gr>
Thu, 15 Sep 2016 22:12:32 +0000 (01:12 +0300)
committerGitHub <noreply@github.com>
Thu, 15 Sep 2016 22:12:32 +0000 (01:12 +0300)
dropped packets now get an alarm with the solution

12 files changed:
conf.d/Makefile.am
conf.d/health.d/softnet.conf [new file with mode: 0644]
configs.signatures
plugins.d/alarm-notify.sh
src/Makefile.am
src/health.c
src/plugin_proc.c
src/plugin_proc.h
src/proc_interrupts.c
src/proc_net_softnet_stat.c [new file with mode: 0644]
src/proc_softirqs.c
web/index.html

index c398899c6a183a6246e2e6df3c91ffaebd5fdc8e..7fccd0eedfbd7a465b519a7d4169fc66ac6f11ea 100644 (file)
@@ -57,6 +57,7 @@ dist_healthconfig_DATA = \
        health.d/ram.conf \
        health.d/redis.conf \
        health.d/retroshare.conf \
+       health.d/softnet.conf \
        health.d/swap.conf \
        health.d/squid.conf \
        $(NULL)
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
new file mode 100644 (file)
index 0000000..4810eee
--- /dev/null
@@ -0,0 +1,21 @@
+# check for common /proc/net/softnet_stat errors
+
+   alarm: 1hour_netdev_backlog_exceeded
+      on: system.softnet_stat
+  lookup: sum -1h unaligned absolute of dropped
+   units: packets
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded
+      to: sysadmin
+
+   alarm: 1hour_netdev_budget_ran_outs
+      on: system.softnet_stat
+  lookup: sum -1h unaligned absolute of squeezed
+   units: events
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining
+      to: sysadmin
index ed99dcd8168a37aaa21b59659b8a0d8c9dae7a2d..4c2237584e1fd2b714817e71f65b49965a31d812 100644 (file)
@@ -128,6 +128,9 @@ declare -A configs_signatures=(
   ['39f9422b0f0c3eec11a31aff79d89514']='health.d/retroshare.conf'
   ['46ef6c1b638e40a7dfd62defdc5f99a3']='health.d/retroshare.conf'
   ['6608c6546b3c6bde084fc1d34b1163c1']='health.d/retroshare.conf'
+  ['312b4b8e2805e19cf9be554b319567d6']='health.d/softnet.conf'
+  ['565f11c38ae6bd5cc9d3c2adb542bc1b']='health.d/softnet.conf'
+  ['a305b400378d6492efd15f9940c2779b']='health.d/softnet.conf'
   ['23ae815aefa221b1929f96752a1f7556']='health.d/squid.conf'
   ['3cc6255457d4cba881ae0554ae5d9190']='health.d/squid.conf'
   ['845023f9b4a526aa0e6493756dbe6034']='health.d/squid.conf'
index 735b27a523a80d08dcbd50b6353e23f5f303bcb0..0b25a6a49411b23e98bf9187825eb13ad38b1a15 100755 (executable)
@@ -297,8 +297,14 @@ send_pushover() {
     if [ "${SEND_PUSHOVER}" = "YES" -a ! -z "${apptoken}" -a ! -z "${usertokens}" -a ! -z "${title}" -a ! -z "${message}" ]
         then
 
-        priority=0
-        [ "${status}" = "CRITICAL" ] && priority=1
+        # https://pushover.net/api
+        priority=-2
+        case "${status}" in
+            CLEAR) priority=-1;;   # low priority: no sound or vibration
+            WARNING) priotity=0;;  # normal priority: respect quiet hours
+            CRITICAL) priority=1;; # high priority: bypass quiet hours
+            *) priority=-2;;       # lowest priority: no notification at all
+        esac
 
         for user in ${usertokens}
         do
index 8fa6d5bdf71a4d9b870a1a60323e686cbebcf92f..0ecd5533fa7e23a04a40a48d8fc7db2df9d219b3 100644 (file)
@@ -55,6 +55,7 @@ netdata_SOURCES = \
        proc_net_rpc_nfsd.c \
        proc_net_snmp.c \
        proc_net_snmp6.c \
+       proc_net_softnet_stat.c \
        proc_net_stat_conntrack.c \
        proc_net_stat_synproxy.c \
        proc_stat.c \
index 672ef539015a9e1280404b9dc4f0678a376b3846..49dc392041eac47f3c8e1ee87b36eeb4df6a02b0 100644 (file)
@@ -834,7 +834,7 @@ void rrdcalc_free(RRDHOST *host, RRDCALC *rc) {
     else if(likely(host->alarms)) {
         RRDCALC *t, *last = host->alarms;
         for(t = last->next; t && t != rc; last = t, t = t->next) ;
-        if(last && last->next == rc)
+        if(last->next == rc)
             last->next = rc->next;
         else
             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
@@ -1825,6 +1825,7 @@ void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
 static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
     buffer_sprintf(wb,
            "\t\t\"%s.%s\": {\n"
+                   "\t\t\t\"id\": %lu,\n"
                    "\t\t\t\"name\": \"%s\",\n"
                    "\t\t\t\"chart\": \"%s\",\n"
                    "\t\t\t\"family\": \"%s\",\n"
@@ -1846,6 +1847,7 @@ static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
                    "\t\t\t\"delay\": %d,\n"
                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
             , rc->chart, rc->name
+            , (unsigned long)rc->id
             , rc->name
             , rc->chart
             , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
index a1bf314de7c34bf4a44081763773e4a750cb7e86..badbd32732311c10c74f738030c4fc779e4450ca 100644 (file)
@@ -32,6 +32,7 @@ void *proc_main(void *ptr)
     int vdo_proc_sys_kernel_random_entropy_avail    = !config_get_boolean("plugin:proc", "/proc/sys/kernel/random/entropy_avail", 1);
     int vdo_proc_interrupts         = !config_get_boolean("plugin:proc", "/proc/interrupts", 1);
     int vdo_proc_softirqs           = !config_get_boolean("plugin:proc", "/proc/softirqs", 1);
+    int vdo_proc_net_softnet_stat   = !config_get_boolean("plugin:proc", "/proc/net/softnet_stat", 1);
     int vdo_proc_loadavg            = !config_get_boolean("plugin:proc", "/proc/loadavg", 1);
     int vdo_sys_kernel_mm_ksm       = !config_get_boolean("plugin:proc", "/sys/kernel/mm/ksm", 1);
     int vdo_cpu_netdata             = !config_get_boolean("plugin:proc", "netdata server resources", 1);
@@ -52,6 +53,7 @@ void *proc_main(void *ptr)
     unsigned long long sutime_proc_sys_kernel_random_entropy_avail = 0ULL;
     unsigned long long sutime_proc_interrupts = 0ULL;
     unsigned long long sutime_proc_softirqs = 0ULL;
+    unsigned long long sutime_proc_net_softnet_stat = 0ULL;
     unsigned long long sutime_proc_loadavg = 0ULL;
     unsigned long long sutime_sys_kernel_mm_ksm = 0ULL;
 
@@ -107,6 +109,14 @@ void *proc_main(void *ptr)
         }
         if(unlikely(netdata_exit)) break;
 
+        if(!vdo_proc_net_softnet_stat) {
+            debug(D_PROCNETDEV_LOOP, "PROCNETDEV: calling do_proc_net_softnet_stat().");
+            sunow = time_usec();
+            vdo_proc_net_softnet_stat = do_proc_net_softnet_stat(rrd_update_every, (sutime_proc_net_softnet_stat > 0)?sunow - sutime_proc_net_softnet_stat:0ULL);
+            sutime_proc_net_softnet_stat = sunow;
+        }
+        if(unlikely(netdata_exit)) break;
+
         if(!vdo_proc_sys_kernel_random_entropy_avail) {
             debug(D_PROCNETDEV_LOOP, "PROCNETDEV: calling do_proc_sys_kernel_random_entropy_avail().");
             sunow = time_usec();
index a512e1cdf9d44c58a5008436e85d98a3ee8035cd..565cd0a26e35d88fca4aa561b8363f65eba362c3 100644 (file)
@@ -20,5 +20,6 @@ extern int do_proc_softirqs(int update_every, unsigned long long dt);
 extern int do_sys_kernel_mm_ksm(int update_every, unsigned long long dt);
 extern int do_proc_loadavg(int update_every, unsigned long long dt);
 extern int do_proc_net_stat_synproxy(int update_every, unsigned long long dt);
+extern int do_proc_net_softnet_stat(int update_every, unsigned long long dt);
 
 #endif /* NETDATA_PLUGIN_PROC_H */
index 53c9344e45ec2d18df3dd49634ed031d1b962e75..be3792f275db36f2c3c2339fd2288ce5b510e72f 100644 (file)
@@ -143,15 +143,14 @@ int do_proc_interrupts(int update_every, unsigned long long dt) {
         int c;
 
         for(c = 0; c < cpus ; c++) {
-            char id[256+1];
-            snprintfz(id, 256, "cpu%d_interrupts", c);
+            char id[50+1];
+            snprintfz(id, 50, "cpu%d_interrupts", c);
 
             st = rrdset_find_bytype("cpu", id);
             if(!st) {
-                char name[256+1], title[256+1];
-                snprintfz(name, 256, "cpu%d_interrupts", c);
-                snprintfz(title, 256, "CPU%d Interrupts", c);
-                st = rrdset_create("cpu", id, name, "interrupts", "cpu.interrupts", title, "interrupts/s", 1100 + c, update_every, RRDSET_TYPE_STACKED);
+                char title[100+1];
+                snprintfz(title, 100, "CPU%d Interrupts", c);
+                st = rrdset_create("cpu", id, NULL, "interrupts", "cpu.interrupts", title, "interrupts/s", 1100 + c, update_every, RRDSET_TYPE_STACKED);
 
                 for(l = 0; l < lines ;l++) {
                     struct interrupt *irr = irrindex(irrs, l, cpus);
diff --git a/src/proc_net_softnet_stat.c b/src/proc_net_softnet_stat.c
new file mode 100644 (file)
index 0000000..4fb710d
--- /dev/null
@@ -0,0 +1,119 @@
+#include "common.h"
+
+static inline char *softnet_column_name(uint32_t column) {
+    switch(column) {
+        // https://github.com/torvalds/linux/blob/a7fd20d1c476af4563e66865213474a2f9f473a4/net/core/net-procfs.c#L161-L166
+        case 0: return "processed";
+        case 1: return "dropped";
+        case 2: return "squeezed";
+        case 9: return "received_rps";
+        case 10: return "flow_limit_count";
+        default: return NULL;
+    }
+}
+
+int do_proc_net_softnet_stat(int update_every, unsigned long long dt) {
+    (void)dt;
+
+    static procfile *ff = NULL;
+    static int do_per_core = -1;
+    static uint32_t allocated_lines = 0, allocated_columns = 0, *data = NULL;
+
+    if(do_per_core == -1) do_per_core = config_get_boolean("plugin:proc:/proc/net/softnet_stat", "softnet_stat per core", 1);
+
+    if(!ff) {
+        char filename[FILENAME_MAX + 1];
+        snprintfz(filename, FILENAME_MAX, "%s%s", global_host_prefix, "/proc/net/softnet_stat");
+        ff = procfile_open(config_get("plugin:proc:/proc/net/softnet_stat", "filename to monitor", filename), " \t", PROCFILE_FLAG_DEFAULT);
+    }
+    if(!ff) return 1;
+
+    ff = procfile_readall(ff);
+    if(!ff) return 0; // we return 0, so that we will retry to open it next time
+
+    uint32_t lines = procfile_lines(ff), l;
+    uint32_t words = procfile_linewords(ff, 0), w;
+
+    if(!lines || !words) {
+        error("Cannot read /proc/net/softnet_stat, %u lines and %u columns reported.", lines, words);
+        return 1;
+    }
+
+    if(lines > 200) lines = 200;
+    if(words > 50) words = 50;
+    
+    if(unlikely(!data || lines > allocated_lines || words > allocated_columns)) {
+        freez(data);
+        allocated_lines = lines;
+        allocated_columns = words;
+        data = mallocz((allocated_lines + 1) * allocated_columns * sizeof(uint32_t));
+    }
+    
+    // initialize to zero
+    bzero(data, (allocated_lines + 1) * allocated_columns * sizeof(uint32_t));
+
+    // parse the values
+    for(l = 0; l < lines ;l++) {
+        words = procfile_linewords(ff, l);
+        if(!words) continue;
+
+        if(words > allocated_columns) words = allocated_columns;
+
+        for(w = 0; w < words ; w++) {
+            if(unlikely(softnet_column_name(w))) {
+                uint32_t t = strtoul(procfile_lineword(ff, l, w), NULL, 16);
+                data[w] += t;
+                data[((l + 1) * allocated_columns) + w] = t;
+            }
+        }
+    }
+
+    if(data[(lines * allocated_columns)] == 0)
+        lines--;
+
+    RRDSET *st;
+
+    // --------------------------------------------------------------------
+
+    st = rrdset_find_bytype("system", "softnet_stat");
+    if(!st) {
+        st = rrdset_create("system", "softnet_stat", NULL, "softnet_stat", NULL, "System softnet_stat", "events/s", 955, update_every, RRDSET_TYPE_LINE);
+        for(w = 0; w < allocated_columns ;w++)
+            if(unlikely(softnet_column_name(w)))
+                rrddim_add(st, softnet_column_name(w), NULL, 1, 1, RRDDIM_INCREMENTAL);
+    }
+    else rrdset_next(st);
+
+    for(w = 0; w < allocated_columns ;w++)
+        if(unlikely(softnet_column_name(w)))
+            rrddim_set(st, softnet_column_name(w), data[w]);
+
+    rrdset_done(st);
+
+    if(do_per_core) {
+        for(l = 0; l < lines ;l++) {
+            char id[50+1];
+            snprintfz(id, 50, "cpu%d_softnet_stat", l);
+
+            st = rrdset_find_bytype("cpu", id);
+            if(!st) {
+                char title[100+1];
+                snprintfz(title, 100, "CPU%d softnet_stat", l);
+
+                st = rrdset_create("cpu", id, NULL, "softnet_stat", NULL, title, "events/s", 4101 + l, update_every, RRDSET_TYPE_LINE);
+                for(w = 0; w < allocated_columns ;w++)
+                    if(unlikely(softnet_column_name(w)))
+                        rrddim_add(st, softnet_column_name(w), NULL, 1, 1, RRDDIM_INCREMENTAL);
+            }
+            else rrdset_next(st);
+
+            for(w = 0; w < allocated_columns ;w++)
+                if(unlikely(softnet_column_name(w)))
+                    rrddim_set(st, softnet_column_name(w), data[((l + 1) * allocated_columns) + w]);
+
+            rrdset_done(st);
+        }
+    }
+
+    return 0;
+}
index a5165040acecc9da79ba349c001c7a2729800bbc..701153e66d316caa3d83bb0be494387e5059e53a 100644 (file)
@@ -134,8 +134,8 @@ int do_proc_softirqs(int update_every, unsigned long long dt) {
         int c;
 
         for(c = 0; c < cpus ; c++) {
-            char id[256+1];
-            snprintfz(id, 256, "cpu%d_softirqs", c);
+            char id[50+1];
+            snprintfz(id, 50, "cpu%d_softirqs", c);
 
             st = rrdset_find_bytype("cpu", id);
             if(!st) {
@@ -148,10 +148,9 @@ int do_proc_softirqs(int update_every, unsigned long long dt) {
                 }
                 if(core_sum == 0) continue; // try next core
 
-                char name[256+1], title[256+1];
-                snprintfz(name, 256, "cpu%d_softirqs", c);
-                snprintfz(title, 256, "CPU%d softirqs", c);
-                st = rrdset_create("cpu", id, name, "softirqs", "cpu.softirqs", title, "softirqs/s", 3000 + c, update_every, RRDSET_TYPE_STACKED);
+                char title[100+1];
+                snprintfz(title, 100, "CPU%d softirqs", c);
+                st = rrdset_create("cpu", id, NULL, "softirqs", "cpu.softirqs", title, "softirqs/s", 3000 + c, update_every, RRDSET_TYPE_STACKED);
 
                 for(l = 0; l < lines ;l++) {
                     struct interrupt *irr = irrindex(irrs, l, cpus);
index fb1b915f1265c43a16ce1effa913331169aed732..32c890e75736faca835d3bb22e791cb11e4db268 100644 (file)
@@ -2791,15 +2791,13 @@ function alarmsUpdateModal() {
             if(chart.priority < families[family].priority)
                 families[family].priority = chart.priority;
 
-            families[family].arr.push(alarm);
+            families[family].arr.unshift(alarm);
         }
 
         // sort the families, like the dashboard menu does
         var families_sorted = families_sort.sort(function (a, b) {
             if (a.priority > b.priority) return -1;
             if (a.priority < b.priority) return 1;
-            if (a.id > b.id) return 1;
-            if (a.id < b.id) return -1;
             return 0;
         });