]> arthur.barton.de Git - netdata.git/commitdiff
added log flood protection #240 to the netdata daemon and apps.plugin
authorCosta Tsaousis (ktsaou) <costa@tsaousis.gr>
Sun, 17 Apr 2016 14:34:51 +0000 (17:34 +0300)
committerCosta Tsaousis (ktsaou) <costa@tsaousis.gr>
Sun, 17 Apr 2016 14:34:51 +0000 (17:34 +0300)
src/apps_plugin.c [changed mode: 0644->0755]
src/log.c [changed mode: 0644->0755]
src/log.h [changed mode: 0644->0755]
src/main.c [changed mode: 0644->0755]
src/plugin_tc.c [changed mode: 0644->0755]
src/plugins_d.c [changed mode: 0644->0755]
src/rrd.c [changed mode: 0644->0755]

old mode 100644 (file)
new mode 100755 (executable)
index 6be562f..6c7206b
@@ -1299,8 +1299,6 @@ int read_pid_file_descriptors(struct pid_stat *p) {
 
 int collect_data_for_all_processes_from_proc(void)
 {
-       static long count_errors = 0;
-
        char dirname[FILENAME_MAX + 1];
 
        snprintf(dirname, FILENAME_MAX, "%s/proc", host_prefix);
@@ -1354,7 +1352,6 @@ int collect_data_for_all_processes_from_proc(void)
                // /proc/<pid>/stat
 
                if(unlikely(read_proc_pid_stat(p))) {
-                       if(!count_errors++ || debug || (p->target && p->target->debug))
                                error("Cannot process %s/proc/%d/stat", host_prefix, pid);
 
                        // there is no reason to proceed if we cannot get its status
@@ -1363,7 +1360,6 @@ int collect_data_for_all_processes_from_proc(void)
 
                // check its parent pid
                if(unlikely(p->ppid < 0 || p->ppid > pid_max)) {
-                       if(unlikely(!count_errors++ || debug || (p->target && p->target->debug)))
                                error("Pid %d states invalid parent pid %d. Using 0.", pid, p->ppid);
 
                        p->ppid = 0;
@@ -1374,7 +1370,6 @@ int collect_data_for_all_processes_from_proc(void)
 
                if(proc_pid_cmdline_is_needed) {
                        if(unlikely(read_proc_pid_cmdline(p))) {
-                               if(!count_errors++ || debug || (p->target && p->target->debug))
                                        error("Cannot process %s/proc/%d/cmdline", host_prefix, pid);
                        }
                }
@@ -1383,7 +1378,6 @@ int collect_data_for_all_processes_from_proc(void)
                // /proc/<pid>/statm
 
                if(unlikely(read_proc_pid_statm(p))) {
-                       if(unlikely(!count_errors++ || debug || (p->target && p->target->debug)))
                                error("Cannot process %s/proc/%d/statm", host_prefix, pid);
 
                        // there is no reason to proceed if we cannot get its memory status
@@ -1395,7 +1389,6 @@ int collect_data_for_all_processes_from_proc(void)
                // /proc/<pid>/io
 
                if(unlikely(read_proc_pid_io(p))) {
-                       if(unlikely(!count_errors++ || debug || (p->target && p->target->debug)))
                                error("Cannot process %s/proc/%d/io", host_prefix, pid);
 
                        // on systems without /proc/X/io
@@ -1407,7 +1400,6 @@ int collect_data_for_all_processes_from_proc(void)
                // <pid> ownership
 
                if(unlikely(read_proc_pid_ownership(p))) {
-                       if(unlikely(!count_errors++ || debug || (p->target && p->target->debug)))
                                error("Cannot stat %s/proc/%d", host_prefix, pid);
                }
 
@@ -1448,7 +1440,6 @@ int collect_data_for_all_processes_from_proc(void)
                // /proc/<pid>/fd
 
                if(unlikely(read_pid_file_descriptors(p))) {
-                       if(unlikely(!count_errors++ || debug || (p->target && p->target->debug)))
                                error("Cannot process entries in %s/proc/%d/fd", host_prefix, pid);
                }
 
@@ -1459,11 +1450,6 @@ int collect_data_for_all_processes_from_proc(void)
                p->updated = 1;
        }
 
-       if(unlikely(count_errors > 1000)) {
-               error("%ld more errors encountered\n", count_errors - 1);
-               count_errors = 0;
-       }
-
        closedir(dir);
 
        return 1;
old mode 100644 (file)
new mode 100755 (executable)
index 02cdae2..e5e0ad2
--- a/src/log.c
+++ b/src/log.c
@@ -27,6 +27,85 @@ int access_log_syslog = 1;
 int error_log_syslog = 1;
 int output_log_syslog = 1;     // debug log
 
+time_t error_log_throttle_period = 1200;
+unsigned long error_log_errors_per_period = 200;
+
+int error_log_limit(int reset) {
+       static time_t start = 0;
+       static unsigned long counter = 0, prevented = 0;
+
+       // do not throttle if the period is 0
+       if(error_log_throttle_period == 0)
+               return 0;
+
+       // prevent all logs if the errors per period is 0
+       if(error_log_errors_per_period == 0)
+               return 1;
+
+       time_t now = time(NULL);
+       if(!start) start = now;
+
+       if(reset) {
+               if(prevented) {
+                       log_date(stderr);
+                       fprintf(stderr, "%s: Resetting logging for process '%s' (prevented %lu logs in the last %ld seconds).\n"
+                                       , program_name
+                               , program_name
+                                       , prevented
+                                       , now - start
+                       );
+               }
+
+               start = now;
+               counter = 0;
+               prevented = 0;
+       }
+
+       // detect if we log too much
+       counter++;
+
+       if(now - start > error_log_throttle_period) {
+               if(prevented) {
+                       log_date(stderr);
+                       fprintf(stderr, "%s: Resuming logging from process '%s' (prevented %lu logs in the last %ld seconds).\n"
+                                       , program_name
+                               , program_name
+                                       , prevented
+                                       , error_log_throttle_period
+                       );
+               }
+
+               // restart the period accounting
+               start = now;
+               counter = 1;
+               prevented = 0;
+
+               // log this error
+               return 0;
+       }
+
+       if(counter > error_log_errors_per_period) {
+               if(!prevented) {
+                       log_date(stderr);
+                       fprintf(stderr, "%s: Too many logs (%lu logs in %ld seconds, threshold is set to %lu logs in %ld seconds). Preventing more logs from process '%s' for %ld seconds.\n"
+                                       , program_name
+                               , counter
+                               , now - start
+                               , error_log_errors_per_period
+                               , error_log_throttle_period
+                               , program_name
+                                       , start + error_log_throttle_period - now);
+               }
+
+               prevented++;
+
+               // prevent logging this error
+               return 1;
+       }
+
+       return 0;
+}
+
 void log_date(FILE *out)
 {
                char outstr[200];
@@ -64,6 +143,9 @@ void info_int( const char *file, const char *function, const unsigned long line,
 {
        va_list args;
 
+       // prevent logging too much
+       if(error_log_limit(0)) return;
+
        log_date(stderr);
 
        va_start( args, fmt );
@@ -85,6 +167,9 @@ void error_int( const char *prefix, const char *file, const char *function, cons
 {
        va_list args;
 
+       // prevent logging too much
+       if(error_log_limit(0)) return;
+
        log_date(stderr);
 
        va_start( args, fmt );
old mode 100644 (file)
new mode 100755 (executable)
index 08f3c4f..e882af3
--- a/src/log.h
+++ b/src/log.h
@@ -42,6 +42,12 @@ extern int access_log_syslog;
 extern int error_log_syslog;
 extern int output_log_syslog;
 
+extern time_t error_log_throttle_period;
+extern unsigned long error_log_errors_per_period;
+extern int error_log_limit(int reset);
+
+#define error_log_limit_reset() do { error_log_limit(1); } while(0)
+
 #define debug(type, args...) do { if(unlikely(!silent && (debug_flags & type))) debug_int(__FILE__, __FUNCTION__, __LINE__, ##args); } while(0)
 #define info(args...)    info_int(__FILE__, __FUNCTION__, __LINE__, ##args)
 #define infoerr(args...) error_int("INFO", __FILE__, __FUNCTION__, __LINE__, ##args)
old mode 100644 (file)
new mode 100755 (executable)
index c670d9c..89ec90f
@@ -36,6 +36,7 @@
 #include "plugin_nfacct.h"
 
 #include "main.h"
+#include "../config.h"
 
 int netdata_exit = 0;
 
@@ -45,6 +46,9 @@ void netdata_cleanup_and_exit(int ret)
        rrdset_save_all();
        // kill_childs();
 
+       // let it log a few more error messages
+       error_log_limit_reset();
+
        if(pidfd != -1) {
                if(ftruncate(pidfd, 0) != 0)
                        error("Cannot truncate pidfile '%s'.", pidfile);
@@ -344,6 +348,12 @@ int main(int argc, char **argv)
                }
                else error_log_syslog = 0;
 
+               error_log_throttle_period = config_get_number("global", "errors throttle period", error_log_throttle_period);
+               setenv("NETDATA_ERRORS_THROTTLE_PERIOD", config_get("global", "errors throttle period"    , ""), 1);
+
+               error_log_errors_per_period = config_get_number("global", "errors per throttle period", error_log_errors_per_period);
+               setenv("NETDATA_ERRORS_PER_PERIOD"     , config_get("global", "errors per throttle period", ""), 1);
+
                // --------------------------------------------------------------------
 
                access_log_file = config_get("global", "access log", LOG_DIR "/access.log");
old mode 100644 (file)
new mode 100755 (executable)
index 4a5d3e4..2c7a55c
@@ -15,6 +15,7 @@
 #include "popen.h"
 #include "plugin_tc.h"
 #include "main.h"
+#include "../config.h"
 
 #define RRD_TYPE_TC                                    "tc"
 #define RRD_TYPE_TC_LEN                                strlen(RRD_TYPE_TC)
@@ -717,7 +718,8 @@ void *tc_main(void *ptr)
                        //      debug(D_TC_LOOP, "IGNORED line");
                        //}
                }
-               mypclose(fp, tc_child_pid);
+               // fgets() failed or loop broke
+               int code = mypclose(fp, tc_child_pid);
                tc_child_pid = 0;
 
                if(device) {
@@ -732,10 +734,19 @@ void *tc_main(void *ptr)
                        return NULL;
                }
 
+               if(code == 1 || code == 127) {
+                       // 1 = DISABLE
+                       // 127 = cannot even run it
+                       error("TC: tc-qos-helper.sh exited with code %d. Disabling it.", code);
+
+                       tc_device_free_all();
+                       pthread_exit(NULL);
+                       return NULL;
+               }
+
                sleep((unsigned int) rrd_update_every);
        }
 
        pthread_exit(NULL);
        return NULL;
 }
-
old mode 100644 (file)
new mode 100755 (executable)
index 93dd29d..b8524d9
@@ -16,6 +16,7 @@
 #include "rrd.h"
 #include "popen.h"
 #include "plugins_d.h"
+#include "../config.h"
 
 struct plugind *pluginsd_root = NULL;
 
old mode 100644 (file)
new mode 100755 (executable)
index 2dce02e..86ff396
--- a/src/rrd.c
+++ b/src/rrd.c
@@ -682,6 +682,9 @@ void rrdset_save_all(void)
 {
        debug(D_RRD_CALLS, "rrdset_save_all()");
 
+       // let it log a few error messages
+       error_log_limit_reset();
+
        RRDSET *st;
        RRDDIM *rd;