]> arthur.barton.de Git - netdata.git/commitdiff
properly handle the exit status of plugins to avoid infinite restart attempts; plugin...
authorCosta Tsaousis (ktsaou) <costa@tsaousis.gr>
Thu, 9 Jun 2016 19:49:44 +0000 (22:49 +0300)
committerCosta Tsaousis (ktsaou) <costa@tsaousis.gr>
Thu, 9 Jun 2016 19:49:44 +0000 (22:49 +0300)
charts.d/apache.chart.sh
charts.d/example.chart.sh
plugins.d/charts.d.plugin
src/plugins_d.c
src/plugins_d.h
web/demo2.html

index f4f7d0581b0729b8234bd241a4f49ce4514245fc..a3e064094c36a357582ceb8c6a3004966fc89bfc 100755 (executable)
@@ -67,19 +67,14 @@ apache_detect() {
 
        # we will not check of the Conns*
        # keys, since these are apache 2.4 specific
-       if [ -z "${apache_key_accesses}" \
-               -o -z "${apache_key_kbytes}" \
-               -o -z "${apache_key_reqpersec}" \
-               -o -z "${apache_key_bytespersec}" \
-               -o -z "${apache_key_bytesperreq}" \
-               -o -z "${apache_key_busyworkers}" \
-               -o -z "${apache_key_idleworkers}" \
-               -o -z "${apache_key_scoreboard}" \
-               ]
-               then
-               echo >&2 "apache: Invalid response or missing keys from apache server: ${*}"
-               return 1
-       fi
+       [ -z "${apache_key_accesses}"    ] && echo >&2 "apache: missing 'Total Accesses' from apache server: ${*}" && return 1
+       [ -z "${apache_key_kbytes}"      ] && echo >&2 "apache: missing 'Total kBytes' from apache server: ${*}" && return 1
+       [ -z "${apache_key_reqpersec}"   ] && echo >&2 "apache: missing 'ReqPerSec' from apache server: ${*}" && return 1
+       [ -z "${apache_key_bytespersec}" ] && echo >&2 "apache: missing 'BytesPerSec' from apache server: ${*}" && return 1
+       [ -z "${apache_key_bytesperreq}" ] && echo >&2 "apache: missing 'BytesPerReq' from apache server: ${*}" && return 1
+       [ -z "${apache_key_busyworkers}" ] && echo >&2 "apache: missing 'BusyWorkers' from apache server: ${*}" && return 1
+       [ -z "${apache_key_idleworkers}" ] && echo >&2 "apache: missing 'IdleWorkers' from apache server: ${*}" && return 1
+       [ -z "${apache_key_scoreboard}"  ] && echo >&2 "apache: missing 'Scoreboard' from apache server: ${*}" && return 1
 
        if [ ! -z "${apache_key_connstotal}" \
                -a ! -z "${apache_key_connsasyncwriting}" \
@@ -88,6 +83,8 @@ apache_detect() {
                ]
                then
                apache_has_conns=1
+       else
+               apache_has_conns=0
        fi
 
        return 0
index ad205046219dfb6994fc2c11c6b90418ba6c20b9..b20740e862ef67180f1e798d9103e7a95196bfa2 100755 (executable)
@@ -7,14 +7,21 @@
 # between the calls of the _update() function
 example_update_every=
 
+# the priority is used to sort the charts on the dashboard
+# 1 = the first chart
 example_priority=150000
 
+# to enable this chart, you have to set this to 12345
+# (just a demonstration for something that needs to be checked)
+example_magic_number=
+
 # _check is called once, to find out if this chart should be enabled or not
 example_check() {
        # this should return:
        #  - 0 to enable the chart
        #  - 1 to disable the chart
 
+       [ "${example_magic_number}" != "12345" ] && return 1
        return 0
 }
 
index 2824fa3c6f01ba44deb77511af6b6f654c88496c..543695dd7d8d48eb3766473814014585c4ee3fb3 100755 (executable)
@@ -504,7 +504,7 @@ if [ -z "$run_charts" ]
        exit 1
 fi
 
-declare -A charts_last_update=() charts_update_every=() charts_next_update=() charts_run_counter=()
+declare -A charts_last_update=() charts_update_every=() charts_next_update=() charts_run_counter=() charts_serial_failures=()
 global_update() {
        local exit_at \
                c=0 dt ret last_ms exec_start_ms exec_end_ms \
@@ -522,13 +522,14 @@ global_update() {
                charts_last_update[$chart]=$((now_ms - (now_ms % (charts_update_every[$chart] * 1000) ) ))
                charts_next_update[$chart]=$(( charts_last_update[$chart] + (charts_update_every[$chart] * 1000) ))
                charts_run_counter[$chart]=0
+               charts_serial_failures[$chart]=0
 
                echo "CHART netdata.plugin_chartsd_$chart '' 'Execution time for $chart plugin' 'milliseconds / run' charts.d netdata.plugin_charts area 145000 ${charts_update_every[$chart]}"
                echo "DIMENSION run_time 'run time' absolute 1 1"
        done
 
        # the main loop
-       while [ 1 ]
+       while [ "${#next_charts[@]}" -gt 0 ]
        do
                c=$((c + 1))
                now_charts=("${next_charts[@]}")
@@ -570,15 +571,24 @@ global_update() {
                                current_time_ms; exec_end_ms=$now_ms
 
                                echo "BEGIN netdata.plugin_chartsd_$chart $dt"
+                               echo "SET run_time = $(( exec_end_ms - exec_start_ms ))"
+                               echo "END"
+
                                if [ $ret -eq 0 ]
                                then
-                                       echo "SET run_time = $(( exec_end_ms - exec_start_ms ))"
+                                       charts_serial_failures[$chart]=0
                                        next_charts+=($chart)
                                else
-                                       echo "SET run_time = $(( (exec_end_ms - exec_start_ms) * -1 ))"
-                                       echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reports failure. Disabling it."
+                                       charts_serial_failures[$chart]=$(( charts_serial_failures[$chart] + 1 ))
+
+                                       if [ charts_serial_failures[$chart] -gt 10 ]
+                                               then
+                                               echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reported failure ${charts_serial_failures[$chart]} times. Disabling it."
+                                       else
+                                               echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reports failure. Will keep trying for a while."
+                                               next_charts+=($chart)
+                                       fi
                                fi
-                               echo "END"
                        else
                                next_charts+=($chart)
                        fi
@@ -601,6 +611,9 @@ global_update() {
 
                test ${now_ms} -ge ${exit_at} && exit 0
        done
+
+       echo >&2 "$PROGRAM_NAME: Nothing left to do. Disabling charts.d.plugin."
+       echo "DISABLE"
 }
 
 global_update
index 0ccbd36e458210d67c81c6d9819a942dfa23f88d..d7815433e4db7769c8d725d4571dd8557654f770 100644 (file)
@@ -125,6 +125,8 @@ void *pluginsd_worker_thread(void *arg)
        uint32_t STOPPING_WAKE_ME_UP_PLEASE_HASH = simple_hash("STOPPING_WAKE_ME_UP_PLEASE");
 #endif
 
+       size_t count = 0;
+
        while(likely(1)) {
                if(unlikely(netdata_exit)) break;
 
@@ -137,7 +139,6 @@ void *pluginsd_worker_thread(void *arg)
                info("PLUGINSD: '%s' running on pid %d", cd->fullfilename, cd->pid);
 
                RRDSET *st = NULL;
-               unsigned long long count = 0;
                char *s;
                uint32_t hash;
 
@@ -182,8 +183,6 @@ void *pluginsd_worker_thread(void *arg)
                                if(unlikely(st->debug)) debug(D_PLUGINSD, "PLUGINSD: '%s' is setting dimension %s/%s to %s", cd->fullfilename, st->id, dimension, value?value:"<nothing>");
 
                                if(value) rrddim_set(st, dimension, strtoll(value, NULL, 0));
-
-                               count++;
                        }
                        else if(likely(hash == BEGIN_HASH && !strcmp(s, "BEGIN"))) {
                                char *id = words[1];
@@ -223,6 +222,8 @@ void *pluginsd_worker_thread(void *arg)
 
                                rrdset_done(st);
                                st = NULL;
+
+                               count++;
                        }
                        else if(likely(hash == FLUSH_HASH && !strcmp(s, "FLUSH"))) {
                                debug(D_PLUGINSD, "PLUGINSD: '%s' is requesting a FLUSH", cd->fullfilename);
@@ -386,17 +387,17 @@ void *pluginsd_worker_thread(void *arg)
                                break;
                        }
                }
+               if(likely(count)) {
+                       cd->successful_collections += count;
+                       cd->serial_failures = 0;
+               }
+               else
+                       cd->serial_failures++;
 
-               info("PLUGINSD: '%s' on pid %d stopped.", cd->fullfilename, cd->pid);
+               info("PLUGINSD: '%s' on pid %d stopped after %zu successful data collections (ENDs).", cd->fullfilename, cd->pid, count);
 
-               // fgets() failed or loop broke
+               // get the return code
                int code = mypclose(fp, cd->pid);
-               if(code == 1 || code == 127) {
-                       // 1 = DISABLE
-                       // 127 = cannot even run it
-                       error("PLUGINSD: '%s' (pid %d) exited with code %d. Disabling it.", cd->fullfilename, cd->pid, code);
-                       cd->enabled = 0;
-               }
 
                if(netdata_exit) {
                        cd->pid = 0;
@@ -406,14 +407,49 @@ void *pluginsd_worker_thread(void *arg)
                        return NULL;
                }
 
-               if(unlikely(!count && cd->enabled)) {
-                       error("PLUGINSD: '%s' (pid %d) does not generate useful output. Waiting a bit before starting it again.", cd->fullfilename, cd->pid);
-                       sleep((unsigned int) (cd->update_every * 10));
+               if(code != 0) {
+                       // the plugin reports failure
+
+                       if(likely(!cd->successful_collections)) {
+                               // nothing collected - disable it
+                               error("PLUGINSD: '%s' exited with error code %d. Disabling it.", cd->fullfilename, code);
+                               cd->enabled = 0;
+                       }
+                       else {
+                               // we have collected something
+
+                               if(likely(cd->serial_failures <= 10)) {
+                                       error("PLUGINSD: '%s' exited with error code %d, but has given useful output in the past (%zu times). Waiting a bit before starting it again.", cd->fullfilename, code, cd->successful_collections);
+                                       sleep((unsigned int) (cd->update_every * 10));
+                               }
+                               else {
+                                       error("PLUGINSD: '%s' exited with error code %d, but has given useful output in the past (%zu times). We tried %d times to restart it, but it failed to generate data. Disabling it.", cd->fullfilename, code, cd->successful_collections, cd->serial_failures);
+                                       cd->enabled = 0;
+                               }
+                       }
                }
+               else {
+                       // the plugin reports success
 
+                       if(unlikely(!cd->successful_collections)) {
+                               // we have collected nothing so far
+
+                               if(likely(cd->serial_failures <= 10)) {
+                                       error("PLUGINSD: '%s' (pid %d) does not generate useful output but it reports success (exits with 0). Waiting a bit before starting it again.", cd->fullfilename, cd->pid);
+                                       sleep((unsigned int) (cd->update_every * 10));
+                               }
+                               else {
+                                       error("PLUGINSD: '%s' (pid %d) does not generate useful output, although it reports success (exits with 0), but we have tried %d times to collect something. Disabling it.", cd->fullfilename, cd->pid, cd->serial_failures);
+                                       cd->enabled = 0;
+                               }
+                       }
+                       else
+                               sleep((unsigned int) cd->update_every);
+               }
                cd->pid = 0;
-               if(likely(cd->enabled)) sleep((unsigned int) cd->update_every);
-               else break;
+
+               if(unlikely(!cd->enabled))
+                       break;
        }
 
        cd->obsolete = 1;
index 11e89e0ac3e32cbd18040f8b11d4408a23395313..1a10dc0dc507dfce60dc4cfebf002e1aad8607b9 100644 (file)
@@ -20,9 +20,15 @@ struct plugind {
        pid_t pid;
        pthread_t thread;
 
-       int update_every;
-       int obsolete;
-       int enabled;
+       size_t successful_collections;          // the number of times we have seen
+                                                                               // values collected from this plugin
+
+       size_t serial_failures;                         // the number of times the plugin started
+                                                                               // without collecting values
+
+       int update_every;                                       // the plugin default data collection frequency
+       int obsolete;                                           // do not touch this structure after setting this to 1
+       int enabled;                                            // if this is enabled or not
 
        time_t started_t;
 
index 8fe28d45bae7d783c35bb1a1c689006267f4cf32..294d7bd7c55ab793ab5e7c52ed03ccc91fa6b921 100644 (file)
@@ -20,7 +20,7 @@
        <meta property="og:description" content="Stunning real-time dashboards, blazingly fast and extremely interactive. Zero configuration, zero dependencies, zero maintenance." />
 
        <script>var netdataTheme = 'slate';</script>
-       <script type="text/javascript" src="dashboard.js?v39"></script>
+       <script type="text/javascript" src="http://my-netdata.io/dashboard.js?v39"></script>
 </head>
 <body>
 
@@ -63,8 +63,8 @@
                                                data-gauge-max-value="32767"
                                                data-width="45%"
                                                data-after="-600"
-                                               data-points="40"
-                                               data-title="Updates Every 15&nbsp;Sec"
+                                               data-points="60"
+                                               data-title="Updates Every 10&nbsp;Sec"
                                                data-units="important metric"
                                                data-colors="#C55"
                                                ></div>
                                                data-easypiechart-max-value="32767"
                                                data-width="45%"
                                                data-after="-600"
-                                               data-points="40"
-                                               data-title="Updates Every 15&nbsp;Sec&nbsp;(<a href='https://github.com/OpenTSDB/opentsdb.net/blob/gh-pages/docs/source/user_guide/utilities/tcollector.rst#collecting-lots-of-metrics-with-tcollector' target='_blank'>OpenTSDB</a>)"
+                                               data-points="60"
+                                               data-title="Updates Every 10&nbsp;Sec"
                                                data-units="important metric"
                                                data-colors="#C55"
                                                ></div>
                                                data-width="45%"
                                                data-after="-600"
                                                data-points="2"
-                                               data-title="Updates Every 5&nbsp;Mins&nbsp;(your&nbsp;NMS)"
+                                               data-title="Updates Every 5&nbsp;Mins"
                                                data-units="important metric"
                                                data-colors="#C55"
                                                ></div>