From: Costa Tsaousis (ktsaou) Date: Thu, 9 Jun 2016 19:49:44 +0000 (+0300) Subject: properly handle the exit status of plugins to avoid infinite restart attempts; plugin... X-Git-Tag: v1.3.0~147^2 X-Git-Url: https://arthur.barton.de/gitweb/?p=netdata.git;a=commitdiff_plain;h=bb8cdaf9dd25372bbbd09d916e9afe7f907ec1d6 properly handle the exit status of plugins to avoid infinite restart attempts; plugins that report failure are now restarted up to 10 times if they have collected values in the past; apache fixed to work with latest apache; example plugin is now disabled by default to prevent starting bash in systems that dont have any other bash plugin used --- diff --git a/charts.d/apache.chart.sh b/charts.d/apache.chart.sh index f4f7d058..a3e06409 100755 --- a/charts.d/apache.chart.sh +++ b/charts.d/apache.chart.sh @@ -67,19 +67,14 @@ apache_detect() { # we will not check of the Conns* # keys, since these are apache 2.4 specific - if [ -z "${apache_key_accesses}" \ - -o -z "${apache_key_kbytes}" \ - -o -z "${apache_key_reqpersec}" \ - -o -z "${apache_key_bytespersec}" \ - -o -z "${apache_key_bytesperreq}" \ - -o -z "${apache_key_busyworkers}" \ - -o -z "${apache_key_idleworkers}" \ - -o -z "${apache_key_scoreboard}" \ - ] - then - echo >&2 "apache: Invalid response or missing keys from apache server: ${*}" - return 1 - fi + [ -z "${apache_key_accesses}" ] && echo >&2 "apache: missing 'Total Accesses' from apache server: ${*}" && return 1 + [ -z "${apache_key_kbytes}" ] && echo >&2 "apache: missing 'Total kBytes' from apache server: ${*}" && return 1 + [ -z "${apache_key_reqpersec}" ] && echo >&2 "apache: missing 'ReqPerSec' from apache server: ${*}" && return 1 + [ -z "${apache_key_bytespersec}" ] && echo >&2 "apache: missing 'BytesPerSec' from apache server: ${*}" && return 1 + [ -z "${apache_key_bytesperreq}" ] && echo >&2 "apache: missing 'BytesPerReq' from apache server: ${*}" && return 1 + [ -z "${apache_key_busyworkers}" ] && echo >&2 "apache: missing 'BusyWorkers' from apache server: ${*}" && return 1 + [ -z "${apache_key_idleworkers}" ] && echo >&2 "apache: missing 'IdleWorkers' from apache server: ${*}" && return 1 + [ -z "${apache_key_scoreboard}" ] && echo >&2 "apache: missing 'Scoreboard' from apache server: ${*}" && return 1 if [ ! -z "${apache_key_connstotal}" \ -a ! -z "${apache_key_connsasyncwriting}" \ @@ -88,6 +83,8 @@ apache_detect() { ] then apache_has_conns=1 + else + apache_has_conns=0 fi return 0 diff --git a/charts.d/example.chart.sh b/charts.d/example.chart.sh index ad205046..b20740e8 100755 --- a/charts.d/example.chart.sh +++ b/charts.d/example.chart.sh @@ -7,14 +7,21 @@ # between the calls of the _update() function example_update_every= +# the priority is used to sort the charts on the dashboard +# 1 = the first chart example_priority=150000 +# to enable this chart, you have to set this to 12345 +# (just a demonstration for something that needs to be checked) +example_magic_number= + # _check is called once, to find out if this chart should be enabled or not example_check() { # this should return: # - 0 to enable the chart # - 1 to disable the chart + [ "${example_magic_number}" != "12345" ] && return 1 return 0 } diff --git a/plugins.d/charts.d.plugin b/plugins.d/charts.d.plugin index 2824fa3c..543695dd 100755 --- a/plugins.d/charts.d.plugin +++ b/plugins.d/charts.d.plugin @@ -504,7 +504,7 @@ if [ -z "$run_charts" ] exit 1 fi -declare -A charts_last_update=() charts_update_every=() charts_next_update=() charts_run_counter=() +declare -A charts_last_update=() charts_update_every=() charts_next_update=() charts_run_counter=() charts_serial_failures=() global_update() { local exit_at \ c=0 dt ret last_ms exec_start_ms exec_end_ms \ @@ -522,13 +522,14 @@ global_update() { charts_last_update[$chart]=$((now_ms - (now_ms % (charts_update_every[$chart] * 1000) ) )) charts_next_update[$chart]=$(( charts_last_update[$chart] + (charts_update_every[$chart] * 1000) )) charts_run_counter[$chart]=0 + charts_serial_failures[$chart]=0 echo "CHART netdata.plugin_chartsd_$chart '' 'Execution time for $chart plugin' 'milliseconds / run' charts.d netdata.plugin_charts area 145000 ${charts_update_every[$chart]}" echo "DIMENSION run_time 'run time' absolute 1 1" done # the main loop - while [ 1 ] + while [ "${#next_charts[@]}" -gt 0 ] do c=$((c + 1)) now_charts=("${next_charts[@]}") @@ -570,15 +571,24 @@ global_update() { current_time_ms; exec_end_ms=$now_ms echo "BEGIN netdata.plugin_chartsd_$chart $dt" + echo "SET run_time = $(( exec_end_ms - exec_start_ms ))" + echo "END" + if [ $ret -eq 0 ] then - echo "SET run_time = $(( exec_end_ms - exec_start_ms ))" + charts_serial_failures[$chart]=0 next_charts+=($chart) else - echo "SET run_time = $(( (exec_end_ms - exec_start_ms) * -1 ))" - echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reports failure. Disabling it." + charts_serial_failures[$chart]=$(( charts_serial_failures[$chart] + 1 )) + + if [ charts_serial_failures[$chart] -gt 10 ] + then + echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reported failure ${charts_serial_failures[$chart]} times. Disabling it." + else + echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reports failure. Will keep trying for a while." + next_charts+=($chart) + fi fi - echo "END" else next_charts+=($chart) fi @@ -601,6 +611,9 @@ global_update() { test ${now_ms} -ge ${exit_at} && exit 0 done + + echo >&2 "$PROGRAM_NAME: Nothing left to do. Disabling charts.d.plugin." + echo "DISABLE" } global_update diff --git a/src/plugins_d.c b/src/plugins_d.c index 0ccbd36e..d7815433 100644 --- a/src/plugins_d.c +++ b/src/plugins_d.c @@ -125,6 +125,8 @@ void *pluginsd_worker_thread(void *arg) uint32_t STOPPING_WAKE_ME_UP_PLEASE_HASH = simple_hash("STOPPING_WAKE_ME_UP_PLEASE"); #endif + size_t count = 0; + while(likely(1)) { if(unlikely(netdata_exit)) break; @@ -137,7 +139,6 @@ void *pluginsd_worker_thread(void *arg) info("PLUGINSD: '%s' running on pid %d", cd->fullfilename, cd->pid); RRDSET *st = NULL; - unsigned long long count = 0; char *s; uint32_t hash; @@ -182,8 +183,6 @@ void *pluginsd_worker_thread(void *arg) if(unlikely(st->debug)) debug(D_PLUGINSD, "PLUGINSD: '%s' is setting dimension %s/%s to %s", cd->fullfilename, st->id, dimension, value?value:""); if(value) rrddim_set(st, dimension, strtoll(value, NULL, 0)); - - count++; } else if(likely(hash == BEGIN_HASH && !strcmp(s, "BEGIN"))) { char *id = words[1]; @@ -223,6 +222,8 @@ void *pluginsd_worker_thread(void *arg) rrdset_done(st); st = NULL; + + count++; } else if(likely(hash == FLUSH_HASH && !strcmp(s, "FLUSH"))) { debug(D_PLUGINSD, "PLUGINSD: '%s' is requesting a FLUSH", cd->fullfilename); @@ -386,17 +387,17 @@ void *pluginsd_worker_thread(void *arg) break; } } + if(likely(count)) { + cd->successful_collections += count; + cd->serial_failures = 0; + } + else + cd->serial_failures++; - info("PLUGINSD: '%s' on pid %d stopped.", cd->fullfilename, cd->pid); + info("PLUGINSD: '%s' on pid %d stopped after %zu successful data collections (ENDs).", cd->fullfilename, cd->pid, count); - // fgets() failed or loop broke + // get the return code int code = mypclose(fp, cd->pid); - if(code == 1 || code == 127) { - // 1 = DISABLE - // 127 = cannot even run it - error("PLUGINSD: '%s' (pid %d) exited with code %d. Disabling it.", cd->fullfilename, cd->pid, code); - cd->enabled = 0; - } if(netdata_exit) { cd->pid = 0; @@ -406,14 +407,49 @@ void *pluginsd_worker_thread(void *arg) return NULL; } - if(unlikely(!count && cd->enabled)) { - error("PLUGINSD: '%s' (pid %d) does not generate useful output. Waiting a bit before starting it again.", cd->fullfilename, cd->pid); - sleep((unsigned int) (cd->update_every * 10)); + if(code != 0) { + // the plugin reports failure + + if(likely(!cd->successful_collections)) { + // nothing collected - disable it + error("PLUGINSD: '%s' exited with error code %d. Disabling it.", cd->fullfilename, code); + cd->enabled = 0; + } + else { + // we have collected something + + if(likely(cd->serial_failures <= 10)) { + error("PLUGINSD: '%s' exited with error code %d, but has given useful output in the past (%zu times). Waiting a bit before starting it again.", cd->fullfilename, code, cd->successful_collections); + sleep((unsigned int) (cd->update_every * 10)); + } + else { + error("PLUGINSD: '%s' exited with error code %d, but has given useful output in the past (%zu times). We tried %d times to restart it, but it failed to generate data. Disabling it.", cd->fullfilename, code, cd->successful_collections, cd->serial_failures); + cd->enabled = 0; + } + } } + else { + // the plugin reports success + if(unlikely(!cd->successful_collections)) { + // we have collected nothing so far + + if(likely(cd->serial_failures <= 10)) { + error("PLUGINSD: '%s' (pid %d) does not generate useful output but it reports success (exits with 0). Waiting a bit before starting it again.", cd->fullfilename, cd->pid); + sleep((unsigned int) (cd->update_every * 10)); + } + else { + error("PLUGINSD: '%s' (pid %d) does not generate useful output, although it reports success (exits with 0), but we have tried %d times to collect something. Disabling it.", cd->fullfilename, cd->pid, cd->serial_failures); + cd->enabled = 0; + } + } + else + sleep((unsigned int) cd->update_every); + } cd->pid = 0; - if(likely(cd->enabled)) sleep((unsigned int) cd->update_every); - else break; + + if(unlikely(!cd->enabled)) + break; } cd->obsolete = 1; diff --git a/src/plugins_d.h b/src/plugins_d.h index 11e89e0a..1a10dc0d 100644 --- a/src/plugins_d.h +++ b/src/plugins_d.h @@ -20,9 +20,15 @@ struct plugind { pid_t pid; pthread_t thread; - int update_every; - int obsolete; - int enabled; + size_t successful_collections; // the number of times we have seen + // values collected from this plugin + + size_t serial_failures; // the number of times the plugin started + // without collecting values + + int update_every; // the plugin default data collection frequency + int obsolete; // do not touch this structure after setting this to 1 + int enabled; // if this is enabled or not time_t started_t; diff --git a/web/demo2.html b/web/demo2.html index 8fe28d45..294d7bd7 100644 --- a/web/demo2.html +++ b/web/demo2.html @@ -20,7 +20,7 @@ - + @@ -63,8 +63,8 @@ data-gauge-max-value="32767" data-width="45%" data-after="-600" - data-points="40" - data-title="Updates Every 15 Sec" + data-points="60" + data-title="Updates Every 10 Sec" data-units="important metric" data-colors="#C55" > @@ -106,8 +106,8 @@ data-easypiechart-max-value="32767" data-width="45%" data-after="-600" - data-points="40" - data-title="Updates Every 15 Sec (OpenTSDB)" + data-points="60" + data-title="Updates Every 10 Sec" data-units="important metric" data-colors="#C55" > @@ -118,7 +118,7 @@ data-width="45%" data-after="-600" data-points="2" - data-title="Updates Every 5 Mins (your NMS)" + data-title="Updates Every 5 Mins" data-units="important metric" data-colors="#C55" >