From: Costa Tsaousis Date: Tue, 18 Oct 2016 20:56:38 +0000 (+0300) Subject: Merge pull request #1123 from ktsaou/master X-Git-Tag: v1.5.0~228 X-Git-Url: https://arthur.barton.de/gitweb/?a=commitdiff_plain;h=cc629d07bbf8986a7630d810cacf94fca58d566b;hp=a8f637780fb4debe91931b8635ec519ffb18a057;p=netdata.git Merge pull request #1123 from ktsaou/master alarm log is saved and loaded back; added fping.plugin --- diff --git a/.gitignore b/.gitignore index c281ea32..30962eaa 100644 --- a/.gitignore +++ b/.gitignore @@ -89,3 +89,10 @@ profile/benchmark-dictionary profile/benchmark-registry *.pyc + +diagrams/*.png +diagrams/*.svg +diagrams/*.atxt +diagrams/plantuml.jar + +netdata.cppcheck diff --git a/CMakeLists.txt b/CMakeLists.txt index e973f6eb..9c4aa760 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,8 +20,12 @@ set(NETDATA_SOURCE_FILES src/daemon.h src/dictionary.c src/dictionary.h + src/eval.c + src/eval.h src/global_statistics.c src/global_statistics.h + src/health.c + src/health.h src/log.c src/log.h src/main.c @@ -53,9 +57,9 @@ set(NETDATA_SOURCE_FILES src/proc_net_rpc_nfsd.c src/proc_net_snmp6.c src/proc_net_snmp.c + src/proc_net_softnet_stat.c src/proc_net_stat_conntrack.c src/proc_net_stat_synproxy.c - src/proc_net_softnet_stat.c src/proc_self_mountinfo.c src/proc_self_mountinfo.h src/proc_softirqs.c @@ -70,8 +74,8 @@ set(NETDATA_SOURCE_FILES src/rrd.h src/storage_number.c src/storage_number.h - src/sys_kernel_mm_ksm.c src/sys_fs_cgroup.c + src/sys_kernel_mm_ksm.c src/unit_test.c src/unit_test.h src/url.c @@ -84,7 +88,7 @@ set(NETDATA_SOURCE_FILES src/web_client.h src/web_server.c src/web_server.h - config.h src/health.h src/health.c src/eval.h src/eval.c) +) set(APPS_PLUGIN_SOURCE_FILES src/appconfig.c diff --git a/Makefile.am b/Makefile.am index ea1ac256..8b4bc837 100644 --- a/Makefile.am +++ b/Makefile.am @@ -39,6 +39,8 @@ SUBDIRS = \ $(NULL) dist_noinst_DATA= \ + diagrams/config.puml \ + diagrams/registry.puml \ configs.signatures \ Dockerfile \ netdata.spec \ @@ -47,7 +49,7 @@ dist_noinst_DATA= \ # until integrated within build # should be proper init.d/openrc/systemd usable dist_noinst_SCRIPTS= \ - ansible/netdata.yml \ + diagrams/build.sh \ coverity-scan.sh \ docker-build.sh \ netdata-installer.sh \ diff --git a/ansible/netdata.yml b/ansible/netdata.yml deleted file mode 100644 index 4c7e7368..00000000 --- a/ansible/netdata.yml +++ /dev/null @@ -1,62 +0,0 @@ ---- -- name: "Install pre-requisites" - apt: - name: "{{ item }}" - state: present - with_items: - - autoconf - - autoconf-archive - - autogen - - automake - - gcc - - git - - libmnl-dev - - make - - pkg-config - - uuid-dev - - zlib1g-dev - when: ansible_os_family == "Debian" - -- name: "Install pre-requisites" - yum: - name: "{{ item }}" - state: present - with_items: - - autoconf - - autoconf-archive - - autogen - - automake - - curl - - gcc - - git - - jq - - libmnl-devel - - libuuid-devel - - make - - pkgconfig - - zlib-devel - when: ansible_os_family == "RedHat" - -- name: "Clone repo" - git: - clone: yes - repo: https://github.com/firehol/netdata.git - dest: /tmp/netdata - -- name: "Installation" - shell: cd /tmp/netdata/ && ./netdata-installer.sh --dont-wait --libs-are-really-here - -- name: "Clean /tmp" - file: - path: /tmp/netdata - state: absent - -- name: "KillAll" - shell: killall netdata - -- name: "Daemon config" - systemd: - daemon_reload: yes - name: netdata - enabled: yes - state: started diff --git a/conf.d/Makefile.am b/conf.d/Makefile.am index 4b24ac05..c2ef258c 100644 --- a/conf.d/Makefile.am +++ b/conf.d/Makefile.am @@ -6,6 +6,7 @@ MAINTAINERCLEANFILES= $(srcdir)/Makefile.in dist_config_DATA = \ apps_groups.conf \ charts.d.conf \ + fping.conf \ node.d.conf \ python.d.conf \ health_alarm_notify.conf \ diff --git a/conf.d/fping.conf b/conf.d/fping.conf new file mode 100644 index 00000000..3bf80477 --- /dev/null +++ b/conf.d/fping.conf @@ -0,0 +1,32 @@ +# This plugin requires a special version of fping. +# Get it from https://github.com/ktsaou/fping +# and build it, like this: +# +# cd /usr/src +# git clone https://github.com/ktsaou/fping.git fping-netdata.git +# cd fping-netdata.git +# ./autogen.sh +# ./configure --prefix=/usr/local +# make +# cp src/fping /usr/local/bin/ +# chown root:root /usr/local/bin/fping +# chmod 4755 /usr/local/bin/fping +# +# ----------------------------------------------------------------------------- +# configuration options +# can be overwritten at /etc/netdata/fping.conf + +# the fping binary to use +# we need one that can output netdata friendly info +fping="$(which fping || command -v fping)" + +# a space separated list of hosts to fping +# it is best to put hostnames here +hosts="" + +# the time in milliseconds (1 sec = 1000 ms) +# to ping the hosts - by default 2 pings per iteration +ping_every="$((update_every * 1000 / 2))" + +# how many retries to make if a host does not respond +retries=1 diff --git a/configs.signatures b/configs.signatures index be8d1258..d4361555 100644 --- a/configs.signatures +++ b/configs.signatures @@ -29,6 +29,20 @@ declare -A configs_signatures=( ['7cf6402b51e5070f2be3ad6fe059ff89']='charts.d.conf' ['a02d14124b19c635c1426cee2e98bac5']='charts.d.conf' ['ca026d7c779f0a7cb7787713c5be5c47']='charts.d.conf' + ['1f5545b3ff52b3eb75ee05401f67a9bc']='fping.conf' + ['2fa8fb929fd597f2ab97b6efc540a043']='health_alarm_notify.conf' + ['42ad0e70b1365b6e7244cc305dbaa529']='health_alarm_notify.conf' + ['707a63f53f4b32e01d134ae90ba94aad']='health_alarm_notify.conf' + ['8fd472a854b0996327e8ed3562161182']='health_alarm_notify.conf' + ['a55133f1b0be0a4255057849dd451b09']='health_alarm_notify.conf' + ['a89c516a1144435a88decf25509318ac']='health_alarm_notify.conf' + ['b68706bb8101ef85192db92f865a5d80']='health_alarm_notify.conf' + ['c080e006f544c949baca33cc24a9c126']='health_alarm_notify.conf' + ['ce2e8768964a936f58c4c2144aee8a01']='health_alarm_notify.conf' + ['e3023092e3b2bbb5351e0fe6682f4fe9']='health_alarm_notify.conf' + ['f8dade4484f1b6a48655388502df7d5a']='health_alarm_notify.conf' + ['fd3164e6e8cb6726706267eae49aa082']='health_alarm_notify.conf' + ['ff1b3d8ae8b2149c711d8da9b7a9c4bd']='health_alarm_notify.conf' ['074df527cc70b5f38c0714f08f20e57c']='health.d/apache.conf' ['174c21a6ce5de97bda83d502aa47a9f8']='health.d/apache.conf' ['3848172053221b95279ba9bf789cd4e0']='health.d/apache.conf' @@ -152,22 +166,14 @@ declare -A configs_signatures=( ['ca08a9b18d38ae0a0f5081a7cdc96863']='health.d/swap.conf' ['da29d2ab1ab7b8fda189960c840e5144']='health.d/swap.conf' ['b3fc4749b132e55ac0d3a0f92859237e']='health.d/tcp_resets.conf' - ['2fa8fb929fd597f2ab97b6efc540a043']='health_alarm_notify.conf' - ['42ad0e70b1365b6e7244cc305dbaa529']='health_alarm_notify.conf' - ['707a63f53f4b32e01d134ae90ba94aad']='health_alarm_notify.conf' - ['8fd472a854b0996327e8ed3562161182']='health_alarm_notify.conf' - ['a55133f1b0be0a4255057849dd451b09']='health_alarm_notify.conf' - ['a89c516a1144435a88decf25509318ac']='health_alarm_notify.conf' - ['b68706bb8101ef85192db92f865a5d80']='health_alarm_notify.conf' - ['c080e006f544c949baca33cc24a9c126']='health_alarm_notify.conf' - ['ce2e8768964a936f58c4c2144aee8a01']='health_alarm_notify.conf' - ['e3023092e3b2bbb5351e0fe6682f4fe9']='health_alarm_notify.conf' - ['f8dade4484f1b6a48655388502df7d5a']='health_alarm_notify.conf' - ['fd3164e6e8cb6726706267eae49aa082']='health_alarm_notify.conf' - ['ff1b3d8ae8b2149c711d8da9b7a9c4bd']='health_alarm_notify.conf' ['707a63f53f4b32e01d134ae90ba94aad']='health_email_recipients.conf' ['a752e51d923e15add4a11fa8f3be935a']='health_email_recipients.conf' ['73a8e10dfe4183aca751e9e2a80dabe3']='node.d.conf' + ['6b917300747e7e8314844237e2462261']='python.d/apache_cache.conf' + ['e0e96cc47ed61d6492416be5236cd4d3']='python.d/apache_cache.conf' + ['5278ebbae19c60db600f0a119cb3664e']='python.d/apache.conf' + ['5829812db29598db5857c9f433e96fef']='python.d/apache.conf' + ['6bf0de6e3b251b765b10a71d8c5c319d']='python.d/apache.conf' ['13141998a5d71308d9c119834c27bfd3']='python.d.conf' ['38d1bf04fe9901481dd6febcc0404a86']='python.d.conf' ['4b775fb31342f1478b3773d041a72911']='python.d.conf' @@ -183,11 +189,6 @@ declare -A configs_signatures=( ['bba2f3886587f137ea08a6e63dd3d376']='python.d.conf' ['d55be5bb5e108da1e7645da007c53cd4']='python.d.conf' ['f82924563e41d99cdae5431f0af69155']='python.d.conf' - ['5278ebbae19c60db600f0a119cb3664e']='python.d/apache.conf' - ['5829812db29598db5857c9f433e96fef']='python.d/apache.conf' - ['6bf0de6e3b251b765b10a71d8c5c319d']='python.d/apache.conf' - ['6b917300747e7e8314844237e2462261']='python.d/apache_cache.conf' - ['e0e96cc47ed61d6492416be5236cd4d3']='python.d/apache_cache.conf' ['7830066c46a7e5f9682b8d3f4566b4e5']='python.d/cpufreq.conf' ['b5b5a8d6d991fb1cef8d80afa23ba114']='python.d/cpufreq.conf' ['dc0d2b96378f290eec3fcf98b89ad824']='python.d/cpufreq.conf' diff --git a/diagrams/build.sh b/diagrams/build.sh new file mode 100755 index 00000000..53f0ea7a --- /dev/null +++ b/diagrams/build.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +path=$(dirname "$0") +cd "${path}" || exit 1 + +if [ ! -f "plantuml.jar" ] +then + echo >&2 "Please download 'plantuml.jar' from http://plantuml.com/ and put it the same folder with me." + exit 1 +fi + +for x in *.puml +do + [ "${x}" = "config.puml" ] && continue + + echo >&2 "Working on ${x}..." + java -jar plantuml.jar -tpng "${x}" + java -jar plantuml.jar -tsvg "${x}" + # java -jar plantuml.jar -ttxt "${x}" +done diff --git a/diagrams/config.puml b/diagrams/config.puml new file mode 100644 index 00000000..0ce0932f --- /dev/null +++ b/diagrams/config.puml @@ -0,0 +1,46 @@ + +skinparam handwritten true +skinparam monochrome true +skinparam roundcorner 15 + +skinparam sequence { + ArrowThickness 3 + + DividerFontColor Black + DividerFontName Comic Sans MS + DividerFontSize 15 + DividerFontStyle Italic + + DelayFontColor Black + DelayFontName Comic Sans MS + DelayFontSize 15 + DelayFontStyle Italic + + TitleFontColor Black + TitleFontName Comic Sans MS + TitleFontStyle Italic + TitleFontSize 25 + + ArrowColor DeepSkyBlue + ArrowFontColor Black + ArrowFontName Comic Sans MS + ArrowFontStyle Regular + ArrowFontSize 19 + + ActorBorderColor DeepSkyBlue + + LifeLineBorderColor blue + LifeLineBackgroundColor #A9DCDF + + ParticipantBorderColor DeepSkyBlue + ParticipantBackgroundColor LightBlue + ParticipantFontName Comic Sans MS + ParticipantFontSize 20 + ParticipantFontColor Black + + ActorBackgroundColor aqua + ActorFontColor Black + ActorFontSize 20 + ActorFontName Comic Sans MS +} + diff --git a/diagrams/registry.puml b/diagrams/registry.puml new file mode 100644 index 00000000..51a337fa --- /dev/null +++ b/diagrams/registry.puml @@ -0,0 +1,40 @@ +@startuml +!include config.puml + +title netdata registry operation +actor "web browser" as user +participant "netdata 1" as n1 +participant "registry 1" as r1 +autonumber "0." + +== standard dashboard communication == + +user ->n1 : \ + hi, give me the dashboard + +n1 --> user : \ + welcome, here it is... + +... a few seconds later ... + +== registry related communication == + +user -> n1 : \ + now give me registry information + +n1 --> user: \ + here it is, talk to registry 1 + +note left of r1 #eee: \ + only your web browser \n\ + talks to the registry + +user -> r1 : \ + Hey registry 1, \ +I am accessing netdata 1... + +r1 --> user : \ + nice!, here are other netdata servers \ +you have accessed in the past + +@enduml diff --git a/netdata-installer.sh b/netdata-installer.sh index c3b97243..930243c4 100755 --- a/netdata-installer.sh +++ b/netdata-installer.sh @@ -15,6 +15,12 @@ fi LC_ALL=C umask 022 +# Be nice on production environments +renice 19 $$ >/dev/null 2>/dev/null + +processors=$(cat /proc/cpuinfo | grep ^processor | wc -l) +[ $(( processors )) -lt 1 ] && processors=1 + # you can set CFLAGS before running installer CFLAGS="${CFLAGS--O3}" @@ -133,7 +139,7 @@ get_git_config_signatures() { echo >configs.signatures.tmp - for x in $(find conf.d -name \*.conf) + for x in $(find conf.d -name \*.conf | sort) do x="${x/conf.d\//}" echo "${x}" @@ -427,7 +433,7 @@ if [ -f src/netdata ] fi echo >&2 "Compiling netdata ..." -run make || exit 1 +run make -j${processors} || exit 1 if [ "${BASH_VERSINFO[0]}" -ge "4" ] then @@ -713,35 +719,35 @@ isnetdata() { } stop_netdata_on_pid() { - local pid="$1" ret=0 count=0 + local pid="${1}" ret=0 count=0 - isnetdata $pid || return 0 + isnetdata ${pid} || return 0 - printf >&2 "Stopping netdata on pid $pid ..." - while [ ! -z "$pid" -a $ret -eq 0 ] + printf >&2 "Stopping netdata on pid ${pid} ..." + while [ ! -z "$pid" -a ${ret} -eq 0 ] do - if [ $count -gt 45 ] + if [ ${count} -gt 45 ] then - echo >&2 "Cannot stop the running netdata on pid $pid." + echo >&2 "Cannot stop the running netdata on pid ${pid}." return 1 fi count=$(( count + 1 )) - run kill $pid 2>/dev/null + run kill ${pid} 2>/dev/null ret=$? - test $ret -eq 0 && printf >&2 "." && sleep 2 + test ${ret} -eq 0 && printf >&2 "." && sleep 2 done echo >&2 - if [ $ret -eq 0 ] + if [ ${ret} -eq 0 ] then - echo >&2 "SORRY! CANNOT STOP netdata ON PID $pid !" + echo >&2 "SORRY! CANNOT STOP netdata ON PID ${pid} !" return 1 fi - echo >&2 "netdata on pid $pid stopped." + echo >&2 "netdata on pid ${pid} stopped." return 0 } @@ -755,7 +761,7 @@ stop_all_netdata() { $(cat /var/run/netdata/netdata.pid 2>/dev/null) \ $(pidof netdata 2>/dev/null) do - stop_netdata_on_pid $p + stop_netdata_on_pid ${p} done } @@ -854,13 +860,13 @@ if [ ! -s "${NETDATA_PREFIX}/etc/netdata/netdata.conf" ] ret=$? # try curl - if [ $ret -ne 0 -o ! -s "${NETDATA_PREFIX}/etc/netdata/netdata.conf.new" ] + if [ ${ret} -ne 0 -o ! -s "${NETDATA_PREFIX}/etc/netdata/netdata.conf.new" ] then curl -s -o "${NETDATA_PREFIX}/etc/netdata/netdata.conf.new" "http://localhost:${NETDATA_PORT}/netdata.conf" ret=$? fi - if [ $ret -eq 0 -a -s "${NETDATA_PREFIX}/etc/netdata/netdata.conf.new" ] + if [ ${ret} -eq 0 -a -s "${NETDATA_PREFIX}/etc/netdata/netdata.conf.new" ] then mv "${NETDATA_PREFIX}/etc/netdata/netdata.conf.new" "${NETDATA_PREFIX}/etc/netdata/netdata.conf" echo >&2 "New configuration saved for you to edit at ${NETDATA_PREFIX}/etc/netdata/netdata.conf" diff --git a/plugins.d/Makefile.am b/plugins.d/Makefile.am index 4bc0dc44..326192f8 100644 --- a/plugins.d/Makefile.am +++ b/plugins.d/Makefile.am @@ -13,6 +13,7 @@ dist_plugins_SCRIPTS = \ cgroup-name.sh \ charts.d.dryrun-helper.sh \ charts.d.plugin \ + fping.plugin \ node.d.plugin \ python.d.plugin \ tc-qos-helper.sh \ diff --git a/plugins.d/charts.d.plugin b/plugins.d/charts.d.plugin index df9998ec..a21f3cac 100755 --- a/plugins.d/charts.d.plugin +++ b/plugins.d/charts.d.plugin @@ -397,16 +397,16 @@ do [ $debug -eq 1 ] && echo >&2 "$PROGRAM_NAME: loading chart: '$chartsd/$chart.chart.sh'" . "$chartsd/$chart.chart.sh" - if [ -f "$confd/charts.d/$chart.conf" ] + if [ -f "$confd/$PROGRAM_NAME/$chart.conf" ] then - [ $debug -eq 1 ] && echo >&2 "$PROGRAM_NAME: loading chart options: '$confd/charts.d/$chart.conf'" - . "$confd/charts.d/$chart.conf" + [ $debug -eq 1 ] && echo >&2 "$PROGRAM_NAME: loading chart options: '$confd/$PROGRAM_NAME/$chart.conf'" + . "$confd/$PROGRAM_NAME/$chart.conf" elif [ -f "$confd/$chart.conf" ] then [ $debug -eq 1 ] && echo >&2 "$PROGRAM_NAME: loading chart options: '$confd/$chart.conf'" . "$confd/$chart.conf" else - echo >&2 "$PROGRAM_NAME: $chart: configuration file '$confd/charts.d/$chart.conf' not found. Using defaults." + echo >&2 "$PROGRAM_NAME: $chart: configuration file '$confd/$PROGRAM_NAME/$chart.conf' not found. Using defaults." fi eval "dt=\$$chart$suffix_update_every" diff --git a/plugins.d/fping.plugin b/plugins.d/fping.plugin new file mode 100755 index 00000000..fa3b62f5 --- /dev/null +++ b/plugins.d/fping.plugin @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +me="${0}" + +# the frequency to send info to netdata +# passed by netdata as the first parameter +update_every="${1-1}" + +# the netdata configuration directory +# passed by netdata as an environment variable +NETDATA_CONFIG_DIR="${NETDATA_CONFIG_DIR-/etc/netdata}" + +# ----------------------------------------------------------------------------- + +# This plugin requires a special version of fping. +# Get it from https://github.com/ktsaou/fping +# and build it, like this: +# +# cd /usr/src +# git clone https://github.com/ktsaou/fping.git fping-netdata.git +# cd fping-netdata.git +# ./autogen.sh +# ./configure --prefix=/usr/local +# make +# cp src/fping /usr/local/bin/ +# chown root:root /usr/local/bin/fping +# chmod 4755 /usr/local/bin/fping +# +# Then, create /etc/netdata/fping.conf +# and set the configuration options given below + +# ----------------------------------------------------------------------------- +# configuration options +# can be overwritten at /etc/netdata/fping.conf + +# the fping binary to use +# we need one that can output netdata friendly info +fping="$(which fping || command -v fping)" + +# a space separated list of hosts to fping +hosts="" + +# the time in milliseconds (1 sec = 1000 ms) +# to ping the hosts - by default 2 pings per iteration +ping_every="$((update_every * 1000 / 2))" + +# how many retries to make if a host does not respond +retries=1 + +# ----------------------------------------------------------------------------- + +# load the configuration file +if [ ! -f "${NETDATA_CONFIG_DIR}/fping.conf" ] +then + echo >&2 "${me}: configuration file '${NETDATA_CONFIG_DIR}/fping.conf' not found - nothing to do." + echo "DISABLE" + exit 1 +fi + +source "${NETDATA_CONFIG_DIR}/fping.conf" + +if [ -z "${hosts}" ] +then + echo >&2 "${me}: no hosts configued in '${NETDATA_CONFIG_DIR}/fping.conf' - nothing to do." + echo "DISABLE" + exit 1 +fi + +if [ -z "${fping}" -o ! -x "${fping}" ] +then + echo >&2 "${me}: command '${fping}' is not executable - cannot proceed." + echo "DISABLE" + exit 1 +fi + +# the fping options we will use +options=( -N -l -R -Q ${update_every} -p ${ping_every} -r ${retries} ${hosts} ) + +# execute fping +exec "${fping}" "${options[@]}" + +# if we cannot execute fping, stop +echo >&2 "${me}: command '${fping} ${options[@]}' failed to be executed." +echo "DISABLE" +exit 1 diff --git a/src/common.c b/src/common.c index e1925ff5..ce348d79 100644 --- a/src/common.c +++ b/src/common.c @@ -224,7 +224,7 @@ int sleep_usec(unsigned long long usec) { while (nanosleep(&req, &rem) == -1) { if (likely(errno == EINTR)) { - info("nanosleep() interrupted (while sleeping for %llu microseconds).", usec); + debug(D_SYSTEM, "nanosleep() interrupted (while sleeping for %llu microseconds).", usec); req.tv_sec = rem.tv_sec; req.tv_nsec = rem.tv_nsec; } else { @@ -1121,7 +1121,7 @@ long get_system_cpus(void) { procfile_close(ff); - info("System has %d processors.", processors); + debug(D_SYSTEM, "System has %d processors.", processors); return processors; } @@ -1152,7 +1152,7 @@ pid_t get_system_pid_max(void) { } procfile_close(ff); - info("System supports %d pids.", pid_max); + debug(D_SYSTEM, "System supports %d pids.", pid_max); return pid_max; } diff --git a/src/daemon.c b/src/daemon.c index 1c34405d..5bf6a1e0 100644 --- a/src/daemon.c +++ b/src/daemon.c @@ -138,7 +138,7 @@ void oom_score_adj(int score) { if(!done) error("Cannot adjust my Out-Of-Memory score to %d.", score); else - info("Adjusted my Out-Of-Memory score to %d.", score); + debug(D_SYSTEM, "Adjusted my Out-Of-Memory score to %d.", score); } int sched_setscheduler_idle(void) { @@ -151,7 +151,7 @@ int sched_setscheduler_idle(void) { if(i != 0) error("Cannot adjust my scheduling priority to IDLE."); else - info("Adjusted my scheduling priority to IDLE."); + debug(D_SYSTEM, "Adjusted my scheduling priority to IDLE."); return i; #else @@ -214,14 +214,14 @@ int become_daemon(int dont_fork, const char *user) // never become a problem if(sched_setscheduler_idle() != 0) { if(nice(19) == -1) error("Cannot lower my CPU priority."); - else info("Set my nice value to 19."); + else debug(D_SYSTEM, "Set my nice value to 19."); } if(user && *user) { if(become_user(user, pidfd) != 0) { error("Cannot become user '%s'. Continuing as we are.", user); } - else info("Successfully became user '%s'.", user); + else debug(D_SYSTEM, "Successfully became user '%s'.", user); } if(pidfd != -1) { diff --git a/src/health.c b/src/health.c index 596b143a..53fbcdca 100644 --- a/src/health.c +++ b/src/health.c @@ -6,6 +6,7 @@ struct health_options { const char *health_default_exec; const char *health_default_recipient; const char *log_filename; + size_t log_entries_written; FILE *log_fp; }; @@ -13,6 +14,7 @@ static struct health_options health = { .health_default_exec = PLUGINS_DIR "/alarm-notify.sh", .health_default_recipient = "root", .log_filename = VARLIB_DIR "/health/alarm_log.db", + .log_entries_written = 0, .log_fp = NULL }; @@ -30,11 +32,11 @@ static inline int health_alarm_log_open(void) { if(health.log_fp) { if (setvbuf(health.log_fp, NULL, _IOLBF, 0) != 0) - error("Cannot set line buffering on health log file."); + error("Health: cannot set line buffering on health log file."); return 0; } - error("Cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename); + error("Health: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", health.log_filename); return -1; } @@ -45,51 +47,310 @@ static inline void health_alarm_log_close(void) { } } -static inline void health_log_recreate(void) { - if(health.log_fp != NULL) { +static inline void health_log_rotate(void) { + static size_t rotate_every = 0; + + if(unlikely(rotate_every == 0)) { + rotate_every = (size_t)config_get_number("health", "rotate log every lines", 2000); + if(rotate_every < 100) rotate_every = 100; + } + + if(unlikely(health.log_entries_written > rotate_every)) { health_alarm_log_close(); + char old_filename[FILENAME_MAX + 1]; + snprintfz(old_filename, FILENAME_MAX, "%s.old", health.log_filename); + + if(unlink(old_filename) == -1 && errno != ENOENT) + error("Health: cannot remove old alarms log file '%s'", old_filename); + + if(link(health.log_filename, old_filename) == -1 && errno != ENOENT) + error("Health: cannot move file '%s' to '%s'.", health.log_filename, old_filename); + + if(unlink(health.log_filename) == -1 && errno != ENOENT) + error("Health: cannot remove old alarms log file '%s'", health.log_filename); + // open it with truncate health.log_fp = fopen(health.log_filename, "w"); - if(health.log_fp) fclose(health.log_fp); - else error("Cannot truncate health log '%s'", health.log_filename); + + if(health.log_fp) + fclose(health.log_fp); + else + error("Health: cannot truncate health log '%s'", health.log_filename); health.log_fp = NULL; + health.log_entries_written = 0; health_alarm_log_open(); } } static inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { - (void)host; - (void)ae; - -/* if(likely(health.log_fp)) { - if(unlikely(fprintf(health.log_fp, "A\t%s\t%08x\t%08x\t%08x\t%08x\t%08x\t%08x\t%s\t%s\t%s\t%s\t%s\t%08x\n", - host->hostname, - ae->unique_id, - ae->alarm_id, - ae->alarm_event_id, - (uint32_t)ae->when, - (uint32_t)ae->duration, - (uint32_t)ae->non_clear_duration, - (uint32_t)ae->exec_run_timestamp, - ae->name, - ae->chart, - ae->family, - ae->exec, - ae->recipient - ) < 0)) + health_log_rotate(); + + if(likely(health.log_fp)) { + if(unlikely(fprintf(health.log_fp + , "%c\t%s" + "\t%08x\t%08x\t%08x\t%08x\t%08x" + "\t%08x\t%08x\t%08x" + "\t%08x\t%08x\t%08x" + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" + "\t%d\t%d\t%d\t%d" + "\t%Lf\t%Lf" + "\n" + , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A' + , host->hostname + + , ae->unique_id + , ae->alarm_id + , ae->alarm_event_id + , ae->updated_by_id + , ae->updates_id + + , (uint32_t)ae->when + , (uint32_t)ae->duration + , (uint32_t)ae->non_clear_duration + , (uint32_t)ae->flags + , (uint32_t)ae->exec_run_timestamp + , (uint32_t)ae->delay_up_to_timestamp + + , (ae->name)?ae->name:"" + , (ae->chart)?ae->chart:"" + , (ae->family)?ae->family:"" + , (ae->exec)?ae->exec:"" + , (ae->recipient)?ae->recipient:"" + , (ae->source)?ae->source:"" + , (ae->units)?ae->units:"" + , (ae->info)?ae->info:"" + + , ae->exec_code + , ae->new_status + , ae->old_status + , ae->delay + + , (long double)ae->new_value + , (long double)ae->old_value + ) < 0)) error("Health: failed to save alarm log entry. Health data may be lost in case of abnormal restart."); + else { + ae->flags |= HEALTH_ENTRY_FLAG_SAVED; + health.log_entries_written++; + } } -*/ +} + +static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) { + static uint32_t max_unique_id = 0, max_alarm_id = 0; + ssize_t loaded = -1, updated = -1, errored = -1, duplicate = -1; + + errno = 0; + + char *s, *buf = mallocz(65536 + 1); + size_t line = 0, len = 0; + loaded = updated = errored = duplicate = 0; + + pthread_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + + while((s = fgets_trim_len(buf, 65536, fp, &len))) { + health.log_entries_written++; + line++; + + int max_entries = 30, entries = 0; + char *pointers[max_entries]; + + pointers[entries++] = s++; + while(*s) { + if(unlikely(*s == '\t')) { + *s = '\0'; + pointers[entries++] = ++s; + if(entries >= max_entries) { + error("Health: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", line, filename, max_entries); + break; + } + } + else s++; + } + + if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) { + ALARM_ENTRY *ae = NULL; + + if(entries < 26) { + error("Health: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", line, filename, entries); + errored++; + continue; + } + + // check that we have valid ids + uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16); + if(!unique_id) { + error("Health: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", line, filename, unique_id, pointers[2]); + errored++; + continue; + } + + uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16); + if(!alarm_id) { + error("Health: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", line, filename, alarm_id, pointers[3]); + errored++; + continue; + } + + if(unlikely(*pointers[0] == 'A')) { + // make sure it is properly numbered + if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) { + error("Health: line %zu of file '%s' has alarm log entry with %u in wrong order. Ignoring it.", line, filename, unique_id); + errored++; + continue; + } + + ae = callocz(1, sizeof(ALARM_ENTRY)); + } + else if(unlikely(*pointers[0] == 'U')) { + // find the original + for(ae = host->health_log.alarms; ae; ae = ae->next) { + if(unlikely(unique_id == ae->unique_id)) { + if(unlikely(*pointers[0] == 'A')) { + error("Health: line %zu of file '%s' adds duplicate alarm log entry with unique id %u. Using the later." + , line, filename, unique_id); + *pointers[0] = 'U'; + duplicate++; + } + break; + } + else if(unlikely(unique_id > ae->unique_id)) { + // no need to continue + // the linked list is sorted + ae = NULL; + break; + } + } + + // if not found, skip this line + if(!ae) { + // error("Health: line %zu of file '%s' updates alarm log entry with unique id %u, but it is not found.", line, filename, unique_id); + continue; + } + } + + // check for a possible host missmatch + //if(strcmp(pointers[1], host->hostname)) + // error("Health: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", line, filename, pointers[1], host->hostname); + + ae->unique_id = unique_id; + ae->alarm_id = alarm_id; + ae->alarm_event_id = (uint32_t)strtoul(pointers[4], NULL, 16); + ae->updated_by_id = (uint32_t)strtoul(pointers[5], NULL, 16); + ae->updates_id = (uint32_t)strtoul(pointers[6], NULL, 16); + + ae->when = (uint32_t)strtoul(pointers[7], NULL, 16); + ae->duration = (uint32_t)strtoul(pointers[8], NULL, 16); + ae->non_clear_duration = (uint32_t)strtoul(pointers[9], NULL, 16); + + ae->flags = (uint32_t)strtoul(pointers[10], NULL, 16); + ae->flags |= HEALTH_ENTRY_FLAG_SAVED; + + ae->exec_run_timestamp = (uint32_t)strtoul(pointers[11], NULL, 16); + ae->delay_up_to_timestamp = (uint32_t)strtoul(pointers[12], NULL, 16); + + if(unlikely(ae->name)) freez(ae->name); + ae->name = strdupz(pointers[13]); + ae->hash_name = simple_hash(ae->name); + + if(unlikely(ae->chart)) freez(ae->chart); + ae->chart = strdupz(pointers[14]); + ae->hash_chart = simple_hash(ae->chart); + + if(unlikely(ae->family)) freez(ae->family); + ae->family = strdupz(pointers[15]); + + if(unlikely(ae->exec)) freez(ae->exec); + ae->exec = strdupz(pointers[16]); + if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; } + + if(unlikely(ae->recipient)) freez(ae->recipient); + ae->recipient = strdupz(pointers[17]); + if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; } + + if(unlikely(ae->source)) freez(ae->source); + ae->source = strdupz(pointers[18]); + if(!*ae->source) { freez(ae->source); ae->source = NULL; } + + if(unlikely(ae->units)) freez(ae->units); + ae->units = strdupz(pointers[19]); + if(!*ae->units) { freez(ae->units); ae->units = NULL; } + + if(unlikely(ae->info)) freez(ae->info); + ae->info = strdupz(pointers[20]); + if(!*ae->info) { freez(ae->info); ae->info = NULL; } + + ae->exec_code = atoi(pointers[21]); + ae->new_status = atoi(pointers[22]); + ae->old_status = atoi(pointers[23]); + ae->delay = atoi(pointers[24]); + + ae->new_value = strtold(pointers[25], NULL); + ae->old_value = strtold(pointers[26], NULL); + + // add it to host if not already there + if(unlikely(*pointers[0] == 'A')) { + ae->next = host->health_log.alarms; + host->health_log.alarms = ae; + loaded++; + } + else updated++; + + if(unlikely(ae->unique_id > max_unique_id)) + max_unique_id = ae->unique_id; + + if(unlikely(ae->alarm_id >= max_alarm_id)) + max_alarm_id = ae->alarm_id; + } + else { + error("Health: line %zu of file '%s' is invalid (unrecognized entry type '%s').", line, filename, pointers[0]); + errored++; + } + } + + pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); + + freez(buf); + + if(!max_unique_id) max_unique_id = (uint32_t)time(NULL); + if(!max_alarm_id) max_alarm_id = (uint32_t)time(NULL); + + host->health_log.next_log_id = max_unique_id + 1; + host->health_log.next_alarm_id = max_alarm_id + 1; + + debug(D_HEALTH, "Health: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", filename, loaded, updated, errored, duplicate); + return loaded; } static inline void health_alarm_log_load(RRDHOST *host) { - (void)host; + health_alarm_log_close(); + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s.old", health.log_filename); + FILE *fp = fopen(filename, "r"); + if(!fp) + error("Health: cannot open health file: %s", filename); + else { + health_alarm_log_read(host, fp, filename); + fclose(fp); + } + + health.log_entries_written = 0; + fp = fopen(health.log_filename, "r"); + if(!fp) + error("Health: cannot open health file: %s", health.log_filename); + else { + health_alarm_log_read(host, fp, health.log_filename); + fclose(fp); + } + + health_alarm_log_open(); } + // ---------------------------------------------------------------------------- // health alarm log management @@ -152,8 +413,8 @@ static inline void health_alarm_log(RRDHOST *host, ALARM_ENTRY *t; for(t = host->health_log.alarms ; t ; t = t->next) { if(t != ae && t->alarm_id == ae->alarm_id) { - if(!(t->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED) && !t->updated_by_id) { - t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED; + if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) { + t->flags |= HEALTH_ENTRY_FLAG_UPDATED; t->updated_by_id = ae->unique_id; ae->updates_id = t->unique_id; @@ -163,10 +424,9 @@ static inline void health_alarm_log(RRDHOST *host, health_alarm_log_save(host, t); } - else { - // no need to continue - break; - } + + // no need to continue + break; } } pthread_rwlock_unlock(&host->health_log.alarm_log_rwlock); @@ -1353,6 +1613,16 @@ static inline int health_parse_db_lookup( return 1; } +static inline char *tabs2spaces(char *s) { + char *t = s; + while(*t) { + if(unlikely(*t == '\t')) *t = ' '; + t++; + } + + return s; +} + static inline char *health_source_file(size_t line, const char *path, const char *filename) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%zu@%s/%s", line, path, filename); @@ -1405,10 +1675,8 @@ int health_readfile(const char *path, const char *filename) { while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) { int stop_appending = !s; line++; - // info("Line %zu of file '%s/%s': '%s'", line, path, filename, s); s = trim(buffer); if(!s) continue; - // info("Trimmed line %zu of file '%s/%s': '%s'", line, path, filename, s); append = strlen(s); if(!stop_appending && s[append - 1] == '\\') { @@ -1445,7 +1713,6 @@ int health_readfile(const char *path, const char *filename) { continue; } - // info("Health file '%s/%s', key '%s', value '%s'", path, filename, key, value); uint32_t hash = simple_uhash(key); if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { @@ -1460,7 +1727,7 @@ int health_readfile(const char *path, const char *filename) { rc = callocz(1, sizeof(RRDCALC)); rc->next_event_id = 1; - rc->name = strdupz(value); + rc->name = tabs2spaces(strdupz(value)); rc->hash = simple_hash(rc->name); rc->source = health_source_file(line, path, filename); rc->green = NAN; @@ -1483,7 +1750,7 @@ int health_readfile(const char *path, const char *filename) { rrdcalctemplate_free(&localhost, rt); rt = callocz(1, sizeof(RRDCALCTEMPLATE)); - rt->name = strdupz(value); + rt->name = tabs2spaces(strdupz(value)); rt->hash_name = simple_hash(rt->name); rt->source = health_source_file(line, path, filename); rt->green = NAN; @@ -1497,12 +1764,12 @@ int health_readfile(const char *path, const char *filename) { if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { if(rc->chart) { if(strcmp(rc->chart, value)) - info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rc->name, key, rc->chart, value, value); freez(rc->chart); } - rc->chart = strdupz(value); + rc->chart = tabs2spaces(strdupz(value)); rc->hash_chart = simple_hash(rc->chart); } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { @@ -1512,14 +1779,14 @@ int health_readfile(const char *path, const char *filename) { } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { if(!health_parse_duration(value, &rc->update_every)) - info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", + error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", line, path, filename, rc->name, key, value); } else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { char *e; rc->green = strtold(value, &e); if(e && *e) { - info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", + error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", line, path, filename, rc->name, key, e); } } @@ -1527,7 +1794,7 @@ int health_readfile(const char *path, const char *filename) { char *e; rc->red = strtold(value, &e); if(e && *e) { - info("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", + error("Health configuration at line %zu of file '%s/%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", line, path, filename, rc->name, key, e); } } @@ -1561,43 +1828,43 @@ int health_readfile(const char *path, const char *filename) { else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { if(rc->exec) { if(strcmp(rc->exec, value)) - info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rc->name, key, rc->exec, value, value); freez(rc->exec); } - rc->exec = strdupz(value); + rc->exec = tabs2spaces(strdupz(value)); } else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { if(rc->recipient) { if(strcmp(rc->recipient, value)) - info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rc->name, key, rc->recipient, value, value); freez(rc->recipient); } - rc->recipient = strdupz(value); + rc->recipient = tabs2spaces(strdupz(value)); } else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { if(rc->units) { if(strcmp(rc->units, value)) - info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rc->name, key, rc->units, value, value); freez(rc->units); } - rc->units = strdupz(value); + rc->units = tabs2spaces(strdupz(value)); strip_quotes(rc->units); } else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { if(rc->info) { if(strcmp(rc->info, value)) - info("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rc->name, key, rc->info, value, value); freez(rc->info); } - rc->info = strdupz(value); + rc->info = tabs2spaces(strdupz(value)); strip_quotes(rc->info); } else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { @@ -1612,12 +1879,12 @@ int health_readfile(const char *path, const char *filename) { if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { if(rt->context) { if(strcmp(rt->context, value)) - info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rt->name, key, rt->context, value, value); freez(rt->context); } - rt->context = strdupz(value); + rt->context = tabs2spaces(strdupz(value)); rt->hash_context = simple_hash(rt->context); } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { @@ -1627,14 +1894,14 @@ int health_readfile(const char *path, const char *filename) { } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { if(!health_parse_duration(value, &rt->update_every)) - info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.", + error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' cannot parse duration: '%s'.", line, path, filename, rt->name, key, value); } else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { char *e; rt->green = strtold(value, &e); if(e && *e) { - info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", + error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", line, path, filename, rt->name, key, e); } } @@ -1642,7 +1909,7 @@ int health_readfile(const char *path, const char *filename) { char *e; rt->red = strtold(value, &e); if(e && *e) { - info("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", + error("Health configuration at line %zu of file '%s/%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", line, path, filename, rt->name, key, e); } } @@ -1676,43 +1943,43 @@ int health_readfile(const char *path, const char *filename) { else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { if(rt->exec) { if(strcmp(rt->exec, value)) - info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rt->name, key, rt->exec, value, value); freez(rt->exec); } - rt->exec = strdupz(value); + rt->exec = tabs2spaces(strdupz(value)); } else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { if(rt->recipient) { if(strcmp(rt->recipient, value)) - info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rt->name, key, rt->recipient, value, value); freez(rt->recipient); } - rt->recipient = strdupz(value); + rt->recipient = tabs2spaces(strdupz(value)); } else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { if(rt->units) { if(strcmp(rt->units, value)) - info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rt->name, key, rt->units, value, value); freez(rt->units); } - rt->units = strdupz(value); + rt->units = tabs2spaces(strdupz(value)); strip_quotes(rt->units); } else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { if(rt->info) { if(strcmp(rt->info, value)) - info("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + error("Health configuration at line %zu of file '%s/%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, path, filename, rt->name, key, rt->info, value, value); freez(rt->info); } - rt->info = strdupz(value); + rt->info = tabs2spaces(strdupz(value)); strip_quotes(rt->info); } else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { @@ -1798,7 +2065,16 @@ void health_init(void) { return; } + char *pathname = config_get("health", "health db directory", VARLIB_DIR "/health"); + if(mkdir(pathname, 0770) == -1 && errno != EEXIST) + fatal("Cannot create directory '%s'.", pathname); + + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/health-log.db", pathname); + health.log_filename = config_get("health", "health db file", filename); + health_alarm_log_load(&localhost); + health_alarm_log_open(); char *path = health_config_dir(); @@ -1865,10 +2141,10 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R ae->name, ae->chart, ae->family, - (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED)?"true":"false", - (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED)?"true":"false", + (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false", + (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false", (unsigned long)ae->exec_run_timestamp, - (ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED)?"true":"false", + (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false", ae->exec?ae->exec:health.health_default_exec, ae->recipient?ae->recipient:health.health_default_recipient, ae->exec_code, @@ -2085,7 +2361,7 @@ void health_reload(void) { ALARM_ENTRY *t; for(t = localhost.health_log.alarms ; t ; t = t->next) { if(t->new_status != RRDCALC_STATUS_REMOVED) - t->notifications |= HEALTH_ENTRY_NOTIFICATIONS_UPDATED; + t->flags |= HEALTH_ENTRY_FLAG_UPDATED; } // reset all thresholds to all charts @@ -2121,25 +2397,36 @@ static inline int rrdcalc_value2status(calculated_number n) { } static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { - ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_PROCESSED; + ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED; + + if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { + // do not send notifications for internal statuses + goto done; + } // find the previous notification for the same alarm + // which we have run the exec script ALARM_ENTRY *t; for(t = ae->next; t ;t = t->next) { - if(t->alarm_id == ae->alarm_id && t->notifications & HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN) + if(t->alarm_id == ae->alarm_id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN) break; } - if(t && t->new_status == ae->new_status) { - // don't send the same notification again - info("Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); - goto done; + if(likely(t)) { + // we have executed this alarm notification in the past + if (t && t->new_status == ae->new_status) { + // don't send the same notification again + debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name, + rrdcalc_status2string(ae->new_status)); + goto done; + } } - - if((ae->old_status == RRDCALC_STATUS_UNDEFINED && ae->new_status == RRDCALC_STATUS_UNINITIALIZED) - || (ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) { - info("Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); - goto done; + else { + // we have not executed this alarm notification in the past + if(unlikely(ae->old_status == RRDCALC_STATUS_UNINITIALIZED && ae->new_status == RRDCALC_STATUS_CLEAR)) { + debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + goto done; + } } char buffer[FILENAME_MAX + 1]; @@ -2173,7 +2460,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->info?ae->info:"" ); - ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN; + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; ae->exec_run_timestamp = time(NULL); debug(D_HEALTH, "executing command '%s'", buffer); @@ -2189,7 +2476,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code); if(ae->exec_code != 0) - ae->notifications |= HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED; + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED; done: health_alarm_log_save(host, ae); @@ -2197,7 +2484,7 @@ done: } static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { - info("Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s", + debug(D_HEALTH, "Health alarm '%s.%s' = %0.2Lf - changed status from %s to %s", ae->chart?ae->chart:"NOCHART", ae->name, ae->new_value, rrdcalc_status2string(ae->old_status), @@ -2217,8 +2504,8 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae && ae->unique_id >= stop_at_id ; ae = ae->next) { if(unlikely( - !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_PROCESSED) && - !(ae->notifications & HEALTH_ENTRY_NOTIFICATIONS_UPDATED) + !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && + !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) )) { if(unlikely(ae->unique_id < first_waiting)) diff --git a/src/health.h b/src/health.h index 9d5834fc..cebea49b 100644 --- a/src/health.h +++ b/src/health.h @@ -264,10 +264,11 @@ typedef struct rrdcalctemplate { #define RRDCALCTEMPLATE_HAS_CALCULATION(rt) ((rt)->after) -#define HEALTH_ENTRY_NOTIFICATIONS_PROCESSED 0x00000001 -#define HEALTH_ENTRY_NOTIFICATIONS_UPDATED 0x00000002 -#define HEALTH_ENTRY_NOTIFICATIONS_EXEC_RUN 0x00000004 -#define HEALTH_ENTRY_NOTIFICATIONS_EXEC_FAILED 0x00000008 +#define HEALTH_ENTRY_FLAG_PROCESSED 0x00000001 +#define HEALTH_ENTRY_FLAG_UPDATED 0x00000002 +#define HEALTH_ENTRY_FLAG_EXEC_RUN 0x00000004 +#define HEALTH_ENTRY_FLAG_EXEC_FAILED 0x00000008 +#define HEALTH_ENTRY_FLAG_SAVED 0x10000000 typedef struct alarm_entry { uint32_t unique_id; @@ -300,7 +301,7 @@ typedef struct alarm_entry { int old_status; int new_status; - uint32_t notifications; + uint32_t flags; int delay; time_t delay_up_to_timestamp; diff --git a/src/log.h b/src/log.h index 3a022fc9..b7323fcf 100644 --- a/src/log.h +++ b/src/log.h @@ -25,6 +25,7 @@ #define D_REGISTRY 0x00200000 #define D_VARIABLES 0x00400000 #define D_HEALTH 0x00800000 +#define D_SYSTEM 0x80000000 //#define DEBUG (D_WEB_CLIENT_ACCESS|D_LISTENER|D_RRD_STATS) //#define DEBUG 0xffffffff diff --git a/src/main.c b/src/main.c index 3e6aa504..1a819938 100644 --- a/src/main.c +++ b/src/main.c @@ -7,7 +7,7 @@ void netdata_cleanup_and_exit(int ret) { error_log_limit_unlimited(); - info("Called: netdata_cleanup_and_exit()"); + debug(D_EXIT, "Called: netdata_cleanup_and_exit()"); #ifdef NETDATA_INTERNAL_CHECKS rrdset_free_all(); #else @@ -169,7 +169,7 @@ void kill_childs() } if(tc_child_pid) { - info("Killing tc-qos-helper procees"); + debug(D_EXIT, "Killing tc-qos-helper procees"); if(killpid(tc_child_pid, SIGTERM) != -1) waitid(P_PID, (id_t) tc_child_pid, &info, WEXITED); } @@ -460,7 +460,8 @@ int main(int argc, char **argv) if(debug_flags != 0) { struct rlimit rl = { RLIM_INFINITY, RLIM_INFINITY }; if(setrlimit(RLIMIT_CORE, &rl) != 0) - info("Cannot request unlimited core dumps for debugging... Proceeding anyway..."); + error("Cannot request unlimited core dumps for debugging... Proceeding anyway..."); + prctl(PR_SET_DUMPABLE, 1, 0, 0, 0); } @@ -520,7 +521,7 @@ int main(int argc, char **argv) rrd_default_history_entries = (int) config_get_number("global", "history", RRD_DEFAULT_HISTORY_ENTRIES); if(rrd_default_history_entries < 5 || rrd_default_history_entries > RRD_HISTORY_ENTRIES_MAX) { - info("Invalid save lines %d given. Defaulting to %d.", rrd_default_history_entries, RRD_DEFAULT_HISTORY_ENTRIES); + error("Invalid history entries %d given. Defaulting to %d.", rrd_default_history_entries, RRD_DEFAULT_HISTORY_ENTRIES); rrd_default_history_entries = RRD_DEFAULT_HISTORY_ENTRIES; } else { @@ -531,7 +532,7 @@ int main(int argc, char **argv) rrd_update_every = (int) config_get_number("global", "update every", UPDATE_EVERY); if(rrd_update_every < 1 || rrd_update_every > 600) { - info("Invalid update timer %d given. Defaulting to %d.", rrd_update_every, UPDATE_EVERY_MAX); + error("Invalid data collection frequency (update every) %d given. Defaulting to %d.", rrd_update_every, UPDATE_EVERY_MAX); rrd_update_every = UPDATE_EVERY; } else debug(D_OPTIONS, "update timer set to %d.", rrd_update_every); @@ -635,7 +636,7 @@ int main(int argc, char **argv) if(debug_flags != 0) { struct rlimit rl = { RLIM_INFINITY, RLIM_INFINITY }; if(setrlimit(RLIMIT_CORE, &rl) != 0) - info("Cannot request unlimited core dumps for debugging... Proceeding anyway..."); + error("Cannot request unlimited core dumps for debugging... Proceeding anyway..."); prctl(PR_SET_DUMPABLE, 1, 0, 0, 0); } #endif /* NETDATA_INTERNAL_CHECKS */ @@ -655,7 +656,7 @@ int main(int argc, char **argv) if(i != 0) fatal("pthread_attr_setstacksize() to %zu bytes, failed with code %d.", wanted_stacksize, i); else - info("Successfully set pthread stacksize to %zu bytes", wanted_stacksize); + debug(D_SYSTEM, "Successfully set pthread stacksize to %zu bytes", wanted_stacksize); } // ------------------------------------------------------------------------ @@ -692,7 +693,7 @@ int main(int argc, char **argv) if(st->enabled) { st->thread = mallocz(sizeof(pthread_t)); - info("Starting thread %s.", st->name); + debug(D_SYSTEM, "Starting thread %s.", st->name); if(pthread_create(st->thread, &attr, st->start_routine, NULL)) error("failed to create new thread for %s.", st->name); @@ -700,7 +701,7 @@ int main(int argc, char **argv) else if(pthread_detach(*st->thread)) error("Cannot request detach of newly created %s thread.", st->name); } - else info("Not starting thread %s.", st->name); + else debug(D_SYSTEM, "Not starting thread %s.", st->name); } // ------------------------------------------------------------------------ @@ -716,7 +717,7 @@ int main(int argc, char **argv) while(1) { pause(); if(netdata_exit) { - info("Exit main loop of netdata."); + debug(D_EXIT, "Exit main loop of netdata."); netdata_cleanup_and_exit(0); exit(0); } diff --git a/src/popen.c b/src/popen.c index ad8d7596..8448b731 100644 --- a/src/popen.c +++ b/src/popen.c @@ -138,7 +138,7 @@ FILE *mypopen(const char *command, pid_t *pidptr) error("pre-execution of command '%s' on pid %d: failed to set default signal handler for SIGUSR2.", command, getpid()); } - info("executing command: '%s' on pid %d.", command, getpid()); + debug(D_CHILDS, "executing command: '%s' on pid %d.", command, getpid()); execl("/bin/sh", "sh", "-c", command, NULL); exit(1); } diff --git a/src/registry.c b/src/registry.c index a0fb629a..bce7c348 100644 --- a/src/registry.c +++ b/src/registry.c @@ -743,8 +743,7 @@ static inline void registry_log_recreate_nolock(void) { } int registry_log_load(void) { - char *s, buf[4096 + 1]; - size_t line = -1; + ssize_t line = -1; // closing the log is required here // otherwise we will append to it the values we read @@ -755,8 +754,10 @@ int registry_log_load(void) { if(!fp) error("Registry: cannot open registry file: %s", registry.log_filename); else { + char *s, buf[4096 + 1]; line = 0; size_t len = 0; + while ((s = fgets_trim_len(buf, 4096, fp, &len))) { line++; @@ -766,7 +767,7 @@ int registry_log_load(void) { // verify it is valid if (unlikely(len < 85 || s[1] != '\t' || s[10] != '\t' || s[47] != '\t' || s[84] != '\t')) { - error("Registry: log line %zu is wrong (len = %zu).", line, len); + error("Registry: log line %zd is wrong (len = %zu).", line, len); continue; } s[1] = s[10] = s[47] = s[84] = '\0'; @@ -781,7 +782,7 @@ int registry_log_load(void) { char *url = name; while(*url && *url != '\t') url++; if(!*url) { - error("Registry: log line %zu does not have a url.", line); + error("Registry: log line %zd does not have a url.", line); continue; } *url++ = '\0'; @@ -800,7 +801,7 @@ int registry_log_load(void) { break; default: - error("Registry: ignoring line %zu of filename '%s': %s.", line, registry.log_filename, s); + error("Registry: ignoring line %zd of filename '%s': %s.", line, registry.log_filename, s); break; } } @@ -1471,7 +1472,7 @@ int registry_save(void) { // rename the db to .old debug(D_REGISTRY, "Registry: Link current db '%s' to .old: '%s'", registry.db_filename, old_filename); if(link(registry.db_filename, old_filename) == -1 && errno != ENOENT) - error("Registry: cannot move file '%s' to '%s'. Saving registry DB failed!", tmp_filename, registry.db_filename); + error("Registry: cannot move file '%s' to '%s'. Saving registry DB failed!", registry.db_filename, old_filename); else { // remove the database (it is saved in .old) diff --git a/src/rrd2json.c b/src/rrd2json.c index 474b5915..b2c1a055 100644 --- a/src/rrd2json.c +++ b/src/rrd2json.c @@ -1158,7 +1158,7 @@ inline static void rrdr_free(RRDR *r) freez(r); } -inline void rrdr_done(RRDR *r) +static inline void rrdr_done(RRDR *r) { r->rows = r->c + 1; r->c = 0; diff --git a/src/web_client.c b/src/web_client.c index 2485f205..c16acfa0 100644 --- a/src/web_client.c +++ b/src/web_client.c @@ -14,7 +14,7 @@ int web_enable_gzip = 1, web_gzip_level = 3, web_gzip_strategy = Z_DEFAULT_STRAT struct web_client *web_clients = NULL; unsigned long long web_clients_count = 0; -inline int web_client_crock_socket(struct web_client *w) { +static inline int web_client_crock_socket(struct web_client *w) { #ifdef TCP_CORK if(likely(!w->tcp_cork && w->ofd != -1)) { w->tcp_cork = 1; @@ -29,7 +29,7 @@ inline int web_client_crock_socket(struct web_client *w) { return 0; } -inline int web_client_uncrock_socket(struct web_client *w) { +static inline int web_client_uncrock_socket(struct web_client *w) { #ifdef TCP_CORK if(likely(w->tcp_cork && w->ofd != -1)) { w->tcp_cork = 0; diff --git a/web/dashboard_info.js b/web/dashboard_info.js index 68df1eb9..49459322 100644 --- a/web/dashboard_info.js +++ b/web/dashboard_info.js @@ -697,8 +697,9 @@ netdataDashboard.context = { // ------------------------------------------------------------------------ // RETROSHARE + 'retroshare.bandwidth': { - info: 'Shows inbound and outbound traffic.', + info: 'RetroShare inbound and outbound traffic.', mainheads: [ netdataDashboard.gaugeChart('Received', '12%', 'bandwidth_down_kb'), netdataDashboard.gaugeChart('Sent', '12%', 'bandwidth_up_kb') @@ -706,7 +707,7 @@ netdataDashboard.context = { }, 'retroshare.peers': { - info: 'Shows the number of (connected) friends.', + info: 'Number of (connected) RetroShare friends.', mainheads: [ function(id) { return '