]> arthur.barton.de Git - netdata.git/commitdiff
Merge pull request #1615 from lowfive/discord_notify
authorCosta Tsaousis <costa@tsaousis.gr>
Fri, 27 Jan 2017 19:00:34 +0000 (21:00 +0200)
committerGitHub <noreply@github.com>
Fri, 27 Jan 2017 19:00:34 +0000 (21:00 +0200)
Initial implementation of Discord notifications

34 files changed:
ChangeLog
README.md
conf.d/Makefile.am
conf.d/health.d/disks.conf
conf.d/health.d/fping.conf [new file with mode: 0644]
conf.d/health.d/memcached.conf
conf.d/health.d/mysql.conf
conf.d/health.d/net.conf
conf.d/health.d/tcp_resets.conf
conf.d/node.d/snmp.conf.md
conf.d/python.d/elasticsearch.conf
configs.signatures
node.d/snmp.node.js
plugins.d/alarm-notify.sh
python.d/README.md
python.d/postgres.chart.py
python.d/tomcat.chart.py
python.d/varnish.chart.py
src/apps_plugin.c
src/common.c
src/health.c
src/health.h
src/inlined.h
src/main.c
src/procfile.c
src/procfile.h
src/rrd2json.c
src/sys_fs_cgroup.c
src/web_buffer_svg.c
src/web_buffer_svg.h
src/web_client.c
web/dashboard.js
web/dashboard_info.js
web/index.html

index c0950dec11f62afc12ebd94cb04fb3b7f5d53406..a8dba2edbb8e052dd6dbcf8c93f09a7971ec7526 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -27,7 +27,7 @@ netdata (1.5.0) - 2016-01-22
    Ilya Mashchenko (@l2isbad) has created most of the python
    data collection plugins in this release !
 
-    - Systemd Services (using cgroups!)
+    - systemd Services (using cgroups!)
     - FPing (yes, network latency in netdata!)
     - postgres databases            @facetoe, @moumoul
     - Vanish disk cache (v3 and v4) @l2isbad
index 5165078d43abbecaa5446f6b8bd2d103207593e8..808c70071740c93a28b9acdfc817f6d8ee14fba5 100644 (file)
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ Netdata is featured at <b><a href="https://octoverse.github.com/" target="_blank
 
  - netdata now runs on **FreeBSD** and **MacOS**
  - netdata now supports **Graphite**, **OpenTSDB**, **Prometheus** and compatible backends
- - netdata now monitors **SystemD Services**
+ - netdata now monitors **systemd Services**
  - new plugins: fping, postgres, varnish, elasticsearch, haproxy, freeradius, mdstat, ISC dhcpd, fail2ban, openvpn, NUMA memory, CPU Idle States, gunicorn, ECC memory errors, IPC semaphores, uptime
  - improved plugins: netfilter conntrack, mysql/mariadb, ipfs, cpufreq, hddtemp, sensors, nginx, nginx_log, phpfpm, redis, dovecot, containers and cgroups, disk space, apps.plugin, tc (QoS) and almost all internal plugins (memory, IPv4 and IPv6, network interfaces, QoS, etc)
  - dozens of new and improved alarms (including performance monitoring alarms for mysql)
index b725e249e8fe7c67e27496a7ac871bdf295f3fd0..e17d8fa929f547cddcb790218cd7472127c346b5 100644 (file)
@@ -64,6 +64,7 @@ dist_healthconfig_DATA = \
     health.d/disks.conf \
     health.d/elasticsearch.conf \
     health.d/entropy.conf \
+    health.d/fping.conf \
     health.d/haproxy.conf \
     health.d/ipc.conf \
     health.d/ipfs.conf \
index 0549bac268b51ea0034c1b8b6edc025d5d2a05d2..ff2d6a605ad3bdae4b9fa3e519356295bc7cb9f5 100644 (file)
@@ -88,7 +88,7 @@ families: *
 template: out_of_disk_space_time
       on: disk.space
 families: *
-    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (0)
+    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
    units: hours
    every: 10s
     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
diff --git a/conf.d/health.d/fping.conf b/conf.d/health.d/fping.conf
new file mode 100644 (file)
index 0000000..69251b1
--- /dev/null
@@ -0,0 +1,53 @@
+
+template: fping_last_collected_secs
+families: *
+      on: fping.latency
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+template: host_reachable
+families: *
+      on: fping.latency
+    calc: $average != nan
+   units: up/down
+   every: 10s
+    crit: $this == 0
+    info: states if the remote host is reachable
+   delay: down 30m multiplier 1.5 max 2h
+      to: sysadmin
+
+template: host_latency
+families: *
+      on: fping.latency
+  lookup: average -10s unaligned of average
+   units: ms
+   every: 10s
+   green: 300
+     red: 1000
+    warn: $this > $green OR $max > $red
+    crit: $this > $red
+    info: average round trip delay during the last 10 seconds
+   delay: down 30m multiplier 1.5 max 2h
+      to: sysadmin
+
+template: packet_loss
+families: *
+      on: fping.quality
+  lookup: average -10m unaligned of returned
+    calc: 100 - $this
+   green: 1
+     red: 10
+   units: %
+   every: 10s
+    warn: $this > $green
+    crit: $this > $red
+    info: packet loss percentage
+   delay: down 30m multiplier 1.5 max 2h
+      to: sysadmin
+
index 7917e36afb61f031c91e1e52cc39ccccc4aa68f2..d248ef57a7204966a41ae9d2cf73f3dc917dddd3 100644 (file)
@@ -42,7 +42,7 @@ template: cache_fill_rate
 
 template: out_of_cache_space_time
       on: memcached.cache
-    calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (0)
+    calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
    units: hours
    every: 10s
     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
index 78773e5b5b0b141489d334cbdbaf0b5bbe535b11..1eeb993f039a59742d9bfc9c551513d5b2d97878 100644 (file)
@@ -49,7 +49,7 @@ template: mysql_10s_table_locks_waited
 
 template: mysql_10s_waited_locks_ratio
       on: mysql.table_locks
-    calc: ($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)
+    calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (10) : (25))
@@ -65,7 +65,7 @@ template: mysql_10s_waited_locks_ratio
 template: mysql_replication
       on: mysql.slave_status
     calc: ($sql_running == -1 OR $io_running == -1)?0:1
-   units: status
+   units: ok/failed
    every: 10s
     crit: $this == 0
    delay: down 5m multiplier 1.5 max 1h
index 924acccc3d1b3d7f7e5fff3812854574b1379fba..cac0bbbfbe941741e495389c32a813194420bad2 100644 (file)
@@ -116,6 +116,7 @@ families: *
    units: %
    warn: $this > (($status >= $WARNING)?(200):(1000))
    crit: $this > (($status >= $WARNING)?(1000):(2000))
+options: no-clear-notification
    info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute
-     to: silent
+     to: sysadmin
 
index daf24a1cd6806866a86a5f3107876ed49727a6f9..49fb1b924c43eec838cc8b944619bbe91965f82b 100644 (file)
@@ -28,8 +28,9 @@
    every: 10s
     warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (4)))
    delay: up 0 down 60m multiplier 1.2 max 2h
+options: no-clear-notification
     info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
-      to: silent
+      to: sysadmin
 
 # -----------------------------------------------------------------------------
 # tcp resets this host receives
@@ -48,5 +49,6 @@
    every: 10s
     warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (4)))
    delay: up 0 down 60m multiplier 1.2 max 2h
+options: no-clear-notification
     info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed)
-      to: silent
+      to: sysadmin
index bae5bf2078a3104ab4b9793d01ef1ce364c416d6..6b496f7a87eae47b55c1d0e55b60e94193ca19b6 100644 (file)
-# SNMP Data Collector\r
-\r
-Using this collector, netdata can collect data from any SNMP device.\r
-\r
-This collector supports:\r
-\r
-- any number of SNMP devices\r
-- each SNMP device can be used to collect data for any number of charts\r
-- each chart may have any number of dimensions\r
-- each SNMP device may have a different update frequency\r
-- each SNMP device will accept one or more batches to report values (you can set `max_request_size` per SNMP server, to control the size of batches).\r
-\r
-The source code of the plugin is [here](https://github.com/firehol/netdata/blob/master/node.d/snmp.node.js).\r
-\r
-## Configuration\r
-\r
-You will need to create the file `/etc/netdata/node.d/snmp.conf` with data like the following.\r
-\r
-In this example:\r
-\r
- - the SNMP device is `10.11.12.8`.\r
- - the SNMP community is `public`.\r
- - we will update the values every 10 seconds (`update_every: 10` under the server `10.11.12.8`).\r
- - we define 2 charts `snmp_switch.bandwidth_port1` and `snmp_switch.bandwidth_port2`, each having 2 dimensions: `in` and `out`.\r
-\r
-```js\r
-{\r
-    "enable_autodetect": false,\r
-    "update_every": 5,\r
-    "max_request_size": 100,\r
-    "servers": [\r
-        {\r
-            "hostname": "10.11.12.8",\r
-            "community": "public",\r
-            "update_every": 10,\r
-            "max_request_size": 50,\r
-            "options": { "timeout": 10000 },\r
-            "charts": {\r
-                "snmp_switch.bandwidth_port1": {\r
-                    "title": "Switch Bandwidth for port 1",\r
-                    "units": "kilobits/s",\r
-                    "type": "area",\r
-                    "priority": 1,\r
-                    "family": "ports",\r
-                    "dimensions": {\r
-                        "in": {\r
-                            "oid": "1.3.6.1.2.1.2.2.1.10.1",\r
-                            "algorithm": "incremental",\r
-                            "multiplier": 8,\r
-                            "divisor": 1024\r
-                        },\r
-                        "out": {\r
-                            "oid": "1.3.6.1.2.1.2.2.1.16.1",\r
-                            "algorithm": "incremental",\r
-                            "multiplier": -8,\r
-                            "divisor": 1024\r
-                        }\r
-                    }\r
-                },\r
-                "snmp_switch.bandwidth_port2": {\r
-                    "title": "Switch Bandwidth for port 2",\r
-                    "units": "kilobits/s",\r
-                    "type": "area",\r
-                    "priority": 1,\r
-                    "family": "ports",\r
-                    "dimensions": {\r
-                        "in": {\r
-                            "oid": "1.3.6.1.2.1.2.2.1.10.2",\r
-                            "algorithm": "incremental",\r
-                            "multiplier": 8,\r
-                            "divisor": 1024\r
-                        },\r
-                        "out": {\r
-                            "oid": "1.3.6.1.2.1.2.2.1.16.2",\r
-                            "algorithm": "incremental",\r
-                            "multiplier": -8,\r
-                            "divisor": 1024\r
-                        }\r
-                    }\r
-                }\r
-            }\r
-        }\r
-    ]\r
-}\r
-```\r
-\r
-`update_every` is the update frequency for each server, in seconds.\r
-\r
-`max_request_size` limits the maximum number of OIDs that will be requested in a single call. The default is 50. Lower this number of you get `TooBig` errors in netdata error.log.\r
-\r
-`family` sets the name of the submenu of the dashboard each chart will appear under.\r
-\r
-If you need to define many charts using incremental OIDs, you can use something like this:\r
-\r
-This is like the previous, but the option `multiply_range` given, will multiply the current chart from `1` to `24` inclusive, producing 24 charts in total for the 24 ports of the switch `10.11.12.8`.\r
-\r
-Each of the 24 new charts will have its id (1-24) appended at:\r
-\r
-1. its chart unique id, i.e. `snmp_switch.bandwidth_port1` to `snmp_switch.bandwidth_port24`\r
-2. its `title`, i.e. `Switch Bandwidth for port 1` to `Switch Bandwidth for port 24`\r
-3. its `oid` (for all dimensions), i.e. dimension `in` will be `1.3.6.1.2.1.2.2.1.10.1` to `1.3.6.1.2.1.2.2.1.10.24`\r
-3. its priority (which will be incremented for each chart so that the charts will appear on the dashboard in this order)\r
-\r
-```js\r
-{\r
-    "enable_autodetect": false,\r
-    "update_every": 10,\r
-    "servers": [\r
-        {\r
-            "hostname": "10.11.12.8",\r
-            "community": "public",\r
-            "update_every": 10,\r
-            "options": { "timeout": 20000 },\r
-            "charts": {\r
-                "snmp_switch.bandwidth_port": {\r
-                    "title": "Switch Bandwidth for port ",\r
-                    "units": "kilobits/s",\r
-                    "type": "area",\r
-                    "priority": 1,\r
-                    "family": "ports",\r
-                    "multiply_range": [ 1, 24 ],\r
-                    "dimensions": {\r
-                        "in": {\r
-                            "oid": "1.3.6.1.2.1.2.2.1.10.",\r
-                            "algorithm": "incremental",\r
-                            "multiplier": 8,\r
-                            "divisor": 1024\r
-                        },\r
-                        "out": {\r
-                            "oid": "1.3.6.1.2.1.2.2.1.16.",\r
-                            "algorithm": "incremental",\r
-                            "multiplier": -8,\r
-                            "divisor": 1024\r
-                        }\r
-                    }\r
-                }\r
-            }\r
-        }\r
-    ]\r
-}\r
-```\r
-\r
-The `options` given for each server, are:\r
-\r
- - `timeout`, the time to wait for the SNMP device to respond. The default is 5000 ms.\r
- - `version`, the SNMP version to use. `0` is Version 1, `1` is Version 2c. The default is Version 1 (`0`).\r
- - `transport`, the default is `udp4`.\r
- - `port`, the port of the SNMP device to connect to. The default is `161`.\r
- - `retries`, the number of attempts to make to fetch the data. The default is `1`.\r
-\r
-## Retreiving names from snmp\r
-\r
-You can append a value retrieved from SNMP to the title, by adding `titleoid` to the chart.\r
-\r
-You can set a dimension name to a value retrieved from SNMP, by adding `oidname` to the dimension.\r
-\r
-Both of the above will participate in `multiply_range`.\r
-\r
-\r
-## Testing the configuration\r
-\r
-To test it, you can run:\r
-\r
-```sh\r
-/usr/libexec/netdata/plugins.d/node.d.plugin 1 snmp\r
-```\r
-\r
-The above will run it on your console and you will be able to see what netdata sees, but also errors. You can get a very detailed output by appending `debug` to the command line.\r
-\r
-If it works, restart netdata to activate the snmp collector and refresh the dashboard (if your SNMP device responds with a delay, you may need to refresh the dashboard in a few seconds).\r
-\r
-## Data collection speed\r
-\r
-Keep in mind that many SNMP switches are routers are very slow. They may not be able to report values per second. If you run `node.d.plugin` in `debug` mode, it will report the time it took for the SNMP device to respond. My switch, for example, needs 7-8 seconds to respond for the traffic on 24 ports (48 OIDs, in/out).\r
-\r
-Also, if you use many SNMP clients on the same SNMP device at the same time, values may be skipped. This is a problem of the SNMP device, not this collector.\r
-\r
-## Finding OIDs\r
-\r
-Use `snmpwalk`, like this:\r
-\r
-```sh\r
-snmpwalk -t 20 -v 1 -O fn -c public 10.11.12.8\r
-```\r
-\r
-- `-t 20` is the timeout in seconds\r
-- `-v 1` is the SNMP version\r
-- `-O fn` will display full OIDs in numeric format (you may want to run it also without this option to see human readable output of OIDs)\r
-- `-c public` is the SNMP community\r
-- `10.11.12.8` is the SNMP device\r
-\r
-Keep in mind that `snmpwalk` outputs the OIDs with a dot in front them. You should remove this dot when adding OIDs to the configuration file of this collector.\r
-\r
-## Example: Linksys SRW2024P\r
-\r
-This is what I use for my Linksys SRW2024P. It creates:\r
-\r
-1. A chart for power consumption (it is a PoE switch)\r
-2. Two charts for packets received (total packets received and packets received with errors)\r
-3. One chart for packets output\r
-4. 24 charts, one for each port of the switch. It also appends the port names, as defined at the switch, to the chart titles.\r
-\r
-This switch also reports various other metrics, like snmp, packets per port, etc. Unfortunately it does not report CPU utilization or backplane utilization.\r
-\r
-This switch has a very slow SNMP processors. To respond, it needs about 8 seconds, so I have set the refresh frequency (`update_every`) to 15 seconds.\r
-\r
-```js\r
-{\r
-        "enable_autodetect": false,\r
-        "update_every": 5,\r
-        "servers": [\r
-                {\r
-                        "hostname": "10.11.12.8",\r
-                        "community": "public",\r
-                        "update_every": 15,\r
-                        "options": { "timeout": 20000, "version": 1 },\r
-                        "charts": {\r
-                                "snmp_switch.power": {\r
-                                        "title": "Switch Power Supply",\r
-                                        "units": "watts",\r
-                                        "type": "line",\r
-                                        "priority": 10,\r
-                                        "family": "power",\r
-                                        "dimensions": {\r
-                                                "supply": {\r
-                                                        "oid": ".1.3.6.1.2.1.105.1.3.1.1.2.1",\r
-                                                        "algorithm": "absolute",\r
-                                                        "multiplier": 1,\r
-                                                        "divisor": 1\r
-                                                },\r
-                                                "used": {\r
-                                                        "oid": ".1.3.6.1.2.1.105.1.3.1.1.4.1",\r
-                                                        "algorithm": "absolute",\r
-                                                        "multiplier": 1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                        }\r
-                                }\r
-                                , "snmp_switch.input": {\r
-                                        "title": "Switch Packets Input",\r
-                                        "units": "packets/s",\r
-                                        "type": "area",\r
-                                        "priority": 20,\r
-                                        "family": "IP",\r
-                                        "dimensions": {\r
-                                                "receives": {\r
-                                                        "oid": ".1.3.6.1.2.1.4.3.0",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": 1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                                , "discards": {\r
-                                                        "oid": ".1.3.6.1.2.1.4.8.0",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": 1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                        }\r
-                                }\r
-                                , "snmp_switch.input_errors": {\r
-                                        "title": "Switch Received Packets with Errors",\r
-                                        "units": "packets/s",\r
-                                        "type": "line",\r
-                                        "priority": 30,\r
-                                        "family": "IP",\r
-                                        "dimensions": {\r
-                                                "bad_header": {\r
-                                                        "oid": ".1.3.6.1.2.1.4.4.0",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": 1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                                , "bad_address": {\r
-                                                        "oid": ".1.3.6.1.2.1.4.5.0",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": 1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                                , "unknown_protocol": {\r
-                                                        "oid": ".1.3.6.1.2.1.4.7.0",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": 1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                        }\r
-                                }\r
-                                , "snmp_switch.output": {\r
-                                        "title": "Switch Output Packets",\r
-                                        "units": "packets/s",\r
-                                        "type": "line",\r
-                                        "priority": 40,\r
-                                        "family": "IP",\r
-                                        "dimensions": {\r
-                                                "requests": {\r
-                                                        "oid": ".1.3.6.1.2.1.4.10.0",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": 1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                                , "discards": {\r
-                                                        "oid": ".1.3.6.1.2.1.4.11.0",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": -1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                                , "no_route": {\r
-                                                        "oid": ".1.3.6.1.2.1.4.12.0",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": -1,\r
-                                                        "divisor": 1\r
-                                                }\r
-                                        }\r
-                                }\r
-                                , "snmp_switch.bandwidth_port": {\r
-                                        "title": "Switch Bandwidth for port ",\r
-                                        "titleoid": ".1.3.6.1.2.1.31.1.1.1.18.",\r
-                                        "units": "kilobits/s",\r
-                                        "type": "area",\r
-                                        "priority": 100,\r
-                                        "family": "ports",\r
-                                        "multiply_range": [ 1, 24 ],\r
-                                        "dimensions": {\r
-                                                "in": {\r
-                                                        "oid": ".1.3.6.1.2.1.2.2.1.10.",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": 8,\r
-                                                        "divisor": 1024\r
-                                                }\r
-                                                , "out": {\r
-                                                        "oid": ".1.3.6.1.2.1.2.2.1.16.",\r
-                                                        "algorithm": "incremental",\r
-                                                        "multiplier": -8,\r
-                                                        "divisor": 1024\r
-                                                }\r
-                                        }\r
-                                }\r
-                        }\r
-                }\r
-        ]\r
-}\r
-```\r
+# SNMP Data Collector
+
+Using this collector, netdata can collect data from any SNMP device.
+
+This collector supports:
+
+- any number of SNMP devices
+- each SNMP device can be used to collect data for any number of charts
+- each chart may have any number of dimensions
+- each SNMP device may have a different update frequency
+- each SNMP device will accept one or more batches to report values (you can set `max_request_size` per SNMP server, to control the size of batches).
+
+The source code of the plugin is [here](https://github.com/firehol/netdata/blob/master/node.d/snmp.node.js).
+
+## Configuration
+
+You will need to create the file `/etc/netdata/node.d/snmp.conf` with data like the following.
+
+In this example:
+
+ - the SNMP device is `10.11.12.8`.
+ - the SNMP community is `public`.
+ - we will update the values every 10 seconds (`update_every: 10` under the server `10.11.12.8`).
+ - we define 2 charts `snmp_switch.bandwidth_port1` and `snmp_switch.bandwidth_port2`, each having 2 dimensions: `in` and `out`.
+
+```js
+{
+    "enable_autodetect": false,
+    "update_every": 5,
+    "max_request_size": 100,
+    "servers": [
+        {
+            "hostname": "10.11.12.8",
+            "community": "public",
+            "update_every": 10,
+            "max_request_size": 50,
+            "options": { "timeout": 10000 },
+            "charts": {
+                "snmp_switch.bandwidth_port1": {
+                    "title": "Switch Bandwidth for port 1",
+                    "units": "kilobits/s",
+                    "type": "area",
+                    "priority": 1,
+                    "family": "ports",
+                    "dimensions": {
+                        "in": {
+                            "oid": "1.3.6.1.2.1.2.2.1.10.1",
+                            "algorithm": "incremental",
+                            "multiplier": 8,
+                            "divisor": 1024,
+                            "offset": 0
+                        },
+                        "out": {
+                            "oid": "1.3.6.1.2.1.2.2.1.16.1",
+                            "algorithm": "incremental",
+                            "multiplier": -8,
+                            "divisor": 1024,
+                            "offset": 0
+                        }
+                    }
+                },
+                "snmp_switch.bandwidth_port2": {
+                    "title": "Switch Bandwidth for port 2",
+                    "units": "kilobits/s",
+                    "type": "area",
+                    "priority": 1,
+                    "family": "ports",
+                    "dimensions": {
+                        "in": {
+                            "oid": "1.3.6.1.2.1.2.2.1.10.2",
+                            "algorithm": "incremental",
+                            "multiplier": 8,
+                            "divisor": 1024,
+                            "offset": 0
+                        },
+                        "out": {
+                            "oid": "1.3.6.1.2.1.2.2.1.16.2",
+                            "algorithm": "incremental",
+                            "multiplier": -8,
+                            "divisor": 1024,
+                            "offset": 0
+                        }
+                    }
+                }
+            }
+        }
+    ]
+}
+```
+
+`update_every` is the update frequency for each server, in seconds.
+
+`max_request_size` limits the maximum number of OIDs that will be requested in a single call. The default is 50. Lower this number of you get `TooBig` errors in netdata error.log.
+
+`family` sets the name of the submenu of the dashboard each chart will appear under.
+
+If you need to define many charts using incremental OIDs, you can use something like this:
+
+This is like the previous, but the option `multiply_range` given, will multiply the current chart from `1` to `24` inclusive, producing 24 charts in total for the 24 ports of the switch `10.11.12.8`.
+
+Each of the 24 new charts will have its id (1-24) appended at:
+
+1. its chart unique id, i.e. `snmp_switch.bandwidth_port1` to `snmp_switch.bandwidth_port24`
+2. its `title`, i.e. `Switch Bandwidth for port 1` to `Switch Bandwidth for port 24`
+3. its `oid` (for all dimensions), i.e. dimension `in` will be `1.3.6.1.2.1.2.2.1.10.1` to `1.3.6.1.2.1.2.2.1.10.24`
+3. its priority (which will be incremented for each chart so that the charts will appear on the dashboard in this order)
+
+```js
+{
+    "enable_autodetect": false,
+    "update_every": 10,
+    "servers": [
+        {
+            "hostname": "10.11.12.8",
+            "community": "public",
+            "update_every": 10,
+            "options": { "timeout": 20000 },
+            "charts": {
+                "snmp_switch.bandwidth_port": {
+                    "title": "Switch Bandwidth for port ",
+                    "units": "kilobits/s",
+                    "type": "area",
+                    "priority": 1,
+                    "family": "ports",
+                    "multiply_range": [ 1, 24 ],
+                    "dimensions": {
+                        "in": {
+                            "oid": "1.3.6.1.2.1.2.2.1.10.",
+                            "algorithm": "incremental",
+                            "multiplier": 8,
+                            "divisor": 1024,
+                            "offset": 0
+                        },
+                        "out": {
+                            "oid": "1.3.6.1.2.1.2.2.1.16.",
+                            "algorithm": "incremental",
+                            "multiplier": -8,
+                            "divisor": 1024,
+                            "offset": 0
+                        }
+                    }
+                }
+            }
+        }
+    ]
+}
+```
+
+The `options` given for each server, are:
+
+ - `timeout`, the time to wait for the SNMP device to respond. The default is 5000 ms.
+ - `version`, the SNMP version to use. `0` is Version 1, `1` is Version 2c. The default is Version 1 (`0`).
+ - `transport`, the default is `udp4`.
+ - `port`, the port of the SNMP device to connect to. The default is `161`.
+ - `retries`, the number of attempts to make to fetch the data. The default is `1`.
+
+## Retreiving names from snmp
+
+You can append a value retrieved from SNMP to the title, by adding `titleoid` to the chart.
+
+You can set a dimension name to a value retrieved from SNMP, by adding `oidname` to the dimension.
+
+Both of the above will participate in `multiply_range`.
+
+
+## Testing the configuration
+
+To test it, you can run:
+
+```sh
+/usr/libexec/netdata/plugins.d/node.d.plugin 1 snmp
+```
+
+The above will run it on your console and you will be able to see what netdata sees, but also errors. You can get a very detailed output by appending `debug` to the command line.
+
+If it works, restart netdata to activate the snmp collector and refresh the dashboard (if your SNMP device responds with a delay, you may need to refresh the dashboard in a few seconds).
+
+## Data collection speed
+
+Keep in mind that many SNMP switches are routers are very slow. They may not be able to report values per second. If you run `node.d.plugin` in `debug` mode, it will report the time it took for the SNMP device to respond. My switch, for example, needs 7-8 seconds to respond for the traffic on 24 ports (48 OIDs, in/out).
+
+Also, if you use many SNMP clients on the same SNMP device at the same time, values may be skipped. This is a problem of the SNMP device, not this collector.
+
+## Finding OIDs
+
+Use `snmpwalk`, like this:
+
+```sh
+snmpwalk -t 20 -v 1 -O fn -c public 10.11.12.8
+```
+
+- `-t 20` is the timeout in seconds
+- `-v 1` is the SNMP version
+- `-O fn` will display full OIDs in numeric format (you may want to run it also without this option to see human readable output of OIDs)
+- `-c public` is the SNMP community
+- `10.11.12.8` is the SNMP device
+
+Keep in mind that `snmpwalk` outputs the OIDs with a dot in front them. You should remove this dot when adding OIDs to the configuration file of this collector.
+
+## Example: Linksys SRW2024P
+
+This is what I use for my Linksys SRW2024P. It creates:
+
+1. A chart for power consumption (it is a PoE switch)
+2. Two charts for packets received (total packets received and packets received with errors)
+3. One chart for packets output
+4. 24 charts, one for each port of the switch. It also appends the port names, as defined at the switch, to the chart titles.
+
+This switch also reports various other metrics, like snmp, packets per port, etc. Unfortunately it does not report CPU utilization or backplane utilization.
+
+This switch has a very slow SNMP processors. To respond, it needs about 8 seconds, so I have set the refresh frequency (`update_every`) to 15 seconds.
+
+```js
+{
+    "enable_autodetect": false,
+    "update_every": 5,
+    "servers": [
+    {
+        "hostname": "10.11.12.8",
+        "community": "public",
+        "update_every": 15,
+        "options": { "timeout": 20000, "version": 1 },
+        "charts": {
+            "snmp_switch.power": {
+                "title": "Switch Power Supply",
+                "units": "watts",
+                "type": "line",
+                "priority": 10,
+                "family": "power",
+                "dimensions": {
+                    "supply": {
+                        "oid": ".1.3.6.1.2.1.105.1.3.1.1.2.1",
+                        "algorithm": "absolute",
+                        "multiplier": 1,
+                        "divisor": 1,
+                        "offset": 0
+                    },
+                    "used": {
+                        "oid": ".1.3.6.1.2.1.105.1.3.1.1.4.1",
+                        "algorithm": "absolute",
+                        "multiplier": 1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                }
+            }
+            , "snmp_switch.input": {
+                "title": "Switch Packets Input",
+                "units": "packets/s",
+                "type": "area",
+                "priority": 20,
+                "family": "IP",
+                "dimensions": {
+                    "receives": {
+                        "oid": ".1.3.6.1.2.1.4.3.0",
+                        "algorithm": "incremental",
+                        "multiplier": 1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                    , "discards": {
+                        "oid": ".1.3.6.1.2.1.4.8.0",
+                        "algorithm": "incremental",
+                        "multiplier": 1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                }
+            }
+            , "snmp_switch.input_errors": {
+                "title": "Switch Received Packets with Errors",
+                "units": "packets/s",
+                "type": "line",
+                "priority": 30,
+                "family": "IP",
+                "dimensions": {
+                    "bad_header": {
+                        "oid": ".1.3.6.1.2.1.4.4.0",
+                        "algorithm": "incremental",
+                        "multiplier": 1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                    , "bad_address": {
+                        "oid": ".1.3.6.1.2.1.4.5.0",
+                        "algorithm": "incremental",
+                        "multiplier": 1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                    , "unknown_protocol": {
+                        "oid": ".1.3.6.1.2.1.4.7.0",
+                        "algorithm": "incremental",
+                        "multiplier": 1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                }
+            }
+            , "snmp_switch.output": {
+                "title": "Switch Output Packets",
+                "units": "packets/s",
+                "type": "line",
+                "priority": 40,
+                "family": "IP",
+                "dimensions": {
+                    "requests": {
+                        "oid": ".1.3.6.1.2.1.4.10.0",
+                        "algorithm": "incremental",
+                        "multiplier": 1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                    , "discards": {
+                        "oid": ".1.3.6.1.2.1.4.11.0",
+                        "algorithm": "incremental",
+                        "multiplier": -1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                    , "no_route": {
+                        "oid": ".1.3.6.1.2.1.4.12.0",
+                        "algorithm": "incremental",
+                        "multiplier": -1,
+                        "divisor": 1,
+                        "offset": 0
+                    }
+                }
+            }
+            , "snmp_switch.bandwidth_port": {
+                "title": "Switch Bandwidth for port ",
+                "titleoid": ".1.3.6.1.2.1.31.1.1.1.18.",
+                "units": "kilobits/s",
+                "type": "area",
+                "priority": 100,
+                "family": "ports",
+                "multiply_range": [ 1, 24 ],
+                "dimensions": {
+                    "in": {
+                        "oid": ".1.3.6.1.2.1.2.2.1.10.",
+                        "algorithm": "incremental",
+                        "multiplier": 8,
+                        "divisor": 1024,
+                        "offset": 0
+                    }
+                    , "out": {
+                        "oid": ".1.3.6.1.2.1.2.2.1.16.",
+                        "algorithm": "incremental",
+                        "multiplier": -8,
+                        "divisor": 1024,
+                        "offset": 0
+                    }
+                }
+            }
+        }
+    }
+    ]
+}
+```
index 1faee85820b6a180584108665aef6e69ccaa52d6..f98aaeced27ad83886ec99ee7346506b91ec6427 100644 (file)
 #     cluster_stats: False/True        # Calls to cluster stats elasticsearch API. Enabled by default.
 #
 # ----------------------------------------------------------------------
+# IMPORTANT Information
+#
+# Module uses python `requests` package
+#
+# You need to install it manually. (python-requests or python3-requests depending on the version of python).
+#
+#
 # AUTO-DETECTION JOBS
 # only one of them will run (they have the same name)
 #
index fbae919c49f8de0481e31a799c8d87f679c92440..817c38902f86dc6cb0112e0aa8ee39199ca5055d 100644 (file)
@@ -37,6 +37,7 @@ declare -A configs_signatures=(
   ['18ee1c6197a4381b1c1631ef6129824f']='apps_groups.conf'
   ['1972e48345e6c3f0d65f94a03317622b']='health_alarm_notify.conf'
   ['1c12b678ab65f271a96da1bbd0a1ab1c']='health.d/softnet.conf'
+  ['1c3168c95b53e999df3d45162b3f50b8']='health.d/fping.conf'
   ['1ea8e8ef1fa8a3a0fcdfba236f4cb195']='python.d/mysql.conf'
   ['1ef0fd38e7969c023bc3fa6d89eaf6d6']='python.d/mdstat.conf'
   ['1f5545b3ff52b3eb75ee05401f67a9bc']='fping.conf'
@@ -67,8 +68,10 @@ declare -A configs_signatures=(
   ['312b4b8e2805e19cf9be554b319567d6']='health.d/softnet.conf'
   ['318bb45755726a25120bb33413d4b582']='health.d/net.conf'
   ['325617412a628e3bc776e3fbb777a2a6']='health.d/redis.conf'
+  ['326e1477131e0f73304711135f70a2a5']='health.d/memcached.conf'
   ['32fde0057c790964f2c743cb3c9aad29']='health.d/nginx.conf'
   ['33b135e28aeaef2b8224ba69a0fde245']='health.d/cpu.conf'
+  ['343bc919a2fbc93f687f9d1339ec5f79']='health.d/net.conf'
   ['3634d5eddc46fb0d50cf47f370670c2c']='health.d/redis.conf'
   ['364b6e0081b116c9ec073b4d329a6dcc']='health_alarm_notify.conf'
   ['367d1463e520eb9dc89223bab161c6d1']='python.d/postgres.conf'
@@ -111,6 +114,7 @@ declare -A configs_signatures=(
   ['4b775fb31342f1478b3773d041a72911']='python.d.conf'
   ['4ccb06fff1ce06dc5bc80e0a9f568f6e']='charts.d.conf'
   ['4d13684cadfa90e73ab465409bf7263b']='health.d/mysql.conf'
+  ['4d91ee6fe4c887ea3865ef36ac63da3c']='health.d/mysql.conf'
   ['4e995acb0d6fd77403a2a9dca984b55b']='charts.d.conf'
   ['4f6a5b47a13f5912cc89e9286701dd08']='health.d/redis.conf'
   ['4f6f4d39c19d7d954f769d3f9d3b4da5']='health.d/memcached.conf'
@@ -218,6 +222,7 @@ declare -A configs_signatures=(
   ['8c1d41e2c88aeca78bc319ed74c8748c']='python.d/phpfpm.conf'
   ['8d0552371a7c9725a04196fa560813d1']='health.d/cpu.conf'
   ['8dc0bd0a70b5117454bd5f5b98f91c2c']='health.d/disks.conf'
+  ['8f4f925c1e97dd164007495ec5135ffc']='health.d/fping.conf'
   ['8fd472a854b0996327e8ed3562161182']='health_alarm_notify.conf'
   ['919911d13901d60a7580f5dfd7fc87bb']='health.d/ram.conf'
   ['91c757ef6be3abdb86906d9dbb9c217a']='fping.conf'
@@ -231,10 +236,12 @@ declare -A configs_signatures=(
   ['99c1617448abbdc493976ab9bda5ce02']='apps_groups.conf'
   ['9a8a459a3841b78d4c6ef07428ad2fe1']='health.d/entropy.conf'
   ['9c0185ceff15415bc59b2ce2c1f04367']='apps_groups.conf'
+  ['9c8ddfa810d83ae58c8614ee5229e66b']='health.d/disks.conf'
   ['9c981c75bdf4b1637f7113e7e45eb2bf']='health.d/memcached.conf'
   ['9e0553ebdc21b64295873fc104cfa79d']='python.d.conf'
   ['9eb3326ae2ee9badeaad31d8dd2eaa2b']='python.d/isc_dhcpd.conf'
   ['a02d14124b19c635c1426cee2e98bac5']='charts.d.conf'
+  ['a03f3e38378385bf87d4c0f81eb1f108']='health.d/tcp_resets.conf'
   ['a09714b5942cf25a89ec3da1dbc18063']='health.d/ram.conf'
   ['a0b3a12389c9c56dfe35964b20b59836']='health.d/bind_rndc.conf'
   ['a0ee8f351f213c0e8af9eb7a4a09cb95']='apps_groups.conf'
@@ -334,6 +341,7 @@ declare -A configs_signatures=(
   ['de02f899a61f21b86adb646940f0bcae']='health.d/net.conf'
   ['def883f35986c9d25de63b1a8e7d0f46']='health.d/entropy.conf'
   ['df381f3a7ca9fb2b4b43ae7cb7a4c492']='python.d/mysql.conf'
+  ['df7e8044902b5e155fad8430c2ddcfa8']='health.d/fping.conf'
   ['dfd5431b11cf2f3852a40d390c1d5a92']='python.d/varnish.conf'
   ['e0242003fd2e3f9ac1b9314e802ada79']='python.d/hddtemp.conf'
   ['e0e96cc47ed61d6492416be5236cd4d3']='python.d/apache_cache.conf'
index 5a478937e99e56e8516cd4e80033cc20aac2abcc..c0974bda6df38c68518f24320dc924446e6fd397 100644 (file)
@@ -1,6 +1,7 @@
 'use strict';
-
+// netdata snmp module
 // This program will connect to one or more SNMP Agents
+//
 
 // example configuration in /etc/netdata/node.d/snmp.conf
 /*
                             "oid": ".1.3.6.1.2.1.2.2.1.10.1",
                             "algorithm": "incremental",
                             "multiplier": 8,
-                            "divisor": 1024
+                            "divisor": 1024,
+                            "offset": 0
                         },
                         "out": {
                             "oid": ".1.3.6.1.2.1.2.2.1.16.1",
                             "algorithm": "incremental",
                             "multiplier": -8,
-                            "divisor": 1024
+                            "divisor": 1024,
+                            "offset": 0
                         }
                     }
                 },
                             "oid": ".1.3.6.1.2.1.2.2.1.10.2",
                             "algorithm": "incremental",
                             "multiplier": 8,
-                            "divisor": 1024
+                            "divisor": 1024,
+                             "offset": 0
                         },
                         "out": {
                             "oid": ".1.3.6.1.2.1.2.2.1.16.2",
                             "algorithm": "incremental",
                             "multiplier": -8,
-                            "divisor": 1024
+                            "divisor": 1024,
+                            "offset": 0
                         }
                     }
                 }
                             "oid": ".1.3.6.1.2.1.2.2.1.10.",
                             "algorithm": "incremental",
                             "multiplier": 8,
-                            "divisor": 1024
+                            "divisor": 1024,
+                            "offset": 0
                         },
                         "out": {
                             "oid": ".1.3.6.1.2.1.2.2.1.16.",
                             "algorithm": "incremental",
                             "multiplier": -8,
-                            "divisor": 1024
+                            "divisor": 1024,
+                            "offset": 0
                         }
                     }
                 }
@@ -360,8 +367,12 @@ var snmp = {
                 for(var j = 0; j < dim_keys_len ; j++) {
                     var d = dim_keys[j];
 
-                    if (dimensions[d].value !== null)
-                        service.set(d, dimensions[d].value);
+                    if (dimensions[d].value !== null) {
+                        if(typeof dimensions[d].offset === 'number')
+                            service.set(d, dimensions[d].value + dimensions[d].offset);
+                        else
+                            service.set(d, dimensions[d].value);
+                    }
                 }
 
                 service.end();
index 654dac433df71d0b7bd199bd8bcea2baf4d57073..c70acc724df93cdd21a427b6a89ada93f0783430 100755 (executable)
@@ -47,7 +47,7 @@ then
         echo >&2
         echo >&2 "# SENDING TEST ${x} ALARM TO ROLE: ${recipient}"
 
-        "${0}" "${recipient}" "$(hostname)" 1 1 "${id}" "$(date +%s)" "test_alarm" "test.chart" "test.family" "${x}" "${last}" 100 90 "${0}" 1 $((0 + id)) "units" "this is a test alarm to verify notifications work"
+        "${0}" "${recipient}" "$(hostname)" 1 1 "${id}" "$(date +%s)" "test_alarm" "test.chart" "test.family" "${x}" "${last}" 100 90 "${0}" 1 $((0 + id)) "units" "this is a test alarm to verify notifications work" "new value" "old value"
         if [ $? -ne 0 ]
         then
             echo >&2 "# FAILED"
@@ -139,6 +139,8 @@ duration="${15}"   # the duration in seconds of the previous alarm state
 non_clear_duration="${16}" # the total duration in seconds this is/was non-clear
 units="${17}"      # the units of the value
 info="${18}"       # a short description of the alarm
+value_string="${19}"        # friendly value (with units)
+old_value_string="${20}"    # friendly old value (with units)
 
 # -----------------------------------------------------------------------------
 # screen statuses we don't need to send a notification
@@ -773,13 +775,13 @@ send_pd() {
         then
         for PD_SERVICE_KEY in ${recipients}
         do
-            d="${status} ${name}=${value} ${units} - ${host}, ${family}"
+            d="${status} ${name} = ${value_string} - ${host}, ${family}"
             ${pd_send} -k ${PD_SERVICE_KEY} \
                        -t ${t} \
                        -d "${d}" \
                        -i ${alarm_id} \
                        -f 'info'="${info}" \
-                       -f 'value_w_units'="${value} ${units}" \
+                       -f 'value_w_units'="${value_string}" \
                        -f 'when'="${when}" \
                        -f 'duration'="${duration}" \
                        -f 'roles'="${roles}" \
@@ -1120,7 +1122,7 @@ status_message="status unknown"
 color="grey"
 
 # the alarm value
-alarm="${name//_/ } = ${value} ${units}"
+alarm="${name//_/ } = ${value_string}"
 
 # the image of the alarm
 image="${images_base_url}/images/seo-performance-128.png"
index 75f5614a7b20bb0c9f0e06078086553152ad4295..5fccf30b8d48007131104de627d82e3764ecb322 100644 (file)
@@ -307,6 +307,12 @@ If no configuration is given, module will attempt to connect to dovecot using un
 
 Module monitor elasticsearch performance and health metrics
 
+**Requirements:**
+ * python `requests` package.
+
+You need to install it manually. (python-requests or python3-requests depending on the version of python).
+
+
 It produces:
 
 1. **Search performance** charts:
index 919b6f8eebef60a57fa002f56fdaa2d626679158..09815409cf7fdc5db89a1abb93271e41a96a25ef 100644 (file)
@@ -8,6 +8,7 @@ from copy import deepcopy
 import psycopg2
 from psycopg2 import extensions
 from psycopg2.extras import DictCursor
+from psycopg2 import OperationalError
 
 from base import SimpleService
 
@@ -193,7 +194,7 @@ class Service(SimpleService):
         self.table_stats = configuration.pop('table_stats', True)
         self.index_stats = configuration.pop('index_stats', True)
         self.configuration = configuration
-        self.connection = None
+        self.connection = False
         self.is_superuser = False
         self.data = {}
         self.databases = set()
@@ -207,9 +208,13 @@ class Service(SimpleService):
         params.update(self.configuration)
 
         if not self.connection:
-            self.connection = psycopg2.connect(**params)
-            self.connection.set_isolation_level(extensions.ISOLATION_LEVEL_AUTOCOMMIT)
-            self.connection.set_session(readonly=True)
+            try:
+                self.connection = psycopg2.connect(**params)
+                self.connection.set_isolation_level(extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+                self.connection.set_session(readonly=True)
+            except OperationalError:
+                return False
+        return True
 
     def check(self):
         try:
@@ -291,13 +296,20 @@ class Service(SimpleService):
                 self.definitions[chart_name]['lines'].append([lock_id, label, 'absolute'])
 
     def _get_data(self):
-        self._connect()
-
-        cursor = self.connection.cursor(cursor_factory=DictCursor)
-        self.add_stats(cursor)
-
-        cursor.close()
-        return self.data
+        if self._connect():
+            cursor = self.connection.cursor(cursor_factory=DictCursor)
+            try:
+                self.add_stats(cursor)
+            except OperationalError:
+                if self.connection.closed == 2:
+                    self.connection = False
+                cursor.close()
+                return None
+            else:
+                cursor.close()
+                return self.data
+        else:
+            return None
 
     def add_stats(self, cursor):
         self.add_database_stats(cursor)
index 31f6ab2483b4f0660ad250fc1f47be394352f64f..154e4e4ba69ca8ea766c861eabc9a1491ce3a025 100644 (file)
@@ -2,11 +2,13 @@
 # Description: tomcat netdata python.d module
 # Author: Pawel Krupa (paulfantom)
 
-# Python version higher than 2.7 is needed to run this module.
-
 from base import UrlService
-import xml.etree.ElementTree as ET  # phone home...
-#from xml.parsers.expat import errors
+from re import compile
+
+try:
+    from urlparse import urlparse
+except ImportError:
+    from urllib.parse import urlparse
 
 # default module values (can be overridden per job in `config`)
 # update_every = 2
@@ -20,23 +22,23 @@ CHARTS = {
     'accesses': {
         'options': [None, "Requests", "requests/s", "statistics", "tomcat.accesses", "area"],
         'lines': [
-            ["accesses", None, 'incremental']
+            ["requestCount", 'accesses', 'incremental']
         ]},
     'volume': {
         'options': [None, "Volume", "KB/s", "volume", "tomcat.volume", "area"],
         'lines': [
-            ["volume", None, 'incremental', 1, 1024]
+            ["bytesSent", 'volume', 'incremental', 1, 1024]
         ]},
     'threads': {
         'options': [None, "Threads", "current threads", "statistics", "tomcat.threads", "line"],
         'lines': [
-            ["current", None, "absolute"],
-            ["busy", None, "absolute"]
+            ["currentThreadCount", 'current', "absolute"],
+            ["currentThreadsBusy", 'busy', "absolute"]
         ]},
     'jvm': {
         'options': [None, "JVM Free Memory", "MB", "statistics", "tomcat.jvm", "area"],
         'lines': [
-            ["jvm", None, "absolute", 1, 1048576]
+            ["free", None, "absolute", 1, 1048576]
         ]}
 }
 
@@ -44,68 +46,35 @@ CHARTS = {
 class Service(UrlService):
     def __init__(self, configuration=None, name=None):
         UrlService.__init__(self, configuration=configuration, name=name)
-        if len(self.url) == 0:
-            self.url = "http://localhost:8080/manager/status?XML=true"
+        self.url = self.configuration.get('url', "http://127.0.0.1:8080/manager/status?XML=true")
         self.order = ORDER
         self.definitions = CHARTS
-        self.port = 8080
 
     def check(self):
-        if UrlService.check(self):
-            return True
-
-        # get port from url
-        self.port = 0
-        for i in self.url.split('/'):
-            try:
-                int(i[-1])
-                self.port = i.split(':')[-1]
-                break
-            except:
-                pass
-        if self.port == 0:
-            self.port = 80
-
-        test = self._get_data()
-        if test is None or len(test) == 0:
+        if not self.url.endswith('manager/status?XML=true'):
+            self.error('Bad url(%s). Must be http://<ip.address>:<port>/manager/status?XML=true' % self.url)
             return False
-        else:
-            return True
+
+        netloc = urlparse(self.url).netloc.rpartition(':')
+        if netloc[1] == ':': port = netloc[2]
+        else: port = 80
+        
+        self.regex_jvm = compile(r'<jvm>.*?</jvm>')
+        self.regex_connector = compile(r'[a-z-]+%s.*?/connector' % port)
+        self.regex = compile(r'([\w]+)=\\?[\'\"](\d+)\\?[\'\"]')
+        
+        return UrlService.check(self)
 
     def _get_data(self):
         """
         Format data received from http request
         :return: dict
         """
-        try:
-            raw = self._get_raw_data()
-            try:
-                data = ET.fromstring(raw)
-            except ET.ParseError as e:
-                # if e.code == errors.codes[errors.XML_ERROR_JUNK_AFTER_DOC_ELEMENT]:
-                if e.code == 9:
-                    end = raw.find('</status>')
-                    end += 9
-                    raw = raw[:end]
-                    self.debug(raw)
-                    data = ET.fromstring(raw)
-                else:
-                    raise Exception(e)
-
-            memory = data.find('./jvm/memory')
-            threads = data.find("./connector[@name='\"http-bio-" + str(self.port) + "\"']/threadInfo")
-            requests = data.find("./connector[@name='\"http-bio-" + str(self.port) + "\"']/requestInfo")
+        data = self._get_raw_data()
+        if data:
+            jvm = self.regex_jvm.findall(data) or ['']
+            connector = self.regex_connector.findall(data) or ['']
+            data = dict(self.regex.findall(''.join([jvm[0], connector[0]])))
+        
+        return data or None
 
-            return {'accesses': requests.attrib['requestCount'],
-                    'volume': requests.attrib['bytesSent'],
-                    'current': threads.attrib['currentThreadCount'],
-                    'busy': threads.attrib['currentThreadsBusy'],
-                    'jvm': memory.attrib['free']}
-        except (ValueError, AttributeError) as e:
-            self.debug(str(e))
-            return None
-        except SyntaxError as e:
-            self.error("Tomcat module needs python 2.7 at least. Stopping")
-            self.debug(str(e))
-        except Exception as e:
-            self.debug(str(e))
index 2b1512f4e58b516c16ac7e6fad836ad95572236d..4b4daddd24d32d77d4a184b5085fe7f854646672 100644 (file)
@@ -145,7 +145,7 @@ class Service(SimpleService):
         if not raw_data:
             return None
 
-        return raw_data
+        return raw_data.decode()
 
     def _get_data(self):
         """
index 0a72190aaff7ccbc3cf1e794a997bd7c6416f7e4..1a43d29ca5f307e849495fc90b628cb2c04707bc 100644 (file)
+
+/*
+ * netdata apps.plugin
+ * (C) Copyright 2016-2017 Costa Tsaousis <costa@tsaousis.gr>
+ * Released under GPL v3+
+ */
+
 #include "common.h"
 
+
+// ----------------------------------------------------------------------------
+// string lengths
+
 #define MAX_COMPARE_NAME 100
 #define MAX_NAME 100
 #define MAX_CMDLINE 1024
 
-// the rates we are going to send to netdata
-// will have this detail
-// a value of:
-// 1 will send just integer parts to netdata
-// 100 will send 2 decimal points
-// 1000 will send 3 decimal points
+
+// ----------------------------------------------------------------------------
+// the rates we are going to send to netdata will have this detail a value of:
+//  - 1 will send just integer parts to netdata
+//  - 100 will send 2 decimal points
+//  - 1000 will send 3 decimal points
 // etc.
 #define RATES_DETAIL 10000ULL
 
+
+// ----------------------------------------------------------------------------
+// to avoid reallocating too frequently, we can increase the number of spare
+// file descriptors used by processes.
+// IMPORTANT:
+// having a lot of spares, increases the CPU utilization of the plugin.
 #define MAX_SPARE_FDS 1
 
-int debug = 0;
 
-int update_every = 1;
-unsigned long long global_iterations_counter = 1;
-unsigned long long file_counter = 0;
-int proc_pid_cmdline_is_needed = 0;
-int include_exited_childs = 1;
-char *config_dir = CONFIG_DIR;
+// ----------------------------------------------------------------------------
+// command line options
 
-pid_t *all_pids_sortlist = NULL;
+static int
+        debug = 0,
+        update_every = 1,
+        enable_guest_charts = 0,
+        enable_file_charts = 1,
+        enable_users_charts = 1,
+        enable_groups_charts = 1,
+        include_exited_childs = 1;
 
-// will be automatically set to 1, if guest values are collected
-int show_guest_time = 0;
-int show_guest_time_old = 0;
 
-int enable_guest_charts = 0;
-int enable_file_charts = 1;
-int enable_users_charts = 1;
-int enable_groups_charts = 1;
+// will be changed to getenv(NETDATA_CONFIG_DIR) if it exists
+static char *config_dir = CONFIG_DIR;
 
 // ----------------------------------------------------------------------------
+// internal flags
+// handled in code (automatically set)
 
-void netdata_cleanup_and_exit(int ret) {
-    exit(ret);
-}
+static int
+        show_guest_time = 0,            // 1 when guest values are collected
+        show_guest_time_old = 0,
+        proc_pid_cmdline_is_needed = 0; // 1 when we need to read /proc/cmdline
+
+
+// ----------------------------------------------------------------------------
+// internal counters
+
+static size_t
+        global_iterations_counter = 1,
+        file_counter = 0;
+
+
+// ----------------------------------------------------------------------------
+// Normalization
+//
+// With normalization we lower the collected metrics by a factor to make them
+// match the total utilization of the system.
+// The discrepancy exists because apps.plugin needs some time to collect all
+// the metrics. This results in utilization that exceeds the total utilization
+// of the system.
+//
+// With normalization we align the per-process utilization, to the total of
+// the system. We first consume the exited children utilization and it the
+// collected values is above the total, we proportionally scale each reported
+// metric.
+
+// the total system time, as reported by /proc/stat
+static kernel_uint_t
+        global_utime = 0,
+        global_stime = 0,
+        global_gtime = 0;
+
+
+// the normalization ratios, as calculated by normalize_utilization()
+double  utime_fix_ratio = 1.0,
+        stime_fix_ratio = 1.0,
+        gtime_fix_ratio = 1.0,
+        minflt_fix_ratio = 1.0,
+        majflt_fix_ratio = 1.0,
+        cutime_fix_ratio = 1.0,
+        cstime_fix_ratio = 1.0,
+        cgtime_fix_ratio = 1.0,
+        cminflt_fix_ratio = 1.0,
+        cmajflt_fix_ratio = 1.0;
 
 
 // ----------------------------------------------------------------------------
 // target
-// target is the structure that process data are aggregated
+//
+// target is the structure that processes are aggregated to be reported
+// to netdata.
+//
+// - Each entry in /etc/apps_groups.conf creates a target.
+// - Each user and group used by a process in the system, creates a target.
 
 struct target {
     char compare[MAX_COMPARE_NAME + 1];
@@ -59,74 +123,280 @@ struct target {
     uid_t uid;
     gid_t gid;
 
-    unsigned long long minflt;
-    unsigned long long cminflt;
-    unsigned long long majflt;
-    unsigned long long cmajflt;
-    unsigned long long utime;
-    unsigned long long stime;
-    unsigned long long gtime;
-    unsigned long long cutime;
-    unsigned long long cstime;
-    unsigned long long cgtime;
-    unsigned long long num_threads;
-    // unsigned long long rss;
-
-    unsigned long long statm_size;
-    unsigned long long statm_resident;
-    unsigned long long statm_share;
-    // unsigned long long statm_text;
-    // unsigned long long statm_lib;
-    // unsigned long long statm_data;
-    // unsigned long long statm_dirty;
-
-    unsigned long long io_logical_bytes_read;
-    unsigned long long io_logical_bytes_written;
-    // unsigned long long io_read_calls;
-    // unsigned long long io_write_calls;
-    unsigned long long io_storage_bytes_read;
-    unsigned long long io_storage_bytes_written;
-    // unsigned long long io_cancelled_write_bytes;
+    kernel_uint_t minflt;
+    kernel_uint_t cminflt;
+    kernel_uint_t majflt;
+    kernel_uint_t cmajflt;
+    kernel_uint_t utime;
+    kernel_uint_t stime;
+    kernel_uint_t gtime;
+    kernel_uint_t cutime;
+    kernel_uint_t cstime;
+    kernel_uint_t cgtime;
+    kernel_uint_t num_threads;
+    // kernel_uint_t rss;
+
+    kernel_uint_t statm_size;
+    kernel_uint_t statm_resident;
+    kernel_uint_t statm_share;
+    // kernel_uint_t statm_text;
+    // kernel_uint_t statm_lib;
+    // kernel_uint_t statm_data;
+    // kernel_uint_t statm_dirty;
+
+    kernel_uint_t io_logical_bytes_read;
+    kernel_uint_t io_logical_bytes_written;
+    // kernel_uint_t io_read_calls;
+    // kernel_uint_t io_write_calls;
+    kernel_uint_t io_storage_bytes_read;
+    kernel_uint_t io_storage_bytes_written;
+    // kernel_uint_t io_cancelled_write_bytes;
 
     int *target_fds;
     int target_fds_size;
 
-    unsigned long long openfiles;
-    unsigned long long openpipes;
-    unsigned long long opensockets;
-    unsigned long long openinotifies;
-    unsigned long long openeventfds;
-    unsigned long long opentimerfds;
-    unsigned long long opensignalfds;
-    unsigned long long openeventpolls;
-    unsigned long long openother;
-
-    unsigned long processes;    // how many processes have been merged to this
-    int exposed;                // if set, we have sent this to netdata
-    int hidden;                 // if set, we set the hidden flag on the dimension
+    kernel_uint_t openfiles;
+    kernel_uint_t openpipes;
+    kernel_uint_t opensockets;
+    kernel_uint_t openinotifies;
+    kernel_uint_t openeventfds;
+    kernel_uint_t opentimerfds;
+    kernel_uint_t opensignalfds;
+    kernel_uint_t openeventpolls;
+    kernel_uint_t openother;
+
+    unsigned int processes; // how many processes have been merged to this
+    int exposed;            // if set, we have sent this to netdata
+    int hidden;             // if set, we set the hidden flag on the dimension
     int debug;
     int ends_with;
-    int starts_with;            // if set, the compare string matches only the
-                                // beginning of the command
+    int starts_with;        // if set, the compare string matches only the
+                            // beginning of the command
 
-    struct target *target;      // the one that will be reported to netdata
+    struct target *target;  // the one that will be reported to netdata
     struct target *next;
 };
 
+struct target
+        *apps_groups_default_target = NULL, // the default target
+        *apps_groups_root_target = NULL,    // apps_groups.conf defined
+        *users_root_target = NULL,          // users
+        *groups_root_target = NULL;         // user groups
+
+size_t
+        apps_groups_targets_count = 0;       // # of apps_groups.conf targets
+
 
 // ----------------------------------------------------------------------------
-// apps_groups.conf
-// aggregate all processes in groups, to have a limited number of dimensions
+// pid_stat
+//
+// structure to store data for each process running
+// see: man proc for the description of the fields
 
-struct target *apps_groups_root_target = NULL;
-struct target *apps_groups_default_target = NULL;
-long apps_groups_targets = 0;
+struct pid_stat {
+    int32_t pid;
+    char comm[MAX_COMPARE_NAME + 1];
+    char cmdline[MAX_CMDLINE + 1];
 
-struct target *users_root_target = NULL;
-struct target *groups_root_target = NULL;
+    uint32_t log_thrown;
 
-static struct target *get_users_target(uid_t uid)
-{
+    // char state;
+    int32_t ppid;
+    // int32_t pgrp;
+    // int32_t session;
+    // int32_t tty_nr;
+    // int32_t tpgid;
+    // uint64_t flags;
+
+    // these are raw values collected
+    kernel_uint_t minflt_raw;
+    kernel_uint_t cminflt_raw;
+    kernel_uint_t majflt_raw;
+    kernel_uint_t cmajflt_raw;
+    kernel_uint_t utime_raw;
+    kernel_uint_t stime_raw;
+    kernel_uint_t gtime_raw; // guest_time
+    kernel_uint_t cutime_raw;
+    kernel_uint_t cstime_raw;
+    kernel_uint_t cgtime_raw; // cguest_time
+
+    // these are rates
+    kernel_uint_t minflt;
+    kernel_uint_t cminflt;
+    kernel_uint_t majflt;
+    kernel_uint_t cmajflt;
+    kernel_uint_t utime;
+    kernel_uint_t stime;
+    kernel_uint_t gtime;
+    kernel_uint_t cutime;
+    kernel_uint_t cstime;
+    kernel_uint_t cgtime;
+
+    // int64_t priority;
+    // int64_t nice;
+    int32_t num_threads;
+    // int64_t itrealvalue;
+    // kernel_uint_t starttime;
+    // kernel_uint_t vsize;
+    // kernel_uint_t rss;
+    // kernel_uint_t rsslim;
+    // kernel_uint_t starcode;
+    // kernel_uint_t endcode;
+    // kernel_uint_t startstack;
+    // kernel_uint_t kstkesp;
+    // kernel_uint_t kstkeip;
+    // uint64_t signal;
+    // uint64_t blocked;
+    // uint64_t sigignore;
+    // uint64_t sigcatch;
+    // uint64_t wchan;
+    // uint64_t nswap;
+    // uint64_t cnswap;
+    // int32_t exit_signal;
+    // int32_t processor;
+    // uint32_t rt_priority;
+    // uint32_t policy;
+    // kernel_uint_t delayacct_blkio_ticks;
+
+    uid_t uid;
+    gid_t gid;
+
+    kernel_uint_t statm_size;
+    kernel_uint_t statm_resident;
+    kernel_uint_t statm_share;
+    // kernel_uint_t statm_text;
+    // kernel_uint_t statm_lib;
+    // kernel_uint_t statm_data;
+    // kernel_uint_t statm_dirty;
+
+    kernel_uint_t io_logical_bytes_read_raw;
+    kernel_uint_t io_logical_bytes_written_raw;
+    // kernel_uint_t io_read_calls_raw;
+    // kernel_uint_t io_write_calls_raw;
+    kernel_uint_t io_storage_bytes_read_raw;
+    kernel_uint_t io_storage_bytes_written_raw;
+    // kernel_uint_t io_cancelled_write_bytes_raw;
+
+    kernel_uint_t io_logical_bytes_read;
+    kernel_uint_t io_logical_bytes_written;
+    // kernel_uint_t io_read_calls;
+    // kernel_uint_t io_write_calls;
+    kernel_uint_t io_storage_bytes_read;
+    kernel_uint_t io_storage_bytes_written;
+    // kernel_uint_t io_cancelled_write_bytes;
+
+    int *fds;                       // array of fds it uses
+    int fds_size;                   // the size of the fds array
+
+    int children_count;             // number of processes directly referencing this
+    char keep:1;                    // 1 when we need to keep this process in memory even after it exited
+    int keeploops;                  // increases by 1 every time keep is 1 and updated 0
+    char updated:1;                 // 1 when the process is currently running
+    char merged:1;                  // 1 when it has been merged to its parent
+    char new_entry:1;               // 1 when this is a new process, just saw for the first time
+    char read:1;                    // 1 when we have already read this process for this iteration
+
+    int sortlist;                   // higher numbers = top on the process tree
+                                    // each process gets a unique number
+
+    struct target *target;          // app_groups.conf targets
+    struct target *user_target;     // uid based targets
+    struct target *group_target;    // gid based targets
+
+    usec_t stat_collected_usec;
+    usec_t last_stat_collected_usec;
+
+    usec_t io_collected_usec;
+    usec_t last_io_collected_usec;
+
+    char *fds_dirname;              // the full directory name in /proc/PID/fd
+
+    char *stat_filename;
+    char *statm_filename;
+    char *io_filename;
+    char *cmdline_filename;
+
+    struct pid_stat *parent;
+    struct pid_stat *prev;
+    struct pid_stat *next;
+};
+
+// log each problem once per process
+// log flood protection flags (log_thrown)
+#define PID_LOG_IO      0x00000001
+#define PID_LOG_STATM   0x00000002
+#define PID_LOG_CMDLINE 0x00000004
+#define PID_LOG_FDS     0x00000008
+#define PID_LOG_STAT    0x00000010
+
+static struct pid_stat
+        *root_of_pids = NULL,   // global list of all processes running
+        **all_pids = NULL;      // to avoid allocations, we pre-allocate the
+                                // the entire pid space.
+
+static size_t
+        all_pids_count = 0;     // the number of processes running
+
+// Another pre-allocated list of all possible pids.
+// We need it to pids and assign them a unique sortlist id, so that we
+// read parents before children. This is needed to prevent a situation where
+// a child is found running, but until we read its parent, it has exited and
+// its parent has accumulated its resources.
+static pid_t
+        *all_pids_sortlist = NULL;
+
+
+// ----------------------------------------------------------------------------
+// file descriptor
+//
+// this is used to keep a global list of all open files of the system.
+// it is needed in order to calculate the unique files processes have open.
+
+#define FILE_DESCRIPTORS_INCREASE_STEP 100
+
+struct file_descriptor {
+    avl avl;
+
+#ifdef NETDATA_INTERNAL_CHECKS
+    uint32_t magic;
+#endif /* NETDATA_INTERNAL_CHECKS */
+
+    const char *name;
+    uint32_t hash;
+
+    char type;
+    int count;
+    int pos;
+} *all_files = NULL;
+
+static int
+        all_files_len = 0,
+        all_files_size = 0;
+
+// types for struct file_descriptor->type
+#define FILETYPE_OTHER      0
+#define FILETYPE_FILE       1
+#define FILETYPE_PIPE       2
+#define FILETYPE_SOCKET     3
+#define FILETYPE_INOTIFY    4
+#define FILETYPE_EVENTFD    5
+#define FILETYPE_EVENTPOLL  6
+#define FILETYPE_TIMERFD    7
+#define FILETYPE_SIGNALFD   8
+
+
+// ----------------------------------------------------------------------------
+// callback required by fatal()
+
+void netdata_cleanup_and_exit(int ret) {
+    exit(ret);
+}
+
+// ----------------------------------------------------------------------------
+// apps_groups.conf
+// aggregate all processes in groups, to have a limited number of dimensions
+
+static struct target *get_users_target(uid_t uid) {
     struct target *w;
     for(w = users_root_target ; w ; w = w->next)
         if(w->uid == uid) return w;
@@ -221,10 +491,12 @@ static struct target *get_apps_groups_target(const char *id, struct target *targ
             if(*name == '-') thidden = 1;
             name++;
         }
-        for(target = apps_groups_root_target ; target ; target = target->next) {
+
+        for(target = apps_groups_root_target ; target != NULL ; target = target->next) {
             if(!target->target && strcmp(name, target->name) == 0)
                 break;
         }
+
         if(unlikely(debug)) {
             if(unlikely(target))
                 fprintf(stderr, "apps.plugin: REUSING TARGET NAME '%s' on ID '%s'\n", target->name, target->id);
@@ -302,10 +574,10 @@ static int read_apps_groups_conf(const char *file)
     if(!ff)
         return 1;
 
-    unsigned long line, lines = procfile_lines(ff);
+    size_t line, lines = procfile_lines(ff);
 
     for(line = 0; line < lines ;line++) {
-        unsigned long word, words = procfile_linewords(ff, line);
+        size_t word, words = procfile_linewords(ff, line);
         if(!words) continue;
 
         char *name = procfile_lineword(ff, line, 0);
@@ -351,144 +623,10 @@ static int read_apps_groups_conf(const char *file)
 
 
 // ----------------------------------------------------------------------------
-// data to store for each pid
-// see: man proc
-
-#define PID_LOG_IO      0x00000001
-#define PID_LOG_STATM   0x00000002
-#define PID_LOG_CMDLINE 0x00000004
-#define PID_LOG_FDS     0x00000008
-#define PID_LOG_STAT    0x00000010
-
-struct pid_stat {
-    int32_t pid;
-    char comm[MAX_COMPARE_NAME + 1];
-    char cmdline[MAX_CMDLINE + 1];
-
-    uint32_t log_thrown;
-
-    // char state;
-    int32_t ppid;
-    // int32_t pgrp;
-    // int32_t session;
-    // int32_t tty_nr;
-    // int32_t tpgid;
-    // uint64_t flags;
-
-    // these are raw values collected
-    unsigned long long minflt_raw;
-    unsigned long long cminflt_raw;
-    unsigned long long majflt_raw;
-    unsigned long long cmajflt_raw;
-    unsigned long long utime_raw;
-    unsigned long long stime_raw;
-    unsigned long long gtime_raw; // guest_time
-    unsigned long long cutime_raw;
-    unsigned long long cstime_raw;
-    unsigned long long cgtime_raw; // cguest_time
-
-    // these are rates
-    unsigned long long minflt;
-    unsigned long long cminflt;
-    unsigned long long majflt;
-    unsigned long long cmajflt;
-    unsigned long long utime;
-    unsigned long long stime;
-    unsigned long long gtime;
-    unsigned long long cutime;
-    unsigned long long cstime;
-    unsigned long long cgtime;
-
-    // int64_t priority;
-    // int64_t nice;
-    int32_t num_threads;
-    // int64_t itrealvalue;
-    // unsigned long long starttime;
-    // unsigned long long vsize;
-    // unsigned long long rss;
-    // unsigned long long rsslim;
-    // unsigned long long starcode;
-    // unsigned long long endcode;
-    // unsigned long long startstack;
-    // unsigned long long kstkesp;
-    // unsigned long long kstkeip;
-    // uint64_t signal;
-    // uint64_t blocked;
-    // uint64_t sigignore;
-    // uint64_t sigcatch;
-    // uint64_t wchan;
-    // uint64_t nswap;
-    // uint64_t cnswap;
-    // int32_t exit_signal;
-    // int32_t processor;
-    // uint32_t rt_priority;
-    // uint32_t policy;
-    // unsigned long long delayacct_blkio_ticks;
-
-    uid_t uid;
-    gid_t gid;
-
-    unsigned long long statm_size;
-    unsigned long long statm_resident;
-    unsigned long long statm_share;
-    // unsigned long long statm_text;
-    // unsigned long long statm_lib;
-    // unsigned long long statm_data;
-    // unsigned long long statm_dirty;
-
-    unsigned long long io_logical_bytes_read_raw;
-    unsigned long long io_logical_bytes_written_raw;
-    // unsigned long long io_read_calls_raw;
-    // unsigned long long io_write_calls_raw;
-    unsigned long long io_storage_bytes_read_raw;
-    unsigned long long io_storage_bytes_written_raw;
-    // unsigned long long io_cancelled_write_bytes_raw;
-
-    unsigned long long io_logical_bytes_read;
-    unsigned long long io_logical_bytes_written;
-    // unsigned long long io_read_calls;
-    // unsigned long long io_write_calls;
-    unsigned long long io_storage_bytes_read;
-    unsigned long long io_storage_bytes_written;
-    // unsigned long long io_cancelled_write_bytes;
-
-    int *fds;                       // array of fds it uses
-    int fds_size;                   // the size of the fds array
-
-    int children_count;             // number of processes directly referencing this
-    int keep;                       // 1 when we need to keep this process in memory even after it exited
-    int keeploops;                  // increases by 1 every time keep is 1 and updated 0
-    int updated;                    // 1 when the process is currently running
-    int merged;                     // 1 when it has been merged to its parent
-    int new_entry;                  // 1 when this is a new process, just saw for the first time
-    int read;                       // 1 when we have already read this process for this iteration
-    int sortlist;                   // higher numbers = top on the process tree
-                                    // each process gets a unique number
-
-    struct target *target;          // app_groups.conf targets
-    struct target *user_target;     // uid based targets
-    struct target *group_target;    // gid based targets
-
-    unsigned long long stat_collected_usec;
-    unsigned long long last_stat_collected_usec;
-
-    unsigned long long io_collected_usec;
-    unsigned long long last_io_collected_usec;
-
-    char *stat_filename;
-    char *statm_filename;
-    char *io_filename;
-    char *cmdline_filename;
-
-    struct pid_stat *parent;
-    struct pid_stat *prev;
-    struct pid_stat *next;
-} *root_of_pids = NULL, **all_pids;
-
-long all_pids_count = 0;
+// struct pid_stat management
 
 static inline struct pid_stat *get_pid_entry(pid_t pid) {
-    if(all_pids[pid]) {
+    if(unlikely(all_pids[pid])) {
         all_pids[pid]->new_entry = 0;
         return all_pids[pid];
     }
@@ -497,7 +635,9 @@ static inline struct pid_stat *get_pid_entry(pid_t pid) {
     all_pids[pid]->fds = callocz(sizeof(int), MAX_SPARE_FDS);
     all_pids[pid]->fds_size = MAX_SPARE_FDS;
 
-    if(root_of_pids) root_of_pids->prev = all_pids[pid];
+    if(likely(root_of_pids))
+        root_of_pids->prev = all_pids[pid];
+
     all_pids[pid]->next = root_of_pids;
     root_of_pids = all_pids[pid];
 
@@ -510,7 +650,7 @@ static inline struct pid_stat *get_pid_entry(pid_t pid) {
 }
 
 static inline void del_pid_entry(pid_t pid) {
-    if(!all_pids[pid]) {
+    if(unlikely(!all_pids[pid])) {
         error("attempted to free pid %d that is not allocated.", pid);
         return;
     }
@@ -518,15 +658,18 @@ static inline void del_pid_entry(pid_t pid) {
     if(unlikely(debug))
         fprintf(stderr, "apps.plugin: process %d %s exited, deleting it.\n", pid, all_pids[pid]->comm);
 
-    if(root_of_pids == all_pids[pid]) root_of_pids = all_pids[pid]->next;
+    if(root_of_pids == all_pids[pid])
+        root_of_pids = all_pids[pid]->next;
+
     if(all_pids[pid]->next) all_pids[pid]->next->prev = all_pids[pid]->prev;
     if(all_pids[pid]->prev) all_pids[pid]->prev->next = all_pids[pid]->next;
 
-    if(all_pids[pid]->fds) freez(all_pids[pid]->fds);
-    if(all_pids[pid]->stat_filename) freez(all_pids[pid]->stat_filename);
-    if(all_pids[pid]->statm_filename) freez(all_pids[pid]->statm_filename);
-    if(all_pids[pid]->io_filename) freez(all_pids[pid]->io_filename);
-    if(all_pids[pid]->cmdline_filename) freez(all_pids[pid]->cmdline_filename);
+    freez(all_pids[pid]->fds);
+    freez(all_pids[pid]->fds_dirname);
+    freez(all_pids[pid]->stat_filename);
+    freez(all_pids[pid]->statm_filename);
+    freez(all_pids[pid]->io_filename);
+    freez(all_pids[pid]->cmdline_filename);
     freez(all_pids[pid]);
 
     all_pids[pid] = NULL;
@@ -604,7 +747,8 @@ static inline int read_proc_pid_stat(struct pid_stat *p) {
     if(unlikely(!ff)) goto cleanup;
 
     // if(set_quotes) procfile_set_quotes(ff, "()");
-    if(set_quotes) procfile_set_open_close(ff, "(", ")");
+    if(unlikely(set_quotes))
+        procfile_set_open_close(ff, "(", ")");
 
     ff = procfile_readall(ff);
     if(unlikely(!ff)) goto cleanup;
@@ -613,85 +757,86 @@ static inline int read_proc_pid_stat(struct pid_stat *p) {
     p->stat_collected_usec = now_realtime_usec();
     file_counter++;
 
-    // p->pid           = str2ul(procfile_lineword(ff, 0, 0+i));
+    // p->pid           = str2pid_t(procfile_lineword(ff, 0, 0+i));
 
-    strncpyz(p->comm, procfile_lineword(ff, 0, 1), MAX_COMPARE_NAME);
+    if(unlikely(!p->comm[0]))
+        strncpyz(p->comm, procfile_lineword(ff, 0, 1), MAX_COMPARE_NAME);
 
     // p->state         = *(procfile_lineword(ff, 0, 2));
-    p->ppid             = (int32_t)str2ul(procfile_lineword(ff, 0, 3));
+    p->ppid             = (int32_t)str2pid_t(procfile_lineword(ff, 0, 3));
     // p->pgrp          = str2ul(procfile_lineword(ff, 0, 4));
     // p->session       = str2ul(procfile_lineword(ff, 0, 5));
     // p->tty_nr        = str2ul(procfile_lineword(ff, 0, 6));
     // p->tpgid         = str2ul(procfile_lineword(ff, 0, 7));
     // p->flags         = str2ull(procfile_lineword(ff, 0, 8));
 
-    unsigned long long last;
+    kernel_uint_t last;
 
     last = p->minflt_raw;
-    p->minflt_raw       = str2ull(procfile_lineword(ff, 0, 9));
+    p->minflt_raw       = str2kernel_unit_t(procfile_lineword(ff, 0, 9));
     p->minflt = (p->minflt_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
     last = p->cminflt_raw;
-    p->cminflt_raw      = str2ull(procfile_lineword(ff, 0, 10));
+    p->cminflt_raw      = str2kernel_unit_t(procfile_lineword(ff, 0, 10));
     p->cminflt = (p->cminflt_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
     last = p->majflt_raw;
-    p->majflt_raw       = str2ull(procfile_lineword(ff, 0, 11));
+    p->majflt_raw       = str2kernel_unit_t(procfile_lineword(ff, 0, 11));
     p->majflt = (p->majflt_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
     last = p->cmajflt_raw;
-    p->cmajflt_raw      = str2ull(procfile_lineword(ff, 0, 12));
+    p->cmajflt_raw      = str2kernel_unit_t(procfile_lineword(ff, 0, 12));
     p->cmajflt = (p->cmajflt_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
     last = p->utime_raw;
-    p->utime_raw        = str2ull(procfile_lineword(ff, 0, 13));
+    p->utime_raw        = str2kernel_unit_t(procfile_lineword(ff, 0, 13));
     p->utime = (p->utime_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
     last = p->stime_raw;
-    p->stime_raw        = str2ull(procfile_lineword(ff, 0, 14));
+    p->stime_raw        = str2kernel_unit_t(procfile_lineword(ff, 0, 14));
     p->stime = (p->stime_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
     last = p->cutime_raw;
-    p->cutime_raw       = str2ull(procfile_lineword(ff, 0, 15));
+    p->cutime_raw       = str2kernel_unit_t(procfile_lineword(ff, 0, 15));
     p->cutime = (p->cutime_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
     last = p->cstime_raw;
-    p->cstime_raw       = str2ull(procfile_lineword(ff, 0, 16));
+    p->cstime_raw       = str2kernel_unit_t(procfile_lineword(ff, 0, 16));
     p->cstime = (p->cstime_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
-    // p->priority      = str2ull(procfile_lineword(ff, 0, 17));
-    // p->nice          = str2ull(procfile_lineword(ff, 0, 18));
-    p->num_threads      = (int32_t)str2ul(procfile_lineword(ff, 0, 19));
-    // p->itrealvalue   = str2ull(procfile_lineword(ff, 0, 20));
-    // p->starttime     = str2ull(procfile_lineword(ff, 0, 21));
-    // p->vsize         = str2ull(procfile_lineword(ff, 0, 22));
-    // p->rss           = str2ull(procfile_lineword(ff, 0, 23));
-    // p->rsslim        = str2ull(procfile_lineword(ff, 0, 24));
-    // p->starcode      = str2ull(procfile_lineword(ff, 0, 25));
-    // p->endcode       = str2ull(procfile_lineword(ff, 0, 26));
-    // p->startstack    = str2ull(procfile_lineword(ff, 0, 27));
-    // p->kstkesp       = str2ull(procfile_lineword(ff, 0, 28));
-    // p->kstkeip       = str2ull(procfile_lineword(ff, 0, 29));
-    // p->signal        = str2ull(procfile_lineword(ff, 0, 30));
-    // p->blocked       = str2ull(procfile_lineword(ff, 0, 31));
-    // p->sigignore     = str2ull(procfile_lineword(ff, 0, 32));
-    // p->sigcatch      = str2ull(procfile_lineword(ff, 0, 33));
-    // p->wchan         = str2ull(procfile_lineword(ff, 0, 34));
-    // p->nswap         = str2ull(procfile_lineword(ff, 0, 35));
-    // p->cnswap        = str2ull(procfile_lineword(ff, 0, 36));
-    // p->exit_signal   = str2ul(procfile_lineword(ff, 0, 37));
-    // p->processor     = str2ul(procfile_lineword(ff, 0, 38));
-    // p->rt_priority   = str2ul(procfile_lineword(ff, 0, 39));
-    // p->policy        = str2ul(procfile_lineword(ff, 0, 40));
-    // p->delayacct_blkio_ticks = str2ull(procfile_lineword(ff, 0, 41));
+    // p->priority      = str2kernel_unit_t(procfile_lineword(ff, 0, 17));
+    // p->nice          = str2kernel_unit_t(procfile_lineword(ff, 0, 18));
+    p->num_threads      = (int32_t)str2uint32_t(procfile_lineword(ff, 0, 19));
+    // p->itrealvalue   = str2kernel_unit_t(procfile_lineword(ff, 0, 20));
+    // p->starttime     = str2kernel_unit_t(procfile_lineword(ff, 0, 21));
+    // p->vsize         = str2kernel_unit_t(procfile_lineword(ff, 0, 22));
+    // p->rss           = str2kernel_unit_t(procfile_lineword(ff, 0, 23));
+    // p->rsslim        = str2kernel_unit_t(procfile_lineword(ff, 0, 24));
+    // p->starcode      = str2kernel_unit_t(procfile_lineword(ff, 0, 25));
+    // p->endcode       = str2kernel_unit_t(procfile_lineword(ff, 0, 26));
+    // p->startstack    = str2kernel_unit_t(procfile_lineword(ff, 0, 27));
+    // p->kstkesp       = str2kernel_unit_t(procfile_lineword(ff, 0, 28));
+    // p->kstkeip       = str2kernel_unit_t(procfile_lineword(ff, 0, 29));
+    // p->signal        = str2kernel_unit_t(procfile_lineword(ff, 0, 30));
+    // p->blocked       = str2kernel_unit_t(procfile_lineword(ff, 0, 31));
+    // p->sigignore     = str2kernel_unit_t(procfile_lineword(ff, 0, 32));
+    // p->sigcatch      = str2kernel_unit_t(procfile_lineword(ff, 0, 33));
+    // p->wchan         = str2kernel_unit_t(procfile_lineword(ff, 0, 34));
+    // p->nswap         = str2kernel_unit_t(procfile_lineword(ff, 0, 35));
+    // p->cnswap        = str2kernel_unit_t(procfile_lineword(ff, 0, 36));
+    // p->exit_signal   = str2kernel_unit_t(procfile_lineword(ff, 0, 37));
+    // p->processor     = str2kernel_unit_t(procfile_lineword(ff, 0, 38));
+    // p->rt_priority   = str2kernel_unit_t(procfile_lineword(ff, 0, 39));
+    // p->policy        = str2kernel_unit_t(procfile_lineword(ff, 0, 40));
+    // p->delayacct_blkio_ticks = str2kernel_unit_t(procfile_lineword(ff, 0, 41));
 
     if(enable_guest_charts) {
         last = p->gtime_raw;
-        p->gtime_raw        = str2ull(procfile_lineword(ff, 0, 42));
+        p->gtime_raw        = str2kernel_unit_t(procfile_lineword(ff, 0, 42));
         p->gtime = (p->gtime_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
         last = p->cgtime_raw;
-        p->cgtime_raw       = str2ull(procfile_lineword(ff, 0, 43));
+        p->cgtime_raw       = str2kernel_unit_t(procfile_lineword(ff, 0, 43));
         p->cgtime = (p->cgtime_raw - last) * (USEC_PER_SEC * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
         if (show_guest_time || p->gtime || p->cgtime) {
@@ -702,7 +847,7 @@ static inline int read_proc_pid_stat(struct pid_stat *p) {
     }
 
     if(unlikely(debug || (p->target && p->target->debug)))
-        fprintf(stderr, "apps.plugin: READ PROC/PID/STAT: %s/proc/%d/stat, process: '%s' on target '%s' (dt=%llu) VALUES: utime=%llu, stime=%llu, cutime=%llu, cstime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu, threads=%d\n", global_host_prefix, p->pid, p->comm, (p->target)?p->target->name:"UNSET", p->stat_collected_usec - p->last_stat_collected_usec, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt, p->num_threads);
+        fprintf(stderr, "apps.plugin: READ PROC/PID/STAT: %s/proc/%d/stat, process: '%s' on target '%s' (dt=%llu) VALUES: utime=" KERNEL_UINT_FORMAT ", stime=" KERNEL_UINT_FORMAT ", cutime=" KERNEL_UINT_FORMAT ", cstime=" KERNEL_UINT_FORMAT ", minflt=" KERNEL_UINT_FORMAT ", majflt=" KERNEL_UINT_FORMAT ", cminflt=" KERNEL_UINT_FORMAT ", cmajflt=" KERNEL_UINT_FORMAT ", threads=%d\n", global_host_prefix, p->pid, p->comm, (p->target)?p->target->name:"UNSET", p->stat_collected_usec - p->last_stat_collected_usec, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt, p->num_threads);
 
     if(unlikely(global_iterations_counter == 1)) {
         p->minflt           = 0;
@@ -794,7 +939,7 @@ static inline int read_proc_pid_io(struct pid_stat *p) {
     p->last_io_collected_usec = p->io_collected_usec;
     p->io_collected_usec = now_realtime_usec();
 
-    unsigned long long last;
+    kernel_uint_t last;
 
     last = p->io_logical_bytes_read_raw;
     p->io_logical_bytes_read_raw = str2ull(procfile_lineword(ff, 0, 1));
@@ -847,14 +992,10 @@ cleanup:
     return 0;
 }
 
-unsigned long long global_utime = 0;
-unsigned long long global_stime = 0;
-unsigned long long global_gtime = 0;
-
 static inline int read_proc_stat() {
     static char filename[FILENAME_MAX + 1] = "";
     static procfile *ff = NULL;
-    static unsigned long long utime_raw = 0, stime_raw = 0, gtime_raw = 0, gntime_raw = 0, ntime_raw = 0;
+    static kernel_uint_t utime_raw = 0, stime_raw = 0, gtime_raw = 0, gntime_raw = 0, ntime_raw = 0;
     static usec_t collected_usec = 0, last_collected_usec = 0;
 
     if(unlikely(!ff)) {
@@ -871,7 +1012,7 @@ static inline int read_proc_stat() {
 
     file_counter++;
 
-    unsigned long long last;
+    kernel_uint_t last;
 
     last = utime_raw;
     utime_raw = str2ull(procfile_lineword(ff, 0, 1));
@@ -917,26 +1058,6 @@ cleanup:
 
 
 // ----------------------------------------------------------------------------
-// file descriptor
-// this is used to keep a global list of all open files of the system
-// it is needed in order to calculate the unique files processes have open
-
-#define FILE_DESCRIPTORS_INCREASE_STEP 100
-
-struct file_descriptor {
-    avl avl;
-#ifdef NETDATA_INTERNAL_CHECKS
-    uint32_t magic;
-#endif /* NETDATA_INTERNAL_CHECKS */
-    uint32_t hash;
-    const char *name;
-    int type;
-    int count;
-    int pos;
-} *all_files = NULL;
-
-int all_files_len = 0;
-int all_files_size = 0;
 
 int file_descriptor_compare(void* a, void* b) {
 #ifdef NETDATA_INTERNAL_CHECKS
@@ -977,15 +1098,7 @@ static struct file_descriptor *file_descriptor_find(const char *name, uint32_t h
 #define file_descriptor_add(fd) avl_insert(&all_files_index, (avl *)(fd))
 #define file_descriptor_remove(fd) avl_remove(&all_files_index, (avl *)(fd))
 
-#define FILETYPE_OTHER 0
-#define FILETYPE_FILE 1
-#define FILETYPE_PIPE 2
-#define FILETYPE_SOCKET 3
-#define FILETYPE_INOTIFY 4
-#define FILETYPE_EVENTFD 5
-#define FILETYPE_EVENTPOLL 6
-#define FILETYPE_TIMERFD 7
-#define FILETYPE_SIGNALFD 8
+// ----------------------------------------------------------------------------
 
 static inline void file_descriptor_not_used(int id)
 {
@@ -1172,81 +1285,113 @@ static inline int file_descriptor_find_or_add(const char *name)
     return file_descriptor_set_on_empty_slot(name, hash, type);
 }
 
+static inline void make_all_pid_fds_negative(struct pid_stat *p) {
+    int *fd = p->fds, *end = &p->fds[p->fds_size];
+    while(fd < end) {
+        *fd = -(*fd);
+        fd++;
+    }
+}
+
+static inline void cleanup_negative_pid_fds(struct pid_stat *p) {
+    int *fd = p->fds, *end = &p->fds[p->fds_size];
+    while(fd < end) {
+        if(unlikely(*fd < 0)) {
+            file_descriptor_not_used(-(*fd));
+            *fd++ = 0;
+        }
+        else
+            fd++;
+    }
+}
+
+static inline void zero_pid_fds(struct pid_stat *p, int first, int size) {
+    int *fd = &p->fds[first], *end = &p->fds[first + size];
+    while(fd < end) *fd++ = 0;
+}
+
 static inline int read_pid_file_descriptors(struct pid_stat *p) {
-    char dirname[FILENAME_MAX+1];
-
-    snprintfz(dirname, FILENAME_MAX, "%s/proc/%d/fd", global_host_prefix, p->pid);
-    DIR *fds = opendir(dirname);
-    if(fds) {
-        int c;
-        struct dirent *de;
-        char fdname[FILENAME_MAX + 1];
-        char linkname[FILENAME_MAX + 1];
-
-        // make the array negative
-        for(c = 0 ; c < p->fds_size ; c++)
-            p->fds[c] = -p->fds[c];
-
-        while((de = readdir(fds))) {
-            if(strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
-                continue;
+    if(unlikely(!p->fds_dirname)) {
+        char dirname[FILENAME_MAX+1];
+        snprintfz(dirname, FILENAME_MAX, "%s/proc/%d/fd", global_host_prefix, p->pid);
+        p->fds_dirname = strdupz(dirname);
+    }
 
-            // check if the fds array is small
-            int fdid = (int)str2l(de->d_name);
-            if(fdid < 0) continue;
-            if(fdid >= p->fds_size) {
-                // it is small, extend it
-                if(unlikely(debug))
-                    fprintf(stderr, "apps.plugin: extending fd memory slots for %s from %d to %d\n", p->comm, p->fds_size, fdid + MAX_SPARE_FDS);
+    DIR *fds = opendir(p->fds_dirname);
+    if(unlikely(!fds)) return 0;
 
-                p->fds = reallocz(p->fds, (fdid + MAX_SPARE_FDS) * sizeof(int));
+    struct dirent *de;
+    char fdname[FILENAME_MAX + 1];
+    char linkname[FILENAME_MAX + 1];
 
-                // and initialize it
-                for(c = p->fds_size ; c < (fdid + MAX_SPARE_FDS) ; c++) p->fds[c] = 0;
-                p->fds_size = fdid + MAX_SPARE_FDS;
-            }
+    // we make all pid fds negative, so that
+    // we can detect unused file descriptors
+    // at the end, to free them
+    make_all_pid_fds_negative(p);
 
-            if(p->fds[fdid] == 0) {
-                // we don't know this fd, get it
-
-                sprintf(fdname, "%s/proc/%d/fd/%s", global_host_prefix, p->pid, de->d_name);
-                ssize_t l = readlink(fdname, linkname, FILENAME_MAX);
-                if(l == -1) {
-                    if(debug || (p->target && p->target->debug)) {
-                        if(debug || (p->target && p->target->debug))
-                            error("Cannot read link %s", fdname);
-                    }
-                    continue;
+    while((de = readdir(fds))) {
+        // we need only files with numeric names
+
+        if(unlikely(de->d_name[0] < '0' || de->d_name[0] > '9'))
+            continue;
+
+        // get its number
+        int fdid = (int)str2l(de->d_name);
+        if(unlikely(fdid < 0)) continue;
+
+        // check if the fds array is small
+        if(unlikely(fdid >= p->fds_size)) {
+            // it is small, extend it
+
+            if(unlikely(debug))
+                fprintf(stderr, "apps.plugin: extending fd memory slots for %s from %d to %d\n", p->comm, p->fds_size, fdid + MAX_SPARE_FDS);
+
+            p->fds = reallocz(p->fds, (fdid + MAX_SPARE_FDS) * sizeof(int));
+
+            // and initialize it
+            zero_pid_fds(p, p->fds_size, (fdid + MAX_SPARE_FDS) - p->fds_size);
+            p->fds_size = fdid + MAX_SPARE_FDS;
+        }
+
+        if(unlikely(p->fds[fdid] == 0)) {
+            // we don't know this fd, get it
+
+            sprintf(fdname, "%s/proc/%d/fd/%s", global_host_prefix, p->pid, de->d_name);
+            ssize_t l = readlink(fdname, linkname, FILENAME_MAX);
+            if(unlikely(l == -1)) {
+                if(debug || (p->target && p->target->debug)) {
+                    if(debug || (p->target && p->target->debug))
+                        error("Cannot read link %s", fdname);
                 }
+                continue;
+            }
+            else
                 linkname[l] = '\0';
-                file_counter++;
 
-                // if another process already has this, we will get
-                // the same id
-                p->fds[fdid] = file_descriptor_find_or_add(linkname);
-            }
+            file_counter++;
 
-            // else make it positive again, we need it
-            // of course, the actual file may have changed, but we don't care so much
-            // FIXME: we could compare the inode as returned by readdir dirent structure
-            else p->fds[fdid] = -p->fds[fdid];
+            // if another process already has this, we will get
+            // the same id
+            p->fds[fdid] = file_descriptor_find_or_add(linkname);
         }
-        closedir(fds);
 
-        // remove all the negative file descriptors
-        for(c = 0 ; c < p->fds_size ; c++) if(p->fds[c] < 0) {
-            file_descriptor_not_used(-p->fds[c]);
-            p->fds[c] = 0;
-        }
+        // else make it positive again, we need it
+        // of course, the actual file may have changed, but we don't care so much
+        // FIXME: we could compare the inode as returned by readdir dirent structure
+
+        else
+            p->fds[fdid] = -p->fds[fdid];
     }
-    else return 0;
+
+    closedir(fds);
+    cleanup_negative_pid_fds(p);
 
     return 1;
 }
 
 // ----------------------------------------------------------------------------
 
-static inline int print_process_and_parents(struct pid_stat *p, unsigned long long time) {
+static inline int print_process_and_parents(struct pid_stat *p, usec_t time) {
     char *prefix = "\\_ ";
     int indent = 0;
 
@@ -1261,25 +1406,25 @@ static inline int print_process_and_parents(struct pid_stat *p, unsigned long lo
     for(i = 0; i < indent ;i++) buffer[i] = ' ';
     buffer[i] = '\0';
 
-    fprintf(stderr, "  %s %s%s (%d %s %lld"
+    fprintf(stderr, "  %s %s%s (%d %s %llu"
         , buffer
         , prefix
         , p->comm
         , p->pid
         , p->updated?"running":"exited"
-        , (long long)p->stat_collected_usec - (long long)time
+        , p->stat_collected_usec - time
         );
 
-    if(p->utime)   fprintf(stderr, " utime=%llu",   p->utime);
-    if(p->stime)   fprintf(stderr, " stime=%llu",   p->stime);
-    if(p->gtime)   fprintf(stderr, " gtime=%llu",   p->gtime);
-    if(p->cutime)  fprintf(stderr, " cutime=%llu",  p->cutime);
-    if(p->cstime)  fprintf(stderr, " cstime=%llu",  p->cstime);
-    if(p->cgtime)  fprintf(stderr, " cgtime=%llu",  p->cgtime);
-    if(p->minflt)  fprintf(stderr, " minflt=%llu",  p->minflt);
-    if(p->cminflt) fprintf(stderr, " cminflt=%llu", p->cminflt);
-    if(p->majflt)  fprintf(stderr, " majflt=%llu",  p->majflt);
-    if(p->cmajflt) fprintf(stderr, " cmajflt=%llu", p->cmajflt);
+    if(p->utime)   fprintf(stderr, " utime=" KERNEL_UINT_FORMAT,   p->utime);
+    if(p->stime)   fprintf(stderr, " stime=" KERNEL_UINT_FORMAT,   p->stime);
+    if(p->gtime)   fprintf(stderr, " gtime=" KERNEL_UINT_FORMAT,   p->gtime);
+    if(p->cutime)  fprintf(stderr, " cutime=" KERNEL_UINT_FORMAT,  p->cutime);
+    if(p->cstime)  fprintf(stderr, " cstime=" KERNEL_UINT_FORMAT,  p->cstime);
+    if(p->cgtime)  fprintf(stderr, " cgtime=" KERNEL_UINT_FORMAT,  p->cgtime);
+    if(p->minflt)  fprintf(stderr, " minflt=" KERNEL_UINT_FORMAT,  p->minflt);
+    if(p->cminflt) fprintf(stderr, " cminflt=" KERNEL_UINT_FORMAT, p->cminflt);
+    if(p->majflt)  fprintf(stderr, " majflt=" KERNEL_UINT_FORMAT,  p->majflt);
+    if(p->cmajflt) fprintf(stderr, " cmajflt=" KERNEL_UINT_FORMAT, p->cmajflt);
     fprintf(stderr, ")\n");
 
     return indent + 1;
@@ -1291,7 +1436,7 @@ static inline void print_process_tree(struct pid_stat *p, char *msg) {
     print_process_and_parents(p, p->stat_collected_usec);
 }
 
-static inline void find_lost_child_debug(struct pid_stat *pe, unsigned long long lost, int type) {
+static inline void find_lost_child_debug(struct pid_stat *pe, kernel_uint_t lost, int type) {
     int found = 0;
     struct pid_stat *p = NULL;
 
@@ -1301,35 +1446,35 @@ static inline void find_lost_child_debug(struct pid_stat *pe, unsigned long long
         switch(type) {
             case 1:
                 if(p->cminflt > lost) {
-                    fprintf(stderr, " > process %d (%s) could use the lost exited child minflt %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+                    fprintf(stderr, " > process %d (%s) could use the lost exited child minflt " KERNEL_UINT_FORMAT " of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
                     found++;
                 }
                 break;
 
             case 2:
                 if(p->cmajflt > lost) {
-                    fprintf(stderr, " > process %d (%s) could use the lost exited child majflt %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+                    fprintf(stderr, " > process %d (%s) could use the lost exited child majflt " KERNEL_UINT_FORMAT " of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
                     found++;
                 }
                 break;
 
             case 3:
                 if(p->cutime > lost) {
-                    fprintf(stderr, " > process %d (%s) could use the lost exited child utime %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+                    fprintf(stderr, " > process %d (%s) could use the lost exited child utime " KERNEL_UINT_FORMAT " of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
                     found++;
                 }
                 break;
 
             case 4:
                 if(p->cstime > lost) {
-                    fprintf(stderr, " > process %d (%s) could use the lost exited child stime %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+                    fprintf(stderr, " > process %d (%s) could use the lost exited child stime " KERNEL_UINT_FORMAT " of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
                     found++;
                 }
                 break;
 
             case 5:
                 if(p->cgtime > lost) {
-                    fprintf(stderr, " > process %d (%s) could use the lost exited child gtime %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+                    fprintf(stderr, " > process %d (%s) could use the lost exited child gtime " KERNEL_UINT_FORMAT " of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
                     found++;
                 }
                 break;
@@ -1339,30 +1484,30 @@ static inline void find_lost_child_debug(struct pid_stat *pe, unsigned long long
     if(!found) {
         switch(type) {
             case 1:
-                fprintf(stderr, " > cannot find any process to use the lost exited child minflt %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+                fprintf(stderr, " > cannot find any process to use the lost exited child minflt " KERNEL_UINT_FORMAT " of process %d (%s)\n", lost, pe->pid, pe->comm);
                 break;
 
             case 2:
-                fprintf(stderr, " > cannot find any process to use the lost exited child majflt %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+                fprintf(stderr, " > cannot find any process to use the lost exited child majflt " KERNEL_UINT_FORMAT " of process %d (%s)\n", lost, pe->pid, pe->comm);
                 break;
 
             case 3:
-                fprintf(stderr, " > cannot find any process to use the lost exited child utime %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+                fprintf(stderr, " > cannot find any process to use the lost exited child utime " KERNEL_UINT_FORMAT " of process %d (%s)\n", lost, pe->pid, pe->comm);
                 break;
 
             case 4:
-                fprintf(stderr, " > cannot find any process to use the lost exited child stime %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+                fprintf(stderr, " > cannot find any process to use the lost exited child stime " KERNEL_UINT_FORMAT " of process %d (%s)\n", lost, pe->pid, pe->comm);
                 break;
 
             case 5:
-                fprintf(stderr, " > cannot find any process to use the lost exited child gtime %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+                fprintf(stderr, " > cannot find any process to use the lost exited child gtime " KERNEL_UINT_FORMAT " of process %d (%s)\n", lost, pe->pid, pe->comm);
                 break;
         }
     }
 }
 
-static inline unsigned long long remove_exited_child_from_parent(unsigned long long *field, unsigned long long *pfield) {
-    unsigned long long absorbed = 0;
+static inline kernel_uint_t remove_exited_child_from_parent(kernel_uint_t *field, kernel_uint_t *pfield) {
+    kernel_uint_t absorbed = 0;
 
     if(*field > *pfield) {
         absorbed += *pfield;
@@ -1385,20 +1530,18 @@ static inline void process_exited_processes() {
         if(p->updated || !p->stat_collected_usec)
             continue;
 
-        struct pid_stat *pp = p->parent;
-
-        unsigned long long utime  = (p->utime_raw + p->cutime_raw)   * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
-        unsigned long long stime  = (p->stime_raw + p->cstime_raw)   * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
-        unsigned long long gtime  = (p->gtime_raw + p->cgtime_raw)   * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
-        unsigned long long minflt = (p->minflt_raw + p->cminflt_raw) * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
-        unsigned long long majflt = (p->majflt_raw + p->cmajflt_raw) * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
+        kernel_uint_t utime  = (p->utime_raw + p->cutime_raw)   * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
+        kernel_uint_t stime  = (p->stime_raw + p->cstime_raw)   * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
+        kernel_uint_t gtime  = (p->gtime_raw + p->cgtime_raw)   * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
+        kernel_uint_t minflt = (p->minflt_raw + p->cminflt_raw) * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
+        kernel_uint_t majflt = (p->majflt_raw + p->cmajflt_raw) * (1000000ULL * RATES_DETAIL) / (p->stat_collected_usec - p->last_stat_collected_usec);
 
         if(utime + stime + gtime + minflt + majflt == 0)
             continue;
 
         if(unlikely(debug)) {
             log_date(stderr);
-            fprintf(stderr, "Absorb %s (%d %s total resources: utime=%llu stime=%llu gtime=%llu minflt=%llu majflt=%llu)\n"
+            fprintf(stderr, "Absorb %s (%d %s total resources: utime=" KERNEL_UINT_FORMAT " stime=" KERNEL_UINT_FORMAT " gtime=" KERNEL_UINT_FORMAT " minflt=" KERNEL_UINT_FORMAT " majflt=" KERNEL_UINT_FORMAT ")\n"
                 , p->comm
                 , p->pid
                 , p->updated?"running":"exited"
@@ -1411,29 +1554,30 @@ static inline void process_exited_processes() {
             print_process_tree(p, "Searching parents");
         }
 
+        struct pid_stat *pp;
         for(pp = p->parent; pp ; pp = pp->parent) {
             if(!pp->updated) continue;
 
-            unsigned long long absorbed;
+            kernel_uint_t absorbed;
             absorbed = remove_exited_child_from_parent(&utime,  &pp->cutime);
             if(unlikely(debug && absorbed))
-                fprintf(stderr, " > process %s (%d %s) absorbed %llu utime (remaining: %llu)\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, utime);
+                fprintf(stderr, " > process %s (%d %s) absorbed " KERNEL_UINT_FORMAT " utime (remaining: " KERNEL_UINT_FORMAT ")\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, utime);
 
             absorbed = remove_exited_child_from_parent(&stime,  &pp->cstime);
             if(unlikely(debug && absorbed))
-                fprintf(stderr, " > process %s (%d %s) absorbed %llu stime (remaining: %llu)\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, stime);
+                fprintf(stderr, " > process %s (%d %s) absorbed " KERNEL_UINT_FORMAT " stime (remaining: " KERNEL_UINT_FORMAT ")\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, stime);
 
             absorbed = remove_exited_child_from_parent(&gtime,  &pp->cgtime);
             if(unlikely(debug && absorbed))
-                fprintf(stderr, " > process %s (%d %s) absorbed %llu gtime (remaining: %llu)\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, gtime);
+                fprintf(stderr, " > process %s (%d %s) absorbed " KERNEL_UINT_FORMAT " gtime (remaining: " KERNEL_UINT_FORMAT ")\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, gtime);
 
             absorbed = remove_exited_child_from_parent(&minflt, &pp->cminflt);
             if(unlikely(debug && absorbed))
-                fprintf(stderr, " > process %s (%d %s) absorbed %llu minflt (remaining: %llu)\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, minflt);
+                fprintf(stderr, " > process %s (%d %s) absorbed " KERNEL_UINT_FORMAT " minflt (remaining: " KERNEL_UINT_FORMAT ")\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, minflt);
 
             absorbed = remove_exited_child_from_parent(&majflt, &pp->cmajflt);
             if(unlikely(debug && absorbed))
-                fprintf(stderr, " > process %s (%d %s) absorbed %llu majflt (remaining: %llu)\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, majflt);
+                fprintf(stderr, " > process %s (%d %s) absorbed " KERNEL_UINT_FORMAT " majflt (remaining: " KERNEL_UINT_FORMAT ")\n", pp->comm, pp->pid, pp->updated?"running":"exited", absorbed, majflt);
         }
 
         if(unlikely(utime + stime + gtime + minflt + majflt > 0)) {
@@ -1448,7 +1592,7 @@ static inline void process_exited_processes() {
             p->keep = 1;
 
             if(unlikely(debug))
-                fprintf(stderr, " > remaining resources - KEEP - for another loop: %s (%d %s total resources: utime=%llu stime=%llu gtime=%llu minflt=%llu majflt=%llu)\n"
+                fprintf(stderr, " > remaining resources - KEEP - for another loop: %s (%d %s total resources: utime=" KERNEL_UINT_FORMAT " stime=" KERNEL_UINT_FORMAT " gtime=" KERNEL_UINT_FORMAT " minflt=" KERNEL_UINT_FORMAT " majflt=" KERNEL_UINT_FORMAT ")\n"
                     , p->comm
                     , p->pid
                     , p->updated?"running":"exited"
@@ -1513,7 +1657,7 @@ static inline void link_all_processes_to_their_parents(void) {
             pp->children_count++;
 
             if(unlikely(debug || (p->target && p->target->debug)))
-                fprintf(stderr, "apps.plugin: \tchild %d (%s, %s) on target '%s' has parent %d (%s, %s). Parent: utime=%llu, stime=%llu, gtime=%llu, minflt=%llu, majflt=%llu, cutime=%llu, cstime=%llu, cgtime=%llu, cminflt=%llu, cmajflt=%llu\n", p->pid, p->comm, p->updated?"running":"exited", (p->target)?p->target->name:"UNSET", pp->pid, pp->comm, pp->updated?"running":"exited", pp->utime, pp->stime, pp->gtime, pp->minflt, pp->majflt, pp->cutime, pp->cstime, pp->cgtime, pp->cminflt, pp->cmajflt);
+                fprintf(stderr, "apps.plugin: \tchild %d (%s, %s) on target '%s' has parent %d (%s, %s). Parent: utime=" KERNEL_UINT_FORMAT ", stime=" KERNEL_UINT_FORMAT ", gtime=" KERNEL_UINT_FORMAT ", minflt=" KERNEL_UINT_FORMAT ", majflt=" KERNEL_UINT_FORMAT ", cutime=" KERNEL_UINT_FORMAT ", cstime=" KERNEL_UINT_FORMAT ", cgtime=" KERNEL_UINT_FORMAT ", cminflt=" KERNEL_UINT_FORMAT ", cmajflt=" KERNEL_UINT_FORMAT "\n", p->pid, p->comm, p->updated?"running":"exited", (p->target)?p->target->name:"UNSET", pp->pid, pp->comm, pp->updated?"running":"exited", pp->utime, pp->stime, pp->gtime, pp->minflt, pp->majflt, pp->cutime, pp->cstime, pp->cgtime, pp->cminflt, pp->cmajflt);
         }
         else {
             p->parent = NULL;
@@ -1594,6 +1738,37 @@ static inline int managed_log(struct pid_stat *p, uint32_t log, int status) {
     return status;
 }
 
+static inline void assign_target_to_pid(struct pid_stat *p) {
+    uint32_t hash = simple_hash(p->comm);
+    size_t pclen  = strlen(p->comm);
+
+    struct target *w;
+    for(w = apps_groups_root_target; w ; w = w->next) {
+        // if(debug || (p->target && p->target->debug)) fprintf(stderr, "apps.plugin: \t\tcomparing '%s' with '%s'\n", w->compare, p->comm);
+
+        // find it - 4 cases:
+        // 1. the target is not a pattern
+        // 2. the target has the prefix
+        // 3. the target has the suffix
+        // 4. the target is something inside cmdline
+
+        if(unlikely(( (!w->starts_with && !w->ends_with && w->comparehash == hash && !strcmp(w->compare, p->comm))
+            || (w->starts_with && !w->ends_with && !strncmp(w->compare, p->comm, w->comparelen))
+            || (!w->starts_with && w->ends_with && pclen >= w->comparelen && !strcmp(w->compare, &p->comm[pclen - w->comparelen]))
+            || (proc_pid_cmdline_is_needed && w->starts_with && w->ends_with && strstr(p->cmdline, w->compare))
+                ))) {
+
+            if(w->target) p->target = w->target;
+            else p->target = w;
+
+            if(debug || (p->target && p->target->debug))
+                fprintf(stderr, "apps.plugin: \t\t%s linked to target %s\n", p->comm, p->target->name);
+
+            break;
+        }
+    }
+}
+
 static inline int collect_data_for_pid(pid_t pid) {
     if(unlikely(pid <= 0 || pid > pid_max)) {
         error("Invalid pid %d read (expected 1 to %d). Ignoring process.", pid, pid_max);
@@ -1646,32 +1821,7 @@ static inline int collect_data_for_pid(pid_t pid) {
         if(unlikely(debug))
             fprintf(stderr, "apps.plugin: \tJust added %d (%s)\n", pid, p->comm);
 
-        uint32_t hash = simple_hash(p->comm);
-        size_t pclen  = strlen(p->comm);
-
-        struct target *w;
-        for(w = apps_groups_root_target; w ; w = w->next) {
-            // if(debug || (p->target && p->target->debug)) fprintf(stderr, "apps.plugin: \t\tcomparing '%s' with '%s'\n", w->compare, p->comm);
-
-            // find it - 4 cases:
-            // 1. the target is not a pattern
-            // 2. the target has the prefix
-            // 3. the target has the suffix
-            // 4. the target is something inside cmdline
-            if( (!w->starts_with && !w->ends_with && w->comparehash == hash && !strcmp(w->compare, p->comm))
-                   || (w->starts_with && !w->ends_with && !strncmp(w->compare, p->comm, w->comparelen))
-                   || (!w->starts_with && w->ends_with && pclen >= w->comparelen && !strcmp(w->compare, &p->comm[pclen - w->comparelen]))
-                   || (proc_pid_cmdline_is_needed && w->starts_with && w->ends_with && strstr(p->cmdline, w->compare))
-                    ) {
-                if(w->target) p->target = w->target;
-                else p->target = w;
-
-                if(debug || (p->target && p->target->debug))
-                    fprintf(stderr, "apps.plugin: \t\t%s linked to target %s\n", p->comm, p->target->name);
-
-                break;
-            }
-        }
+        assign_target_to_pid(p);
     }
 
     // --------------------------------------------------------------------
@@ -1694,19 +1844,13 @@ static inline int collect_data_for_pid(pid_t pid) {
     return 1;
 }
 
-static int collect_data_for_all_processes_from_proc(void) {
+static int collect_data_for_all_processes(void) {
     struct pid_stat *p = NULL;
 
     if(all_pids_count) {
-        // read parents before childs
-        // this is needed to prevent a situation where
-        // a child is found running, but until we read
-        // its parent, it has exited and its parent
-        // has accumulated its resources
-
-        long slc = 0;
+        size_t slc = 0;
         for(p = root_of_pids; p ; p = p->next) {
-            p->read             = 0;
+            p->read             = 0; // mark it as not read, so that collect_data_for_pid() will read it
             p->updated          = 0;
             p->new_entry        = 0;
             p->merged           = 0;
@@ -1717,12 +1861,22 @@ static int collect_data_for_all_processes_from_proc(void) {
         }
 
         if(unlikely(slc != all_pids_count)) {
-            error("Internal error: I was thinking I had %ld processes in my arrays, but it seems there are more.", all_pids_count);
+            error("Internal error: I was thinking I had %zu processes in my arrays, but it seems there are more.", all_pids_count);
             all_pids_count = slc;
         }
 
         if(include_exited_childs) {
+            // Read parents before childs
+            // This is needed to prevent a situation where
+            // a child is found running, but until we read
+            // its parent, it has exited and its parent
+            // has accumulated its resources.
+
             qsort((void *)all_pids_sortlist, (size_t)all_pids_count, sizeof(pid_t), compar_pid);
+
+            // we forward read all running processes
+            // collect_data_for_pid() is smart enough,
+            // not to read the same pid twice per iterations
             for(slc = 0; slc < all_pids_count; slc++)
                 collect_data_for_pid(all_pids_sortlist[slc]);
         }
@@ -1734,14 +1888,18 @@ static int collect_data_for_all_processes_from_proc(void) {
     DIR *dir = opendir(dirname);
     if(!dir) return 0;
 
-    struct dirent *file = NULL;
+    struct dirent *de = NULL;
+
+    while((de = readdir(dir))) {
+        char *endptr = de->d_name;
+
+        if(unlikely(de->d_type != DT_DIR || de->d_name[0] < '0' || de->d_name[0] > '9'))
+            continue;
 
-    while((file = readdir(dir))) {
-        char *endptr = file->d_name;
-        pid_t pid = (pid_t) strtoul(file->d_name, &endptr, 10);
+        pid_t pid = (pid_t) strtoul(de->d_name, &endptr, 10);
 
         // make sure we read a valid number
-        if(unlikely(endptr == file->d_name || *endptr != '\0'))
+        if(unlikely(endptr == de->d_name || *endptr != '\0'))
             continue;
 
         collect_data_for_pid(pid);
@@ -1751,14 +1909,17 @@ static int collect_data_for_all_processes_from_proc(void) {
     if(!all_pids_count)
         return 0;
 
+    // we need /proc/stat to normalize the cpu consumption of the exited childs
+    read_proc_stat();
+
+    // build the process tree
+    link_all_processes_to_their_parents();
+
     // normally this is done
     // however we may have processes exited while we collected values
     // so let's find the exited ones
     // we do this by collecting the ownership of process
     // if we manage to get the ownership, the process still runs
-
-    read_proc_stat();
-    link_all_processes_to_their_parents();
     process_exited_processes();
 
     return 1;
@@ -1786,8 +1947,6 @@ static void cleanup_exited_pids(void) {
 
     for(p = root_of_pids; p ;) {
         if(!p->updated && (!p->keep || p->keeploops > 0)) {
-//          fprintf(stderr, "\tEXITED %d %s [parent %d %s, target %s] utime=%llu, stime=%llu, gtime=%llu, cutime=%llu, cstime=%llu, cgtime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu\n", p->pid, p->comm, p->parent->pid, p->parent->comm, p->target->name,  p->utime, p->stime, p->gtime, p->cutime, p->cstime, p->cgtime, p->minflt, p->majflt, p->cminflt, p->cmajflt);
-
             if(unlikely(debug && (p->keep || p->keeploops)))
                 fprintf(stderr, " > CLEANUP cannot keep exited process %d (%s) anymore - removing it.\n", p->pid, p->comm);
 
@@ -1918,9 +2077,9 @@ static void apply_apps_groups_targets_inheritance(void) {
         fprintf(stderr, "apps.plugin: apply_apps_groups_targets_inheritance() made %d loops on the process tree\n", loops);
 }
 
-static long zero_all_targets(struct target *root) {
+static size_t zero_all_targets(struct target *root) {
     struct target *w;
-    long count = 0;
+    size_t count = 0;
 
     for (w = root; w ; w = w->next) {
         count++;
@@ -2111,7 +2270,7 @@ static inline void aggregate_pid_on_target(struct target *w, struct pid_stat *p,
     w->num_threads += p->num_threads;
 
     if(unlikely(debug || w->debug))
-        fprintf(stderr, "apps.plugin: \taggregating '%s' pid %d on target '%s' utime=%llu, stime=%llu, gtime=%llu, cutime=%llu, cstime=%llu, cgtime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu\n", p->comm, p->pid, w->name, p->utime, p->stime, p->gtime, p->cutime, p->cstime, p->cgtime, p->minflt, p->majflt, p->cminflt, p->cmajflt);
+        fprintf(stderr, "apps.plugin: \taggregating '%s' pid %d on target '%s' utime=" KERNEL_UINT_FORMAT ", stime=" KERNEL_UINT_FORMAT ", gtime=" KERNEL_UINT_FORMAT ", cutime=" KERNEL_UINT_FORMAT ", cstime=" KERNEL_UINT_FORMAT ", cgtime=" KERNEL_UINT_FORMAT ", minflt=" KERNEL_UINT_FORMAT ", majflt=" KERNEL_UINT_FORMAT ", cminflt=" KERNEL_UINT_FORMAT ", cmajflt=" KERNEL_UINT_FORMAT "\n", p->comm, p->pid, w->name, p->utime, p->stime, p->gtime, p->cutime, p->cstime, p->cgtime, p->minflt, p->majflt, p->cminflt, p->cmajflt);
 }
 
 static void calculate_netdata_statistics(void) {
@@ -2120,7 +2279,7 @@ static void calculate_netdata_statistics(void) {
 
     zero_all_targets(users_root_target);
     zero_all_targets(groups_root_target);
-    apps_groups_targets = zero_all_targets(apps_groups_root_target);
+    apps_groups_targets_count = zero_all_targets(apps_groups_root_target);
 
     // this has to be done, before the cleanup
     struct pid_stat *p = NULL;
@@ -2182,21 +2341,18 @@ static void calculate_netdata_statistics(void) {
 
 int print_calculated_number(char *str, calculated_number value) { (void)str; (void)value; return 0; }
 
-static inline void send_BEGIN(const char *type, const char *id, unsigned long long usec) {
+static inline void send_BEGIN(const char *type, const char *id, usec_t usec) {
     fprintf(stdout, "BEGIN %s.%s %llu\n", type, id, usec);
 }
 
-static inline void send_SET(const char *name, unsigned long long value) {
-    fprintf(stdout, "SET %s = %llu\n", name, value);
+static inline void send_SET(const char *name, kernel_uint_t value) {
+    fprintf(stdout, "SET %s = " KERNEL_UINT_FORMAT "\n", name, value);
 }
 
 static inline void send_END(void) {
     fprintf(stdout, "END\n");
 }
 
-double utime_fix_ratio = 1.0, stime_fix_ratio = 1.0, gtime_fix_ratio = 1.0, cutime_fix_ratio = 1.0, cstime_fix_ratio = 1.0, cgtime_fix_ratio = 1.0;
-double minflt_fix_ratio = 1.0, majflt_fix_ratio = 1.0, cminflt_fix_ratio = 1.0, cmajflt_fix_ratio = 1.0;
-
 static usec_t send_resource_usage_to_netdata() {
     static struct timeval last = { 0, 0 };
     static struct rusage me_last;
@@ -2237,17 +2393,17 @@ static usec_t send_resource_usage_to_netdata() {
         "SET system = %llu\n"
         "END\n"
         "BEGIN netdata.apps_files %llu\n"
-        "SET files = %llu\n"
-        "SET pids = %ld\n"
+        "SET files = %zu\n"
+        "SET pids = %zu\n"
         "SET fds = %d\n"
-        "SET targets = %ld\n"
+        "SET targets = %zu\n"
         "END\n"
         "BEGIN netdata.apps_fix %llu\n"
-        "SET utime = %llu\n"
-        "SET stime = %llu\n"
-        "SET gtime = %llu\n"
-        "SET minflt = %llu\n"
-        "SET majflt = %llu\n"
+        "SET utime = %u\n"
+        "SET stime = %u\n"
+        "SET gtime = %u\n"
+        "SET minflt = %u\n"
+        "SET majflt = %u\n"
         "END\n"
         , usec
         , cpuuser
@@ -2256,48 +2412,48 @@ static usec_t send_resource_usage_to_netdata() {
         , file_counter
         , all_pids_count
         , all_files_len
-        , apps_groups_targets
+        , apps_groups_targets_count
         , usec
-        , (unsigned long long)(utime_fix_ratio   * 100 * RATES_DETAIL)
-        , (unsigned long long)(stime_fix_ratio   * 100 * RATES_DETAIL)
-        , (unsigned long long)(gtime_fix_ratio   * 100 * RATES_DETAIL)
-        , (unsigned long long)(minflt_fix_ratio  * 100 * RATES_DETAIL)
-        , (unsigned long long)(majflt_fix_ratio  * 100 * RATES_DETAIL)
+        , (unsigned int)(utime_fix_ratio   * 100 * RATES_DETAIL)
+        , (unsigned int)(stime_fix_ratio   * 100 * RATES_DETAIL)
+        , (unsigned int)(gtime_fix_ratio   * 100 * RATES_DETAIL)
+        , (unsigned int)(minflt_fix_ratio  * 100 * RATES_DETAIL)
+        , (unsigned int)(majflt_fix_ratio  * 100 * RATES_DETAIL)
         );
 
     if(include_exited_childs)
         fprintf(stdout,
             "BEGIN netdata.apps_children_fix %llu\n"
-            "SET cutime = %llu\n"
-            "SET cstime = %llu\n"
-            "SET cgtime = %llu\n"
-            "SET cminflt = %llu\n"
-            "SET cmajflt = %llu\n"
+            "SET cutime = %u\n"
+            "SET cstime = %u\n"
+            "SET cgtime = %u\n"
+            "SET cminflt = %u\n"
+            "SET cmajflt = %u\n"
             "END\n"
             , usec
-            , (unsigned long long)(cutime_fix_ratio  * 100 * RATES_DETAIL)
-            , (unsigned long long)(cstime_fix_ratio  * 100 * RATES_DETAIL)
-            , (unsigned long long)(cgtime_fix_ratio  * 100 * RATES_DETAIL)
-            , (unsigned long long)(cminflt_fix_ratio * 100 * RATES_DETAIL)
-            , (unsigned long long)(cmajflt_fix_ratio * 100 * RATES_DETAIL)
+            , (unsigned int)(cutime_fix_ratio  * 100 * RATES_DETAIL)
+            , (unsigned int)(cstime_fix_ratio  * 100 * RATES_DETAIL)
+            , (unsigned int)(cgtime_fix_ratio  * 100 * RATES_DETAIL)
+            , (unsigned int)(cminflt_fix_ratio * 100 * RATES_DETAIL)
+            , (unsigned int)(cmajflt_fix_ratio * 100 * RATES_DETAIL)
             );
 
     return usec;
 }
 
-static void normalize_data(struct target *root) {
+static void normalize_utilization(struct target *root) {
     struct target *w;
 
     // childs processing introduces spikes
     // here we try to eliminate them by disabling childs processing either for specific dimensions
     // or entirely. Of course, either way, we disable it just a single iteration.
 
-    unsigned long long max = processors * hz * RATES_DETAIL;
-    unsigned long long utime = 0, cutime = 0, stime = 0, cstime = 0, gtime = 0, cgtime = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0;
+    kernel_uint_t max_time = processors * hz * RATES_DETAIL;
+    kernel_uint_t utime = 0, cutime = 0, stime = 0, cstime = 0, gtime = 0, cgtime = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0;
 
-    if(global_utime > max) global_utime = max;
-    if(global_stime > max) global_stime = max;
-    if(global_gtime > max) global_gtime = max;
+    if(global_utime > max_time) global_utime = max_time;
+    if(global_stime > max_time) global_stime = max_time;
+    if(global_gtime > max_time) global_gtime = max_time;
 
     for(w = root; w ; w = w->next) {
         if(w->target || (!w->processes && !w->exposed)) continue;
@@ -2397,11 +2553,11 @@ static void normalize_data(struct target *root) {
 
     if(unlikely(debug)) {
         fprintf(stderr,
-            "SYSTEM: u=%llu s=%llu g=%llu "
-            "COLLECTED: u=%llu s=%llu g=%llu cu=%llu cs=%llu cg=%llu "
-            "DELTA: u=%lld s=%lld g=%lld "
+            "SYSTEM: u=" KERNEL_UINT_FORMAT " s=" KERNEL_UINT_FORMAT " g=" KERNEL_UINT_FORMAT " "
+            "COLLECTED: u=" KERNEL_UINT_FORMAT " s=" KERNEL_UINT_FORMAT " g=" KERNEL_UINT_FORMAT " cu=" KERNEL_UINT_FORMAT " cs=" KERNEL_UINT_FORMAT " cg=" KERNEL_UINT_FORMAT " "
+            "DELTA: u=" KERNEL_UINT_FORMAT " s=" KERNEL_UINT_FORMAT " g=" KERNEL_UINT_FORMAT " "
             "FIX: u=%0.2f s=%0.2f g=%0.2f cu=%0.2f cs=%0.2f cg=%0.2f "
-            "FINALLY: u=%llu s=%llu g=%llu cu=%llu cs=%llu cg=%llu "
+            "FINALLY: u=" KERNEL_UINT_FORMAT " s=" KERNEL_UINT_FORMAT " g=" KERNEL_UINT_FORMAT " cu=" KERNEL_UINT_FORMAT " cs=" KERNEL_UINT_FORMAT " cg=" KERNEL_UINT_FORMAT " "
             "\n"
             , global_utime
             , global_stime
@@ -2412,21 +2568,21 @@ static void normalize_data(struct target *root) {
             , cutime
             , cstime
             , cgtime
-            , (long long)utime + (long long)cutime - (long long)global_utime
-            , (long long)stime + (long long)cstime - (long long)global_stime
-            , (long long)gtime + (long long)cgtime - (long long)global_gtime
+            , utime + cutime - global_utime
+            , stime + cstime - global_stime
+            , gtime + cgtime - global_gtime
             , utime_fix_ratio
             , stime_fix_ratio
             , gtime_fix_ratio
             , cutime_fix_ratio
             , cstime_fix_ratio
             , cgtime_fix_ratio
-            , (unsigned long long)(utime * utime_fix_ratio)
-            , (unsigned long long)(stime * stime_fix_ratio)
-            , (unsigned long long)(gtime * gtime_fix_ratio)
-            , (unsigned long long)(cutime * cutime_fix_ratio)
-            , (unsigned long long)(cstime * cstime_fix_ratio)
-            , (unsigned long long)(cgtime * cgtime_fix_ratio)
+            , (kernel_uint_t)(utime * utime_fix_ratio)
+            , (kernel_uint_t)(stime * stime_fix_ratio)
+            , (kernel_uint_t)(gtime * gtime_fix_ratio)
+            , (kernel_uint_t)(cutime * cutime_fix_ratio)
+            , (kernel_uint_t)(cstime * cstime_fix_ratio)
+            , (kernel_uint_t)(cgtime * cgtime_fix_ratio)
             );
     }
 }
@@ -2437,21 +2593,21 @@ static void send_collected_data_to_netdata(struct target *root, const char *type
     send_BEGIN(type, "cpu", usec);
     for (w = root; w ; w = w->next) {
         if(unlikely(w->exposed))
-            send_SET(w->name, (unsigned long long)(w->utime * utime_fix_ratio) + (unsigned long long)(w->stime * stime_fix_ratio) + (unsigned long long)(w->gtime * gtime_fix_ratio) + (include_exited_childs?((unsigned long long)(w->cutime * cutime_fix_ratio) + (unsigned long long)(w->cstime * cstime_fix_ratio) + (unsigned long long)(w->cgtime * cgtime_fix_ratio)):0ULL));
+            send_SET(w->name, (kernel_uint_t)(w->utime * utime_fix_ratio) + (kernel_uint_t)(w->stime * stime_fix_ratio) + (kernel_uint_t)(w->gtime * gtime_fix_ratio) + (include_exited_childs?((kernel_uint_t)(w->cutime * cutime_fix_ratio) + (kernel_uint_t)(w->cstime * cstime_fix_ratio) + (kernel_uint_t)(w->cgtime * cgtime_fix_ratio)):0ULL));
     }
     send_END();
 
     send_BEGIN(type, "cpu_user", usec);
     for (w = root; w ; w = w->next) {
         if(unlikely(w->exposed))
-            send_SET(w->name, (unsigned long long)(w->utime * utime_fix_ratio) + (include_exited_childs?((unsigned long long)(w->cutime * cutime_fix_ratio)):0ULL));
+            send_SET(w->name, (kernel_uint_t)(w->utime * utime_fix_ratio) + (include_exited_childs?((kernel_uint_t)(w->cutime * cutime_fix_ratio)):0ULL));
     }
     send_END();
 
     send_BEGIN(type, "cpu_system", usec);
     for (w = root; w ; w = w->next) {
         if(unlikely(w->exposed))
-            send_SET(w->name, (unsigned long long)(w->stime * stime_fix_ratio) + (include_exited_childs?((unsigned long long)(w->cstime * cstime_fix_ratio)):0ULL));
+            send_SET(w->name, (kernel_uint_t)(w->stime * stime_fix_ratio) + (include_exited_childs?((kernel_uint_t)(w->cstime * cstime_fix_ratio)):0ULL));
     }
     send_END();
 
@@ -2459,7 +2615,7 @@ static void send_collected_data_to_netdata(struct target *root, const char *type
         send_BEGIN(type, "cpu_guest", usec);
         for (w = root; w ; w = w->next) {
             if(unlikely(w->exposed))
-                send_SET(w->name, (unsigned long long)(w->gtime * gtime_fix_ratio) + (include_exited_childs?((unsigned long long)(w->cgtime * cgtime_fix_ratio)):0ULL));
+                send_SET(w->name, (kernel_uint_t)(w->gtime * gtime_fix_ratio) + (include_exited_childs?((kernel_uint_t)(w->cgtime * cgtime_fix_ratio)):0ULL));
         }
         send_END();
     }
@@ -2495,14 +2651,14 @@ static void send_collected_data_to_netdata(struct target *root, const char *type
     send_BEGIN(type, "minor_faults", usec);
     for (w = root; w ; w = w->next) {
         if(unlikely(w->exposed))
-            send_SET(w->name, (unsigned long long)(w->minflt * minflt_fix_ratio) + (include_exited_childs?((unsigned long long)(w->cminflt * cminflt_fix_ratio)):0ULL));
+            send_SET(w->name, (kernel_uint_t)(w->minflt * minflt_fix_ratio) + (include_exited_childs?((kernel_uint_t)(w->cminflt * cminflt_fix_ratio)):0ULL));
     }
     send_END();
 
     send_BEGIN(type, "major_faults", usec);
     for (w = root; w ; w = w->next) {
         if(unlikely(w->exposed))
-            send_SET(w->name, (unsigned long long)(w->majflt * majflt_fix_ratio) + (include_exited_childs?((unsigned long long)(w->cmajflt * cmajflt_fix_ratio)):0ULL));
+            send_SET(w->name, (kernel_uint_t)(w->majflt * majflt_fix_ratio) + (include_exited_childs?((kernel_uint_t)(w->cmajflt * cmajflt_fix_ratio)):0ULL));
     }
     send_END();
 
@@ -2868,7 +3024,7 @@ int main(int argc, char **argv)
     parse_args(argc, argv);
 
     all_pids_sortlist = callocz(sizeof(pid_t), (size_t)pid_max);
-    all_pids = callocz(sizeof(struct pid_stat *), (size_t) pid_max);
+    all_pids          = callocz(sizeof(struct pid_stat *), (size_t) pid_max);
 
     fprintf(stdout,
         "CHART netdata.apps_cpu '' 'Apps Plugin CPU' 'milliseconds/s' apps.plugin netdata.apps_cpu stacked 140000 %1$d\n"
@@ -2919,14 +3075,14 @@ int main(int argc, char **argv)
         }
 #endif
 
-        if(!collect_data_for_all_processes_from_proc()) {
+        if(!collect_data_for_all_processes()) {
             error("Cannot collect /proc data for running processes. Disabling apps.plugin...");
             printf("DISABLE\n");
             exit(1);
         }
 
         calculate_netdata_statistics();
-        normalize_data(apps_groups_root_target);
+        normalize_utilization(apps_groups_root_target);
 
         usec_t dt = send_resource_usage_to_netdata();
 
@@ -2952,7 +3108,7 @@ int main(int argc, char **argv)
         show_guest_time_old = show_guest_time;
 
         if(unlikely(debug))
-            fprintf(stderr, "apps.plugin: done Loop No %llu\n", global_iterations_counter);
+            fprintf(stderr, "apps.plugin: done Loop No %zu\n", global_iterations_counter);
 
         time_t current_t = now_realtime_sec();
 
index 42f3d8d154212cf8785d61bbe72dafc67ed4c6bf..5b6c52505ea20f8bb7983c2620c98385f7316356 100644 (file)
@@ -1155,30 +1155,25 @@ pid_t get_system_pid_max(void) {
         return pid_max;
     #else
 
+    static char read = 0;
+    if(unlikely(read)) return pid_max;
+    read = 1;
+
     char filename[FILENAME_MAX + 1];
     snprintfz(filename, FILENAME_MAX, "%s/proc/sys/kernel/pid_max", global_host_prefix);
-    procfile *ff = procfile_open(filename, NULL, PROCFILE_FLAG_DEFAULT);
-    if(!ff) {
-        error("Cannot open file '%s'. Assuming system supports %d pids.", filename, pid_max);
-        return pid_max;
-    }
 
-    ff = procfile_readall(ff);
-    if(!ff) {
-        error("Cannot read file '%s'. Assuming system supports %d pids.", filename, pid_max);
+    unsigned long long max = 0;
+    if(read_single_number_file(filename, &max) != 0) {
+        error("Cannot open file '%s'. Assuming system supports %d pids.", filename, pid_max);
         return pid_max;
     }
 
-    pid_max = (pid_t)str2i(procfile_lineword(ff, 0, 0));
-    if(!pid_max) {
-        procfile_close(ff);
-        pid_max = 32768;
+    if(!max) {
         error("Cannot parse file '%s'. Assuming system supports %d pids.", filename, pid_max);
         return pid_max;
     }
 
-    procfile_close(ff);
-    debug(D_SYSTEM, "System supports %d pids.", pid_max);
+    pid_max = (pid_t) max;
     return pid_max;
 
     #endif /* __APPLE__ */
index 193312eec3af855c2587e15a6dc93d485138b24f..7f9be80b18002ae3690d2e17ba01fea622df6c5e 100755 (executable)
@@ -251,34 +251,34 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
             ae->exec_run_timestamp      = (uint32_t)strtoul(pointers[11], NULL, 16);
             ae->delay_up_to_timestamp   = (uint32_t)strtoul(pointers[12], NULL, 16);
 
-            if(unlikely(ae->name)) freez(ae->name);
+            freez(ae->name);
             ae->name = strdupz(pointers[13]);
             ae->hash_name = simple_hash(ae->name);
 
-            if(unlikely(ae->chart)) freez(ae->chart);
+            freez(ae->chart);
             ae->chart = strdupz(pointers[14]);
             ae->hash_chart = simple_hash(ae->chart);
 
-            if(unlikely(ae->family)) freez(ae->family);
+            freez(ae->family);
             ae->family = strdupz(pointers[15]);
 
-            if(unlikely(ae->exec)) freez(ae->exec);
+            freez(ae->exec);
             ae->exec = strdupz(pointers[16]);
             if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
 
-            if(unlikely(ae->recipient)) freez(ae->recipient);
+            freez(ae->recipient);
             ae->recipient = strdupz(pointers[17]);
             if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
 
-            if(unlikely(ae->source)) freez(ae->source);
+            freez(ae->source);
             ae->source = strdupz(pointers[18]);
             if(!*ae->source) { freez(ae->source); ae->source = NULL; }
 
-            if(unlikely(ae->units)) freez(ae->units);
+            freez(ae->units);
             ae->units = strdupz(pointers[19]);
             if(!*ae->units) { freez(ae->units); ae->units = NULL; }
 
-            if(unlikely(ae->info)) freez(ae->info);
+            freez(ae->info);
             ae->info = strdupz(pointers[20]);
             if(!*ae->info) { freez(ae->info); ae->info = NULL; }
 
@@ -290,6 +290,12 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
             ae->new_value   = str2l(pointers[25]);
             ae->old_value   = str2l(pointers[26]);
 
+            static char value_string[100 + 1];
+            freez(ae->old_value_string);
+            freez(ae->new_value_string);
+            ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
+            ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
+
             // add it to host if not already there
             if(unlikely(*pointers[0] == 'A')) {
                 ae->next = host->health_log.alarms;
@@ -353,17 +359,26 @@ static inline void health_alarm_log_load(RRDHOST *host) {
 // ----------------------------------------------------------------------------
 // health alarm log management
 
-static inline void health_alarm_log(RRDHOST *host,
-                uint32_t alarm_id, uint32_t alarm_event_id,
-                time_t when,
-                const char *name, const char *chart, const char *family,
-                const char *exec, const char *recipient, time_t duration,
-                calculated_number old_value, calculated_number new_value,
-                int old_status, int new_status,
-                const char *source,
-                const char *units,
-                const char *info,
-                int delay
+static inline void health_alarm_log(
+        RRDHOST *host,
+        uint32_t alarm_id,
+        uint32_t alarm_event_id,
+        time_t when,
+        const char *name,
+        const char *chart,
+        const char *family,
+        const char *exec,
+        const char *recipient,
+        time_t duration,
+        calculated_number old_value,
+        calculated_number new_value,
+        int old_status,
+        int new_status,
+        const char *source,
+        const char *units,
+        const char *info,
+        int delay,
+        uint32_t flags
 ) {
     debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
 
@@ -391,12 +406,19 @@ static inline void health_alarm_log(RRDHOST *host,
     ae->when = when;
     ae->old_value = old_value;
     ae->new_value = new_value;
+
+    static char value_string[100 + 1];
+    ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
+    ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
+
     ae->old_status = old_status;
     ae->new_status = new_status;
     ae->duration = duration;
     ae->delay = delay;
     ae->delay_up_to_timestamp = when + delay;
 
+    ae->flags |= flags;
+
     if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
         ae->non_clear_duration += ae->duration;
 
@@ -1095,7 +1117,27 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
 
     {
         time_t now = now_realtime_sec();
-        health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0);
+        health_alarm_log(
+                st->rrdhost,
+                rc->id,
+                rc->next_event_id++,
+                now,
+                rc->name,
+                rc->rrdset->id,
+                rc->rrdset->family,
+                rc->exec,
+                rc->recipient,
+                now - rc->last_status_change,
+                rc->old_value,
+                rc->value,
+                rc->status,
+                RRDCALC_STATUS_UNINITIALIZED,
+                rc->source,
+                rc->units,
+                rc->info,
+                0,
+                0
+        );
     }
 }
 
@@ -1133,7 +1175,27 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
 
     {
         time_t now = now_realtime_sec();
-        health_alarm_log(st->rrdhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0);
+        health_alarm_log(
+                st->rrdhost,
+                rc->id,
+                rc->next_event_id++,
+                now,
+                rc->name,
+                rc->rrdset->id,
+                rc->rrdset->family,
+                rc->exec,
+                rc->recipient,
+                now - rc->last_status_change,
+                rc->old_value,
+                rc->value,
+                rc->status,
+                RRDCALC_STATUS_REMOVED,
+                rc->source,
+                rc->units,
+                rc->info,
+                0,
+                0
+        );
     }
 
     RRDHOST *host = st->rrdhost;
@@ -1472,6 +1534,7 @@ static inline void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt) {
 #define HEALTH_UNITS_KEY "units"
 #define HEALTH_INFO_KEY "info"
 #define HEALTH_DELAY_KEY "delay"
+#define HEALTH_OPTIONS_KEY "options"
 
 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
     if(!rc->chart) {
@@ -1702,6 +1765,35 @@ static inline int health_parse_delay(
     return 1;
 }
 
+static inline uint32_t health_parse_options(const char *s) {
+    uint32_t options = 0;
+    char buf[100+1] = "";
+
+    while(*s) {
+        buf[0] = '\0';
+
+        // skip spaces
+        while(*s && isspace(*s))
+            s++;
+
+        // find the next space
+        size_t count = 0;
+        while(*s && count < 100 && !isspace(*s))
+            buf[count++] = *s++;
+
+        if(buf[0]) {
+            buf[count] = '\0';
+
+            if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear"))
+                options |= RRDCALC_FLAG_NO_CLEAR_NOTIFICATION;
+            else
+                error("Ignoring unknown alarm option '%s'", buf);
+        }
+    }
+
+    return options;
+}
+
 static inline int health_parse_db_lookup(
         size_t line, const char *path, const char *file, char *string,
         int *group_method, int *after, int *before, int *every,
@@ -1830,7 +1922,25 @@ static inline void strip_quotes(char *s) {
 int health_readfile(const char *path, const char *filename) {
     debug(D_HEALTH, "Health configuration reading file '%s/%s'", path, filename);
 
-    static uint32_t hash_alarm = 0, hash_template = 0, hash_on = 0, hash_families = 0, hash_calc = 0, hash_green = 0, hash_red = 0, hash_warn = 0, hash_crit = 0, hash_exec = 0, hash_every = 0, hash_lookup = 0, hash_units = 0, hash_info = 0, hash_recipient = 0, hash_delay = 0;
+    static uint32_t
+            hash_alarm = 0,
+            hash_template = 0,
+            hash_on = 0,
+            hash_families = 0,
+            hash_calc = 0,
+            hash_green = 0,
+            hash_red = 0,
+            hash_warn = 0,
+            hash_crit = 0,
+            hash_exec = 0,
+            hash_every = 0,
+            hash_lookup = 0,
+            hash_units = 0,
+            hash_info = 0,
+            hash_recipient = 0,
+            hash_delay = 0,
+            hash_options = 0;
+
     char buffer[HEALTH_CONF_MAX_LINE + 1];
 
     if(unlikely(!hash_alarm)) {
@@ -1850,6 +1960,7 @@ int health_readfile(const char *path, const char *filename) {
         hash_info = simple_hash(HEALTH_INFO_KEY);
         hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
         hash_delay = simple_uhash(HEALTH_DELAY_KEY);
+        hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
     }
 
     snprintfz(buffer, HEALTH_CONF_MAX_LINE, "%s/%s", path, filename);
@@ -2062,6 +2173,9 @@ int health_readfile(const char *path, const char *filename) {
             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
                 health_parse_delay(line, path, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
             }
+            else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
+                rc->options |= health_parse_options(value);
+            }
             else {
                 error("Health configuration at line %zu of file '%s/%s' for alarm '%s' has unknown key '%s'.",
                      line, path, filename, rc->name, key);
@@ -2183,6 +2297,9 @@ int health_readfile(const char *path, const char *filename) {
             else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
                 health_parse_delay(line, path, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
             }
+            else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
+                rt->options |= health_parse_options(value);
+            }
             else {
                 error("Health configuration at line %zu of file '%s/%s' for template '%s' has unknown key '%s'.",
                       line, path, filename, rt->name, key);
@@ -2305,61 +2422,70 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char
 }
 
 static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
-    buffer_sprintf(wb, "\n\t{\n"
-                           "\t\t\"hostname\": \"%s\",\n"
-                           "\t\t\"unique_id\": %u,\n"
-                           "\t\t\"alarm_id\": %u,\n"
-                           "\t\t\"alarm_event_id\": %u,\n"
-                           "\t\t\"name\": \"%s\",\n"
-                           "\t\t\"chart\": \"%s\",\n"
-                           "\t\t\"family\": \"%s\",\n"
-                           "\t\t\"processed\": %s,\n"
-                           "\t\t\"updated\": %s,\n"
-                           "\t\t\"exec_run\": %lu,\n"
-                           "\t\t\"exec_failed\": %s,\n"
-                           "\t\t\"exec\": \"%s\",\n"
-                           "\t\t\"recipient\": \"%s\",\n"
-                           "\t\t\"exec_code\": %d,\n"
-                           "\t\t\"source\": \"%s\",\n"
-                           "\t\t\"units\": \"%s\",\n"
-                           "\t\t\"info\": \"%s\",\n"
-                           "\t\t\"when\": %lu,\n"
-                           "\t\t\"duration\": %lu,\n"
-                           "\t\t\"non_clear_duration\": %lu,\n"
-                           "\t\t\"status\": \"%s\",\n"
-                           "\t\t\"old_status\": \"%s\",\n"
-                           "\t\t\"delay\": %d,\n"
-                           "\t\t\"delay_up_to_timestamp\": %lu,\n"
-                           "\t\t\"updated_by_id\": %u,\n"
-                           "\t\t\"updates_id\": %u,\n",
-                   host->hostname,
-                   ae->unique_id,
-                   ae->alarm_id,
-                   ae->alarm_event_id,
-                   ae->name,
-                   ae->chart,
-                   ae->family,
-                   (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
-                   (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
-                   (unsigned long)ae->exec_run_timestamp,
-                   (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
-                   ae->exec?ae->exec:health.health_default_exec,
-                   ae->recipient?ae->recipient:health.health_default_recipient,
-                   ae->exec_code,
-                   ae->source,
-                   ae->units?ae->units:"",
-                   ae->info?ae->info:"",
-                   (unsigned long)ae->when,
-                   (unsigned long)ae->duration,
-                   (unsigned long)ae->non_clear_duration,
-                   rrdcalc_status2string(ae->new_status),
-                   rrdcalc_status2string(ae->old_status),
-                   ae->delay,
-                   (unsigned long)ae->delay_up_to_timestamp,
-                   ae->updated_by_id,
-                   ae->updates_id
+    buffer_sprintf(wb,
+            "\n\t{\n"
+                    "\t\t\"hostname\": \"%s\",\n"
+                    "\t\t\"unique_id\": %u,\n"
+                    "\t\t\"alarm_id\": %u,\n"
+                    "\t\t\"alarm_event_id\": %u,\n"
+                    "\t\t\"name\": \"%s\",\n"
+                    "\t\t\"chart\": \"%s\",\n"
+                    "\t\t\"family\": \"%s\",\n"
+                    "\t\t\"processed\": %s,\n"
+                    "\t\t\"updated\": %s,\n"
+                    "\t\t\"exec_run\": %lu,\n"
+                    "\t\t\"exec_failed\": %s,\n"
+                    "\t\t\"exec\": \"%s\",\n"
+                    "\t\t\"recipient\": \"%s\",\n"
+                    "\t\t\"exec_code\": %d,\n"
+                    "\t\t\"source\": \"%s\",\n"
+                    "\t\t\"units\": \"%s\",\n"
+                    "\t\t\"info\": \"%s\",\n"
+                    "\t\t\"when\": %lu,\n"
+                    "\t\t\"duration\": %lu,\n"
+                    "\t\t\"non_clear_duration\": %lu,\n"
+                    "\t\t\"status\": \"%s\",\n"
+                    "\t\t\"old_status\": \"%s\",\n"
+                    "\t\t\"delay\": %d,\n"
+                    "\t\t\"delay_up_to_timestamp\": %lu,\n"
+                    "\t\t\"updated_by_id\": %u,\n"
+                    "\t\t\"updates_id\": %u,\n"
+                    "\t\t\"value_string\": \"%s\",\n"
+                    "\t\t\"old_value_string\": \"%s\",\n"
+            , host->hostname
+            , ae->unique_id
+            , ae->alarm_id
+            , ae->alarm_event_id
+            , ae->name
+            , ae->chart
+            , ae->family
+            , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
+            , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
+            , (unsigned long)ae->exec_run_timestamp
+            , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false"
+            , ae->exec?ae->exec:health.health_default_exec
+            , ae->recipient?ae->recipient:health.health_default_recipient
+            , ae->exec_code
+            , ae->source
+            , ae->units?ae->units:""
+            , ae->info?ae->info:""
+            , (unsigned long)ae->when
+            , (unsigned long)ae->duration
+            , (unsigned long)ae->non_clear_duration
+            , rrdcalc_status2string(ae->new_status)
+            , rrdcalc_status2string(ae->old_status)
+            , ae->delay
+            , (unsigned long)ae->delay_up_to_timestamp
+            , ae->updated_by_id
+            , ae->updates_id
+            , ae->new_value_string
+            , ae->old_value_string
     );
 
+    if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
+        buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
+    }
+
     buffer_strcat(wb, "\t\t\"value\":");
     buffer_rrd_value(wb, ae->new_value);
     buffer_strcat(wb, ",\n");
@@ -2415,30 +2541,34 @@ static inline void health_rrdcalc2json_nolock(BUFFER *wb, RRDCALC *rc) {
                    "\t\t\t\"delay_multiplier\": %f,\n"
                    "\t\t\t\"delay\": %d,\n"
                    "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
-            , rc->chart, rc->name
-            , (unsigned long)rc->id
-            , rc->name
-            , rc->chart
-            , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
-            , (rc->rrdset)?"true":"false"
-            , rc->exec?rc->exec:health.health_default_exec
-            , rc->recipient?rc->recipient:health.health_default_recipient
-            , rc->source
-            , rc->units?rc->units:""
-            , rc->info?rc->info:""
-            , rrdcalc_status2string(rc->status)
-            , (unsigned long)rc->last_status_change
-            , (unsigned long)rc->last_updated
-            , (unsigned long)rc->next_update
-            , rc->update_every
-            , rc->delay_up_duration
-            , rc->delay_down_duration
-            , rc->delay_max_duration
-            , rc->delay_multiplier
-            , rc->delay_last
-            , (unsigned long)rc->delay_up_to_timestamp
+           , rc->chart, rc->name
+           , (unsigned long)rc->id
+           , rc->name
+           , rc->chart
+           , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
+           , (rc->rrdset)?"true":"false"
+           , rc->exec?rc->exec:health.health_default_exec
+           , rc->recipient?rc->recipient:health.health_default_recipient
+           , rc->source
+           , rc->units?rc->units:""
+           , rc->info?rc->info:""
+           , rrdcalc_status2string(rc->status)
+           , (unsigned long)rc->last_status_change
+           , (unsigned long)rc->last_updated
+           , (unsigned long)rc->next_update
+           , rc->update_every
+           , rc->delay_up_duration
+           , rc->delay_down_duration
+           , rc->delay_max_duration
+           , rc->delay_multiplier
+           , rc->delay_last
+           , (unsigned long)rc->delay_up_to_timestamp
     );
 
+    if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
+        buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n");
+    }
+
     if(RRDCALC_HAS_DB_LOOKUP(rc)) {
         if(rc->dimensions && *rc->dimensions)
             health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
@@ -2601,12 +2731,21 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
 
     if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
         // do not send notifications for internal statuses
+        debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+        goto done;
+    }
+
+    if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
+        // do not send notifications for disabled statuses
+        debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+        // mark it as run, so that we will send the same alarm if it happens again
         goto done;
     }
 
     // find the previous notification for the same alarm
     // which we have run the exec script
-    {
+    // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
+    if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
         uint32_t id = ae->alarm_id;
         ALARM_ENTRY *t;
         for(t = ae->next; t ; t = t->next) {
@@ -2643,7 +2782,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
     const char *recipient = ae->recipient;
     if(!recipient) recipient = health.health_default_recipient;
 
-    snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s'",
+    snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '%0.0Lf' '%0.0Lf' '%s' '%u' '%u' '%s' '%s' '%s' '%s'",
               exec,
               recipient,
               host->hostname,
@@ -2662,7 +2801,9 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
               (uint32_t)ae->duration,
               (uint32_t)ae->non_clear_duration,
               ae->units?ae->units:"",
-              ae->info?ae->info:""
+              ae->info?ae->info:"",
+              ae->new_value_string,
+              ae->old_value_string
     );
 
     ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
@@ -2754,6 +2895,8 @@ static inline void health_alarm_log_process(RRDHOST *host) {
         freez(ae->source);
         freez(ae->units);
         freez(ae->info);
+        freez(ae->old_value_string);
+        freez(ae->new_value_string);
         freez(ae);
 
         ae = t;
@@ -3083,7 +3226,27 @@ void *health_main(void *ptr) {
 
                     rc->delay_last = delay;
                     rc->delay_up_to_timestamp = now + delay;
-                    health_alarm_log(&localhost, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last);
+                    health_alarm_log(
+                            &localhost,
+                            rc->id,
+                            rc->next_event_id++,
+                            now,
+                            rc->name,
+                            rc->rrdset->id,
+                            rc->rrdset->family,
+                            rc->exec,
+                            rc->recipient,
+                            now - rc->last_status_change,
+                            rc->old_value,
+                            rc->value,
+                            rc->status,
+                            status,
+                            rc->source,
+                            rc->units,
+                            rc->info,
+                            rc->delay_last,
+                            (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)?HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION:0
+                    );
                     rc->last_status_change = now;
                     rc->status = status;
                 }
index 79831d4fc563257a24ce26cd7e9f42bad1036649..87f8a1a18e547e2665cd5b007e3483dfcb361a06 100644 (file)
@@ -119,13 +119,14 @@ typedef struct rrddimvar {
 #define RRDCALC_STATUS_WARNING        3
 #define RRDCALC_STATUS_CRITICAL       4
 
-#define RRDCALC_FLAG_DB_ERROR      0x00000001
-#define RRDCALC_FLAG_DB_NAN        0x00000002
-/* #define RRDCALC_FLAG_DB_STALE      0x00000004 */
-#define RRDCALC_FLAG_CALC_ERROR    0x00000008
-#define RRDCALC_FLAG_WARN_ERROR    0x00000010
-#define RRDCALC_FLAG_CRIT_ERROR    0x00000020
-#define RRDCALC_FLAG_RUNNABLE      0x00000040
+#define RRDCALC_FLAG_DB_ERROR              0x00000001
+#define RRDCALC_FLAG_DB_NAN                0x00000002
+/* #define RRDCALC_FLAG_DB_STALE           0x00000004 */
+#define RRDCALC_FLAG_CALC_ERROR            0x00000008
+#define RRDCALC_FLAG_WARN_ERROR            0x00000010
+#define RRDCALC_FLAG_CRIT_ERROR            0x00000020
+#define RRDCALC_FLAG_RUNNABLE              0x00000040
+#define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
 
 typedef struct rrdcalc {
     uint32_t id;                    // the unique id of this alarm
@@ -274,11 +275,12 @@ typedef struct rrdcalctemplate {
 
 #define RRDCALCTEMPLATE_HAS_CALCULATION(rt) ((rt)->after)
 
-#define HEALTH_ENTRY_FLAG_PROCESSED    0x00000001
-#define HEALTH_ENTRY_FLAG_UPDATED      0x00000002
-#define HEALTH_ENTRY_FLAG_EXEC_RUN     0x00000004
-#define HEALTH_ENTRY_FLAG_EXEC_FAILED  0x00000008
-#define HEALTH_ENTRY_FLAG_SAVED        0x10000000
+#define HEALTH_ENTRY_FLAG_PROCESSED             0x00000001
+#define HEALTH_ENTRY_FLAG_UPDATED               0x00000002
+#define HEALTH_ENTRY_FLAG_EXEC_RUN              0x00000004
+#define HEALTH_ENTRY_FLAG_EXEC_FAILED           0x00000008
+#define HEALTH_ENTRY_FLAG_SAVED                 0x10000000
+#define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
 
 typedef struct alarm_entry {
     uint32_t unique_id;
@@ -308,6 +310,10 @@ typedef struct alarm_entry {
 
     calculated_number old_value;
     calculated_number new_value;
+
+    char *old_value_string;
+    char *new_value_string;
+
     int old_status;
     int new_status;
 
index e776f830e489e7577ac01b1c376779f8d436c17d..e9d6e87945d74c641d64866af0eba8f9716ea302 100644 (file)
@@ -3,6 +3,19 @@
 
 #include "common.h"
 
+#ifdef KERNEL_32BIT
+typedef uint32_t kernel_uint_t;
+#define str2kernel_unit_t(string) str2uint32_t(string)
+#define KERNEL_UINT_FORMAT "%u"
+#else
+typedef uint64_t kernel_uint_t;
+#define str2kernel_unit_t(string) str2uint64_t(string)
+#define KERNEL_UINT_FORMAT "%lu"
+#endif
+
+#define str2pid_t(string) str2uint32_t(string)
+
+
 // for faster execution, allow the compiler to inline
 // these functions that are called thousands of times per second
 
@@ -70,6 +83,26 @@ static inline long str2l(const char *s) {
     return n;
 }
 
+static inline uint32_t str2uint32_t(const char *s) {
+    uint32_t n = 0;
+    char c;
+    for(c = *s; c >= '0' && c <= '9' ; c = *(++s)) {
+        n *= 10;
+        n += c - '0';
+    }
+    return n;
+}
+
+static inline uint64_t str2uint64_t(const char *s) {
+    uint64_t n = 0;
+    char c;
+    for(c = *s; c >= '0' && c <= '9' ; c = *(++s)) {
+        n *= 10;
+        n += c - '0';
+    }
+    return n;
+}
+
 static inline unsigned long str2ul(const char *s) {
     unsigned long n = 0;
     char c;
index ca134fcb05fddd60bbe331768237d6e21bca4abd..fb547e440526a1489b60365203416323c134ccb0 100644 (file)
@@ -46,8 +46,8 @@ struct netdata_static_thread static_threads[] = {
 #else
     {"proc",               "plugins",   "proc",       1, NULL, NULL, proc_main},
     {"diskspace",          "plugins",   "diskspace",  1, NULL, NULL, proc_diskspace_main},
-#endif /* __FreeBSD__, __APPLE__*/
     {"cgroups",            "plugins",   "cgroups",    1, NULL, NULL, cgroups_main},
+#endif /* __FreeBSD__, __APPLE__*/
     {"check",              "plugins",   "checks",     0, NULL, NULL, checks_main},
     {"backends",            NULL,       NULL,         1, NULL, NULL, backends_main},
     {"health",              NULL,       NULL,         1, NULL, NULL, health_main},
@@ -552,6 +552,10 @@ int main(int argc, char **argv)
         if(!p) p = "/bin:/usr/bin";
         snprintfz(path, 1024, "%s:%s", p, "/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin");
         setenv("PATH", config_get("plugins", "PATH environment variable", path), 1);
+
+        p = getenv("PYTHONPATH");
+        if(!p) p = "";
+        setenv("PYTHONPATH", config_get("plugins", "PYTHONPATH environment variable", p), 1);
     }
 
     char *user = NULL;
index 6f52bf465fb28dba7f4aaba00db84d101c03f014..736a35de95b1d294b86547044f37419f8cf143d8 100644 (file)
@@ -15,6 +15,27 @@ size_t procfile_max_lines = PFLINES_INCREASE_STEP;
 size_t procfile_max_words = PFWORDS_INCREASE_STEP;
 size_t procfile_max_allocation = PROCFILE_INCREMENT_BUFFER;
 
+
+// ----------------------------------------------------------------------------
+
+char *procfile_filename(procfile *ff) {
+    if(ff->filename[0]) return ff->filename;
+
+    char buffer[FILENAME_MAX + 1];
+    snprintfz(buffer, FILENAME_MAX, "/proc/self/fd/%d", ff->fd);
+
+    ssize_t l = readlink(buffer, ff->filename, FILENAME_MAX);
+    if(unlikely(l == -1))
+        snprintfz(ff->filename, FILENAME_MAX, "unknown filename for fd %d", ff->fd);
+    else
+        ff->filename[l] = '\0';
+
+    // on non-linux systems, something like this will be needed
+    // fcntl(ff->fd, F_GETPATH, ff->filename)
+
+    return ff->filename;
+}
+
 // ----------------------------------------------------------------------------
 // An array of words
 
@@ -114,7 +135,7 @@ static inline void pflines_free(pflines *fl) {
 #define PF_CHAR_IS_CLOSE        'C'
 
 void procfile_close(procfile *ff) {
-    debug(D_PROCFILE, PF_PREFIX ": Closing file '%s'", ff->filename);
+    debug(D_PROCFILE, PF_PREFIX ": Closing file '%s'", procfile_filename(ff));
 
     if(likely(ff->lines)) pflines_free(ff->lines);
     if(likely(ff->words)) pfwords_free(ff->words);
@@ -250,26 +271,24 @@ static inline void procfile_parser(procfile *ff) {
 }
 
 procfile *procfile_readall(procfile *ff) {
-    debug(D_PROCFILE, PF_PREFIX ": Reading file '%s'.", ff->filename);
-
-    ssize_t r = 1;
-    ff->len = 0;
+    // debug(D_PROCFILE, PF_PREFIX ": Reading file '%s'.", ff->filename);
 
-    while(likely(r > 0)) {
+    ff->len = 0;    // zero the used size
+    ssize_t r = 1;  // read at least once
+    while(r > 0) {
         ssize_t s = ff->len;
         ssize_t x = ff->size - s;
 
         if(unlikely(!x)) {
-            debug(D_PROCFILE, PF_PREFIX ": Expanding data buffer for file '%s'.", ff->filename);
-
+            debug(D_PROCFILE, PF_PREFIX ": Expanding data buffer for file '%s'.", procfile_filename(ff));
             ff = reallocz(ff, sizeof(procfile) + ff->size + PROCFILE_INCREMENT_BUFFER);
             ff->size += PROCFILE_INCREMENT_BUFFER;
         }
 
-        debug(D_PROCFILE, "Reading file '%s', from position %ld with length %lu", ff->filename, s, ff->size - s);
+        debug(D_PROCFILE, "Reading file '%s', from position %ld with length %lu", procfile_filename(ff), s, ff->size - s);
         r = read(ff->fd, &ff->data[s], ff->size - s);
         if(unlikely(r == -1)) {
-            if(unlikely(!(ff->flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) error(PF_PREFIX ": Cannot read from file '%s'", ff->filename);
+            if(unlikely(!(ff->flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) error(PF_PREFIX ": Cannot read from file '%s'", procfile_filename(ff));
             procfile_close(ff);
             return NULL;
         }
@@ -277,9 +296,9 @@ procfile *procfile_readall(procfile *ff) {
         ff->len += r;
     }
 
-    debug(D_PROCFILE, "Rewinding file '%s'", ff->filename);
+    // debug(D_PROCFILE, "Rewinding file '%s'", ff->filename);
     if(unlikely(lseek(ff->fd, 0, SEEK_SET) == -1)) {
-        if(unlikely(!(ff->flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) error(PF_PREFIX ": Cannot rewind on file '%s'.", ff->filename);
+        if(unlikely(!(ff->flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) error(PF_PREFIX ": Cannot rewind on file '%s'.", procfile_filename(ff));
         procfile_close(ff);
         return NULL;
     }
@@ -294,7 +313,7 @@ procfile *procfile_readall(procfile *ff) {
         if(unlikely(ff->words->len > procfile_max_words)) procfile_max_words = ff->words->len;
     }
 
-    debug(D_PROCFILE, "File '%s' updated.", ff->filename);
+    // debug(D_PROCFILE, "File '%s' updated.", ff->filename);
     return ff;
 }
 
@@ -379,7 +398,9 @@ procfile *procfile_open(const char *filename, const char *separators, uint32_t f
 
     size_t size = (unlikely(procfile_adaptive_initial_allocation)) ? procfile_max_allocation : PROCFILE_INCREMENT_BUFFER;
     procfile *ff = mallocz(sizeof(procfile) + size);
-    strncpyz(ff->filename, filename, FILENAME_MAX);
+
+    //strncpyz(ff->filename, filename, FILENAME_MAX);
+    ff->filename[0] = '\0';
 
     ff->fd = fd;
     ff->size = size;
@@ -406,7 +427,8 @@ procfile *procfile_reopen(procfile *ff, const char *filename, const char *separa
         return NULL;
     }
 
-    strncpyz(ff->filename, filename, FILENAME_MAX);
+    //strncpyz(ff->filename, filename, FILENAME_MAX);
+    ff->filename[0] = '\0';
 
     ff->flags = flags;
 
@@ -423,7 +445,7 @@ void procfile_print(procfile *ff) {
     size_t lines = procfile_lines(ff), l;
     char *s;
 
-    debug(D_PROCFILE, "File '%s' with %zu lines and %zu words", ff->filename, ff->lines->len, ff->words->len);
+    debug(D_PROCFILE, "File '%s' with %zu lines and %zu words", procfile_filename(ff), ff->lines->len, ff->words->len);
 
     for(l = 0; likely(l < lines) ;l++) {
         size_t words = procfile_linewords(ff, l);
index a586ba48d461ef617ebc4e4cc0012371a9c1f644..dae5a0fc272092deaea95dd565392ae77a9ee5b0 100644 (file)
@@ -59,7 +59,8 @@ typedef struct {
 #define PROCFILE_FLAG_NO_ERROR_ON_FILE_IO 0x00000001
 
 typedef struct {
-    char filename[FILENAME_MAX + 1];
+    char filename[FILENAME_MAX + 1]; // not populated until profile_filename() is called
+
     uint32_t flags;
     int fd;               // the file desriptor
     size_t len;           // the bytes we have placed into data
@@ -89,6 +90,8 @@ extern void procfile_print(procfile *ff);
 extern void procfile_set_quotes(procfile *ff, const char *quotes);
 extern void procfile_set_open_close(procfile *ff, const char *open, const char *close);
 
+extern char *procfile_filename(procfile *ff);
+
 // ----------------------------------------------------------------------------
 
 // set this to 1, to have procfile adapt its initial buffer allocation to the max allocation used so far
index 067475006d03675171db9f37a37120db4cd274ac..568d04e8d9275e20291d2532d77c96973d88cdde 100644 (file)
@@ -541,7 +541,7 @@ static void rrdr_dump(RRDR *r)
 }
 */
 
-void rrdr_disable_not_selected_dimensions(RRDR *r, const char *dims)
+void rrdr_disable_not_selected_dimensions(RRDR *r, uint32_t options, const char *dims)
 {
     char b[strlen(dims) + 1];
     char *o = b, *tok;
@@ -567,7 +567,8 @@ void rrdr_disable_not_selected_dimensions(RRDR *r, const char *dims)
                 // since the user needs this dimension
                 // make it appear as NONZERO, to return it
                 // even if the dimension has only zeros
-                r->od[c] |= RRDR_NONZERO;
+                // unless option non_zero is set
+                if (!(options & RRDR_OPTION_NONZERO)) r->od[c] |= RRDR_NONZERO;
             }
         }
     }
@@ -1778,7 +1779,7 @@ int rrd2value(RRDSET *st, BUFFER *wb, calculated_number *n, const char *dimensio
     options = rrdr_check_options(r, options, dimensions);
 
     if(dimensions)
-        rrdr_disable_not_selected_dimensions(r, dimensions);
+        rrdr_disable_not_selected_dimensions(r, options, dimensions);
 
     if(db_after)  *db_after  = r->after;
     if(db_before) *db_before = r->before;
@@ -1806,7 +1807,7 @@ int rrd2format(RRDSET *st, BUFFER *wb, BUFFER *dimensions, uint32_t format, long
     options = rrdr_check_options(r, options, (dimensions)?buffer_tostring(dimensions):NULL);
 
     if(dimensions)
-        rrdr_disable_not_selected_dimensions(r, buffer_tostring(dimensions));
+        rrdr_disable_not_selected_dimensions(r, options, buffer_tostring(dimensions));
 
     if(latest_timestamp && rrdr_rows(r) > 0)
         *latest_timestamp = r->before;
index 2b7254f47aa3b34607328b0d9d8143db42d19462..4914b2ed99a839e595d01be6586b1213d5323c85 100644 (file)
@@ -145,6 +145,7 @@ void read_cgroup_plugin_configuration() {
 
     enabled_cgroup_patterns = simple_pattern_create(
             config_get("plugin:cgroups", "enable by default cgroups matching",
+                    " /system.slice/docker-*.scope "
                     " !*.mount "
                     " !*.partition "
                     " !*.scope "
@@ -180,6 +181,8 @@ void read_cgroup_plugin_configuration() {
 
     enabled_cgroup_renames = simple_pattern_create(
             config_get("plugin:cgroups", "run script to rename cgroups matching",
+                    " *docker* "
+                    " *lxc* "
                     " !/ "
                     " !*.mount "
                     " !*.partition "
index cac365ab14e719d4f26d175b3ab8e0eb8fa0a07b..bdc1608ed2889087d90f214d41e417d12bf4c6c3 100644 (file)
@@ -368,10 +368,226 @@ cleanup:
     return len - i;
 }
 
-static inline const char *fix_units(const char *units) {
-    if(!units || !*units || !strcmp(units, "empty") || !strcmp(units, "null")) return "";
-    if(!strcmp(units, "percentage") || !strcmp(units, "percent") || !strcmp(units, "pcent")) return "%";
-    return units;
+static inline char *format_value_with_precision_and_unit(char *value_string, size_t value_string_len, calculated_number value, const char *units, int precision) {
+    if(unlikely(isnan(value) || isinf(value)))
+        value = 0.0;
+
+    char *separator = "";
+    if(unlikely(isalnum(*units)))
+        separator = " ";
+
+    if(precision < 0) {
+        int len, lstop = 0, trim_zeros = 1;
+
+        calculated_number abs = value;
+        if(isless(value, 0)) {
+            lstop = 1;
+            abs = -value;
+        }
+
+        if(isgreaterequal(abs, 1000)) {
+            len = snprintfz(value_string, value_string_len, "%0.0Lf", (long double) value);
+            trim_zeros = 0;
+        }
+        else if(isgreaterequal(abs, 100)) len = snprintfz(value_string, value_string_len, "%0.1Lf", (long double) value);
+        else if(isgreaterequal(abs, 1))   len = snprintfz(value_string, value_string_len, "%0.2Lf", (long double) value);
+        else if(isgreaterequal(abs, 0.1)) len = snprintfz(value_string, value_string_len, "%0.3Lf", (long double) value);
+        else                              len = snprintfz(value_string, value_string_len, "%0.4Lf", (long double) value);
+
+        if(unlikely(trim_zeros)) {
+            int l;
+            // remove trailing zeros from the decimal part
+            for(l = len - 1; l > lstop; l--) {
+                if(likely(value_string[l] == '0')) {
+                    value_string[l] = '\0';
+                    len--;
+                }
+
+                else if(unlikely(value_string[l] == '.')) {
+                    value_string[l] = '\0';
+                    len--;
+                    break;
+                }
+
+                else
+                    break;
+            }
+        }
+
+        if(unlikely(len <= 0)) len = 1;
+        snprintfz(&value_string[len], value_string_len - len, "%s%s", separator, units);
+    }
+    else {
+        if(precision > 50) precision = 50;
+        snprintfz(value_string, value_string_len, "%0.*Lf%s%s", precision, (long double) value, separator, units);
+    }
+
+    return value_string;
+}
+
+inline char *format_value_and_unit(char *value_string, size_t value_string_len, calculated_number value, const char *units, int precision) {
+    static uint32_t
+            hash_seconds = 0,
+            hash_seconds_ago = 0,
+            hash_minutes = 0,
+            hash_minutes_ago = 0,
+            hash_hours = 0,
+            hash_hours_ago = 0,
+            hash_onoff = 0,
+            hash_updown = 0,
+            hash_okerror = 0,
+            hash_okfailed = 0,
+            hash_empty = 0,
+            hash_null = 0,
+            hash_percentage = 0,
+            hash_percent = 0,
+            hash_pcent = 0;
+
+    if(unlikely(!hash_seconds)) {
+        hash_seconds     = simple_hash("seconds");
+        hash_seconds_ago = simple_hash("seconds ago");
+        hash_minutes     = simple_hash("minutes");
+        hash_minutes_ago = simple_hash("minutes ago");
+        hash_hours       = simple_hash("hours");
+        hash_hours_ago   = simple_hash("hours ago");
+        hash_onoff       = simple_hash("on/off");
+        hash_updown      = simple_hash("up/down");
+        hash_okerror     = simple_hash("ok/error");
+        hash_okfailed    = simple_hash("ok/failed");
+        hash_empty       = simple_hash("empty");
+        hash_null        = simple_hash("null");
+        hash_percentage  = simple_hash("percentage");
+        hash_percent     = simple_hash("percent");
+        hash_pcent       = simple_hash("pcent");
+    }
+
+    if(unlikely(!units)) units = "";
+
+    uint32_t hash_units = simple_hash(units);
+
+    if(unlikely((hash_units == hash_seconds && !strcmp(units, "seconds")) || (hash_units == hash_seconds_ago && !strcmp(units, "seconds ago")))) {
+        if(value == 0.0) {
+            snprintfz(value_string, value_string_len, "%s", "now");
+            return value_string;
+        }
+        else if(isnan(value) || isinf(value)) {
+            snprintfz(value_string, value_string_len, "%s", "never");
+            return value_string;
+        }
+
+        const char *suffix = (hash_units == hash_seconds_ago)?" ago":"";
+
+        size_t s = (size_t)value;
+        size_t d = s / 86400;
+        s = s % 86400;
+
+        size_t h = s / 3600;
+        s = s % 3600;
+
+        size_t m = s / 60;
+        s = s % 60;
+
+        if(d)
+            snprintfz(value_string, value_string_len, "%zu %s %02zu:%02zu:%02zu%s", d, (d == 1)?"day":"days", h, m, s, suffix);
+        else
+            snprintfz(value_string, value_string_len, "%02zu:%02zu:%02zu%s", h, m, s, suffix);
+
+        return value_string;
+    }
+
+    else if(unlikely((hash_units == hash_minutes && !strcmp(units, "minutes")) || (hash_units == hash_minutes_ago && !strcmp(units, "minutes ago")))) {
+        if(value == 0.0) {
+            snprintfz(value_string, value_string_len, "%s", "now");
+            return value_string;
+        }
+        else if(isnan(value) || isinf(value)) {
+            snprintfz(value_string, value_string_len, "%s", "never");
+            return value_string;
+        }
+
+        const char *suffix = (hash_units == hash_minutes_ago)?" ago":"";
+
+        size_t m = (size_t)value;
+        size_t d = m / (60 * 24);
+        m = m % (60 * 24);
+
+        size_t h = m / 60;
+        m = m % 60;
+
+        if(d)
+            snprintfz(value_string, value_string_len, "%zud %02zuh %02zum%s", d, h, m, suffix);
+        else
+            snprintfz(value_string, value_string_len, "%zuh %zum%s", h, m, suffix);
+
+        return value_string;
+    }
+
+    else if(unlikely((hash_units == hash_hours && !strcmp(units, "hours")) || (hash_units == hash_hours_ago && !strcmp(units, "hours ago")))) {
+        if(value == 0.0) {
+            snprintfz(value_string, value_string_len, "%s", "now");
+            return value_string;
+        }
+        else if(isnan(value) || isinf(value)) {
+            snprintfz(value_string, value_string_len, "%s", "never");
+            return value_string;
+        }
+
+        const char *suffix = (hash_units == hash_hours_ago)?" ago":"";
+
+        size_t h = (size_t)value;
+        size_t d = h / 24;
+        h = h % 24;
+
+        if(d)
+            snprintfz(value_string, value_string_len, "%zud %zuh%s", d, h, suffix);
+        else
+            snprintfz(value_string, value_string_len, "%zuh%s", h, suffix);
+
+        return value_string;
+    }
+
+    else if(unlikely(hash_units == hash_onoff && !strcmp(units, "on/off"))) {
+        snprintfz(value_string, value_string_len, "%s", (value != 0.0)?"on":"off");
+        return value_string;
+    }
+
+    else if(unlikely(hash_units == hash_updown && !strcmp(units, "up/down"))) {
+        snprintfz(value_string, value_string_len, "%s", (value != 0.0)?"up":"down");
+        return value_string;
+    }
+
+    else if(unlikely(hash_units == hash_okerror && !strcmp(units, "ok/error"))) {
+        snprintfz(value_string, value_string_len, "%s", (value != 0.0)?"ok":"error");
+        return value_string;
+    }
+
+    else if(unlikely(hash_units == hash_okfailed && !strcmp(units, "ok/failed"))) {
+        snprintfz(value_string, value_string_len, "%s", (value != 0.0)?"ok":"failed");
+        return value_string;
+    }
+
+    else if(unlikely(hash_units == hash_empty && !strcmp(units, "empty")))
+        units = "";
+
+    else if(unlikely(hash_units == hash_null && !strcmp(units, "null")))
+        units = "";
+
+    else if(unlikely(hash_units == hash_percentage && !strcmp(units, "percentage")))
+        units = "%";
+
+    else if(unlikely(hash_units == hash_percent && !strcmp(units, "percent")))
+        units = "%";
+
+    else if(unlikely(hash_units == hash_pcent && !strcmp(units, "pcent")))
+        units = "%";
+
+
+    if(unlikely(isnan(value) || isinf(value))) {
+        strcpy(value_string, "-");
+        return value_string;
+    }
+
+    return format_value_with_precision_and_unit(value_string, value_string_len, value, units, precision);
 }
 
 static inline const char *color_map(const char *color) {
@@ -391,7 +607,13 @@ static inline const char *color_map(const char *color) {
     return color;
 }
 
-static inline void calc_colorz(const char *color, char *final, size_t len, calculated_number value, int value_is_null) {
+static inline void calc_colorz(const char *color, char *final, size_t len, calculated_number value) {
+    int value_is_null = 0;
+    if(isnan(value) || isinf(value)) {
+        value = 0.0;
+        value_is_null = 1;
+    }
+
     char color_buffer[256 + 1] = "";
     char value_buffer[256 + 1] = "";
     char comparison = '>';
@@ -501,7 +723,7 @@ static inline void calc_colorz(const char *color, char *final, size_t len, calcu
 // colors
 #define COLOR_STRING_SIZE 100
 
-void buffer_svg(BUFFER *wb, const char *label, calculated_number value, const char *units, const char *label_color, const char *value_color, int value_is_null, int precision) {
+void buffer_svg(BUFFER *wb, const char *label, calculated_number value, const char *units, const char *label_color, const char *value_color, int precision) {
     char      label_buffer[LABEL_STRING_SIZE + 1]
             , value_color_buffer[COLOR_STRING_SIZE + 1]
             , value_string[VALUE_STRING_SIZE + 1]
@@ -516,102 +738,10 @@ void buffer_svg(BUFFER *wb, const char *label, calculated_number value, const ch
         label_color = "#555";
 
     if(unlikely(!value_color || !*value_color))
-        value_color = (value_is_null)?"#999":"#4c1";
-
-    units = fix_units(units);
-    calc_colorz(value_color, value_color_buffer, COLOR_STRING_SIZE, value, value_is_null);
-
-    char *separator = "";
-    if(unlikely(isalnum(*units)))
-        separator = " ";
-
-    if(unlikely(!strcmp(units, "seconds"))) {
-        size_t s = (size_t)value;
-        size_t d = s / 86400;
-        s = s % 86400;
-
-        size_t h = s / 3600;
-        s = s % 3600;
-
-        size_t m = s / 60;
-        s = s % 60;
-
-        if(d)
-            snprintfz(value_string, VALUE_STRING_SIZE, "%zu %s %02zu:%02zu:%02zu", d, (d == 1)?"day":"days", h, m, s);
-        else
-            snprintfz(value_string, VALUE_STRING_SIZE, "%02zu:%02zu:%02zu", h, m, s);
-    }
-
-    else if(unlikely(!strcmp(units, "minutes"))) {
-        size_t m = (size_t)value;
-        size_t d = m / (60 * 24);
-        m = m % (60 * 24);
-
-        size_t h = m / 60;
-        m = m % 60;
+        value_color = (isnan(value) || isinf(value))?"#999":"#4c1";
 
-        if(d)
-            snprintfz(value_string, VALUE_STRING_SIZE, "%zud %02zuh %02zum", d, h, m);
-        else
-            snprintfz(value_string, VALUE_STRING_SIZE, "%zuh %zum", h, m);
-    }
-
-    else if(unlikely(!strcmp(units, "hours"))) {
-        size_t h = (size_t)value;
-        size_t d = h / 24;
-        h = h % 24;
-
-        if(d)
-            snprintfz(value_string, VALUE_STRING_SIZE, "%zud %zuh", d, h);
-        else
-            snprintfz(value_string, VALUE_STRING_SIZE, "%zuh", h);
-    }
-
-    else if(unlikely(value_is_null))
-        strcpy(value_string, "-");
-
-    else if(precision < 0) {
-        int len, lstop = 0, trim_zeros = 1;
-
-        calculated_number abs = value;
-        if(isless(value, 0)) {
-            lstop = 1;
-            abs = -value;
-        }
-
-        if(isgreaterequal(abs, 1000))     { len = snprintfz(value_string, VALUE_STRING_SIZE, "%0.0Lf", (long double)value); trim_zeros = 0; }
-        else if(isgreaterequal(abs, 100))   len = snprintfz(value_string, VALUE_STRING_SIZE, "%0.1Lf", (long double)value);
-        else if(isgreaterequal(abs, 1))     len = snprintfz(value_string, VALUE_STRING_SIZE, "%0.2Lf", (long double)value);
-        else if(isgreaterequal(abs, 0.1))   len = snprintfz(value_string, VALUE_STRING_SIZE, "%0.3Lf", (long double)value);
-        else                                len = snprintfz(value_string, VALUE_STRING_SIZE, "%0.4Lf", (long double)value);
-
-        if(unlikely(trim_zeros)) {
-            int l;
-            // remove trailing zeros from the decimal part
-            for(l = len - 1; l > lstop ; l--) {
-                if(likely(value_string[l] == '0')) {
-                    value_string[l] = '\0';
-                    len--;
-                }
-
-                else if(unlikely(value_string[l] == '.')) {
-                    value_string[l] = '\0';
-                    len--;
-                    break;
-                }
-
-                else
-                    break;
-            }
-        }
-
-        if(len >= 0)
-            snprintfz(&value_string[len], VALUE_STRING_SIZE - len, "%s%s", separator, units);
-    }
-    else {
-        if(precision > 50) precision = 50;
-        snprintfz(value_string, VALUE_STRING_SIZE, "%0.*Lf%s%s", precision, (long double)value, separator, units);
-    }
+    calc_colorz(value_color, value_color_buffer, COLOR_STRING_SIZE, value);
+    format_value_and_unit(value_string, VALUE_STRING_SIZE, value, units, precision);
 
     // we need to copy the label, since verdana11_width may write to it
     strncpyz(label_buffer, label, LABEL_STRING_SIZE);
index 1281847eb49c4721c7d00cfe3cd61257f8981215..49f73e445c0c2b1210a57ce830e4e44dc574b791 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef NETDATA_WEB_BUFFER_SVG_H
 #define NETDATA_WEB_BUFFER_SVG_H 1
 
-extern void buffer_svg(BUFFER *wb, const char *label, calculated_number value, const char *units, const char *label_color, const char *value_color, int value_is_null, int precision);
+extern void buffer_svg(BUFFER *wb, const char *label, calculated_number value, const char *units, const char *label_color, const char *value_color, int precision);
+extern char *format_value_and_unit(char *value_string, size_t value_string_len, calculated_number value, const char *units, int precision);
 
 #endif /* NETDATA_WEB_BUFFER_SVG_H */
index 4b6ccf6469e7d1c9070be807028dd949c6294941..5acb44d5b0537e150e821981601bc71ea0c7c5ae 100644 (file)
@@ -896,7 +896,7 @@ int web_client_api_request_v1_badge(struct web_client *w, char *url) {
     if(!st) st = rrdset_find_byname(chart);
     if(!st) {
         buffer_no_cacheable(w->response.data);
-        buffer_svg(w->response.data, "chart not found", 0, "", NULL, NULL, 1, -1);
+        buffer_svg(w->response.data, "chart not found", NAN, "", NULL, NULL, -1);
         ret = 200;
         goto cleanup;
     }
@@ -906,7 +906,7 @@ int web_client_api_request_v1_badge(struct web_client *w, char *url) {
         rc = rrdcalc_find(st, alarm);
         if (!rc) {
             buffer_no_cacheable(w->response.data);
-            buffer_svg(w->response.data, "alarm not found", 0, "", NULL, NULL, 1, -1);
+            buffer_svg(w->response.data, "alarm not found", NAN, "", NULL, NULL, -1);
             ret = 200;
             goto cleanup;
         }
@@ -982,9 +982,6 @@ int web_client_api_request_v1_badge(struct web_client *w, char *url) {
             );
 
     if(rc) {
-        calculated_number n = rc->value;
-        if(isnan(n) || isinf(n)) n = 0;
-
         if (refresh > 0) {
             buffer_sprintf(w->response.header, "Refresh: %d\r\n", refresh);
             w->response.data->expires = now_realtime_sec() + refresh;
@@ -1020,19 +1017,18 @@ int web_client_api_request_v1_badge(struct web_client *w, char *url) {
         }
 
         buffer_svg(w->response.data,
-                   label,
-                   rc->value * multiply / divide,
-                   units,
-                   label_color,
-                   value_color,
-                   0,
-                   precision);
+                label,
+                (isnan(rc->value)||isinf(rc->value)) ? rc->value : rc->value * multiply / divide,
+                units,
+                label_color,
+                value_color,
+                precision);
         ret = 200;
     }
     else {
         time_t latest_timestamp = 0;
         int value_is_null = 1;
-        calculated_number n = 0;
+        calculated_number n = NAN;
         ret = 500;
 
         // if the collected value is too old, don't calculate its value
@@ -1065,13 +1061,12 @@ int web_client_api_request_v1_badge(struct web_client *w, char *url) {
 
         // render the badge
         buffer_svg(w->response.data,
-                   label,
-                   n * multiply / divide,
-                   units,
-                   label_color,
-                   value_color,
-                   value_is_null,
-                   precision);
+                label,
+                (value_is_null)?NAN:(n * multiply / divide),
+                units,
+                label_color,
+                value_color,
+                precision);
     }
 
 cleanup:
index 6fc294204249e0f3cd331a3b1c8f350b1f8cfeb0..346a8e9c10d0e3b5e42eb7364c6146c461a7d4df 100644 (file)
@@ -6075,7 +6075,7 @@ var NETDATA = window.NETDATA || {};
 
             var name = entry.name.replace(/_/g, ' ');
             var status = entry.status.toLowerCase();
-            var title = name + ' = ' + ((value === null)?'NaN':Math.floor(value)).toString() + ' ' + entry.units;
+            var title = name + ' = ' + entry.value_string.toString();
             var tag = entry.alarm_id;
             var icon = 'images/seo-performance-128.png';
             var interaction = false;
@@ -6104,7 +6104,11 @@ var NETDATA = window.NETDATA || {};
                         // console.log('alarm' + entry.unique_id + ' switch to CLEAR from ' + entry.old_status);
                         return;
                     }
-                    title = name + ' back to normal';
+                    if(entry.no_clear_notification === true) {
+                        // console.log('alarm' + entry.unique_id + ' is CLEAR but has no_clear_notification flag');
+                        return;
+                    }
+                    title = name + ' back to normal (' + entry.value_string.toString() + ')';
                     icon = 'images/check-mark-2-128-green.png'
                     interaction = false;
                     break;
index 24a579cfe8813fb7cc7a2b36f13b79925b4db779..7d7f726e3b3783d15ec857d5676737cd5abea66a 100644 (file)
@@ -12,7 +12,7 @@ netdataDashboard.menu = {
     },
 
     'services': {
-        title: 'Systemd Services',
+        title: 'systemd Services',
         icon: '<i class="fa fa-cogs" aria-hidden="true"></i>',
         info: 'Resources utilization of systemd services.'
     },
index d8e1282344daca504ae02de53fc4991bfa4988d2..ea51cc13245e9364b687d4e99b14b6dd9798c9a2 100644 (file)
 
                 function alarm_to_html(alarm, full) {
                     var chart = options.data.charts[alarm.chart];
+                    if(typeof(chart) === 'undefined') {
+                        // this means the charts loaded are incomplete
+                        // probably netdata was restarted and more charts
+                        // are now available.
+                        return '';
+                    }
+
                     var has_alarm = ((typeof alarm.warn !== 'undefined' || typeof alarm.crit !== 'undefined')?true:false);
 
                     var role_href = ((has_alarm === true)?('<br/>&nbsp;<br/>role: <b>' + alarm.recipient + '</b><br/>&nbsp;<br/><b><i class="fa fa-line-chart" aria-hidden="true"></i></b><small>&nbsp;&nbsp;<a href="#" onClick="NETDATA.alarms.scrollToChart(\'' + alarm.chart + '\'); $(\'#alarmsModal\').modal(\'hide\'); return false;">jump to chart</a></small>'):('&nbsp;'));
                         + ((typeof alarm.crit !== 'undefined')?('<tr><td width="10%" style="text-align:right">critical&nbsp;when</td><td><span style="font-family: monospace; color: #e05d44; font-weight: bold;">' + alarm.crit + '</span></td></tr>'):'');
 
                     if(full === true) {
-                            html += ((typeof alarm.lookup_after !== 'undefined')?('<tr><td width="10%" style="text-align:right">db&nbsp;lookup</td><td>' + alarm_lookup_explain(alarm, chart) + '</td></tr>'):'')
+                        var units = chart.units;
+                        if(units === '%') units = '&#37;';
+
+                        html += ((typeof alarm.lookup_after !== 'undefined')?('<tr><td width="10%" style="text-align:right">db&nbsp;lookup</td><td>' + alarm_lookup_explain(alarm, chart) + '</td></tr>'):'')
                             + ((typeof alarm.calc !== 'undefined')?('<tr><td width="10%" style="text-align:right">calculation</td><td><span style="font-family: monospace;">' + alarm.calc + '</span></td></tr>'):'')
-                            + ((chart.green !== null)?('<tr><td width="10%" style="text-align:right">green&nbsp;threshold</td><td><code>' + chart.green + ' ' + chart.units + '</code></td></tr>'):'')
-                            + ((chart.red !== null)?('<tr><td width="10%" style="text-align:right">red&nbsp;threshold</td><td><code>' + chart.red + ' ' + chart.units + '</code></td></tr>'):'');
+                            + ((chart.green !== null)?('<tr><td width="10%" style="text-align:right">green&nbsp;threshold</td><td><code>' + chart.green + ' ' + units + '</code></td></tr>'):'')
+                            + ((chart.red !== null)?('<tr><td width="10%" style="text-align:right">red&nbsp;threshold</td><td><code>' + chart.red + ' ' + units + '</code></td></tr>'):'');
                     }
 
                     var delay = '';
                                 switchable: false,
                                 sortable: true
                             },
+                            {
+                                field: 'value_string',
+                                title: 'Friendly Value',
+                                titleTooltip: 'The value of the alarm, that triggered this event',
+                                align: 'right',
+                                valign: 'middle',
+                                sortable: true
+                            },
+                            {
+                                field: 'old_value_string',
+                                title: 'Friendly Old Value',
+                                titleTooltip: 'The value of the alarm, just before this event',
+                                align: 'right',
+                                valign: 'middle',
+                                visible: false,
+                                sortable: true
+                            },
                             {
                                 field: 'old_value',
                                 title: 'Old Value',
                                 },
                                 align: 'right',
                                 valign: 'middle',
+                                visible: false,
                                 sortable: true
                             },
                             {
                                 titleTooltip: 'The units of the value of the alarm',
                                 align: 'left',
                                 valign: 'middle',
+                                visible: false,
                                 sortable: true
                             },
                             {
     </div>
 </body>
 </html>
-<script type="text/javascript" src="dashboard.js?v20170118-11"></script>
+<script type="text/javascript" src="dashboard.js?v20170127-1"></script>