From: Costa Tsaousis Date: Sun, 22 Jan 2017 04:57:07 +0000 (+0200) Subject: Merge pull request #1594 from ktsaou/master X-Git-Tag: v1.5.0~7 X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=053f5a74caedbdf45658348825ddc9762814c6a4;hp=bb3d2c36083556b24c5311fefcd0d531a613e2d6;p=netdata.git Merge pull request #1594 from ktsaou/master data collection optimizations --- diff --git a/conf.d/Makefile.am b/conf.d/Makefile.am index 27777144..a7e8224c 100644 --- a/conf.d/Makefile.am +++ b/conf.d/Makefile.am @@ -28,6 +28,7 @@ dist_pythonconfig_DATA = \ python.d/bind_rndc.conf \ python.d/cpufreq.conf \ python.d/dovecot.conf \ + python.d/elasticsearch.conf \ python.d/example.conf \ python.d/exim.conf \ python.d/fail2ban.conf \ @@ -60,6 +61,7 @@ dist_healthconfig_DATA = \ health.d/bind_rndc.conf \ health.d/cpu.conf \ health.d/disks.conf \ + health.d/elasticsearch.conf \ health.d/entropy.conf \ health.d/haproxy.conf \ health.d/ipc.conf \ diff --git a/conf.d/health.d/elasticsearch.conf b/conf.d/health.d/elasticsearch.conf new file mode 100644 index 00000000..dffd4096 --- /dev/null +++ b/conf.d/health.d/elasticsearch.conf @@ -0,0 +1,9 @@ + alarm: elasticsearch_last_collected + on: elasticsearch_local.cluster_health_status + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/conf.d/python.d/elasticsearch.conf b/conf.d/python.d/elasticsearch.conf new file mode 100644 index 00000000..1faee858 --- /dev/null +++ b/conf.d/python.d/elasticsearch.conf @@ -0,0 +1,72 @@ +# netdata python.d.plugin configuration for elasticsearch stats +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 1 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# retries sets the number of retries to be made in case of failures. +# If unset, the default for python.d.plugin is used. +# Attempts to restore the service are made once every update_every +# and only if the module has collected values in the past. +# retries: 5 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 1 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# retries: 5 # the JOB's number of restoration attempts +# +# Additionally to the above, elasticsearch plugin also supports the following: +# +# host: 'ipaddress' # Server ip address or hostname. +# port: 'port' # Port on which elasticsearch listen. +# cluster_health: False/True # Calls to cluster health elasticsearch API. Enabled by default. +# cluster_stats: False/True # Calls to cluster stats elasticsearch API. Enabled by default. +# +# ---------------------------------------------------------------------- +# AUTO-DETECTION JOBS +# only one of them will run (they have the same name) +# +#local: +# host: '127.0.0.1' +# port: '9200' +# cluster_health: True +# cluster_stats: True diff --git a/netdata-installer.sh b/netdata-installer.sh index f6b4013c..fa69de19 100755 --- a/netdata-installer.sh +++ b/netdata-installer.sh @@ -694,6 +694,7 @@ run find ./system/ -type f -a \! -name \*.in -a \! -name Makefile\* -a \! -name NETDATA_ADDED_TO_DOCKER=0 NETDATA_ADDED_TO_NGINX=0 NETDATA_ADDED_TO_VARNISH=0 +NETDATA_ADDED_TO_HAPROXY=0 if [ ${UID} -eq 0 ] then portable_add_group netdata @@ -701,6 +702,7 @@ if [ ${UID} -eq 0 ] portable_add_user_to_group docker netdata && NETDATA_ADDED_TO_DOCKER=1 portable_add_user_to_group nginx netdata && NETDATA_ADDED_TO_NGINX=1 portable_add_user_to_group varnish netdata && NETDATA_ADDED_TO_VARNISH=1 + portable_add_user_to_group haproxy netdata && NETDATA_ADDED_TO_HAPROXY=1 if [ -d /etc/logrotate.d -a ! -f /etc/logrotate.d/netdata ] then @@ -1355,6 +1357,16 @@ if [ $? -eq 0 -a "${NETDATA_ADDED_TO_VARNISH}" = "1" ] echo " gpasswd -d netdata varnish" fi +getent group haproxy > /dev/null +if [ $? -eq 0 -a "${NETDATA_ADDED_TO_HAPROXY}" = "1" ] + then + echo + echo "You may also want to remove the netdata user from the haproxy group" + echo "by running:" + echo " gpasswd -d netdata haproxy" +fi + + UNINSTALL chmod 750 netdata-uninstaller.sh diff --git a/python.d/Makefile.am b/python.d/Makefile.am index 436bceea..7706acd6 100644 --- a/python.d/Makefile.am +++ b/python.d/Makefile.am @@ -14,6 +14,7 @@ dist_python_SCRIPTS = \ cpufreq.chart.py \ cpuidle.chart.py \ dovecot.chart.py \ + elasticsearch.chart.py \ example.chart.py \ exim.chart.py \ fail2ban.chart.py \ diff --git a/python.d/elasticsearch.chart.py b/python.d/elasticsearch.chart.py new file mode 100644 index 00000000..ff841f17 --- /dev/null +++ b/python.d/elasticsearch.chart.py @@ -0,0 +1,402 @@ +# -*- coding: utf-8 -*- +# Description: elastic search node stats netdata python.d module +# Author: l2isbad + +from base import UrlService +from requests import get +from socket import gethostbyname +try: + from queue import Queue +except ImportError: + from Queue import Queue +from threading import Thread + +# default module values (can be overridden per job in `config`) +# update_every = 2 +update_every = 5 +priority = 60000 +retries = 60 + +# charts order (can be overridden if you want less charts, or different order) +ORDER = ['search_perf_total', 'search_perf_time', 'search_latency', 'index_perf_total', 'index_perf_time', + 'index_latency', 'jvm_mem_heap', 'jvm_gc_count', 'jvm_gc_time', 'host_metrics_file_descriptors', + 'host_metrics_http', 'host_metrics_transport', 'thread_pool_qr', 'fdata_cache', 'fdata_ev_tr', + 'cluster_health_status', 'cluster_health_nodes', 'cluster_health_shards', 'cluster_stats_nodes', + 'cluster_stats_query_cache', 'cluster_stats_docs', 'cluster_stats_store', 'cluster_stats_indices_shards'] + +CHARTS = { + 'search_perf_total': { + 'options': [None, 'Number of queries, fetches', 'queries', 'Search performance', 'es.search_query', 'stacked'], + 'lines': [ + ['query_total', 'search_total', 'incremental'], + ['fetch_total', 'fetch_total', 'incremental'], + ['query_current', 'search_current', 'absolute'], + ['fetch_current', 'fetch_current', 'absolute'] + ]}, + 'search_perf_time': { + 'options': [None, 'Time spent on queries, fetches', 'seconds', 'Search performance', 'es.search_time', 'stacked'], + 'lines': [ + ['query_time_in_millis', 'query', 'incremental', 1, 1000], + ['fetch_time_in_millis', 'fetch', 'incremental', 1, 1000] + ]}, + 'search_latency': { + 'options': [None, 'Query and fetch latency', 'ms', 'Search performance', 'es.search_latency', 'stacked'], + 'lines': [ + ['query_latency', 'query', 'absolute', 1, 1000], + ['fetch_latency', 'fetch', 'absolute', 1, 1000] + ]}, + 'index_perf_total': { + 'options': [None, 'Number of documents indexed, index refreshes, flushes', 'documents/indexes', + 'Indexing performance', 'es.index_doc', 'stacked'], + 'lines': [ + ['indexing_index_total', 'indexed', 'incremental'], + ['refresh_total', 'refreshes', 'incremental'], + ['flush_total', 'flushes', 'incremental'], + ['indexing_index_current', 'indexed_current', 'absolute'], + ]}, + 'index_perf_time': { + 'options': [None, 'Time spent on indexing, refreshing, flushing', 'seconds', 'Indexing performance', + 'es.search_time', 'stacked'], + 'lines': [ + ['indexing_index_time_in_millis', 'indexing', 'incremental', 1, 1000], + ['refresh_total_time_in_millis', 'refreshing', 'incremental', 1, 1000], + ['flush_total_time_in_millis', 'flushing', 'incremental', 1, 1000] + ]}, + 'index_latency': { + 'options': [None, 'Indexing and flushing latency', 'ms', 'Indexing performance', + 'es.index_latency', 'stacked'], + 'lines': [ + ['indexing_latency', 'indexing', 'absolute', 1, 1000], + ['flushing_latency', 'flushing', 'absolute', 1, 1000] + ]}, + 'jvm_mem_heap': { + 'options': [None, 'JVM heap currently in use/committed', 'percent/MB', 'Memory usage and gc', + 'es.jvm_heap', 'area'], + 'lines': [ + ['jvm_heap_percent', 'inuse', 'absolute'], + ['jvm_heap_commit', 'commit', 'absolute', -1, 1048576] + ]}, + 'jvm_gc_count': { + 'options': [None, 'Count of garbage collections', 'counts', 'Memory usage and gc', 'es.gc_count', 'stacked'], + 'lines': [ + ['young_collection_count', 'young', 'incremental'], + ['old_collection_count', 'old', 'incremental'] + ]}, + 'jvm_gc_time': { + 'options': [None, 'Time spent on garbage collections', 'ms', 'Memory usage and gc', 'es.gc_time', 'stacked'], + 'lines': [ + ['young_collection_time_in_millis', 'young', 'incremental'], + ['old_collection_time_in_millis', 'old', 'incremental'] + ]}, + 'thread_pool_qr': { + 'options': [None, 'Number of queued/rejected threads in thread pool', 'threads', 'Queues and rejections', + 'es.qr', 'stacked'], + 'lines': [ + ['bulk_queue', 'bulk_queue', 'absolute'], + ['index_queue', 'index_queue', 'absolute'], + ['search_queue', 'search_queue', 'absolute'], + ['merge_queue', 'merge_queue', 'absolute'], + ['bulk_rejected', 'bulk_rej', 'absolute'], + ['index_rejected', 'index_rej', 'absolute'], + ['search_rejected', 'search_rej', 'absolute'], + ['merge_rejected', 'merge_rej', 'absolute'] + ]}, + 'fdata_cache': { + 'options': [None, 'Fielddata cache size', 'MB', 'Fielddata cache', 'es.fdata_cache', 'line'], + 'lines': [ + ['index_fdata_mem', 'mem_size', 'absolute', 1, 1048576] + ]}, + 'fdata_ev_tr': { + 'options': [None, 'Fielddata evictions and circuit breaker tripped count', 'number of events', + 'Fielddata cache', 'es.fdata_ev_tr', 'line'], + 'lines': [ + ['index_fdata_evic', 'evictions', 'incremental'], + ['breakers_fdata_trip', 'tripped', 'incremental'] + ]}, + 'cluster_health_nodes': { + 'options': [None, 'Nodes and tasks statistics', 'units', 'Cluster health API', + 'es.cluster_health', 'stacked'], + 'lines': [ + ['health_number_of_nodes', 'nodes', 'absolute'], + ['health_number_of_data_nodes', 'data_nodes', 'absolute'], + ['health_number_of_pending_tasks', 'pending_tasks', 'absolute'], + ['health_number_of_in_flight_fetch', 'inflight_fetch', 'absolute'] + ]}, + 'cluster_health_status': { + 'options': [None, 'Cluster status', 'status', 'Cluster health API', + 'es.cluster_health_status', 'area'], + 'lines': [ + ['status_green', 'green', 'absolute'], + ['status_red', 'red', 'absolute'], + ['status_foo1', None, 'absolute'], + ['status_foo2', None, 'absolute'], + ['status_foo3', None, 'absolute'], + ['status_yellow', 'yellow', 'absolute'] + ]}, + 'cluster_health_shards': { + 'options': [None, 'Shards statistics', 'shards', 'Cluster health API', + 'es.cluster_health_sharts', 'stacked'], + 'lines': [ + ['health_active_shards', 'active_shards', 'absolute'], + ['health_relocating_shards', 'relocating_shards', 'absolute'], + ['health_unassigned_shards', 'unassigned', 'absolute'], + ['health_delayed_unassigned_shards', 'delayed_unassigned', 'absolute'], + ['health_initializing_shards', 'initializing', 'absolute'], + ['health_active_shards_percent_as_number', 'active_percent', 'absolute'] + ]}, + 'cluster_stats_nodes': { + 'options': [None, 'Nodes statistics', 'nodes', 'Cluster stats API', + 'es.cluster_stats_nodes', 'stacked'], + 'lines': [ + ['count_data_only', 'data_only', 'absolute'], + ['count_master_data', 'master_data', 'absolute'], + ['count_total', 'total', 'absolute'], + ['count_master_only', 'master_only', 'absolute'], + ['count_client', 'client', 'absolute'] + ]}, + 'cluster_stats_query_cache': { + 'options': [None, 'Query cache statistics', 'queries', 'Cluster stats API', + 'es.cluster_stats_query_cache', 'stacked'], + 'lines': [ + ['query_cache_hit_count', 'hit', 'incremental'], + ['query_cache_miss_count', 'miss', 'incremental'] + ]}, + 'cluster_stats_docs': { + 'options': [None, 'Docs statistics', 'count', 'Cluster stats API', + 'es.cluster_stats_docs', 'line'], + 'lines': [ + ['docs_count', 'docs', 'absolute'] + ]}, + 'cluster_stats_store': { + 'options': [None, 'Store statistics', 'MB', 'Cluster stats API', + 'es.cluster_stats_store', 'line'], + 'lines': [ + ['store_size_in_bytes', 'size', 'absolute', 1, 1048567] + ]}, + 'cluster_stats_indices_shards': { + 'options': [None, 'Indices and shards statistics', 'count', 'Cluster stats API', + 'es.cluster_stats_ind_sha', 'stacked'], + 'lines': [ + ['indices_count', 'indices', 'absolute'], + ['shards_total', 'shards', 'absolute'] + ]}, + 'host_metrics_transport': { + 'options': [None, 'Cluster communication transport metrics', 'kbit/s', 'Host metrics', + 'es.host_metrics_transport', 'area'], + 'lines': [ + ['transport_rx_size_in_bytes', 'in', 'incremental', 8, 1000], + ['transport_tx_size_in_bytes', 'out', 'incremental', -8, 1000] + ]}, + 'host_metrics_file_descriptors': { + 'options': [None, 'Available file descriptors in percent', 'percent', 'Host metrics', + 'es.host_metrics_descriptors', 'area'], + 'lines': [ + ['file_descriptors_used', 'used', 'absolute', 1, 10] + ]}, + 'host_metrics_http': { + 'options': [None, 'Opened HTTP connections', 'connections', 'Host metrics', + 'es.host_metrics_http', 'line'], + 'lines': [ + ['http_current_open', 'opened', 'absolute', 1, 1] + ]} +} + + +class Service(UrlService): + def __init__(self, configuration=None, name=None): + UrlService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + self.host = self.configuration.get('host') + self.port = self.configuration.get('port') + self.user = self.configuration.get('user') + self.password = self.configuration.get('pass') + self.latency = dict() + + def check(self): + # We can't start if AND not specified + if not all([self.host, self.port]): + return False + + # It as a bad idea to use hostname. + # Hostname -> ipaddress + try: + self.host = gethostbyname(self.host) + except Exception as e: + self.error(str(e)) + return False + + # HTTP Auth? NOT TESTED + self.auth = self.user and self.password + + # Create URL for every Elasticsearch API + url_node_stats = 'http://%s:%s/_nodes/_local/stats' % (self.host, self.port) + url_cluster_health = 'http://%s:%s/_cluster/health' % (self.host, self.port) + url_cluster_stats = 'http://%s:%s/_cluster/stats' % (self.host, self.port) + + # Create list of enabled API calls + user_choice = [bool(self.configuration.get('node_stats', True)), + bool(self.configuration.get('cluster_health', True)), + bool(self.configuration.get('cluster_stats', True))] + + avail_methods = [(self._get_node_stats, url_node_stats), + (self._get_cluster_health, url_cluster_health), + (self._get_cluster_stats, url_cluster_stats)] + + # Remove disabled API calls from 'avail methods' + self.methods = [avail_methods[_] for _ in range(len(avail_methods)) if user_choice[_]] + + # Run _get_data for ALL active API calls. + api_result = {} + for method in self.methods: + api_result[method[1]] = (bool(self._get_raw_data(method[1]))) + + # We can start ONLY if all active API calls returned NOT None + if not all(api_result.values()): + self.error('Plugin could not get data from all APIs') + self.error('%s' % api_result) + return False + else: + self.info('%s' % api_result) + self.info('Plugin was started successfully') + + return True + + def _get_raw_data(self, url): + try: + if not self.auth: + raw_data = get(url) + else: + raw_data = get(url, auth=(self.user, self.password)) + except Exception: + return None + + return raw_data + + def _get_data(self): + threads = list() + queue = Queue() + result = dict() + + for method in self.methods: + th = Thread(target=method[0], args=(queue, method[1])) + th.start() + threads.append(th) + + for thread in threads: + thread.join() + result.update(queue.get()) + + return result or None + + def _get_cluster_health(self, queue, url): + """ + Format data received from http request + :return: dict + """ + + data = self._get_raw_data(url) + + if not data: + queue.put({}) + else: + data = data.json() + + to_netdata = dict() + to_netdata.update(update_key('health', data)) + to_netdata.update({'status_green': 0, 'status_red': 0, 'status_yellow': 0, + 'status_foo1': 0, 'status_foo2': 0, 'status_foo3': 0}) + to_netdata[''.join(['status_', to_netdata.get('health_status', '')])] = 1 + + queue.put(to_netdata) + + def _get_cluster_stats(self, queue, url): + """ + Format data received from http request + :return: dict + """ + + data = self._get_raw_data(url) + + if not data: + queue.put({}) + else: + data = data.json() + + to_netdata = dict() + to_netdata.update(update_key('count', data['nodes']['count'])) + to_netdata.update(update_key('query_cache', data['indices']['query_cache'])) + to_netdata.update(update_key('docs', data['indices']['docs'])) + to_netdata.update(update_key('store', data['indices']['store'])) + to_netdata['indices_count'] = data['indices']['count'] + to_netdata['shards_total'] = data['indices']['shards']['total'] + + queue.put(to_netdata) + + def _get_node_stats(self, queue, url): + """ + Format data received from http request + :return: dict + """ + + data = self._get_raw_data(url) + + if not data: + queue.put({}) + else: + data = data.json() + node = list(data['nodes'].keys())[0] + to_netdata = dict() + # Search performance metrics + to_netdata.update(data['nodes'][node]['indices']['search']) + to_netdata['query_latency'] = self.find_avg(to_netdata['query_total'], + to_netdata['query_time_in_millis'], 'query_latency') + to_netdata['fetch_latency'] = self.find_avg(to_netdata['fetch_total'], + to_netdata['fetch_time_in_millis'], 'fetch_latency') + + # Indexing performance metrics + for key in ['indexing', 'refresh', 'flush']: + to_netdata.update(update_key(key, data['nodes'][node]['indices'].get(key, {}))) + to_netdata['indexing_latency'] = self.find_avg(to_netdata['indexing_index_total'], + to_netdata['indexing_index_time_in_millis'], 'index_latency') + to_netdata['flushing_latency'] = self.find_avg(to_netdata['flush_total'], + to_netdata['flush_total_time_in_millis'], 'flush_latency') + # Memory usage and garbage collection + to_netdata.update(update_key('young', data['nodes'][node]['jvm']['gc']['collectors']['young'])) + to_netdata.update(update_key('old', data['nodes'][node]['jvm']['gc']['collectors']['old'])) + to_netdata['jvm_heap_percent'] = data['nodes'][node]['jvm']['mem']['heap_used_percent'] + to_netdata['jvm_heap_commit'] = data['nodes'][node]['jvm']['mem']['heap_committed_in_bytes'] + + # Thread pool queues and rejections + for key in ['bulk', 'index', 'search', 'merge']: + to_netdata.update(update_key(key, data['nodes'][node]['thread_pool'].get(key, {}))) + + # Fielddata cache + to_netdata['index_fdata_mem'] = data['nodes'][node]['indices']['fielddata']['memory_size_in_bytes'] + to_netdata['index_fdata_evic'] = data['nodes'][node]['indices']['fielddata']['evictions'] + to_netdata['breakers_fdata_trip'] = data['nodes'][node]['breakers']['fielddata']['tripped'] + + # Host metrics + to_netdata.update(update_key('http', data['nodes'][node]['http'])) + to_netdata.update(update_key('transport', data['nodes'][node]['transport'])) + to_netdata['file_descriptors_used'] = round(float(data['nodes'][node]['process']['open_file_descriptors']) + / data['nodes'][node]['process']['max_file_descriptors'] * 1000) + + queue.put(to_netdata) + + def find_avg(self, value1, value2, key): + if key not in self.latency: + self.latency.update({key: [value1, value2]}) + return 0 + else: + if not self.latency[key][0] == value1: + latency = round(float(value2 - self.latency[key][1]) / float(value1 - self.latency[key][0]) * 1000) + self.latency.update({key: [value1, value2]}) + return latency + else: + self.latency.update({key: [value1, value2]}) + return 0 + + +def update_key(string, dictionary): + return {'_'.join([string, k]): v for k, v in dictionary.items()} diff --git a/python.d/freeradius.chart.py b/python.d/freeradius.chart.py index 335127fd..2ac280f0 100644 --- a/python.d/freeradius.chart.py +++ b/python.d/freeradius.chart.py @@ -101,8 +101,8 @@ class Service(SimpleService): :return: str """ try: - process_echo = Popen(self.sub_echo, stdout=PIPE, shell=False) - process_rad = Popen(self.sub_radclient, stdin=process_echo.stdout, stdout=PIPE, shell=False) + process_echo = Popen(self.sub_echo, stdout=PIPE, stderr=PIPE, shell=False) + process_rad = Popen(self.sub_radclient, stdin=process_echo.stdout, stdout=PIPE, stderr=PIPE, shell=False) process_echo.stdout.close() raw_result = process_rad.communicate()[0] except Exception: diff --git a/python.d/varnish.chart.py b/python.d/varnish.chart.py index 9d2c780e..9b37c93d 100644 --- a/python.d/varnish.chart.py +++ b/python.d/varnish.chart.py @@ -60,9 +60,11 @@ CHARTS = {'backend_health': ['s0.g_bytes', 'allocated', 'absolute', -1, 1048576]], 'options': [None, 'Memory usage', 'megabytes', 'Memory usage', 'varnish.memory_usage', 'stacked']}, 'session': - {'lines': [['sess_conn', 'conn', 'incremental', 1, 1], - ['client_req', 'requests', 'incremental', 1, 1], - ['sess_dropped', 'dropped', 'incremental', 1, 1]], + {'lines': [['sess_conn', 'sess_conn', 'incremental', 1, 1], + ['client_req', 'client_requests', 'incremental', 1, 1], + ['client_conn', 'client_conn', 'incremental', 1, 1], + ['client_drop', 'client_drop', 'incremental', 1, 1], + ['sess_dropped', 'sess_dropped', 'incremental', 1, 1]], 'options': [None, 'Sessions', 'units', 'Client metrics', 'varnish.session', 'line']}, 'threads': {'lines': [['threads', None, 'absolute', 1, 1], @@ -88,7 +90,7 @@ class Service(SimpleService): if is_executable(''.join([directory, 'varnishstat']), X_OK)][0] except IndexError: self.varnish = False - self.rgx_all = compile(r'([A-Z]+\.)([\d\w_.]+)\s+(\d+)') + self.rgx_all = compile(r'([A-Z]+\.)?([\d\w_.]+)\s+(\d+)') # Could be # VBE.boot.super_backend.pipe_hdrbyte (new) # or @@ -114,7 +116,7 @@ class Service(SimpleService): # 2. Output is parsable (list is not empty after regex findall) is_parsable = self.rgx_all.findall(reply) if not is_parsable: - self.error('Cant parse output (only varnish version 4+ supported)') + self.error('Cant parse output...') return False # We need to find the right regex for backend parse @@ -123,7 +125,7 @@ class Service(SimpleService): self.rgx_bck = self.rgx_bck[0] else: self.backend_list = self.rgx_bck[1].findall(reply)[::2] - self.rgx_bck = self.rgx_back[1] + self.rgx_bck = self.rgx_bck[1] # We are about to start! self.create_charts() @@ -187,7 +189,8 @@ class Service(SimpleService): # 3.3 Problems summary chart for elem in ['backend_busy', 'backend_unhealthy', 'esi_errors', 'esi_warnings', 'losthdr', 'sess_drop', 'sess_fail', 'sess_pipe_overflow', 'threads_destroyed', 'threads_failed', 'threads_limited', 'thread_queue_len']: - to_netdata[''.join([elem, '_b'])] = to_netdata.get(elem, 0) + if to_netdata.get(elem) is not None: + to_netdata[''.join([elem, '_b'])] = to_netdata.get(elem) # Ready steady go! return to_netdata @@ -207,7 +210,8 @@ class Service(SimpleService): #self.order.extend(extra_charts) # Create static charts - self.definitions = {chart: values for chart, values in CHARTS.items() if chart in self.order} + #self.definitions = {chart: values for chart, values in CHARTS.items() if chart in self.order} + self.definitions = CHARTS # Create dynamic backend charts if self.backend_list: diff --git a/src/freebsd_sysctl.c b/src/freebsd_sysctl.c index 7edaf4f0..9400089d 100644 --- a/src/freebsd_sysctl.c +++ b/src/freebsd_sysctl.c @@ -163,6 +163,7 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { int numdevs; static void *devstat_data = NULL; struct devstat *dstat; + char disk[DEVSTAT_NAME_LEN + 10 + 1]; // 10 - maximum number of digits for int struct cur_dstat { collected_number duration_read_ms; collected_number duration_write_ms; @@ -609,34 +610,35 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { error("DISABLED: disk.io"); } else { dstat = devstat_data + sizeof(long); // skip generation number - collected_number total_disk_reads = 0; - collected_number total_disk_writes = 0; + collected_number total_disk_kbytes_read = 0; + collected_number total_disk_kbytes_write = 0; for (i = 0; i < numdevs; i++) { if (((dstat[i].device_type & DEVSTAT_TYPE_MASK) == DEVSTAT_TYPE_DIRECT) || ((dstat[i].device_type & DEVSTAT_TYPE_MASK) == DEVSTAT_TYPE_STORARRAY)) { + sprintf(disk, "%s%d", dstat[i].device_name, dstat[i].unit_number); // -------------------------------------------------------------------- - st = rrdset_find_bytype(RRD_TYPE_DISK, dstat[i].device_name); + st = rrdset_find_bytype(RRD_TYPE_DISK, disk); if (unlikely(!st)) { - st = rrdset_create(RRD_TYPE_DISK, dstat[i].device_name, NULL, dstat[i].device_name, "disk.io", "Disk I/O Bandwidth", "kilobytes/s", 2000, update_every, RRDSET_TYPE_AREA); + st = rrdset_create(RRD_TYPE_DISK, disk, NULL, disk, "disk.io", "Disk I/O Bandwidth", "kilobytes/s", 2000, update_every, RRDSET_TYPE_AREA); rrddim_add(st, "reads", NULL, 1, 1024, RRDDIM_INCREMENTAL); rrddim_add(st, "writes", NULL, -1, 1024, RRDDIM_INCREMENTAL); } else rrdset_next(st); - total_disk_reads += dstat[i].bytes[DEVSTAT_READ]; - total_disk_writes += dstat[i].bytes[DEVSTAT_WRITE]; + total_disk_kbytes_read += dstat[i].bytes[DEVSTAT_READ]/KILO_FACTOR; + total_disk_kbytes_write += dstat[i].bytes[DEVSTAT_WRITE]/KILO_FACTOR; prev_dstat.bytes_read = rrddim_set(st, "reads", dstat[i].bytes[DEVSTAT_READ]); prev_dstat.bytes_write = rrddim_set(st, "writes", dstat[i].bytes[DEVSTAT_WRITE]); rrdset_done(st); // -------------------------------------------------------------------- - st = rrdset_find_bytype("disk_ops", dstat[i].device_name); + st = rrdset_find_bytype("disk_ops", disk); if (unlikely(!st)) { - st = rrdset_create("disk_ops", dstat[i].device_name, NULL, dstat[i].device_name, "disk.ops", "Disk Completed I/O Operations", "operations/s", 2001, update_every, RRDSET_TYPE_LINE); + st = rrdset_create("disk_ops", disk, NULL, disk, "disk.ops", "Disk Completed I/O Operations", "operations/s", 2001, update_every, RRDSET_TYPE_LINE); st->isdetail = 1; rrddim_add(st, "reads", NULL, 1, 1, RRDDIM_INCREMENTAL); @@ -650,9 +652,9 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { // -------------------------------------------------------------------- - st = rrdset_find_bytype("disk_qops", dstat[i].device_name); + st = rrdset_find_bytype("disk_qops", disk); if (unlikely(!st)) { - st = rrdset_create("disk_qops", dstat[i].device_name, NULL, dstat[i].device_name, "disk.qops", "Disk Current I/O Operations", "operations", 2002, update_every, RRDSET_TYPE_LINE); + st = rrdset_create("disk_qops", disk, NULL, disk, "disk.qops", "Disk Current I/O Operations", "operations", 2002, update_every, RRDSET_TYPE_LINE); st->isdetail = 1; rrddim_add(st, "operations", NULL, 1, 1, RRDDIM_ABSOLUTE); @@ -664,9 +666,9 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { // -------------------------------------------------------------------- - st = rrdset_find_bytype("disk_util", dstat[i].device_name); + st = rrdset_find_bytype("disk_util", disk); if (unlikely(!st)) { - st = rrdset_create("disk_util", dstat[i].device_name, NULL, dstat[i].device_name, "disk.util", "Disk Utilization Time", "% of time working", 2004, update_every, RRDSET_TYPE_AREA); + st = rrdset_create("disk_util", disk, NULL, disk, "disk.util", "Disk Utilization Time", "% of time working", 2004, update_every, RRDSET_TYPE_AREA); st->isdetail = 1; rrddim_add(st, "utilization", NULL, 1, 10, RRDDIM_INCREMENTAL); @@ -679,9 +681,9 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { // -------------------------------------------------------------------- - st = rrdset_find_bytype("disk_iotime", dstat[i].device_name); + st = rrdset_find_bytype("disk_iotime", disk); if (unlikely(!st)) { - st = rrdset_create("disk_iotime", dstat[i].device_name, NULL, dstat[i].device_name, "disk.iotime", "Disk Total I/O Time", "milliseconds/s", 2022, update_every, RRDSET_TYPE_LINE); + st = rrdset_create("disk_iotime", disk, NULL, disk, "disk.iotime", "Disk Total I/O Time", "milliseconds/s", 2022, update_every, RRDSET_TYPE_LINE); st->isdetail = 1; rrddim_add(st, "reads", NULL, 1, 1, RRDDIM_INCREMENTAL); @@ -703,9 +705,9 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { // -------------------------------------------------------------------- - st = rrdset_find_bytype("disk_await", dstat[i].device_name); + st = rrdset_find_bytype("disk_await", disk); if (unlikely(!st)) { - st = rrdset_create("disk_await", dstat[i].device_name, NULL, dstat[i].device_name, "disk.await", "Average Completed I/O Operation Time", "ms per operation", 2005, update_every, RRDSET_TYPE_LINE); + st = rrdset_create("disk_await", disk, NULL, disk, "disk.await", "Average Completed I/O Operation Time", "ms per operation", 2005, update_every, RRDSET_TYPE_LINE); st->isdetail = 1; rrddim_add(st, "reads", NULL, 1, 1, RRDDIM_ABSOLUTE); @@ -721,9 +723,9 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { // -------------------------------------------------------------------- - st = rrdset_find_bytype("disk_avgsz", dstat[i].device_name); + st = rrdset_find_bytype("disk_avgsz", disk); if (unlikely(!st)) { - st = rrdset_create("disk_avgsz", dstat[i].device_name, NULL, dstat[i].device_name, "disk.avgsz", "Average Completed I/O Operation Bandwidth", "kilobytes per operation", 2006, update_every, RRDSET_TYPE_AREA); + st = rrdset_create("disk_avgsz", disk, NULL, disk, "disk.avgsz", "Average Completed I/O Operation Bandwidth", "kilobytes per operation", 2006, update_every, RRDSET_TYPE_AREA); st->isdetail = 1; rrddim_add(st, "reads", NULL, 1, 1024, RRDDIM_ABSOLUTE); @@ -739,9 +741,9 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { // -------------------------------------------------------------------- - st = rrdset_find_bytype("disk_svctm", dstat[i].device_name); + st = rrdset_find_bytype("disk_svctm", disk); if (unlikely(!st)) { - st = rrdset_create("disk_svctm", dstat[i].device_name, NULL, dstat[i].device_name, "disk.svctm", "Average Service Time", "ms per operation", 2007, update_every, RRDSET_TYPE_LINE); + st = rrdset_create("disk_svctm", disk, NULL, disk, "disk.svctm", "Average Service Time", "ms per operation", 2007, update_every, RRDSET_TYPE_LINE); st->isdetail = 1; rrddim_add(st, "svctm", NULL, 1, 1, RRDDIM_ABSOLUTE); @@ -753,21 +755,21 @@ int do_freebsd_sysctl(int update_every, usec_t dt) { rrdset_done(st); } } + } - // -------------------------------------------------------------------- - - st = rrdset_find_bytype("system", "io"); - if (unlikely(!st)) { - st = rrdset_create("system", "io", NULL, "disk", NULL, "Disk I/O", "kilobytes/s", 150, update_every, RRDSET_TYPE_AREA); - rrddim_add(st, "in", NULL, 1, 1024, RRDDIM_INCREMENTAL); - rrddim_add(st, "out", NULL, -1, 1024, RRDDIM_INCREMENTAL); - } - else rrdset_next(st); + // -------------------------------------------------------------------- - rrddim_set(st, "in", total_disk_reads); - rrddim_set(st, "out", total_disk_writes); - rrdset_done(st); + st = rrdset_find_bytype("system", "io"); + if (unlikely(!st)) { + st = rrdset_create("system", "io", NULL, "disk", NULL, "Disk I/O", "kilobytes/s", 150, update_every, RRDSET_TYPE_AREA); + rrddim_add(st, "in", NULL, 1, 1, RRDDIM_INCREMENTAL); + rrddim_add(st, "out", NULL, -1, 1, RRDDIM_INCREMENTAL); } + else rrdset_next(st); + + rrddim_set(st, "in", total_disk_kbytes_read); + rrddim_set(st, "out", total_disk_kbytes_write); + rrdset_done(st); } } }