1 # -*- coding: utf-8 -*-
2 # Description: web log netdata python.d module
5 from base import LogService
8 from os import access, R_OK
9 from os.path import getsize
10 from collections import namedtuple
11 from copy import deepcopy
13 from itertools import zip_longest
15 from itertools import izip_longest as zip_longest
20 ORDER = ['response_codes', 'bandwidth', 'response_time', 'requests_per_url', 'http_method', 'requests_per_ipproto',
21 'clients', 'clients_all']
24 'options': [None, 'Response Codes', 'requests/s', 'responses', 'web_log.response_codes', 'stacked'],
26 ['2xx', '2xx', 'incremental'],
27 ['5xx', '5xx', 'incremental'],
28 ['3xx', '3xx', 'incremental'],
29 ['4xx', '4xx', 'incremental'],
30 ['1xx', '1xx', 'incremental'],
31 ['0xx', 'other', 'incremental'],
32 ['unmatched', 'unmatched', 'incremental']
35 'options': [None, 'Bandwidth', 'KB/s', 'bandwidth', 'web_log.bandwidth', 'area'],
37 ['resp_length', 'received', 'incremental', 1, 1024],
38 ['bytes_sent', 'sent', 'incremental', -1, 1024]
41 'options': [None, 'Processing Time', 'milliseconds', 'timings', 'web_log.response_time', 'area'],
43 ['resp_time_min', 'min', 'incremental', 1, 1000],
44 ['resp_time_max', 'max', 'incremental', 1, 1000],
45 ['resp_time_avg', 'avg', 'incremental', 1, 1000]
48 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'clients', 'web_log.clients', 'stacked'],
50 ['unique_cur_ipv4', 'ipv4', 'incremental', 1, 1],
51 ['unique_cur_ipv6', 'ipv6', 'incremental', 1, 1]
54 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'clients', 'web_log.clients_all', 'stacked'],
56 ['unique_tot_ipv4', 'ipv4', 'absolute', 1, 1],
57 ['unique_tot_ipv6', 'ipv6', 'absolute', 1, 1]
60 'options': [None, 'Requests Per HTTP Method', 'requests/s', 'http methods', 'web_log.http_method', 'stacked'],
63 'requests_per_ipproto': {
64 'options': [None, 'Requests Per IP Protocol', 'requests/s', 'ip protocols', 'web_log.requests_per_ipproto',
67 ['req_ipv4', 'ipv4', 'incremental', 1, 1],
68 ['req_ipv6', 'ipv6', 'incremental', 1, 1]
72 NAMED_URL_PATTERN = namedtuple('URL_PATTERN', ['description', 'pattern'])
75 class Service(LogService):
76 def __init__(self, configuration=None, name=None):
77 LogService.__init__(self, configuration=configuration, name=name)
78 # Variables from module configuration file
79 self.log_path = self.configuration.get('path')
80 self.detailed_response_codes = self.configuration.get('detailed_response_codes', True)
81 self.all_time = self.configuration.get('all_time', True)
82 self.url_pattern = self.configuration.get('categories') # dict
83 self.regex = None # will be assigned in 'find_regex' method
84 self.resp_time_func = None # will be assigned in 'find_regex' method
85 self._get_data = None # will be assigned in 'check' method.
86 self.order = None # will be assigned in 'create_*_method' method.
87 self.definitions = None # will be assigned in 'create_*_method' method.
88 self.detailed_chart = None # will be assigned in 'create_*_method' method.
89 self.http_method_chart = None # will be assigned in 'create_*_method' method.
90 # sorted list of unique IPs
91 self.unique_all_time = list()
92 # if there is no new logs this dict returned to netdata
93 self.data = {'bytes_sent': 0, 'resp_length': 0, 'resp_time_min': 0,
94 'resp_time_max': 0, 'resp_time_avg': 0, 'unique_cur_ipv4': 0,
95 'unique_cur_ipv6': 0, '2xx': 0, '5xx': 0, '3xx': 0, '4xx': 0,
96 '1xx': 0, '0xx': 0, 'unmatched': 0, 'req_ipv4': 0, 'req_ipv6': 0,
97 'unique_tot_ipv4': 0, 'unique_tot_ipv6': 0}
100 if not self.log_path:
101 self.error('log path is not specified')
104 # log_path must be readable
105 if not access(self.log_path, R_OK):
106 self.error('%s not readable or not exist' % self.log_path)
109 # log_path file should not be empty
110 if not getsize(self.log_path):
111 self.error('%s is empty' % self.log_path)
114 # Read last line (or first if there is only one line)
115 with open(self.log_path, 'rb') as logs:
117 while logs.read(1) != b'\n':
121 last_line = logs.readline().decode(encoding='utf-8')
124 regex_name = self.find_regex(last_line)
126 self.error('Can\'t parse %s' % self.log_path)
129 if regex_name.startswith('access_'):
130 self.create_access_charts(regex_name)
131 if regex_name == 'access_default':
132 self.info('Not all data collected. You need to modify LogFormat.')
133 self._get_data = self._get_access_data
134 self.info('Used regex: %s' % regex_name)
137 # If it's not access_logs.. Not used at the moment
140 def find_regex(self, last_line):
142 :param last_line: str: literally last line from log file
144 It's sad but different web servers has different logs formats
145 We need to find appropriate regex for current log file
146 All logic is do a regex search through the string for all patterns
147 until we find something or fail.
149 # REGEX: 1.IPv4 address 2.HTTP method 3. URL 4. Response code
150 # 5. Bytes sent 6. Response length 7. Response process time
151 access_default = re.compile(r'([\da-f.:]+)'
157 access_apache_ext = re.compile(r'([\da-f.:]+)'
165 access_nginx_ext = re.compile(r'([\da-f.:]+)'
173 regex_function = zip([access_apache_ext, access_nginx_ext, access_default],
174 [lambda x: x, lambda x: x * 1000000, lambda x: x],
175 ['access_apache_ext', 'access_nginx_ext', 'access_default'])
177 for regex, function, name in regex_function:
178 if regex.search(last_line):
180 self.resp_time_func = function
185 def create_access_charts(self, regex_name):
187 :param regex_name: str: regex name from 'find_regex' method. Ex.: 'apache_extended', 'nginx_extended'
189 Create additional charts depending on the 'find_regex' result (parsed_line) and configuration file
190 1. 'time_response' chart is removed if there is no 'time_response' in logs.
191 2. Other stuff is just remove/add chart depending on yes/no in conf
193 def find_job_name(override_name, name):
195 :param override_name: str: 'name' var from configuration file
196 :param name: str: 'job_name' from configuration file
197 :return: str: new job name
198 We need this for dynamic charts. Actually same logic as in python.d.plugin.
200 add_to_name = override_name or name
202 return '_'.join(['web_log', re.sub('\s+', '_', add_to_name)])
206 self.order = ORDER[:]
207 self.definitions = deepcopy(CHARTS)
209 job_name = find_job_name(self.override_name, self.name)
210 self.detailed_chart = 'CHART %s.detailed_response_codes ""' \
211 ' "Detailed Response Codes" requests/s responses' \
212 ' web_log.detailed_response_codes stacked 1 %s\n' % (job_name, self.update_every)
213 self.http_method_chart = 'CHART %s.http_method' \
214 ' "" "Requests Per HTTP Method" requests/s "http methods"' \
215 ' web_log.http_method stacked 2 %s\n' % (job_name, self.update_every)
217 # Remove 'request_time' chart from ORDER if request_time not in logs
218 if regex_name == 'access_default':
219 self.order.remove('response_time')
220 # Remove 'clients_all' chart from ORDER if specified in the configuration
221 if not self.all_time:
222 self.order.remove('clients_all')
223 # Add 'detailed_response_codes' chart if specified in the configuration
224 if self.detailed_response_codes:
225 self.order.append('detailed_response_codes')
226 self.definitions['detailed_response_codes'] = {'options': [None, 'Detailed Response Codes', 'requests/s',
227 'responses', 'web_log.detailed_response_codes',
231 # Add 'requests_per_url' chart if specified in the configuration
233 self.url_pattern = [NAMED_URL_PATTERN(description=k, pattern=re.compile(v)) for k, v
234 in self.url_pattern.items()]
235 self.definitions['requests_per_url'] = {'options': [None, 'Requests Per Url', 'requests/s',
236 'urls', 'web_log.requests_per_url', 'stacked'],
237 'lines': [['other_url', 'other', 'incremental']]}
238 for elem in self.url_pattern:
239 self.definitions['requests_per_url']['lines'].append([elem.description, elem.description,
241 self.data.update({elem.description: 0})
242 self.data.update({'other_url': 0})
244 self.order.remove('requests_per_url')
246 def add_new_dimension(self, dimension, line_list, chart_string, key):
248 :param dimension: str: response status code. Ex.: '202', '499'
249 :param line_list: list: Ex.: ['202', '202', 'incremental']
250 :param chart_string: Current string we need to pass to netdata to rebuild the chart
251 :param key: str: CHARTS dict key (chart name). Ex.: 'response_time'
252 :return: str: new chart string = previous + new dimensions
254 self.data.update({dimension: 0})
255 # SET method check if dim in _dimensions
256 self._dimensions.append(dimension)
257 # UPDATE method do SET only if dim in definitions
258 self.definitions[key]['lines'].append(line_list)
260 chart += "%s %s\n" % ('DIMENSION', ' '.join(line_list))
264 def _get_access_data(self):
267 :return: dict OR None
268 None if _get_raw_data method fails.
269 In all other cases - dict.
271 raw = self._get_raw_data()
275 request_time, unique_current = list(), list()
276 request_counter = {'count': 0, 'sum': 0}
277 ip_address_counter = {'unique_cur_ip': 0}
279 match = self.regex.search(line)
281 match_dict = dict(zip_longest('address method url code sent resp_length resp_time'.split(),
284 code = ''.join([match_dict['code'][0], 'xx'])
287 self.data['0xx'] += 1
288 # detailed response code
289 if self.detailed_response_codes:
290 self._get_data_detailed_response_codes(match_dict['code'])
293 self._get_data_per_url(match_dict['url'])
294 # requests per http method
295 self._get_data_http_method(match_dict['method'])
297 self.data['bytes_sent'] += int(match_dict['sent'])
298 # request processing time and bandwidth received
299 if match_dict['resp_length'] and match_dict['resp_time']:
300 self.data['resp_length'] += int(match_dict['resp_length'])
301 resp_time = self.resp_time_func(float(match_dict['resp_time']))
302 bisect.insort_left(request_time, resp_time)
303 request_counter['count'] += 1
304 request_counter['sum'] += resp_time
305 # requests per ip proto
306 proto = 'ipv4' if '.' in match_dict['address'] else 'ipv6'
307 self.data['req_' + proto] += 1
309 if address_not_in_pool(self.unique_all_time, match_dict['address'],
310 self.data['unique_tot_ipv4'] + self.data['unique_tot_ipv6']):
311 self.data['unique_tot_' + proto] += 1
312 if address_not_in_pool(unique_current, match_dict['address'], ip_address_counter['unique_cur_ip']):
313 self.data['unique_cur_' + proto] += 1
314 ip_address_counter['unique_cur_ip'] += 1
316 self.data['unmatched'] += 1
319 self.data['resp_time_min'] += int(request_time[0])
320 self.data['resp_time_avg'] += int(round(float(request_counter['sum']) / request_counter['count']))
321 self.data['resp_time_max'] += int(request_time[-1])
324 def _get_data_detailed_response_codes(self, code):
326 :param code: str: CODE from parsed line. Ex.: '202, '499'
328 Calls add_new_dimension method If the value is found for the first time
330 if code not in self.data:
331 chart_string_copy = self.detailed_chart
332 self.detailed_chart = self.add_new_dimension(code, [code, code, 'incremental'],
333 chart_string_copy, 'detailed_response_codes')
336 def _get_data_http_method(self, method):
338 :param method: str: METHOD from parsed line. Ex.: 'GET', 'POST'
340 Calls add_new_dimension method If the value is found for the first time
342 if method not in self.data:
343 chart_string_copy = self.http_method_chart
344 self.http_method_chart = self.add_new_dimension(method, [method, method, 'incremental'],
345 chart_string_copy, 'http_method')
346 self.data[method] += 1
348 def _get_data_per_url(self, url):
350 :param url: str: URL from parsed line
352 Scan through string looking for the first location where patterns produce a match for all user
356 for elem in self.url_pattern:
357 if elem.pattern.search(url):
358 self.data[elem.description] += 1
362 self.data['other_url'] += 1
365 def address_not_in_pool(pool, address, pool_size):
367 :param pool: list of ip addresses
368 :param address: ip address
369 :param pool_size: current size of pool
370 :return: True if address not in pool. False if address in pool
372 index = bisect.bisect_left(pool, address)
373 if index < pool_size:
374 if pool[index] == address:
377 bisect.insort_left(pool, address)
380 bisect.insort_left(pool, address)