1 # -*- coding: utf-8 -*-
2 # Description: web log netdata python.d module
5 from base import LogService
8 from os import access, R_OK
9 from os.path import getsize
10 from collections import namedtuple
11 from copy import deepcopy
16 ORDER = ['response_statuses', 'response_codes', 'bandwidth', 'response_time', 'requests_per_url', 'http_method',
17 'requests_per_ipproto', 'clients', 'clients_all']
20 'options': [None, 'Response Codes', 'requests/s', 'responses', 'web_log.response_codes', 'stacked'],
22 ['2xx', '2xx', 'incremental'],
23 ['5xx', '5xx', 'incremental'],
24 ['3xx', '3xx', 'incremental'],
25 ['4xx', '4xx', 'incremental'],
26 ['1xx', '1xx', 'incremental'],
27 ['0xx', 'other', 'incremental'],
28 ['unmatched', 'unmatched', 'incremental']
31 'options': [None, 'Bandwidth', 'KB/s', 'bandwidth', 'web_log.bandwidth', 'area'],
33 ['resp_length', 'received', 'incremental', 1, 1024],
34 ['bytes_sent', 'sent', 'incremental', -1, 1024]
37 'options': [None, 'Processing Time', 'milliseconds', 'timings', 'web_log.response_time', 'area'],
39 ['resp_time_min', 'min', 'incremental', 1, 1000],
40 ['resp_time_max', 'max', 'incremental', 1, 1000],
41 ['resp_time_avg', 'avg', 'incremental', 1, 1000]
44 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'clients', 'web_log.clients', 'stacked'],
46 ['unique_cur_ipv4', 'ipv4', 'incremental', 1, 1],
47 ['unique_cur_ipv6', 'ipv6', 'incremental', 1, 1]
50 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'clients', 'web_log.clients_all', 'stacked'],
52 ['unique_tot_ipv4', 'ipv4', 'absolute', 1, 1],
53 ['unique_tot_ipv6', 'ipv6', 'absolute', 1, 1]
56 'options': [None, 'Requests Per HTTP Method', 'requests/s', 'http methods', 'web_log.http_method', 'stacked'],
59 'requests_per_ipproto': {
60 'options': [None, 'Requests Per IP Protocol', 'requests/s', 'ip protocols', 'web_log.requests_per_ipproto',
63 ['req_ipv4', 'ipv4', 'incremental', 1, 1],
64 ['req_ipv6', 'ipv6', 'incremental', 1, 1]
66 'response_statuses': {
67 'options': [None, 'Response Statuses', 'requests/s', 'responses', 'web_log.response_statuses',
70 ['successful_requests', 'success', 'incremental', 1, 1],
71 ['server_errors', 'error', 'incremental', 1, 1],
72 ['redirects', 'redirect', 'incremental', 1, 1],
73 ['bad_requests', 'bad', 'incremental', 1, 1],
74 ['other_requests', 'other', 'incremental', 1, 1]
78 NAMED_URL_PATTERN = namedtuple('URL_PATTERN', ['description', 'pattern'])
81 class Service(LogService):
82 def __init__(self, configuration=None, name=None):
86 # self._get_data = None # will be assigned in 'check' method.
87 # self.order = None # will be assigned in 'create_*_method' method.
88 # self.definitions = None # will be assigned in 'create_*_method' method.
89 # self.detailed_chart = None # will be assigned in 'create_*_method' method.
90 # self.http_method_chart = None # will be assigned in 'create_*_method' method.
92 LogService.__init__(self, configuration=configuration, name=name)
93 # Variables from module configuration file
94 self.log_path = self.configuration.get('path')
95 self.detailed_response_codes = self.configuration.get('detailed_response_codes', True)
96 self.all_time = self.configuration.get('all_time', True)
97 self.url_pattern = self.configuration.get('categories') # dict
98 self.custom_log_format = self.configuration.get('custom_log_format') # dict
100 self.unique_all_time = list() # sorted list of unique IPs
101 self.regex = None # will be assigned in 'find_regex' or 'find_regex_custom' method
102 self.resp_time_func = None # will be assigned in 'find_regex' or 'find_regex_custom' method
103 self.data = {'bytes_sent': 0, 'resp_length': 0, 'resp_time_min': 0, 'resp_time_max': 0,
104 'resp_time_avg': 0, 'unique_cur_ipv4': 0, 'unique_cur_ipv6': 0, '2xx': 0,
105 '5xx': 0, '3xx': 0, '4xx': 0, '1xx': 0, '0xx': 0, 'unmatched': 0, 'req_ipv4': 0,
106 'req_ipv6': 0, 'unique_tot_ipv4': 0, 'unique_tot_ipv6': 0, 'successful_requests': 0,
107 'redirects': 0, 'bad_requests': 0, 'server_errors': 0, 'other_requests': 0}
113 We need to make sure:
114 1. "log_path" is specified in the module configuration file
115 2. "log_path" must be readable by netdata user and must exist
116 3. "log_path' must not be empty. We need at least 1 line to find appropriate pattern to parse
117 4. Plugin can work using predefined patterns (OK for nginx, apache default log format) or user defined
118 pattern. So we need to check if we can parse last line from log file with user pattern OR module patterns.
119 5. All patterns for per_url_request_counter feature are valid regex expressions
121 if not self.log_path:
122 self.error('log path is not specified')
125 if not access(self.log_path, R_OK):
126 self.error('%s not readable or not exist' % self.log_path)
129 if not getsize(self.log_path):
130 self.error('%s is empty' % self.log_path)
133 # Read last line (or first if there is only one line)
134 with open(self.log_path, 'rb') as logs:
136 while logs.read(1) != b'\n':
140 last_line = logs.readline()
143 last_line = last_line.decode()
144 except UnicodeDecodeError:
146 last_line = last_line.decode(encoding='utf-8')
147 except (TypeError, UnicodeDecodeError) as error:
148 self.error(str(error))
151 # Custom_log_format or predefined log format.
152 if self.custom_log_format:
153 match_dict, log_name, error = self.find_regex_custom(last_line)
155 match_dict, log_name, error = self.find_regex(last_line)
157 # "match_dict" is None if there are any problems
158 if match_dict is None:
159 self.error(str(error))
162 # self.url_pattern check
164 self.url_pattern = check_req_per_url_pattern(self.url_pattern)
167 if not (self.regex and self.resp_time_func):
168 self.error('That can not happen, but it happened. "regex" or "resp_time_func" is None')
170 # All is ok. We are about to start.
171 if log_name == 'web_access':
172 self.create_access_charts(match_dict) # Create charts
173 self._get_data = self._get_access_data
174 self.info('Collected data: %s' % list(match_dict.keys()))
177 # If it's not access_logs.. Not used at the moment
180 def find_regex_custom(self, last_line):
182 :param last_line: str: literally last line from log file
183 :return: tuple where:
184 [0]: dict or None: match_dict or None
185 [1]: str or None: log_name or None
186 [2]: str: error description
188 We are here only if "custom_log_format" is in logs. We need to make sure:
189 1. "custom_log_format" is a dict
190 2. "pattern" in "custom_log_format" and pattern is <str> instance
191 3. if "time_multiplier" is in "custom_log_format" it must be <int> instance
193 If all parameters is ok we need to make sure:
194 1. Pattern search is success
195 2. Pattern search contains named subgroups (?P<subgroup_name>) (= "match_dict")
197 If pattern search is success we need to make sure:
198 1. All mandatory keys ['address', 'code', 'bytes_sent', 'method', 'url'] are in "match_dict"
200 If this is True we need to make sure:
201 1. All mandatory key values from "match_dict" have the correct format
202 ("code" is integer, "method" is uppercase word, etc)
204 If non mandatory keys in "match_dict" we need to make sure:
205 1. All non mandatory key values from match_dict ['resp_length', 'resp_time'] have the correct format
206 ("resp_length" is integer or "-", "resp_time" is integer or float)
209 if not is_dict(self.custom_log_format):
210 return find_regex_return(msg='Custom log: "custom_log_format" is not a <dict>')
212 pattern = self.custom_log_format.get('pattern')
213 if not (pattern and isinstance(pattern, str)):
214 return find_regex_return(msg='Custom log: "pattern" option is not specified or type is not <str>')
216 resp_time_func = self.custom_log_format.get('time_multiplier') or 0
218 if not isinstance(resp_time_func, int):
219 return find_regex_return(msg='Custom log: "time_multiplier" is not an integer')
222 regex = re.compile(pattern)
223 except re.error as error:
224 return find_regex_return(msg='Pattern compile error: %s' % str(error))
226 match = regex.search(last_line)
228 match_dict = match.groupdict() or None
230 return find_regex_return(msg='Custom log: pattern search FAILED')
232 if match_dict is None:
233 find_regex_return(msg='Custom log: search OK but contains no named subgroups'
234 ' (you need to use ?P<subgroup_name>)')
236 basic_values = {'address', 'method', 'url', 'code', 'bytes_sent'} - set(match_dict)
239 return find_regex_return(msg='Custom log: search OK but some mandatory keys (%s) are missing'
240 % list(basic_values))
242 if not re.search(r'[\da-f.:]+', match_dict['address']):
243 return find_regex_return(msg='Custom log: can\'t parse "address": %s'
244 % match_dict['address'])
245 if not re.search(r'[1-9]\d{2}', match_dict['code']):
246 return find_regex_return(msg='Custom log: can\'t parse "code": %s'
247 % match_dict['code'])
248 if not re.search(r'[A-Z]+', match_dict['method']):
249 return find_regex_return(msg='Custom log: can\'t parse "method": %s'
250 % match_dict['method'])
251 if not re.search(r'\d+|-', match_dict['bytes_sent']):
252 return find_regex_return(msg='Custom log: can\'t parse "bytes_sent": %s'
253 % match_dict['bytes_sent'])
255 if 'resp_length' in match_dict:
256 if not re.search(r'\d+', match_dict['resp_length']):
257 return find_regex_return(msg='Custom log: can\'t parse "resp_length": %s'
258 % match_dict['resp_length'])
260 if 'resp_time' in match_dict:
261 if not re.search(r'[\d.]+', match_dict['resp_length']):
262 return find_regex_return(msg='Custom log: can\'t parse "resp_time": %s'
263 % match_dict['resp_time'])
265 if '.' in match_dict['resp_time']:
266 self.resp_time_func = lambda time: time * (resp_time_func or 1000000)
268 self.resp_time_func = lambda time: time * (resp_time_func or 1)
271 return find_regex_return(match_dict=match_dict,
272 log_name='web_access',
275 def find_regex(self, last_line):
277 :param last_line: str: literally last line from log file
278 :return: tuple where:
279 [0]: dict or None: match_dict or None
280 [1]: str or None: log_name or None
281 [2]: str: error description
282 We need to find appropriate pattern for current log file
283 All logic is do a regex search through the string for all predefined patterns
284 until we find something or fail.
286 # REGEX: 1.IPv4 address 2.HTTP method 3. URL 4. Response code
287 # 5. Bytes sent 6. Response length 7. Response process time
288 acs_default = re.compile(r'(?P<address>[\da-f.:]+)'
289 r' -.*?"(?P<method>[A-Z]+)'
291 r' (?P<code>[1-9]\d{2})'
292 r' (?P<bytes_sent>\d+|-)')
294 acs_apache_ext_insert = re.compile(r'(?P<address>[\da-f.:]+)'
295 r' -.*?"(?P<method>[A-Z]+)'
297 r' (?P<code>[1-9]\d{2})'
298 r' (?P<bytes_sent>\d+|-)'
299 r' (?P<resp_length>\d+)'
300 r' (?P<resp_time>\d+) ')
302 acs_apache_ext_append = re.compile(r'(?P<address>[\da-f.:]+)'
303 r' -.*?"(?P<method>[A-Z]+)'
305 r' (?P<code>[1-9]\d{2})'
306 r' (?P<bytes_sent>\d+|-)'
308 r' (?P<resp_length>\d+)'
309 r' (?P<resp_time>\d+)'
312 acs_nginx_ext_insert = re.compile(r'(?P<address>[\da-f.:]+)'
313 r' -.*?"(?P<method>[A-Z]+)'
315 r' (?P<code>[1-9]\d{2})'
316 r' (?P<bytes_sent>\d+)'
317 r' (?P<resp_length>\d+)'
318 r' (?P<resp_time>\d\.\d+) ')
320 acs_nginx_ext_append = re.compile(r'(?P<address>[\da-f.:]+)'
321 r' -.*?"(?P<method>[A-Z]+)'
323 r' (?P<code>[1-9]\d{2})'
324 r' (?P<bytes_sent>\d+)'
326 r' (?P<resp_length>\d+)'
327 r' (?P<resp_time>\d\.\d+)')
333 return time * 1000000
335 r_regex = [acs_apache_ext_insert, acs_apache_ext_append, acs_nginx_ext_insert,
336 acs_nginx_ext_append, acs_default]
337 r_function = [func_usec, func_usec, func_sec, func_sec, func_usec]
338 regex_function = zip(r_regex, r_function)
341 for regex, function in regex_function:
342 match = regex.search(last_line)
345 self.resp_time_func = function
346 match_dict = match.groupdict()
349 return find_regex_return(match_dict=match_dict or None,
350 log_name='web_access',
351 msg='Unknown log format. You need to use "custom_log_format" feature.')
353 def create_access_charts(self, match_dict):
355 :param match_dict: dict: regex.search.groupdict(). Ex. {'address': '127.0.0.1', 'code': '200', 'method': 'GET'}
357 Create additional charts depending on the 'match_dict' keys and configuration file options
358 1. 'time_response' chart is removed if there is no 'resp_time' in match_dict.
359 2. Other stuff is just remove/add chart depending on yes/no in conf
361 def find_job_name(override_name, name):
363 :param override_name: str: 'name' var from configuration file
364 :param name: str: 'job_name' from configuration file
365 :return: str: new job name
366 We need this for dynamic charts. Actually same logic as in python.d.plugin.
368 add_to_name = override_name or name
370 return '_'.join(['web_log', re.sub('\s+', '_', add_to_name)])
374 self.order = ORDER[:]
375 self.definitions = deepcopy(CHARTS)
377 job_name = find_job_name(self.override_name, self.name)
378 self.detailed_chart = 'CHART %s.detailed_response_codes ""' \
379 ' "Detailed Response Codes" requests/s responses' \
380 ' web_log.detailed_response_codes stacked 1 %s\n' % (job_name, self.update_every)
381 self.http_method_chart = 'CHART %s.http_method' \
382 ' "" "Requests Per HTTP Method" requests/s "http methods"' \
383 ' web_log.http_method stacked 2 %s\n' % (job_name, self.update_every)
385 # Remove 'request_time' chart from ORDER if resp_time not in match_dict
386 if 'resp_time' not in match_dict:
387 self.order.remove('response_time')
388 # Remove 'clients_all' chart from ORDER if specified in the configuration
389 if not self.all_time:
390 self.order.remove('clients_all')
391 # Add 'detailed_response_codes' chart if specified in the configuration
392 if self.detailed_response_codes:
393 self.order.append('detailed_response_codes')
394 self.definitions['detailed_response_codes'] = {'options': [None, 'Detailed Response Codes', 'requests/s',
395 'responses', 'web_log.detailed_response_codes',
399 # Add 'requests_per_url' chart if specified in the configuration
401 self.definitions['requests_per_url'] = {'options': [None, 'Requests Per Url', 'requests/s',
402 'urls', 'web_log.requests_per_url', 'stacked'],
403 'lines': [['other_url', 'other', 'incremental']]}
404 for elem in self.url_pattern:
405 self.definitions['requests_per_url']['lines'].append([elem.description, elem.description,
407 self.data.update({elem.description: 0})
408 self.data.update({'other_url': 0})
410 self.order.remove('requests_per_url')
412 def add_new_dimension(self, dimension, line_list, chart_string, key):
414 :param dimension: str: response status code. Ex.: '202', '499'
415 :param line_list: list: Ex.: ['202', '202', 'incremental']
416 :param chart_string: Current string we need to pass to netdata to rebuild the chart
417 :param key: str: CHARTS dict key (chart name). Ex.: 'response_time'
418 :return: str: new chart string = previous + new dimensions
420 self.data.update({dimension: 0})
421 # SET method check if dim in _dimensions
422 self._dimensions.append(dimension)
423 # UPDATE method do SET only if dim in definitions
424 self.definitions[key]['lines'].append(line_list)
426 chart += "%s %s\n" % ('DIMENSION', ' '.join(line_list))
430 def _get_access_data(self):
433 :return: dict OR None
434 None if _get_raw_data method fails.
435 In all other cases - dict.
437 raw = self._get_raw_data()
441 request_time, unique_current = list(), list()
442 request_counter = {'count': 0, 'sum': 0}
443 ip_address_counter = {'unique_cur_ip': 0}
445 match = self.regex.search(line)
447 match_dict = match.groupdict()
449 code = ''.join([match_dict['code'][0], 'xx'])
452 self.data['0xx'] += 1
453 # detailed response code
454 if self.detailed_response_codes:
455 self._get_data_detailed_response_codes(match_dict['code'])
457 self._get_data_statuses(match_dict['code'])
460 self._get_data_per_url(match_dict['url'])
461 # requests per http method
462 self._get_data_http_method(match_dict['method'])
464 bytes_sent = match_dict['bytes_sent'] if '-' not in match_dict['bytes_sent'] else 0
465 self.data['bytes_sent'] += int(bytes_sent)
466 # request processing time and bandwidth received
467 if 'resp_length' in match_dict:
468 self.data['resp_length'] += int(match_dict['resp_length'])
469 if 'resp_time' in match_dict:
470 resp_time = self.resp_time_func(float(match_dict['resp_time']))
471 bisect.insort_left(request_time, resp_time)
472 request_counter['count'] += 1
473 request_counter['sum'] += resp_time
474 # requests per ip proto
475 proto = 'ipv4' if '.' in match_dict['address'] else 'ipv6'
476 self.data['req_' + proto] += 1
478 if address_not_in_pool(self.unique_all_time, match_dict['address'],
479 self.data['unique_tot_ipv4'] + self.data['unique_tot_ipv6']):
480 self.data['unique_tot_' + proto] += 1
481 if address_not_in_pool(unique_current, match_dict['address'], ip_address_counter['unique_cur_ip']):
482 self.data['unique_cur_' + proto] += 1
483 ip_address_counter['unique_cur_ip'] += 1
485 self.data['unmatched'] += 1
489 self.data['resp_time_min'] += int(request_time[0])
490 self.data['resp_time_avg'] += int(round(float(request_counter['sum']) / request_counter['count']))
491 self.data['resp_time_max'] += int(request_time[-1])
494 def _get_data_detailed_response_codes(self, code):
496 :param code: str: CODE from parsed line. Ex.: '202, '499'
498 Calls add_new_dimension method If the value is found for the first time
500 if code not in self.data:
501 chart_string_copy = self.detailed_chart
502 self.detailed_chart = self.add_new_dimension(code, [code, code, 'incremental'],
503 chart_string_copy, 'detailed_response_codes')
506 def _get_data_http_method(self, method):
508 :param method: str: METHOD from parsed line. Ex.: 'GET', 'POST'
510 Calls add_new_dimension method If the value is found for the first time
512 if method not in self.data:
513 chart_string_copy = self.http_method_chart
514 self.http_method_chart = self.add_new_dimension(method, [method, method, 'incremental'],
515 chart_string_copy, 'http_method')
516 self.data[method] += 1
518 def _get_data_per_url(self, url):
520 :param url: str: URL from parsed line
522 Scan through string looking for the first location where patterns produce a match for all user
526 for elem in self.url_pattern:
527 if elem.pattern.search(url):
528 self.data[elem.description] += 1
532 self.data['other_url'] += 1
534 def _get_data_statuses(self, code):
536 :param code: str: response status code. Ex.: '202', '499'
540 if code_class == '2' or code == '304' or code_class == '1':
541 self.data['successful_requests'] += 1
542 elif code_class == '3':
543 self.data['redirects'] += 1
544 elif code_class == '4':
545 self.data['bad_requests'] += 1
546 elif code_class == '5':
547 self.data['server_errors'] += 1
549 self.data['other_requests'] += 1
552 def address_not_in_pool(pool, address, pool_size):
554 :param pool: list of ip addresses
555 :param address: ip address
556 :param pool_size: current pool size
557 :return: True if address not in pool. False if address in pool.
559 index = bisect.bisect_left(pool, address)
560 if index < pool_size:
561 if pool[index] == address:
564 bisect.insort_left(pool, address)
567 bisect.insort_left(pool, address)
571 def find_regex_return(match_dict=None, log_name=None, msg='Generic error message'):
573 :param match_dict: dict: re.search.groupdict() or None
574 :param log_name: str: log name
575 :param msg: str: error description
578 return match_dict, log_name, msg
581 def check_req_per_url_pattern(url_pattern):
583 :param url_pattern: dict: ex. {'dim1': 'pattern1>', 'dim2': '<pattern2>'}
584 :return: list of named tuples or None:
585 We need to make sure all patterns are valid regular expressions
587 if not is_dict(url_pattern):
592 def is_valid_pattern(pattern):
595 :return: re.compile(pattern) or None
597 if not isinstance(pattern, str):
601 compile_pattern = re.compile(pattern)
605 return compile_pattern
607 for dimension, regex in url_pattern.items():
608 valid_pattern = is_valid_pattern(regex)
609 if isinstance(dimension, str) and valid_pattern:
610 result.append(NAMED_URL_PATTERN(description=dimension, pattern=valid_pattern))
612 return result or None
618 :return: True or False
619 obj can be <dict> or <OrderedDict>
623 except AttributeError: