1 # -*- coding: utf-8 -*-
2 # Description: web log netdata python.d module
5 from base import LogService
8 from os import access, R_OK
9 from os.path import getsize
10 from collections import namedtuple
11 from copy import deepcopy
16 ORDER = ['response_statuses', 'response_codes', 'bandwidth', 'response_time', 'requests_per_url', 'http_method',
17 'requests_per_ipproto', 'clients', 'clients_all']
20 'options': [None, 'Response Codes', 'requests/s', 'responses', 'web_log.response_codes', 'stacked'],
22 ['2xx', '2xx', 'incremental'],
23 ['5xx', '5xx', 'incremental'],
24 ['3xx', '3xx', 'incremental'],
25 ['4xx', '4xx', 'incremental'],
26 ['1xx', '1xx', 'incremental'],
27 ['0xx', 'other', 'incremental'],
28 ['unmatched', 'unmatched', 'incremental']
31 'options': [None, 'Bandwidth', 'KB/s', 'bandwidth', 'web_log.bandwidth', 'area'],
33 ['resp_length', 'received', 'incremental', 1, 1024],
34 ['bytes_sent', 'sent', 'incremental', -1, 1024]
37 'options': [None, 'Processing Time', 'milliseconds', 'timings', 'web_log.response_time', 'area'],
39 ['resp_time_min', 'min', 'incremental', 1, 1000],
40 ['resp_time_max', 'max', 'incremental', 1, 1000],
41 ['resp_time_avg', 'avg', 'incremental', 1, 1000]
44 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'clients', 'web_log.clients', 'stacked'],
46 ['unique_cur_ipv4', 'ipv4', 'incremental', 1, 1],
47 ['unique_cur_ipv6', 'ipv6', 'incremental', 1, 1]
50 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'clients', 'web_log.clients_all', 'stacked'],
52 ['unique_tot_ipv4', 'ipv4', 'absolute', 1, 1],
53 ['unique_tot_ipv6', 'ipv6', 'absolute', 1, 1]
56 'options': [None, 'Requests Per HTTP Method', 'requests/s', 'http methods', 'web_log.http_method', 'stacked'],
59 'requests_per_ipproto': {
60 'options': [None, 'Requests Per IP Protocol', 'requests/s', 'ip protocols', 'web_log.requests_per_ipproto',
63 ['req_ipv4', 'ipv4', 'incremental', 1, 1],
64 ['req_ipv6', 'ipv6', 'incremental', 1, 1]
66 'response_statuses': {
67 'options': [None, 'Response Statuses', 'requests/s', 'responses', 'web_log.response_statuses',
70 ['successful_requests', 'success', 'incremental', 1, 1],
71 ['server_errors', 'error', 'incremental', 1, 1],
72 ['redirects', 'redirect', 'incremental', 1, 1],
73 ['bad_requests', 'bad', 'incremental', 1, 1],
74 ['other_requests', 'other', 'incremental', 1, 1]
78 NAMED_URL_PATTERN = namedtuple('URL_PATTERN', ['description', 'pattern'])
81 class Service(LogService):
82 def __init__(self, configuration=None, name=None):
86 # self._get_data = None # will be assigned in 'check' method.
87 # self.order = None # will be assigned in 'create_*_method' method.
88 # self.definitions = None # will be assigned in 'create_*_method' method.
89 # self.detailed_chart = None # will be assigned in 'create_*_method' method.
90 # self.http_method_chart = None # will be assigned in 'create_*_method' method.
92 LogService.__init__(self, configuration=configuration, name=name)
93 # Variables from module configuration file
94 self.log_path = self.configuration.get('path')
95 self.detailed_response_codes = self.configuration.get('detailed_response_codes', True)
96 self.all_time = self.configuration.get('all_time', True)
97 self.url_pattern = self.configuration.get('categories') # dict
98 self.custom_log_format = self.configuration.get('custom_log_format') # dict
100 self.unique_all_time = list() # sorted list of unique IPs
101 self.regex = None # will be assigned in 'find_regex' or 'find_regex_custom' method
102 self.resp_time_func = None # will be assigned in 'find_regex' or 'find_regex_custom' method
103 self.data = {'bytes_sent': 0, 'resp_length': 0, 'resp_time_min': 0, 'resp_time_max': 0,
104 'resp_time_avg': 0, 'unique_cur_ipv4': 0, 'unique_cur_ipv6': 0, '2xx': 0,
105 '5xx': 0, '3xx': 0, '4xx': 0, '1xx': 0, '0xx': 0, 'unmatched': 0, 'req_ipv4': 0,
106 'req_ipv6': 0, 'unique_tot_ipv4': 0, 'unique_tot_ipv6': 0, 'successful_requests': 0,
107 'redirects': 0, 'bad_requests': 0, 'server_errors': 0, 'other_requests': 0}
113 We need to make sure:
114 1. "log_path" is specified in the module configuration file
115 2. "log_path" must be readable by netdata user and must exist
116 3. "log_path' must not be empty. We need at least 1 line to find appropriate pattern to parse
117 4. Plugin can work using predefined patterns (OK for nginx, apache default log format) or user defined
118 pattern. So we need to check if we can parse last line from log file with user pattern OR module patterns.
119 5. All patterns for per_url_request_counter feature are valid regex expressions
121 if not self.log_path:
122 self.error('log path is not specified')
125 if not access(self.log_path, R_OK):
126 self.error('%s not readable or not exist' % self.log_path)
129 if not getsize(self.log_path):
130 self.error('%s is empty' % self.log_path)
133 # Read last line (or first if there is only one line)
134 with open(self.log_path, 'rb') as logs:
136 while logs.read(1) != b'\n':
140 last_line = logs.readline()
143 last_line = last_line.decode()
144 except UnicodeDecodeError:
146 last_line = last_line.decode(encoding='utf-8')
147 except (TypeError, UnicodeDecodeError) as error:
148 self.error(str(error))
151 # Custom_log_format or predefined log format.
152 if self.custom_log_format:
153 match_dict, log_name, error = self.find_regex_custom(last_line)
155 match_dict, log_name, error = self.find_regex(last_line)
157 # "match_dict" is None if there are any problems
158 if match_dict is None:
159 self.error(str(error))
162 # self.url_pattern check
164 self.url_pattern = check_req_per_url_pattern(self.url_pattern)
167 if not (self.regex and self.resp_time_func):
168 self.error('That can not happen, but it happened. "regex" or "resp_time_func" is None')
170 # All is ok. We are about to start.
171 if log_name == 'web_access':
172 self.create_access_charts(match_dict) # Create charts
173 self._get_data = self._get_access_data
174 self.info('Collected data: %s' % list(match_dict.keys()))
177 # If it's not access_logs.. Not used at the moment
180 def find_regex_custom(self, last_line):
182 :param last_line: str: literally last line from log file
183 :return: tuple where:
184 [0]: dict or None: match_dict or None
185 [1]: str or None: log_name or None
186 [2]: str: error description
188 We are here only if "custom_log_format" is in logs. We need to make sure:
189 1. "custom_log_format" is a dict
190 2. "pattern" in "custom_log_format" and pattern is <str> instance
191 3. if "time_multiplier" is in "custom_log_format" it must be <int> instance
193 If all parameters is ok we need to make sure:
194 1. Pattern search is success
195 2. Pattern search contains named subgroups (?P<subgroup_name>) (= "match_dict")
197 If pattern search is success we need to make sure:
198 1. All mandatory keys ['address', 'code', 'bytes_sent', 'method', 'url'] are in "match_dict"
200 If this is True we need to make sure:
201 1. All mandatory key values from "match_dict" have the correct format
202 ("code" is integer, "method" is uppercase word, etc)
204 If non mandatory keys in "match_dict" we need to make sure:
205 1. All non mandatory key values from match_dict ['resp_length', 'resp_time'] have the correct format
206 ("resp_length" is integer or "-", "resp_time" is integer or float)
209 if not is_dict(self.custom_log_format):
210 return find_regex_return(msg='Custom log: "custom_log_format" is not a <dict>')
212 pattern = self.custom_log_format.get('pattern')
213 if not (pattern and isinstance(pattern, str)):
214 return find_regex_return(msg='Custom log: "pattern" option is not specified or type is not <str>')
216 resp_time_func = self.custom_log_format.get('time_multiplier') or 0
218 if not isinstance(resp_time_func, int):
219 return find_regex_return(msg='Custom log: "time_multiplier" is not an integer')
222 regex = re.compile(pattern)
223 except re.error as error:
224 return find_regex_return(msg='Pattern compile error: %s' % str(error))
226 match = regex.search(last_line)
228 match_dict = match.groupdict() or None
230 return find_regex_return(msg='Custom log: pattern search FAILED')
232 if match_dict is None:
233 find_regex_return(msg='Custom log: search OK but contains no named subgroups'
234 ' (you need to use ?P<subgroup_name>)')
236 mandatory_dict = {'address': r'[\da-f.:]+',
237 'code': r'[1-9]\d{2}',
239 'bytes_sent': r'\d+|-'}
240 optional_dict = {'resp_length': r'\d+',
241 'resp_time': r'[\d.]+'}
243 mandatory_values = set(mandatory_dict) - set(match_dict)
245 return find_regex_return(msg='Custom log: search OK but some mandatory keys (%s) are missing'
246 % list(mandatory_values))
248 for key in mandatory_dict:
249 if not re.search(mandatory_dict[key], match_dict[key]):
250 return find_regex_return(msg='Custom log: can\'t parse "%s": %s'
251 % (key, match_dict[key]))
253 optional_values = set(optional_dict) & set(match_dict)
254 for key in optional_values:
255 if not re.search(optional_dict[key], match_dict[key]):
256 return find_regex_return(msg='Custom log: can\'t parse "%s": %s'
257 % (key, match_dict[key]))
259 dot_in_time = '.' in match_dict.get('resp_time', '')
261 self.resp_time_func = lambda time: time * (resp_time_func or 1000000)
263 self.resp_time_func = lambda time: time * (resp_time_func or 1)
266 return find_regex_return(match_dict=match_dict,
267 log_name='web_access')
269 def find_regex(self, last_line):
271 :param last_line: str: literally last line from log file
272 :return: tuple where:
273 [0]: dict or None: match_dict or None
274 [1]: str or None: log_name or None
275 [2]: str: error description
276 We need to find appropriate pattern for current log file
277 All logic is do a regex search through the string for all predefined patterns
278 until we find something or fail.
280 # REGEX: 1.IPv4 address 2.HTTP method 3. URL 4. Response code
281 # 5. Bytes sent 6. Response length 7. Response process time
282 acs_default = re.compile(r'(?P<address>[\da-f.:]+)'
283 r' -.*?"(?P<method>[A-Z]+)'
285 r' (?P<code>[1-9]\d{2})'
286 r' (?P<bytes_sent>\d+|-)')
288 acs_apache_ext_insert = re.compile(r'(?P<address>[\da-f.:]+)'
289 r' -.*?"(?P<method>[A-Z]+)'
291 r' (?P<code>[1-9]\d{2})'
292 r' (?P<bytes_sent>\d+|-)'
293 r' (?P<resp_length>\d+)'
294 r' (?P<resp_time>\d+) ')
296 acs_apache_ext_append = re.compile(r'(?P<address>[\da-f.:]+)'
297 r' -.*?"(?P<method>[A-Z]+)'
299 r' (?P<code>[1-9]\d{2})'
300 r' (?P<bytes_sent>\d+|-)'
302 r' (?P<resp_length>\d+)'
303 r' (?P<resp_time>\d+)'
306 acs_nginx_ext_insert = re.compile(r'(?P<address>[\da-f.:]+)'
307 r' -.*?"(?P<method>[A-Z]+)'
309 r' (?P<code>[1-9]\d{2})'
310 r' (?P<bytes_sent>\d+)'
311 r' (?P<resp_length>\d+)'
312 r' (?P<resp_time>\d\.\d+) ')
314 acs_nginx_ext_append = re.compile(r'(?P<address>[\da-f.:]+)'
315 r' -.*?"(?P<method>[A-Z]+)'
317 r' (?P<code>[1-9]\d{2})'
318 r' (?P<bytes_sent>\d+)'
320 r' (?P<resp_length>\d+)'
321 r' (?P<resp_time>\d\.\d+)')
327 return time * 1000000
329 r_regex = [acs_apache_ext_insert, acs_apache_ext_append, acs_nginx_ext_insert,
330 acs_nginx_ext_append, acs_default]
331 r_function = [func_usec, func_usec, func_sec, func_sec, func_usec]
332 regex_function = zip(r_regex, r_function)
335 for regex, function in regex_function:
336 match = regex.search(last_line)
339 self.resp_time_func = function
340 match_dict = match.groupdict()
343 return find_regex_return(match_dict=match_dict or None,
344 log_name='web_access',
345 msg='Unknown log format. You need to use "custom_log_format" feature.')
347 def create_access_charts(self, match_dict):
349 :param match_dict: dict: regex.search.groupdict(). Ex. {'address': '127.0.0.1', 'code': '200', 'method': 'GET'}
351 Create additional charts depending on the 'match_dict' keys and configuration file options
352 1. 'time_response' chart is removed if there is no 'resp_time' in match_dict.
353 2. Other stuff is just remove/add chart depending on yes/no in conf
355 def find_job_name(override_name, name):
357 :param override_name: str: 'name' var from configuration file
358 :param name: str: 'job_name' from configuration file
359 :return: str: new job name
360 We need this for dynamic charts. Actually same logic as in python.d.plugin.
362 add_to_name = override_name or name
364 return '_'.join(['web_log', re.sub('\s+', '_', add_to_name)])
368 self.order = ORDER[:]
369 self.definitions = deepcopy(CHARTS)
371 job_name = find_job_name(self.override_name, self.name)
372 self.detailed_chart = 'CHART %s.detailed_response_codes ""' \
373 ' "Detailed Response Codes" requests/s responses' \
374 ' web_log.detailed_response_codes stacked 1 %s\n' % (job_name, self.update_every)
375 self.http_method_chart = 'CHART %s.http_method' \
376 ' "" "Requests Per HTTP Method" requests/s "http methods"' \
377 ' web_log.http_method stacked 2 %s\n' % (job_name, self.update_every)
379 # Remove 'request_time' chart from ORDER if resp_time not in match_dict
380 if 'resp_time' not in match_dict:
381 self.order.remove('response_time')
382 # Remove 'clients_all' chart from ORDER if specified in the configuration
383 if not self.all_time:
384 self.order.remove('clients_all')
385 # Add 'detailed_response_codes' chart if specified in the configuration
386 if self.detailed_response_codes:
387 self.order.append('detailed_response_codes')
388 self.definitions['detailed_response_codes'] = {'options': [None, 'Detailed Response Codes', 'requests/s',
389 'responses', 'web_log.detailed_response_codes',
393 # Add 'requests_per_url' chart if specified in the configuration
395 self.definitions['requests_per_url'] = {'options': [None, 'Requests Per Url', 'requests/s',
396 'urls', 'web_log.requests_per_url', 'stacked'],
397 'lines': [['pur_other', 'other', 'incremental']]}
398 for elem in self.url_pattern:
399 self.definitions['requests_per_url']['lines'].append([elem.description, elem.description[4:],
401 self.data.update({elem.description: 0})
402 self.data.update({'pur_other': 0})
404 self.order.remove('requests_per_url')
406 def add_new_dimension(self, dimension, line_list, chart_string, key):
408 :param dimension: str: response status code. Ex.: '202', '499'
409 :param line_list: list: Ex.: ['202', '202', 'incremental']
410 :param chart_string: Current string we need to pass to netdata to rebuild the chart
411 :param key: str: CHARTS dict key (chart name). Ex.: 'response_time'
412 :return: str: new chart string = previous + new dimensions
414 self.data.update({dimension: 0})
415 # SET method check if dim in _dimensions
416 self._dimensions.append(dimension)
417 # UPDATE method do SET only if dim in definitions
418 self.definitions[key]['lines'].append(line_list)
420 chart += "%s %s\n" % ('DIMENSION', ' '.join(line_list))
424 def _get_access_data(self):
427 :return: dict OR None
428 None if _get_raw_data method fails.
429 In all other cases - dict.
431 raw = self._get_raw_data()
435 request_time, unique_current = list(), list()
436 request_counter = {'count': 0, 'sum': 0}
437 ip_address_counter = {'unique_cur_ip': 0}
439 match = self.regex.search(line)
441 match_dict = match.groupdict()
443 code = ''.join([match_dict['code'][0], 'xx'])
446 self.data['0xx'] += 1
447 # detailed response code
448 if self.detailed_response_codes:
449 self._get_data_detailed_response_codes(match_dict['code'])
451 self._get_data_statuses(match_dict['code'])
454 self._get_data_per_url(match_dict['url'])
455 # requests per http method
456 self._get_data_http_method(match_dict['method'])
458 bytes_sent = match_dict['bytes_sent'] if '-' not in match_dict['bytes_sent'] else 0
459 self.data['bytes_sent'] += int(bytes_sent)
460 # request processing time and bandwidth received
461 if 'resp_length' in match_dict:
462 self.data['resp_length'] += int(match_dict['resp_length'])
463 if 'resp_time' in match_dict:
464 resp_time = self.resp_time_func(float(match_dict['resp_time']))
465 bisect.insort_left(request_time, resp_time)
466 request_counter['count'] += 1
467 request_counter['sum'] += resp_time
468 # requests per ip proto
469 proto = 'ipv4' if '.' in match_dict['address'] else 'ipv6'
470 self.data['req_' + proto] += 1
472 if address_not_in_pool(self.unique_all_time, match_dict['address'],
473 self.data['unique_tot_ipv4'] + self.data['unique_tot_ipv6']):
474 self.data['unique_tot_' + proto] += 1
475 if address_not_in_pool(unique_current, match_dict['address'], ip_address_counter['unique_cur_ip']):
476 self.data['unique_cur_' + proto] += 1
477 ip_address_counter['unique_cur_ip'] += 1
479 self.data['unmatched'] += 1
483 self.data['resp_time_min'] += int(request_time[0])
484 self.data['resp_time_avg'] += int(round(float(request_counter['sum']) / request_counter['count']))
485 self.data['resp_time_max'] += int(request_time[-1])
488 def _get_data_detailed_response_codes(self, code):
490 :param code: str: CODE from parsed line. Ex.: '202, '499'
492 Calls add_new_dimension method If the value is found for the first time
494 if code not in self.data:
495 chart_string_copy = self.detailed_chart
496 self.detailed_chart = self.add_new_dimension(code, [code, code, 'incremental'],
497 chart_string_copy, 'detailed_response_codes')
500 def _get_data_http_method(self, method):
502 :param method: str: METHOD from parsed line. Ex.: 'GET', 'POST'
504 Calls add_new_dimension method If the value is found for the first time
506 if method not in self.data:
507 chart_string_copy = self.http_method_chart
508 self.http_method_chart = self.add_new_dimension(method, [method, method, 'incremental'],
509 chart_string_copy, 'http_method')
510 self.data[method] += 1
512 def _get_data_per_url(self, url):
514 :param url: str: URL from parsed line
516 Scan through string looking for the first location where patterns produce a match for all user
520 for elem in self.url_pattern:
521 if elem.pattern.search(url):
522 self.data[elem.description] += 1
526 self.data['pur_other'] += 1
528 def _get_data_statuses(self, code):
530 :param code: str: response status code. Ex.: '202', '499'
534 if code_class == '2' or code == '304' or code_class == '1':
535 self.data['successful_requests'] += 1
536 elif code_class == '3':
537 self.data['redirects'] += 1
538 elif code_class == '4':
539 self.data['bad_requests'] += 1
540 elif code_class == '5':
541 self.data['server_errors'] += 1
543 self.data['other_requests'] += 1
546 def address_not_in_pool(pool, address, pool_size):
548 :param pool: list of ip addresses
549 :param address: ip address
550 :param pool_size: current pool size
551 :return: True if address not in pool. False if address in pool.
553 index = bisect.bisect_left(pool, address)
554 if index < pool_size:
555 if pool[index] == address:
558 bisect.insort_left(pool, address)
561 bisect.insort_left(pool, address)
565 def find_regex_return(match_dict=None, log_name=None, msg='Generic error message'):
567 :param match_dict: dict: re.search.groupdict() or None
568 :param log_name: str: log name
569 :param msg: str: error description
572 return match_dict, log_name, msg
575 def check_req_per_url_pattern(url_pattern):
577 :param url_pattern: dict: ex. {'dim1': 'pattern1>', 'dim2': '<pattern2>'}
578 :return: list of named tuples or None:
579 We need to make sure all patterns are valid regular expressions
581 if not is_dict(url_pattern):
586 def is_valid_pattern(pattern):
589 :return: re.compile(pattern) or None
591 if not isinstance(pattern, str):
595 compile_pattern = re.compile(pattern)
599 return compile_pattern
601 for dimension, regex in url_pattern.items():
602 valid_pattern = is_valid_pattern(regex)
603 if isinstance(dimension, str) and valid_pattern:
604 result.append(NAMED_URL_PATTERN(description='_'.join(['pur', dimension]), pattern=valid_pattern))
606 return result or None
612 :return: True or False
613 obj can be <dict> or <OrderedDict>
617 except AttributeError: