1 # -*- coding: utf-8 -*-
2 # Description: web log netdata python.d module
5 from base import LogService
8 from os import access, R_OK
9 from os.path import getsize
10 from collections import namedtuple
11 from copy import deepcopy
16 ORDER = ['response_statuses', 'response_codes', 'bandwidth', 'response_time', 'requests_per_url', 'http_method',
17 'requests_per_ipproto', 'clients', 'clients_all']
20 'options': [None, 'Response Codes', 'requests/s', 'responses', 'web_log.response_codes', 'stacked'],
22 ['2xx', '2xx', 'incremental'],
23 ['5xx', '5xx', 'incremental'],
24 ['3xx', '3xx', 'incremental'],
25 ['4xx', '4xx', 'incremental'],
26 ['1xx', '1xx', 'incremental'],
27 ['0xx', 'other', 'incremental'],
28 ['unmatched', 'unmatched', 'incremental']
31 'options': [None, 'Bandwidth', 'KB/s', 'bandwidth', 'web_log.bandwidth', 'area'],
33 ['resp_length', 'received', 'incremental', 1, 1024],
34 ['bytes_sent', 'sent', 'incremental', -1, 1024]
37 'options': [None, 'Processing Time', 'milliseconds', 'timings', 'web_log.response_time', 'area'],
39 ['resp_time_min', 'min', 'incremental', 1, 1000],
40 ['resp_time_max', 'max', 'incremental', 1, 1000],
41 ['resp_time_avg', 'avg', 'incremental', 1, 1000]
44 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'clients', 'web_log.clients', 'stacked'],
46 ['unique_cur_ipv4', 'ipv4', 'incremental', 1, 1],
47 ['unique_cur_ipv6', 'ipv6', 'incremental', 1, 1]
50 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'clients', 'web_log.clients_all', 'stacked'],
52 ['unique_tot_ipv4', 'ipv4', 'absolute', 1, 1],
53 ['unique_tot_ipv6', 'ipv6', 'absolute', 1, 1]
56 'options': [None, 'Requests Per HTTP Method', 'requests/s', 'http methods', 'web_log.http_method', 'stacked'],
58 ['GET', 'GET', 'incremental', 1, 1]
60 'requests_per_ipproto': {
61 'options': [None, 'Requests Per IP Protocol', 'requests/s', 'ip protocols', 'web_log.requests_per_ipproto',
64 ['req_ipv4', 'ipv4', 'incremental', 1, 1],
65 ['req_ipv6', 'ipv6', 'incremental', 1, 1]
67 'response_statuses': {
68 'options': [None, 'Response Statuses', 'requests/s', 'responses', 'web_log.response_statuses',
71 ['successful_requests', 'success', 'incremental', 1, 1],
72 ['server_errors', 'error', 'incremental', 1, 1],
73 ['redirects', 'redirect', 'incremental', 1, 1],
74 ['bad_requests', 'bad', 'incremental', 1, 1],
75 ['other_requests', 'other', 'incremental', 1, 1]
79 NAMED_URL_PATTERN = namedtuple('URL_PATTERN', ['description', 'pattern'])
82 class Service(LogService):
83 def __init__(self, configuration=None, name=None):
87 # self._get_data = None # will be assigned in 'check' method.
88 # self.order = None # will be assigned in 'create_*_method' method.
89 # self.definitions = None # will be assigned in 'create_*_method' method.
90 # self.detailed_chart = None # will be assigned in 'create_*_method' method.
91 # self.http_method_chart = None # will be assigned in 'create_*_method' method.
93 LogService.__init__(self, configuration=configuration, name=name)
94 # Variables from module configuration file
95 self.log_path = self.configuration.get('path')
96 self.detailed_response_codes = self.configuration.get('detailed_response_codes', True)
97 self.all_time = self.configuration.get('all_time', True)
98 self.url_pattern = self.configuration.get('categories') # dict
99 self.custom_log_format = self.configuration.get('custom_log_format') # dict
101 self.unique_all_time = list() # sorted list of unique IPs
102 self.regex = None # will be assigned in 'find_regex' or 'find_regex_custom' method
103 self.resp_time_func = None # will be assigned in 'find_regex' or 'find_regex_custom' method
104 self.data = {'bytes_sent': 0, 'resp_length': 0, 'resp_time_min': 0, 'resp_time_max': 0,
105 'resp_time_avg': 0, 'unique_cur_ipv4': 0, 'unique_cur_ipv6': 0, '2xx': 0,
106 '5xx': 0, '3xx': 0, '4xx': 0, '1xx': 0, '0xx': 0, 'unmatched': 0, 'req_ipv4': 0,
107 'req_ipv6': 0, 'unique_tot_ipv4': 0, 'unique_tot_ipv6': 0, 'successful_requests': 0,
108 'redirects': 0, 'bad_requests': 0, 'server_errors': 0, 'other_requests': 0, 'GET': 0}
114 We need to make sure:
115 1. "log_path" is specified in the module configuration file
116 2. "log_path" must be readable by netdata user and must exist
117 3. "log_path' must not be empty. We need at least 1 line to find appropriate pattern to parse
118 4. Plugin can work using predefined patterns (OK for nginx, apache default log format) or user defined
119 pattern. So we need to check if we can parse last line from log file with user pattern OR module patterns.
120 5. All patterns for per_url_request_counter feature are valid regex expressions
122 if not self.log_path:
123 self.error('log path is not specified')
126 if not access(self.log_path, R_OK):
127 self.error('%s not readable or not exist' % self.log_path)
130 if not getsize(self.log_path):
131 self.error('%s is empty' % self.log_path)
134 # Read last line (or first if there is only one line)
135 with open(self.log_path, 'rb') as logs:
137 while logs.read(1) != b'\n':
141 last_line = logs.readline()
144 last_line = last_line.decode()
145 except UnicodeDecodeError:
147 last_line = last_line.decode(encoding='utf-8')
148 except (TypeError, UnicodeDecodeError) as error:
149 self.error(str(error))
152 # Custom_log_format or predefined log format.
153 if self.custom_log_format:
154 match_dict, log_name, error = self.find_regex_custom(last_line)
156 match_dict, log_name, error = self.find_regex(last_line)
158 # "match_dict" is None if there are any problems
159 if match_dict is None:
160 self.error(str(error))
163 # self.url_pattern check
165 self.url_pattern = check_req_per_url_pattern(self.url_pattern)
168 if not (self.regex and self.resp_time_func):
169 self.error('That can not happen, but it happened. "regex" or "resp_time_func" is None')
171 # All is ok. We are about to start.
172 if log_name == 'web_access':
173 self.create_access_charts(match_dict) # Create charts
174 self._get_data = self._get_access_data
175 self.info('Collected data: %s' % list(match_dict.keys()))
178 # If it's not access_logs.. Not used at the moment
181 def find_regex_custom(self, last_line):
183 :param last_line: str: literally last line from log file
184 :return: tuple where:
185 [0]: dict or None: match_dict or None
186 [1]: str or None: log_name or None
187 [2]: str: error description
189 We are here only if "custom_log_format" is in logs. We need to make sure:
190 1. "custom_log_format" is a dict
191 2. "pattern" in "custom_log_format" and pattern is <str> instance
192 3. if "time_multiplier" is in "custom_log_format" it must be <int> instance
194 If all parameters is ok we need to make sure:
195 1. Pattern search is success
196 2. Pattern search contains named subgroups (?P<subgroup_name>) (= "match_dict")
198 If pattern search is success we need to make sure:
199 1. All mandatory keys ['address', 'code', 'bytes_sent', 'method', 'url'] are in "match_dict"
201 If this is True we need to make sure:
202 1. All mandatory key values from "match_dict" have the correct format
203 ("code" is integer, "method" is uppercase word, etc)
205 If non mandatory keys in "match_dict" we need to make sure:
206 1. All non mandatory key values from match_dict ['resp_length', 'resp_time'] have the correct format
207 ("resp_length" is integer or "-", "resp_time" is integer or float)
210 if not is_dict(self.custom_log_format):
211 return find_regex_return(msg='Custom log: "custom_log_format" is not a <dict>')
213 pattern = self.custom_log_format.get('pattern')
214 if not (pattern and isinstance(pattern, str)):
215 return find_regex_return(msg='Custom log: "pattern" option is not specified or type is not <str>')
217 resp_time_func = self.custom_log_format.get('time_multiplier') or 0
219 if not isinstance(resp_time_func, int):
220 return find_regex_return(msg='Custom log: "time_multiplier" is not an integer')
223 regex = re.compile(pattern)
224 except re.error as error:
225 return find_regex_return(msg='Pattern compile error: %s' % str(error))
227 match = regex.search(last_line)
229 match_dict = match.groupdict() or None
231 return find_regex_return(msg='Custom log: pattern search FAILED')
233 if match_dict is None:
234 find_regex_return(msg='Custom log: search OK but contains no named subgroups'
235 ' (you need to use ?P<subgroup_name>)')
237 mandatory_dict = {'address': r'[\da-f.:]+',
238 'code': r'[1-9]\d{2}',
240 'bytes_sent': r'\d+|-'}
241 optional_dict = {'resp_length': r'\d+',
242 'resp_time': r'[\d.]+'}
244 mandatory_values = set(mandatory_dict) - set(match_dict)
246 return find_regex_return(msg='Custom log: search OK but some mandatory keys (%s) are missing'
247 % list(mandatory_values))
249 for key in mandatory_dict:
250 if not re.search(mandatory_dict[key], match_dict[key]):
251 return find_regex_return(msg='Custom log: can\'t parse "%s": %s'
252 % (key, match_dict[key]))
254 optional_values = set(optional_dict) & set(match_dict)
255 for key in optional_values:
256 if not re.search(optional_dict[key], match_dict[key]):
257 return find_regex_return(msg='Custom log: can\'t parse "%s": %s'
258 % (key, match_dict[key]))
260 dot_in_time = '.' in match_dict.get('resp_time', '')
262 self.resp_time_func = lambda time: time * (resp_time_func or 1000000)
264 self.resp_time_func = lambda time: time * (resp_time_func or 1)
267 return find_regex_return(match_dict=match_dict,
268 log_name='web_access')
270 def find_regex(self, last_line):
272 :param last_line: str: literally last line from log file
273 :return: tuple where:
274 [0]: dict or None: match_dict or None
275 [1]: str or None: log_name or None
276 [2]: str: error description
277 We need to find appropriate pattern for current log file
278 All logic is do a regex search through the string for all predefined patterns
279 until we find something or fail.
281 # REGEX: 1.IPv4 address 2.HTTP method 3. URL 4. Response code
282 # 5. Bytes sent 6. Response length 7. Response process time
283 acs_default = re.compile(r'(?P<address>[\da-f.:]+)'
284 r' -.*?"(?P<method>[A-Z]+)'
286 r' (?P<code>[1-9]\d{2})'
287 r' (?P<bytes_sent>\d+|-)')
289 acs_apache_ext_insert = re.compile(r'(?P<address>[\da-f.:]+)'
290 r' -.*?"(?P<method>[A-Z]+)'
292 r' (?P<code>[1-9]\d{2})'
293 r' (?P<bytes_sent>\d+|-)'
294 r' (?P<resp_length>\d+)'
295 r' (?P<resp_time>\d+) ')
297 acs_apache_ext_append = re.compile(r'(?P<address>[\da-f.:]+)'
298 r' -.*?"(?P<method>[A-Z]+)'
300 r' (?P<code>[1-9]\d{2})'
301 r' (?P<bytes_sent>\d+|-)'
303 r' (?P<resp_length>\d+)'
304 r' (?P<resp_time>\d+)'
307 acs_nginx_ext_insert = re.compile(r'(?P<address>[\da-f.:]+)'
308 r' -.*?"(?P<method>[A-Z]+)'
310 r' (?P<code>[1-9]\d{2})'
311 r' (?P<bytes_sent>\d+)'
312 r' (?P<resp_length>\d+)'
313 r' (?P<resp_time>\d\.\d+) ')
315 acs_nginx_ext_append = re.compile(r'(?P<address>[\da-f.:]+)'
316 r' -.*?"(?P<method>[A-Z]+)'
318 r' (?P<code>[1-9]\d{2})'
319 r' (?P<bytes_sent>\d+)'
321 r' (?P<resp_length>\d+)'
322 r' (?P<resp_time>\d\.\d+)')
328 return time * 1000000
330 r_regex = [acs_apache_ext_insert, acs_apache_ext_append, acs_nginx_ext_insert,
331 acs_nginx_ext_append, acs_default]
332 r_function = [func_usec, func_usec, func_sec, func_sec, func_usec]
333 regex_function = zip(r_regex, r_function)
336 for regex, function in regex_function:
337 match = regex.search(last_line)
340 self.resp_time_func = function
341 match_dict = match.groupdict()
344 return find_regex_return(match_dict=match_dict or None,
345 log_name='web_access',
346 msg='Unknown log format. You need to use "custom_log_format" feature.')
348 def create_access_charts(self, match_dict):
350 :param match_dict: dict: regex.search.groupdict(). Ex. {'address': '127.0.0.1', 'code': '200', 'method': 'GET'}
352 Create additional charts depending on the 'match_dict' keys and configuration file options
353 1. 'time_response' chart is removed if there is no 'resp_time' in match_dict.
354 2. Other stuff is just remove/add chart depending on yes/no in conf
356 def find_job_name(override_name, name):
358 :param override_name: str: 'name' var from configuration file
359 :param name: str: 'job_name' from configuration file
360 :return: str: new job name
361 We need this for dynamic charts. Actually same logic as in python.d.plugin.
363 add_to_name = override_name or name
365 return '_'.join(['web_log', re.sub('\s+', '_', add_to_name)])
369 self.order = ORDER[:]
370 self.definitions = deepcopy(CHARTS)
372 job_name = find_job_name(self.override_name, self.name)
373 self.detailed_chart = 'CHART %s.detailed_response_codes ""' \
374 ' "Detailed Response Codes" requests/s responses' \
375 ' web_log.detailed_response_codes stacked 1 %s\n' % (job_name, self.update_every)
376 self.http_method_chart = 'CHART %s.http_method' \
377 ' "" "Requests Per HTTP Method" requests/s "http methods"' \
378 ' web_log.http_method stacked 2 %s\n' \
379 'DIMENSION GET GET incremental\n' % (job_name, self.update_every)
381 # Remove 'request_time' chart from ORDER if resp_time not in match_dict
382 if 'resp_time' not in match_dict:
383 self.order.remove('response_time')
384 # Remove 'clients_all' chart from ORDER if specified in the configuration
385 if not self.all_time:
386 self.order.remove('clients_all')
387 # Add 'detailed_response_codes' chart if specified in the configuration
388 if self.detailed_response_codes:
389 self.order.append('detailed_response_codes')
390 self.definitions['detailed_response_codes'] = {'options': [None, 'Detailed Response Codes', 'requests/s',
391 'responses', 'web_log.detailed_response_codes',
395 # Add 'requests_per_url' chart if specified in the configuration
397 self.definitions['requests_per_url'] = {'options': [None, 'Requests Per Url', 'requests/s',
398 'urls', 'web_log.requests_per_url', 'stacked'],
399 'lines': [['pur_other', 'other', 'incremental']]}
400 for elem in self.url_pattern:
401 self.definitions['requests_per_url']['lines'].append([elem.description, elem.description[4:],
403 self.data.update({elem.description: 0})
404 self.data.update({'pur_other': 0})
406 self.order.remove('requests_per_url')
408 def add_new_dimension(self, dimension, line_list, chart_string, key):
410 :param dimension: str: response status code. Ex.: '202', '499'
411 :param line_list: list: Ex.: ['202', '202', 'incremental']
412 :param chart_string: Current string we need to pass to netdata to rebuild the chart
413 :param key: str: CHARTS dict key (chart name). Ex.: 'response_time'
414 :return: str: new chart string = previous + new dimensions
416 self.data.update({dimension: 0})
417 # SET method check if dim in _dimensions
418 self._dimensions.append(dimension)
419 # UPDATE method do SET only if dim in definitions
420 self.definitions[key]['lines'].append(line_list)
422 chart += "%s %s\n" % ('DIMENSION', ' '.join(line_list))
426 def _get_access_data(self):
429 :return: dict OR None
430 None if _get_raw_data method fails.
431 In all other cases - dict.
433 raw = self._get_raw_data()
437 request_time, unique_current = list(), list()
438 request_counter = {'count': 0, 'sum': 0}
439 ip_address_counter = {'unique_cur_ip': 0}
441 match = self.regex.search(line)
443 match_dict = match.groupdict()
445 code = ''.join([match_dict['code'][0], 'xx'])
448 self.data['0xx'] += 1
449 # detailed response code
450 if self.detailed_response_codes:
451 self._get_data_detailed_response_codes(match_dict['code'])
453 self._get_data_statuses(match_dict['code'])
456 self._get_data_per_url(match_dict['url'])
457 # requests per http method
458 self._get_data_http_method(match_dict['method'])
460 bytes_sent = match_dict['bytes_sent'] if '-' not in match_dict['bytes_sent'] else 0
461 self.data['bytes_sent'] += int(bytes_sent)
462 # request processing time and bandwidth received
463 if 'resp_length' in match_dict:
464 self.data['resp_length'] += int(match_dict['resp_length'])
465 if 'resp_time' in match_dict:
466 resp_time = self.resp_time_func(float(match_dict['resp_time']))
467 bisect.insort_left(request_time, resp_time)
468 request_counter['count'] += 1
469 request_counter['sum'] += resp_time
470 # requests per ip proto
471 proto = 'ipv4' if '.' in match_dict['address'] else 'ipv6'
472 self.data['req_' + proto] += 1
474 if address_not_in_pool(self.unique_all_time, match_dict['address'],
475 self.data['unique_tot_ipv4'] + self.data['unique_tot_ipv6']):
476 self.data['unique_tot_' + proto] += 1
477 if address_not_in_pool(unique_current, match_dict['address'], ip_address_counter['unique_cur_ip']):
478 self.data['unique_cur_' + proto] += 1
479 ip_address_counter['unique_cur_ip'] += 1
481 self.data['unmatched'] += 1
485 self.data['resp_time_min'] += int(request_time[0])
486 self.data['resp_time_avg'] += int(round(float(request_counter['sum']) / request_counter['count']))
487 self.data['resp_time_max'] += int(request_time[-1])
490 def _get_data_detailed_response_codes(self, code):
492 :param code: str: CODE from parsed line. Ex.: '202, '499'
494 Calls add_new_dimension method If the value is found for the first time
496 if code not in self.data:
497 chart_string_copy = self.detailed_chart
498 self.detailed_chart = self.add_new_dimension(code, [code, code, 'incremental'],
499 chart_string_copy, 'detailed_response_codes')
502 def _get_data_http_method(self, method):
504 :param method: str: METHOD from parsed line. Ex.: 'GET', 'POST'
506 Calls add_new_dimension method If the value is found for the first time
508 if method not in self.data:
509 chart_string_copy = self.http_method_chart
510 self.http_method_chart = self.add_new_dimension(method, [method, method, 'incremental'],
511 chart_string_copy, 'http_method')
512 self.data[method] += 1
514 def _get_data_per_url(self, url):
516 :param url: str: URL from parsed line
518 Scan through string looking for the first location where patterns produce a match for all user
522 for elem in self.url_pattern:
523 if elem.pattern.search(url):
524 self.data[elem.description] += 1
528 self.data['pur_other'] += 1
530 def _get_data_statuses(self, code):
532 :param code: str: response status code. Ex.: '202', '499'
536 if code_class == '2' or code == '304' or code_class == '1':
537 self.data['successful_requests'] += 1
538 elif code_class == '3':
539 self.data['redirects'] += 1
540 elif code_class == '4':
541 self.data['bad_requests'] += 1
542 elif code_class == '5':
543 self.data['server_errors'] += 1
545 self.data['other_requests'] += 1
548 def address_not_in_pool(pool, address, pool_size):
550 :param pool: list of ip addresses
551 :param address: ip address
552 :param pool_size: current pool size
553 :return: True if address not in pool. False if address in pool.
555 index = bisect.bisect_left(pool, address)
556 if index < pool_size:
557 if pool[index] == address:
560 bisect.insort_left(pool, address)
563 bisect.insort_left(pool, address)
567 def find_regex_return(match_dict=None, log_name=None, msg='Generic error message'):
569 :param match_dict: dict: re.search.groupdict() or None
570 :param log_name: str: log name
571 :param msg: str: error description
574 return match_dict, log_name, msg
577 def check_req_per_url_pattern(url_pattern):
579 :param url_pattern: dict: ex. {'dim1': 'pattern1>', 'dim2': '<pattern2>'}
580 :return: list of named tuples or None:
581 We need to make sure all patterns are valid regular expressions
583 if not is_dict(url_pattern):
588 def is_valid_pattern(pattern):
591 :return: re.compile(pattern) or None
593 if not isinstance(pattern, str):
597 compile_pattern = re.compile(pattern)
601 return compile_pattern
603 for dimension, regex in url_pattern.items():
604 valid_pattern = is_valid_pattern(regex)
605 if isinstance(dimension, str) and valid_pattern:
606 result.append(NAMED_URL_PATTERN(description='_'.join(['pur', dimension]), pattern=valid_pattern))
608 return result or None
614 :return: True or False
615 obj can be <dict> or <OrderedDict>
619 except AttributeError: