1 # -*- coding: utf-8 -*-
2 # Description: web log netdata python.d module
5 from base import LogService
8 from os import access, R_OK
9 from os.path import getsize
10 from collections import defaultdict, namedtuple
11 from copy import deepcopy
13 from itertools import zip_longest
15 from itertools import izip_longest as zip_longest
20 ORDER = ['response_codes', 'response_time', 'requests_per_url', 'http_method', 'requests_per_ipproto', 'bandwidth', 'clients', 'clients_all']
23 'options': [None, 'Response Codes', 'requests/s', 'responses', 'web_log.response_codes', 'stacked'],
25 ['2xx', '2xx', 'absolute'],
26 ['5xx', '5xx', 'absolute'],
27 ['3xx', '3xx', 'absolute'],
28 ['4xx', '4xx', 'absolute'],
29 ['1xx', '1xx', 'absolute'],
30 ['0xx', 'other', 'absolute'],
31 ['unmatched', 'unmatched', 'absolute']
34 'options': [None, 'Bandwidth', 'KB/s', 'bandwidth', 'web_log.bandwidth', 'area'],
36 ['resp_length', 'received', 'absolute', 1, 1024],
37 ['bytes_sent', 'sent', 'absolute', -1, 1024]
40 'options': [None, 'Processing Time', 'milliseconds', 'timings', 'web_log.response_time', 'area'],
42 ['resp_time_min', 'min', 'absolute', 1, 1],
43 ['resp_time_max', 'max', 'absolute', 1, 1],
44 ['resp_time_avg', 'avg', 'absolute', 1, 1]
47 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'unique clients', 'web_log.clients', 'stacked'],
49 ['unique_cur_ipv4', 'ipv4', 'absolute', 1, 1],
50 ['unique_cur_ipv6', 'ipv6', 'absolute', 1, 1]
53 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'unique clients', 'web_log.clients_all', 'stacked'],
55 ['unique_tot_ipv4', 'ipv4', 'absolute', 1, 1],
56 ['unique_tot_ipv6', 'ipv6', 'absolute', 1, 1]
59 'options': [None, 'Requests Per HTTP Method', 'requests/s', 'requests', 'web_log.http_method', 'stacked'],
62 'requests_per_ipproto': {
63 'options': [None, 'Requests Per IP Protocol', 'requests/s', 'requests', 'web_log.requests_per_ipproto', 'stacked'],
65 ['req_ipv4', 'ipv4', 'absolute', 1, 1],
66 ['req_ipv6', 'ipv6', 'absolute', 1, 1]
70 NAMED_URL_PATTERN = namedtuple('URL_PATTERN', ['description', 'pattern'])
73 class Service(LogService):
74 def __init__(self, configuration=None, name=None):
75 LogService.__init__(self, configuration=configuration, name=name)
76 # Variables from module configuration file
77 self.log_path = self.configuration.get('path')
78 self.detailed_response_codes = self.configuration.get('detailed_response_codes', True)
79 self.all_time = self.configuration.get('all_time', True)
80 self.url_pattern = self.configuration.get('categories') # dict
82 # sorted list of unique IPs
83 self.unique_all_time = list()
84 # dict for values that should not be zeroed every poll
85 self.storage = {'unique_tot_ipv4': 0, 'unique_tot_ipv6': 0}
86 # if there is no new logs this dict + self.storage returned to netdata
87 self.data = {'bytes_sent': 0, 'resp_length': 0, 'resp_time_min': 0,
88 'resp_time_max': 0, 'resp_time_avg': 0, 'unique_cur_ipv4': 0,
89 'unique_cur_ipv6': 0, '2xx': 0, '5xx': 0, '3xx': 0, '4xx': 0,
90 '1xx': 0, '0xx': 0, 'unmatched': 0, 'req_ipv4': 0, 'req_ipv6': 0}
94 self.error('log path is not specified')
97 # log_path must be readable
98 if not access(self.log_path, R_OK):
99 self.error('%s not readable or not exist' % self.log_path)
102 # log_path file should not be empty
103 if not getsize(self.log_path):
104 self.error('%s is empty' % self.log_path)
107 # Read last line (or first if there is only one line)
108 with open(self.log_path, 'rb') as logs:
110 while logs.read(1) != b'\n':
114 last_line = logs.readline().decode(encoding='utf-8')
117 parsed_line, regex_name = self.find_regex(last_line)
119 self.error('Can\'t parse output')
122 self.create_charts(parsed_line[0], regex_name)
123 if len(parsed_line[0]) == 5:
124 self.info('Not all data collected. You need to modify LogFormat.')
127 def find_regex(self, last_line):
129 :param last_line: str: literally last line from log file
130 :return: parsed line, regex name (the one that matches) OR None, None
131 It's sad but different web servers has different logs formats
132 We need to find appropriate regex for current log file
133 All logic is do a regex search through the string for all patterns
134 until we find something or fail.
136 # REGEX: 1.IPv4 address 2.HTTP method 3. URL 4. Response code
137 # 5. Bytes sent 6. Response length 7. Response process time
138 default = re.compile(r'([\da-f.:]+)'
144 apache_extended = re.compile(r'([\da-f.:]+)'
152 nginx_extended = re.compile(r'([\da-f.:]+)'
160 regex_function = zip([apache_extended, nginx_extended, default],
161 [lambda x: x, lambda x: x * 1000, lambda x: x],
162 ['apache_extended', 'nginx_extended', 'default'])
164 for regex, function, name in regex_function:
165 if regex.search(last_line):
167 self.resp_time_func = function
172 return self.regex.findall(last_line), regex_name
176 def create_charts(self, parsed_line, regex_name):
178 :param parsed_line: list: re.findall result.
179 :param regex_name: str: regex name from 'find_regex' method. Ex.: 'apache_extended', 'nginx_extended'
181 Create additional charts depending on the 'find_regex' result (parsed_line) and configuration file
182 1. 'time_response' chart is removed if there is no 'time_response' in logs.
183 2. We need to change divisor for 'response_time' chart for apache (time in microseconds in logs)
184 3. Other stuff is just remove/add chart depending on yes/no in conf
186 def find_job_name(override_name, name):
188 :param override_name: str: 'name' var from configuration file
189 :param name: str: 'job_name' from configuration file
190 :return: str: new job name
191 We need this for dynamic charts. Actually same logic as in python.d.plugin.
193 add_to_name = override_name or name
195 return '_'.join(['web_log', re.sub('\s+', '_', add_to_name)])
199 self.order = ORDER[:]
200 self.definitions = deepcopy(CHARTS)
202 job_name = find_job_name(self.override_name, self.name)
203 self.detailed_chart = 'CHART %s.detailed_response_codes ""' \
204 ' "Response Codes" requests/s responses' \
205 ' web_log.detailed_resp stacked 1 %s\n' % (job_name, self.update_every)
206 self.http_method_chart = 'CHART %s.http_method' \
207 ' "" "HTTP Methods" requests/s requests' \
208 ' web_log.http_method stacked 2 %s\n' % (job_name, self.update_every)
210 if 'apache' in regex_name:
211 self.definitions['response_time']['lines'][0][4] = 1000
212 self.definitions['response_time']['lines'][1][4] = 1000
213 self.definitions['response_time']['lines'][2][4] = 1000
215 # Remove 'request_time' chart from ORDER if request_time not in logs
216 if len(parsed_line) < 7:
217 self.order.remove('response_time')
218 # Remove 'clients_all' chart from ORDER if specified in the configuration
219 if not self.all_time:
220 self.order.remove('clients_all')
221 # Add 'detailed_response_codes' chart if specified in the configuration
222 if self.detailed_response_codes:
223 self.order.append('detailed_response_codes')
224 self.definitions['detailed_response_codes'] = {'options': [None, 'Detailed Response Codes', 'requests/s',
225 'responses', 'web_log.detailed_resp', 'stacked'],
228 # Add 'requests_per_url' chart if specified in the configuration
230 self.url_pattern = [NAMED_URL_PATTERN(description=k, pattern=re.compile(v)) for k, v in self.url_pattern.items()]
231 self.definitions['requests_per_url'] = {'options': [None, 'Requests Per Url', 'requests/s',
232 'requests', 'web_log.url_pattern', 'stacked'],
233 'lines': [['other_url', 'other', 'absolute']]}
234 for elem in self.url_pattern:
235 self.definitions['requests_per_url']['lines'].append([elem.description, elem.description, 'absolute'])
236 self.data.update({elem.description: 0})
237 self.data.update({'other_url': 0})
239 self.order.remove('requests_per_url')
241 def add_new_dimension(self, dimension, line_list, chart_string, key):
243 :param dimension: str: response status code. Ex.: '202', '499'
244 :param line_list: list: Ex.: ['202', '202', 'Absolute']
245 :param chart_string: Current string we need to pass to netdata to rebuild the chart
246 :param key: str: CHARTS dict key (chart name). Ex.: 'response_time'
247 :return: str: new chart string = previous + new dimensions
249 self.storage.update({dimension: 0})
250 # SET method check if dim in _dimensions
251 self._dimensions.append(dimension)
252 # UPDATE method do SET only if dim in definitions
253 self.definitions[key]['lines'].append(line_list)
255 chart += "%s %s\n" % ('DIMENSION', ' '.join(line_list))
262 :return: dict OR None
263 None if _get_raw_data method fails.
264 In all other cases - dict.
266 raw = self._get_raw_data()
270 request_time, unique_current = list(), list()
271 request_counter = {'count': 0, 'sum': 0}
273 to_netdata.update(self.data)
274 default_dict = defaultdict(lambda: 0)
277 match = self.regex.search(line)
279 match_dict = dict(zip_longest('address method url code sent resp_length resp_time'.split(), match.groups()))
281 code = ''.join([match_dict['code'][0], 'xx'])
282 to_netdata[code] += 1
284 to_netdata['0xx'] += 1
285 # detailed response code
286 if self.detailed_response_codes:
287 self._get_data_detailed_response_codes(match_dict['code'], default_dict)
290 self._get_data_per_url(match_dict['url'], default_dict)
291 # requests per http method
292 self._get_data_http_method(match_dict['method'], default_dict)
294 to_netdata['bytes_sent'] += int(match_dict['sent'])
296 if match_dict['resp_length'] and match_dict['resp_time']:
297 to_netdata['resp_length'] += int(match_dict['resp_length'])
298 resp_time = self.resp_time_func(float(match_dict['resp_time']))
299 bisect.insort_left(request_time, resp_time)
300 request_counter['count'] += 1
301 request_counter['sum'] += resp_time
302 # requests per ip proto
303 if '.' in match_dict['address']:
305 to_netdata['req_ipv4'] += 1
308 to_netdata['req_ipv6'] += 1
310 if address_not_in_pool(self.unique_all_time, match_dict['address'],
311 self.storage['unique_tot_ipv4'] + self.storage['unique_tot_ipv6']):
312 self.storage['unique_tot_' + proto] += 1
313 if address_not_in_pool(unique_current, match_dict['address'],
314 to_netdata['unique_cur_ipv4'] + to_netdata['unique_cur_ipv6']):
315 to_netdata['unique_cur_' + proto] += 1
317 to_netdata['unmatched'] += 1
320 to_netdata['resp_time_min'] = request_time[0]
321 to_netdata['resp_time_avg'] = float(request_counter['sum']) / request_counter['count']
322 to_netdata['resp_time_max'] = request_time[-1]
324 to_netdata.update(self.storage)
325 to_netdata.update(default_dict)
328 def _get_data_detailed_response_codes(self, code, default_dict):
330 :param code: str: CODE from parsed line. Ex.: '202, '499'
331 :param default_dict: defaultdict
333 Calls add_new_dimension method If the value is found for the first time
335 if code not in self.storage:
336 chart_string_copy = self.detailed_chart
337 self.detailed_chart = self.add_new_dimension(code, [code, code, 'absolute'],
338 chart_string_copy, 'detailed_response_codes')
339 default_dict[code] += 1
341 def _get_data_http_method(self, method, default_dict):
343 :param method: str: METHOD from parsed line. Ex.: 'GET', 'POST'
344 :param default_dict: defaultdict
346 Calls add_new_dimension method If the value is found for the first time
348 if method not in self.storage:
349 chart_string_copy = self.http_method_chart
350 self.http_method_chart = self.add_new_dimension(method, [method, method, 'absolute'],
351 chart_string_copy, 'http_method')
352 default_dict[method] += 1
354 def _get_data_per_url(self, url, default_dict):
356 :param url: str: URL from parsed line
357 :param default_dict: defaultdict
359 Scan through string looking for the first location where patterns produce a match for all user
363 for elem in self.url_pattern:
364 if elem.pattern.search(url):
365 default_dict[elem.description] += 1
369 default_dict['other_url'] += 1
372 def address_not_in_pool(pool, address, pool_size):
374 :param pool: list of ip addresses
375 :param address: ip address
376 :param pool_size: current size of pool
377 :return: True if address not pool and False address in pool
378 If address not in pool function add address to pool.
380 index = bisect.bisect_left(pool, address)
381 if index < pool_size:
382 if pool[index] == address:
385 bisect.insort_left(pool, address)
388 bisect.insort_left(pool, address)