1 # -*- coding: utf-8 -*-
2 # Description: web log netdata python.d module
5 from base import LogService
8 from os import access, R_OK
9 from os.path import getsize
10 from collections import defaultdict, namedtuple
11 from copy import deepcopy
13 from itertools import zip_longest
15 from itertools import izip_longest as zip_longest
20 ORDER = ['response_codes', 'bandwidth', 'response_time', 'requests_per_url', 'http_method', 'requests_per_ipproto', 'clients', 'clients_all']
23 'options': [None, 'Response Codes', 'requests/s', 'responses', 'web_log.response_codes', 'stacked'],
25 ['2xx', '2xx', 'absolute'],
26 ['5xx', '5xx', 'absolute'],
27 ['3xx', '3xx', 'absolute'],
28 ['4xx', '4xx', 'absolute'],
29 ['1xx', '1xx', 'absolute'],
30 ['0xx', 'other', 'absolute'],
31 ['unmatched', 'unmatched', 'absolute']
34 'options': [None, 'Bandwidth', 'KB/s', 'bandwidth', 'web_log.bandwidth', 'area'],
36 ['resp_length', 'received', 'absolute', 1, 1024],
37 ['bytes_sent', 'sent', 'absolute', -1, 1024]
40 'options': [None, 'Processing Time', 'milliseconds', 'timings', 'web_log.response_time', 'area'],
42 ['resp_time_min', 'min', 'absolute', 1, 1000],
43 ['resp_time_max', 'max', 'absolute', 1, 1000],
44 ['resp_time_avg', 'avg', 'absolute', 1, 1000]
47 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'clients', 'web_log.clients', 'stacked'],
49 ['unique_cur_ipv4', 'ipv4', 'absolute', 1, 1],
50 ['unique_cur_ipv6', 'ipv6', 'absolute', 1, 1]
53 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'clients', 'web_log.clients_all', 'stacked'],
55 ['unique_tot_ipv4', 'ipv4', 'absolute', 1, 1],
56 ['unique_tot_ipv6', 'ipv6', 'absolute', 1, 1]
59 'options': [None, 'Requests Per HTTP Method', 'requests/s', 'http methods', 'web_log.http_method', 'stacked'],
62 'requests_per_ipproto': {
63 'options': [None, 'Requests Per IP Protocol', 'requests/s', 'ip protocols', 'web_log.requests_per_ipproto', 'stacked'],
65 ['req_ipv4', 'ipv4', 'absolute', 1, 1],
66 ['req_ipv6', 'ipv6', 'absolute', 1, 1]
70 NAMED_URL_PATTERN = namedtuple('URL_PATTERN', ['description', 'pattern'])
73 class Service(LogService):
74 def __init__(self, configuration=None, name=None):
75 LogService.__init__(self, configuration=configuration, name=name)
76 # Variables from module configuration file
77 self.log_path = self.configuration.get('path')
78 self.detailed_response_codes = self.configuration.get('detailed_response_codes', True)
79 self.all_time = self.configuration.get('all_time', True)
80 self.url_pattern = self.configuration.get('categories') # dict
81 self.regex = None # will be assigned in 'find_regex' method
82 self.resp_time_func = None # will be assigned in 'find_regex' method
83 self._get_data = None # will be assigned in 'check' method.
84 self.order = None # will be assigned in 'create_*_method' method.
85 self.definitions = None # will be assigned in 'create_*_method' method.
86 self.detailed_chart = None # will be assigned in 'create_*_method' method.
87 self.http_method_chart = None # will be assigned in 'create_*_method' method.
88 # sorted list of unique IPs
89 self.unique_all_time = list()
90 # dict for values that should not be zeroed every poll
91 self.storage = {'unique_tot_ipv4': 0, 'unique_tot_ipv6': 0}
92 # if there is no new logs this dict + self.storage returned to netdata
93 self.data = {'bytes_sent': 0, 'resp_length': 0, 'resp_time_min': 0,
94 'resp_time_max': 0, 'resp_time_avg': 0, 'unique_cur_ipv4': 0,
95 'unique_cur_ipv6': 0, '2xx': 0, '5xx': 0, '3xx': 0, '4xx': 0,
96 '1xx': 0, '0xx': 0, 'unmatched': 0, 'req_ipv4': 0, 'req_ipv6': 0}
100 self.error('log path is not specified')
103 # log_path must be readable
104 if not access(self.log_path, R_OK):
105 self.error('%s not readable or not exist' % self.log_path)
108 # log_path file should not be empty
109 if not getsize(self.log_path):
110 self.error('%s is empty' % self.log_path)
113 # Read last line (or first if there is only one line)
114 with open(self.log_path, 'rb') as logs:
116 while logs.read(1) != b'\n':
120 last_line = logs.readline().decode(encoding='utf-8')
123 regex_name = self.find_regex(last_line)
125 self.error('Can\'t parse %s' % self.log_path)
128 if regex_name.startswith('access_'):
129 self.create_access_charts(regex_name)
130 if regex_name == 'access_default':
131 self.info('Not all data collected. You need to modify LogFormat.')
132 self._get_data = self._get_access_data
133 self.info('Used regex: %s' % regex_name)
136 # If it's not access_logs.. Not used at the moment
139 def find_regex(self, last_line):
141 :param last_line: str: literally last line from log file
143 It's sad but different web servers has different logs formats
144 We need to find appropriate regex for current log file
145 All logic is do a regex search through the string for all patterns
146 until we find something or fail.
148 # REGEX: 1.IPv4 address 2.HTTP method 3. URL 4. Response code
149 # 5. Bytes sent 6. Response length 7. Response process time
150 access_default = re.compile(r'([\da-f.:]+)'
156 access_apache_ext = re.compile(r'([\da-f.:]+)'
164 access_nginx_ext = re.compile(r'([\da-f.:]+)'
172 regex_function = zip([access_apache_ext, access_nginx_ext, access_default],
173 [lambda x: x, lambda x: x * 1000000, lambda x: x],
174 ['access_apache_ext', 'access_nginx_ext', 'access_default'])
176 for regex, function, name in regex_function:
177 if regex.search(last_line):
179 self.resp_time_func = function
184 def create_access_charts(self, regex_name):
186 :param regex_name: str: regex name from 'find_regex' method. Ex.: 'apache_extended', 'nginx_extended'
188 Create additional charts depending on the 'find_regex' result (parsed_line) and configuration file
189 1. 'time_response' chart is removed if there is no 'time_response' in logs.
190 2. We need to change divisor for 'response_time' chart for apache (time in microseconds in logs)
191 3. Other stuff is just remove/add chart depending on yes/no in conf
193 def find_job_name(override_name, name):
195 :param override_name: str: 'name' var from configuration file
196 :param name: str: 'job_name' from configuration file
197 :return: str: new job name
198 We need this for dynamic charts. Actually same logic as in python.d.plugin.
200 add_to_name = override_name or name
202 return '_'.join(['web_log', re.sub('\s+', '_', add_to_name)])
206 self.order = ORDER[:]
207 self.definitions = deepcopy(CHARTS)
209 job_name = find_job_name(self.override_name, self.name)
210 self.detailed_chart = 'CHART %s.detailed_response_codes ""' \
211 ' "Detailed Response Codes" requests/s responses' \
212 ' web_log.detailed_response_codes stacked 1 %s\n' % (job_name, self.update_every)
213 self.http_method_chart = 'CHART %s.http_method' \
214 ' "" "Requests Per HTTP Method" requests/s "http methods"' \
215 ' web_log.http_method stacked 2 %s\n' % (job_name, self.update_every)
217 # Remove 'request_time' chart from ORDER if request_time not in logs
218 if regex_name == 'access_default':
219 self.order.remove('response_time')
220 # Remove 'clients_all' chart from ORDER if specified in the configuration
221 if not self.all_time:
222 self.order.remove('clients_all')
223 # Add 'detailed_response_codes' chart if specified in the configuration
224 if self.detailed_response_codes:
225 self.order.append('detailed_response_codes')
226 self.definitions['detailed_response_codes'] = {'options': [None, 'Detailed Response Codes', 'requests/s',
227 'responses', 'web_log.detailed_response_codes', 'stacked'],
230 # Add 'requests_per_url' chart if specified in the configuration
232 self.url_pattern = [NAMED_URL_PATTERN(description=k, pattern=re.compile(v)) for k, v in self.url_pattern.items()]
233 self.definitions['requests_per_url'] = {'options': [None, 'Requests Per Url', 'requests/s',
234 'urls', 'web_log.requests_per_url', 'stacked'],
235 'lines': [['other_url', 'other', 'absolute']]}
236 for elem in self.url_pattern:
237 self.definitions['requests_per_url']['lines'].append([elem.description, elem.description, 'absolute'])
238 self.data.update({elem.description: 0})
239 self.data.update({'other_url': 0})
241 self.order.remove('requests_per_url')
243 def add_new_dimension(self, dimension, line_list, chart_string, key):
245 :param dimension: str: response status code. Ex.: '202', '499'
246 :param line_list: list: Ex.: ['202', '202', 'Absolute']
247 :param chart_string: Current string we need to pass to netdata to rebuild the chart
248 :param key: str: CHARTS dict key (chart name). Ex.: 'response_time'
249 :return: str: new chart string = previous + new dimensions
251 self.data.update({dimension: 0})
252 # SET method check if dim in _dimensions
253 self._dimensions.append(dimension)
254 # UPDATE method do SET only if dim in definitions
255 self.definitions[key]['lines'].append(line_list)
257 chart += "%s %s\n" % ('DIMENSION', ' '.join(line_list))
261 def _get_access_data(self):
264 :return: dict OR None
265 None if _get_raw_data method fails.
266 In all other cases - dict.
268 raw = self._get_raw_data()
272 request_time, unique_current = list(), list()
273 request_counter = {'count': 0, 'sum': 0}
275 to_netdata.update(self.data)
276 default_dict = defaultdict(lambda: 0)
279 match = self.regex.search(line)
281 match_dict = dict(zip_longest('address method url code sent resp_length resp_time'.split(),
284 code = ''.join([match_dict['code'][0], 'xx'])
285 to_netdata[code] += 1
287 to_netdata['0xx'] += 1
288 # detailed response code
289 if self.detailed_response_codes:
290 self._get_data_detailed_response_codes(match_dict['code'], default_dict)
293 self._get_data_per_url(match_dict['url'], default_dict)
294 # requests per http method
295 self._get_data_http_method(match_dict['method'], default_dict)
297 to_netdata['bytes_sent'] += int(match_dict['sent'])
298 # request processing time and bandwidth received
299 if match_dict['resp_length'] and match_dict['resp_time']:
300 to_netdata['resp_length'] += int(match_dict['resp_length'])
301 resp_time = self.resp_time_func(float(match_dict['resp_time']))
302 bisect.insort_left(request_time, resp_time)
303 request_counter['count'] += 1
304 request_counter['sum'] += resp_time
305 # requests per ip proto
306 proto = 'ipv4' if '.' in match_dict['address'] else 'ipv6'
307 to_netdata['req_' + proto] += 1
309 if address_not_in_pool(self.unique_all_time, match_dict['address'],
310 self.storage['unique_tot_ipv4'] + self.storage['unique_tot_ipv6']):
311 self.storage['unique_tot_' + proto] += 1
312 if address_not_in_pool(unique_current, match_dict['address'],
313 to_netdata['unique_cur_ipv4'] + to_netdata['unique_cur_ipv6']):
314 to_netdata['unique_cur_' + proto] += 1
316 to_netdata['unmatched'] += 1
319 to_netdata['resp_time_min'] = request_time[0]
320 to_netdata['resp_time_avg'] = round(float(request_counter['sum']) / request_counter['count'])
321 to_netdata['resp_time_max'] = request_time[-1]
323 to_netdata.update(self.storage)
324 to_netdata.update(default_dict)
327 def _get_data_detailed_response_codes(self, code, default_dict):
329 :param code: str: CODE from parsed line. Ex.: '202, '499'
330 :param default_dict: defaultdict
332 Calls add_new_dimension method If the value is found for the first time
334 if code not in self.data:
335 chart_string_copy = self.detailed_chart
336 self.detailed_chart = self.add_new_dimension(code, [code, code, 'absolute'],
337 chart_string_copy, 'detailed_response_codes')
338 default_dict[code] += 1
340 def _get_data_http_method(self, method, default_dict):
342 :param method: str: METHOD from parsed line. Ex.: 'GET', 'POST'
343 :param default_dict: defaultdict
345 Calls add_new_dimension method If the value is found for the first time
347 if method not in self.data:
348 chart_string_copy = self.http_method_chart
349 self.http_method_chart = self.add_new_dimension(method, [method, method, 'absolute'],
350 chart_string_copy, 'http_method')
351 default_dict[method] += 1
353 def _get_data_per_url(self, url, default_dict):
355 :param url: str: URL from parsed line
356 :param default_dict: defaultdict
358 Scan through string looking for the first location where patterns produce a match for all user
362 for elem in self.url_pattern:
363 if elem.pattern.search(url):
364 default_dict[elem.description] += 1
368 default_dict['other_url'] += 1
371 def address_not_in_pool(pool, address, pool_size):
373 :param pool: list of ip addresses
374 :param address: ip address
375 :param pool_size: current size of pool
376 :return: True if address not pool and False address in pool
377 If address not in pool function add address to pool.
379 index = bisect.bisect_left(pool, address)
380 if index < pool_size:
381 if pool[index] == address:
384 bisect.insort_left(pool, address)
387 bisect.insort_left(pool, address)