1 # -*- coding: utf-8 -*-
2 # Description: web log netdata python.d module
5 from base import LogService
8 from os import access, R_OK
9 from os.path import getsize
10 from collections import defaultdict, namedtuple
11 from copy import deepcopy
13 from itertools import zip_longest
15 from itertools import izip_longest as zip_longest
20 ORDER = ['response_codes', 'bandwidth', 'response_time', 'requests_per_url', 'http_method', 'requests_per_ipproto', 'clients', 'clients_all']
23 'options': [None, 'Response Codes', 'requests/s', 'responses', 'web_log.response_codes', 'stacked'],
25 ['2xx', '2xx', 'absolute'],
26 ['5xx', '5xx', 'absolute'],
27 ['3xx', '3xx', 'absolute'],
28 ['4xx', '4xx', 'absolute'],
29 ['1xx', '1xx', 'absolute'],
30 ['0xx', 'other', 'absolute'],
31 ['unmatched', 'unmatched', 'absolute']
34 'options': [None, 'Bandwidth', 'KB/s', 'bandwidth', 'web_log.bandwidth', 'area'],
36 ['resp_length', 'received', 'absolute', 1, 1024],
37 ['bytes_sent', 'sent', 'absolute', -1, 1024]
40 'options': [None, 'Processing Time', 'milliseconds', 'timings', 'web_log.response_time', 'area'],
42 ['resp_time_min', 'min', 'absolute', 1, 1],
43 ['resp_time_max', 'max', 'absolute', 1, 1],
44 ['resp_time_avg', 'avg', 'absolute', 1, 1]
47 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'clients', 'web_log.clients', 'stacked'],
49 ['unique_cur_ipv4', 'ipv4', 'absolute', 1, 1],
50 ['unique_cur_ipv6', 'ipv6', 'absolute', 1, 1]
53 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'clients', 'web_log.clients_all', 'stacked'],
55 ['unique_tot_ipv4', 'ipv4', 'absolute', 1, 1],
56 ['unique_tot_ipv6', 'ipv6', 'absolute', 1, 1]
59 'options': [None, 'Requests Per HTTP Method', 'requests/s', 'http methods', 'web_log.http_method', 'stacked'],
62 'requests_per_ipproto': {
63 'options': [None, 'Requests Per IP Protocol', 'requests/s', 'ip protocols', 'web_log.requests_per_ipproto', 'stacked'],
65 ['req_ipv4', 'ipv4', 'absolute', 1, 1],
66 ['req_ipv6', 'ipv6', 'absolute', 1, 1]
70 NAMED_URL_PATTERN = namedtuple('URL_PATTERN', ['description', 'pattern'])
73 class Service(LogService):
74 def __init__(self, configuration=None, name=None):
75 LogService.__init__(self, configuration=configuration, name=name)
76 # Variables from module configuration file
77 self.log_path = self.configuration.get('path')
78 self.detailed_response_codes = self.configuration.get('detailed_response_codes', True)
79 self.all_time = self.configuration.get('all_time', True)
80 self.url_pattern = self.configuration.get('categories') # dict
81 self.regex = None # will be assigned in 'find_regex' method
82 self.resp_time_func = None # will be assigned in 'find_regex' method
83 self._get_data = None # will be assigned in 'check' method.
84 self.order = None # will be assigned in 'create_*_method' method.
85 self.definitions = None # will be assigned in 'create_*_method' method.
86 self.detailed_chart = None # will be assigned in 'create_*_method' method.
87 self.http_method_chart = None # will be assigned in 'create_*_method' method.
88 # sorted list of unique IPs
89 self.unique_all_time = list()
90 # dict for values that should not be zeroed every poll
91 self.storage = {'unique_tot_ipv4': 0, 'unique_tot_ipv6': 0}
92 # if there is no new logs this dict + self.storage returned to netdata
93 self.data = {'bytes_sent': 0, 'resp_length': 0, 'resp_time_min': 0,
94 'resp_time_max': 0, 'resp_time_avg': 0, 'unique_cur_ipv4': 0,
95 'unique_cur_ipv6': 0, '2xx': 0, '5xx': 0, '3xx': 0, '4xx': 0,
96 '1xx': 0, '0xx': 0, 'unmatched': 0, 'req_ipv4': 0, 'req_ipv6': 0}
100 self.error('log path is not specified')
103 # log_path must be readable
104 if not access(self.log_path, R_OK):
105 self.error('%s not readable or not exist' % self.log_path)
108 # log_path file should not be empty
109 if not getsize(self.log_path):
110 self.error('%s is empty' % self.log_path)
113 # Read last line (or first if there is only one line)
114 with open(self.log_path, 'rb') as logs:
116 while logs.read(1) != b'\n':
120 last_line = logs.readline().decode(encoding='utf-8')
123 regex_name = self.find_regex(last_line)
125 self.error('Can\'t parse %s' % self.log_path)
128 if regex_name.startswith('access_'):
129 self.create_access_charts(regex_name)
130 if regex_name == 'access_default':
131 self.info('Not all data collected. You need to modify LogFormat.')
132 self._get_data = self._get_access_data
133 self.info('Used regex: %s' % regex_name)
136 # If it's not access_logs.. Not used at the moment
139 def find_regex(self, last_line):
141 :param last_line: str: literally last line from log file
143 It's sad but different web servers has different logs formats
144 We need to find appropriate regex for current log file
145 All logic is do a regex search through the string for all patterns
146 until we find something or fail.
148 # REGEX: 1.IPv4 address 2.HTTP method 3. URL 4. Response code
149 # 5. Bytes sent 6. Response length 7. Response process time
150 access_default = re.compile(r'([\da-f.:]+)'
156 access_apache_ext = re.compile(r'([\da-f.:]+)'
164 access_nginx_ext = re.compile(r'([\da-f.:]+)'
172 regex_function = zip([access_apache_ext, access_nginx_ext, access_default],
173 [lambda x: x, lambda x: x * 1000, lambda x: x],
174 ['access_apache_ext', 'access_nginx_ext', 'access_default'])
176 for regex, function, name in regex_function:
177 if regex.search(last_line):
179 self.resp_time_func = function
184 def create_access_charts(self, regex_name):
186 :param regex_name: str: regex name from 'find_regex' method. Ex.: 'apache_extended', 'nginx_extended'
188 Create additional charts depending on the 'find_regex' result (parsed_line) and configuration file
189 1. 'time_response' chart is removed if there is no 'time_response' in logs.
190 2. We need to change divisor for 'response_time' chart for apache (time in microseconds in logs)
191 3. Other stuff is just remove/add chart depending on yes/no in conf
193 def find_job_name(override_name, name):
195 :param override_name: str: 'name' var from configuration file
196 :param name: str: 'job_name' from configuration file
197 :return: str: new job name
198 We need this for dynamic charts. Actually same logic as in python.d.plugin.
200 add_to_name = override_name or name
202 return '_'.join(['web_log', re.sub('\s+', '_', add_to_name)])
206 self.order = ORDER[:]
207 self.definitions = deepcopy(CHARTS)
209 job_name = find_job_name(self.override_name, self.name)
210 self.detailed_chart = 'CHART %s.detailed_response_codes ""' \
211 ' "Detailed Response Codes" requests/s responses' \
212 ' web_log.detailed_response_codes stacked 1 %s\n' % (job_name, self.update_every)
213 self.http_method_chart = 'CHART %s.http_method' \
214 ' "" "Requests Per HTTP Method" requests/s "http methods"' \
215 ' web_log.http_method stacked 2 %s\n' % (job_name, self.update_every)
217 if regex_name == 'access_apache_ext':
218 self.definitions['response_time']['lines'][0][4] = 1000
219 self.definitions['response_time']['lines'][1][4] = 1000
220 self.definitions['response_time']['lines'][2][4] = 1000
222 # Remove 'request_time' chart from ORDER if request_time not in logs
223 if regex_name == 'access_default':
224 self.order.remove('response_time')
225 # Remove 'clients_all' chart from ORDER if specified in the configuration
226 if not self.all_time:
227 self.order.remove('clients_all')
228 # Add 'detailed_response_codes' chart if specified in the configuration
229 if self.detailed_response_codes:
230 self.order.append('detailed_response_codes')
231 self.definitions['detailed_response_codes'] = {'options': [None, 'Detailed Response Codes', 'requests/s',
232 'responses', 'web_log.detailed_response_codes', 'stacked'],
235 # Add 'requests_per_url' chart if specified in the configuration
237 self.url_pattern = [NAMED_URL_PATTERN(description=k, pattern=re.compile(v)) for k, v in self.url_pattern.items()]
238 self.definitions['requests_per_url'] = {'options': [None, 'Requests Per Url', 'requests/s',
239 'urls', 'web_log.requests_per_url', 'stacked'],
240 'lines': [['other_url', 'other', 'absolute']]}
241 for elem in self.url_pattern:
242 self.definitions['requests_per_url']['lines'].append([elem.description, elem.description, 'absolute'])
243 self.data.update({elem.description: 0})
244 self.data.update({'other_url': 0})
246 self.order.remove('requests_per_url')
248 def add_new_dimension(self, dimension, line_list, chart_string, key):
250 :param dimension: str: response status code. Ex.: '202', '499'
251 :param line_list: list: Ex.: ['202', '202', 'Absolute']
252 :param chart_string: Current string we need to pass to netdata to rebuild the chart
253 :param key: str: CHARTS dict key (chart name). Ex.: 'response_time'
254 :return: str: new chart string = previous + new dimensions
256 self.data.update({dimension: 0})
257 # SET method check if dim in _dimensions
258 self._dimensions.append(dimension)
259 # UPDATE method do SET only if dim in definitions
260 self.definitions[key]['lines'].append(line_list)
262 chart += "%s %s\n" % ('DIMENSION', ' '.join(line_list))
266 def _get_access_data(self):
269 :return: dict OR None
270 None if _get_raw_data method fails.
271 In all other cases - dict.
273 raw = self._get_raw_data()
277 request_time, unique_current = list(), list()
278 request_counter = {'count': 0, 'sum': 0}
280 to_netdata.update(self.data)
281 default_dict = defaultdict(lambda: 0)
284 match = self.regex.search(line)
286 match_dict = dict(zip_longest('address method url code sent resp_length resp_time'.split(),
289 code = ''.join([match_dict['code'][0], 'xx'])
290 to_netdata[code] += 1
292 to_netdata['0xx'] += 1
293 # detailed response code
294 if self.detailed_response_codes:
295 self._get_data_detailed_response_codes(match_dict['code'], default_dict)
298 self._get_data_per_url(match_dict['url'], default_dict)
299 # requests per http method
300 self._get_data_http_method(match_dict['method'], default_dict)
302 to_netdata['bytes_sent'] += int(match_dict['sent'])
303 # request processing time and bandwidth received
304 if match_dict['resp_length'] and match_dict['resp_time']:
305 to_netdata['resp_length'] += int(match_dict['resp_length'])
306 resp_time = self.resp_time_func(float(match_dict['resp_time']))
307 bisect.insort_left(request_time, resp_time)
308 request_counter['count'] += 1
309 request_counter['sum'] += resp_time
310 # requests per ip proto
311 proto = 'ipv4' if '.' in match_dict['address'] else 'ipv6'
312 to_netdata['req_' + proto] += 1
314 if address_not_in_pool(self.unique_all_time, match_dict['address'],
315 self.storage['unique_tot_ipv4'] + self.storage['unique_tot_ipv6']):
316 self.storage['unique_tot_' + proto] += 1
317 if address_not_in_pool(unique_current, match_dict['address'],
318 to_netdata['unique_cur_ipv4'] + to_netdata['unique_cur_ipv6']):
319 to_netdata['unique_cur_' + proto] += 1
321 to_netdata['unmatched'] += 1
324 to_netdata['resp_time_min'] = request_time[0]
325 to_netdata['resp_time_avg'] = float(request_counter['sum']) / request_counter['count']
326 to_netdata['resp_time_max'] = request_time[-1]
328 to_netdata.update(self.storage)
329 to_netdata.update(default_dict)
332 def _get_data_detailed_response_codes(self, code, default_dict):
334 :param code: str: CODE from parsed line. Ex.: '202, '499'
335 :param default_dict: defaultdict
337 Calls add_new_dimension method If the value is found for the first time
339 if code not in self.data:
340 chart_string_copy = self.detailed_chart
341 self.detailed_chart = self.add_new_dimension(code, [code, code, 'absolute'],
342 chart_string_copy, 'detailed_response_codes')
343 default_dict[code] += 1
345 def _get_data_http_method(self, method, default_dict):
347 :param method: str: METHOD from parsed line. Ex.: 'GET', 'POST'
348 :param default_dict: defaultdict
350 Calls add_new_dimension method If the value is found for the first time
352 if method not in self.data:
353 chart_string_copy = self.http_method_chart
354 self.http_method_chart = self.add_new_dimension(method, [method, method, 'absolute'],
355 chart_string_copy, 'http_method')
356 default_dict[method] += 1
358 def _get_data_per_url(self, url, default_dict):
360 :param url: str: URL from parsed line
361 :param default_dict: defaultdict
363 Scan through string looking for the first location where patterns produce a match for all user
367 for elem in self.url_pattern:
368 if elem.pattern.search(url):
369 default_dict[elem.description] += 1
373 default_dict['other_url'] += 1
376 def address_not_in_pool(pool, address, pool_size):
378 :param pool: list of ip addresses
379 :param address: ip address
380 :param pool_size: current size of pool
381 :return: True if address not pool and False address in pool
382 If address not in pool function add address to pool.
384 index = bisect.bisect_left(pool, address)
385 if index < pool_size:
386 if pool[index] == address:
389 bisect.insort_left(pool, address)
392 bisect.insort_left(pool, address)