3 #define BACKEND_SOURCE_DATA_AS_COLLECTED 0x00000001
4 #define BACKEND_SOURCE_DATA_AVERAGE 0x00000002
5 #define BACKEND_SOURCE_DATA_SUM 0x00000004
7 int connect_to_socket4(const char *ip, int port, struct timeval *timeout) {
10 debug(D_LISTENER, "IPv4 connecting to ip '%s' port %d", ip, port);
12 sock = socket(AF_INET, SOCK_STREAM, 0);
14 error("IPv4 socket() on ip '%s' port %d failed.", ip, port);
18 if(setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)timeout, sizeof(struct timeval)) < 0)
19 error("Failed to set timeout on the socket to ip '%s' port %d", ip, port);
21 struct sockaddr_in name;
22 memset(&name, 0, sizeof(struct sockaddr_in));
23 name.sin_family = AF_INET;
24 name.sin_port = htons(port);
26 int ret = inet_pton(AF_INET, ip, (void *)&name.sin_addr.s_addr);
28 error("Failed to convert '%s' to a valid IPv4 address.", ip);
33 if(connect(sock, (struct sockaddr *) &name, sizeof(name)) < 0) {
35 error("IPv4 failed to connect to '%s', port %d", ip, port);
39 debug(D_LISTENER, "Connected to IPv4 ip '%s' port %d", ip, port);
43 int connect_to_socket6(const char *ip, int port, struct timeval *timeout) {
47 debug(D_LISTENER, "IPv6 connecting to ip '%s' port %d", ip, port);
49 sock = socket(AF_INET6, SOCK_STREAM, 0);
51 error("IPv6 socket() on ip '%s' port %d failed.", ip, port);
55 if(setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)timeout, sizeof(struct timeval)) < 0)
56 error("Failed to set timeout on the socket to ip '%s' port %d", ip, port);
59 if(setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, (void*)&ipv6only, sizeof(ipv6only)) != 0)
60 error("Cannot set IPV6_V6ONLY on ip '%s' port's %d.", ip, port);
62 struct sockaddr_in6 name;
63 memset(&name, 0, sizeof(struct sockaddr_in6));
64 name.sin6_family = AF_INET6;
65 name.sin6_port = htons ((uint16_t) port);
67 int ret = inet_pton(AF_INET6, ip, (void *)&name.sin6_addr.s6_addr);
69 error("Failed to convert IP '%s' to a valid IPv6 address.", ip);
74 name.sin6_scope_id = 0;
76 if(connect(sock, (struct sockaddr *)&name, sizeof(name)) < 0) {
78 error("IPv6 failed to connect to '%s', port %d", ip, port);
82 debug(D_LISTENER, "Connected to IPv6 ip '%s' port %d", ip, port);
87 static inline int connect_to_one(const char *definition, int default_port, struct timeval *timeout) {
88 struct addrinfo hints;
89 struct addrinfo *result = NULL, *rp = NULL;
91 char buffer[strlen(definition) + 1];
92 strcpy(buffer, definition);
95 snprintfz(buffer2, 10, "%d", default_port);
97 char *ip = buffer, *port = buffer2;
102 while(*e && *e != ']') e++;
109 while(*e && *e != ':') e++;
123 memset(&hints, 0, sizeof(struct addrinfo));
124 hints.ai_family = AF_UNSPEC; /* Allow IPv4 or IPv6 */
125 hints.ai_socktype = SOCK_DGRAM; /* Datagram socket */
126 hints.ai_flags = AI_PASSIVE; /* For wildcard IP address */
127 hints.ai_protocol = 0; /* Any protocol */
128 hints.ai_canonname = NULL;
129 hints.ai_addr = NULL;
130 hints.ai_next = NULL;
132 int r = getaddrinfo(ip, port, &hints, &result);
134 error("Cannot resolve host '%s', port '%s': %s\n", ip, port, gai_strerror(r));
139 for (rp = result; rp != NULL && fd == -1; rp = rp->ai_next) {
140 char rip[INET_ADDRSTRLEN + INET6_ADDRSTRLEN] = "INVALID";
143 switch (rp->ai_addr->sa_family) {
145 struct sockaddr_in *sin = (struct sockaddr_in *) rp->ai_addr;
146 inet_ntop(AF_INET, &sin->sin_addr, rip, INET_ADDRSTRLEN);
147 rport = ntohs(sin->sin_port);
148 fd = connect_to_socket4(rip, rport, timeout);
153 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) rp->ai_addr;
154 inet_ntop(AF_INET6, &sin6->sin6_addr, rip, INET6_ADDRSTRLEN);
155 rport = ntohs(sin6->sin6_port);
156 fd = connect_to_socket6(rip, rport, timeout);
162 freeaddrinfo(result);
167 static inline calculated_number backend_calculate_value_from_stored_data(RRDSET *st, RRDDIM *rd, time_t after, time_t before, uint32_t options) {
168 time_t first_t = rrdset_first_entry_t(st);
169 time_t last_t = rrdset_last_entry_t(st);
171 if(unlikely(before - after < st->update_every && after != after - after % st->update_every))
172 // when st->update_every is bigger than the frequency we send data to backend
173 // skip the iterations that are not aligned to the database
176 // align the time-frame
177 // for 'after' also skip the first value by adding st->update_every
178 after = after - after % st->update_every + st->update_every;
179 before = before - before % st->update_every;
181 if(unlikely(after < first_t))
184 if(unlikely(after > before))
185 // this can happen when the st->update_every > before - after
188 if(unlikely(before > last_t))
192 calculated_number sum = 0;
194 long start_at_slot = rrdset_time2slot(st, before),
195 stop_at_slot = rrdset_time2slot(st, after),
198 for(slot = start_at_slot; !stop_now ; slot--) {
199 if(unlikely(slot < 0)) slot = st->entries - 1;
200 if(unlikely(slot == stop_at_slot)) stop_now = 1;
202 storage_number n = rd->values[slot];
203 if(unlikely(!does_storage_number_exist(n))) continue;
205 calculated_number value = unpack_storage_number(n);
210 if(unlikely(!counter))
213 if(unlikely(options & BACKEND_SOURCE_DATA_SUM))
216 return sum / (calculated_number)counter;
219 static inline int format_dimension_collected_graphite_plaintext(BUFFER *b, const char *prefix, RRDHOST *host, const char *hostname, RRDSET *st, RRDDIM *rd, time_t after, time_t before, uint32_t options) {
224 buffer_sprintf(b, "%s.%s.%s.%s " COLLECTED_NUMBER_FORMAT " %u\n", prefix, hostname, st->id, rd->id, rd->last_collected_value, (uint32_t)rd->last_collected_time.tv_sec);
228 static inline int format_dimension_stored_graphite_plaintext(BUFFER *b, const char *prefix, RRDHOST *host, const char *hostname, RRDSET *st, RRDDIM *rd, time_t after, time_t before, uint32_t options) {
230 calculated_number value = backend_calculate_value_from_stored_data(st, rd, after, before, options);
232 buffer_sprintf(b, "%s.%s.%s.%s " CALCULATED_NUMBER_FORMAT " %u\n", prefix, hostname, st->id, rd->id, value, (uint32_t) before);
238 static inline int format_dimension_collected_opentsdb_telnet(BUFFER *b, const char *prefix, RRDHOST *host, const char *hostname, RRDSET *st, RRDDIM *rd, time_t after, time_t before, uint32_t options) {
243 buffer_sprintf(b, "put %s.%s.%s %u " COLLECTED_NUMBER_FORMAT " host=%s\n", prefix, st->id, rd->id, (uint32_t)rd->last_collected_time.tv_sec, rd->last_collected_value, hostname);
247 static inline int format_dimension_stored_opentsdb_telnet(BUFFER *b, const char *prefix, RRDHOST *host, const char *hostname, RRDSET *st, RRDDIM *rd, time_t after, time_t before, uint32_t options) {
249 calculated_number value = backend_calculate_value_from_stored_data(st, rd, after, before, options);
251 buffer_sprintf(b, "put %s.%s.%s %u " CALCULATED_NUMBER_FORMAT " host=%s\n", prefix, st->id, rd->id, (uint32_t) before, value, hostname);
257 void *backends_main(void *ptr) {
260 BUFFER *b = buffer_create(1);
261 int (*formatter)(BUFFER *b, const char *prefix, RRDHOST *host, const char *hostname, RRDSET *st, RRDDIM *rd, time_t after, time_t before, uint32_t options);
263 info("BACKEND thread created with task id %d", gettid());
265 if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0)
266 error("Cannot set pthread cancel type to DEFERRED.");
268 if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0)
269 error("Cannot set pthread cancel state to ENABLE.");
271 // ------------------------------------------------------------------------
272 // collect configuration options
274 struct timeval timeout = {
278 int default_port = 0;
281 int enabled = config_get_boolean("backend", "enabled", 0);
282 const char *source = config_get("backend", "data source", "average");
283 const char *type = config_get("backend", "type", "graphite");
284 const char *destination = config_get("backend", "destination", "localhost");
285 const char *prefix = config_get("backend", "prefix", "netdata");
286 const char *hostname = config_get("backend", "hostname", localhost.hostname);
287 int frequency = (int)config_get_number("backend", "update every", 10);
288 int buffer_on_failures = (int)config_get_number("backend", "buffer on failures", 10);
289 long timeoutms = config_get_number("backend", "timeout ms", frequency * 2 * 1000);
291 // ------------------------------------------------------------------------
292 // validate configuration options
293 // and prepare for sending data to our backend
294 if(!enabled || frequency < 1)
297 if(!strcmp(source, "as collected")) {
298 options = BACKEND_SOURCE_DATA_AS_COLLECTED;
300 else if(!strcmp(source, "average")) {
301 options = BACKEND_SOURCE_DATA_AVERAGE;
303 else if(!strcmp(source, "sum") || !strcmp(source, "volume")) {
304 options = BACKEND_SOURCE_DATA_SUM;
307 error("Invalid data source method '%s' for backend given. Disabling backed.", source);
311 if(!strcmp(type, "graphite") || !strcmp(type, "graphite:plaintext")) {
313 if(options == BACKEND_SOURCE_DATA_AS_COLLECTED)
314 formatter = format_dimension_collected_graphite_plaintext;
316 formatter = format_dimension_stored_graphite_plaintext;
318 else if(!strcmp(type, "opentsdb") || !strcmp(type, "opentsdb:telnet")) {
320 if(options == BACKEND_SOURCE_DATA_AS_COLLECTED)
321 formatter = format_dimension_collected_opentsdb_telnet;
323 formatter = format_dimension_stored_opentsdb_telnet;
326 error("Unknown backend type '%s'", type);
331 error("BACKED invalid timeout %ld ms given. Assuming %d ms.", timeoutms, frequency * 2 * 1000);
332 timeoutms = frequency * 2 * 1000;
334 timeout.tv_sec = (timeoutms * 1000) / 1000000;
335 timeout.tv_usec = (timeoutms * 1000) % 1000000;
337 // ------------------------------------------------------------------------
338 // prepare the charts for monitoring the backend
340 struct rusage thread;
343 chart_buffered_metrics = 0,
344 chart_lost_metrics = 0,
345 chart_sent_metrics = 0,
346 chart_buffered_bytes = 0,
347 chart_sent_bytes = 0,
348 chart_transmission_successes = 0,
349 chart_transmission_failures = 0,
350 chart_data_lost_events = 0,
351 chart_lost_bytes = 0,
352 chart_backend_reconnects = 0,
353 chart_backend_latency = 0;
355 RRDSET *chart_metrics = rrdset_find("netdata.backend_metrics");
357 chart_metrics = rrdset_create("netdata", "backend_metrics", NULL, "backend", NULL, "Netdata Buffered Metrics", "metrics", 130600, frequency, RRDSET_TYPE_LINE);
358 rrddim_add(chart_metrics, "buffered", NULL, 1, 1, RRDDIM_ABSOLUTE);
359 rrddim_add(chart_metrics, "lost", NULL, 1, 1, RRDDIM_ABSOLUTE);
360 rrddim_add(chart_metrics, "sent", NULL, 1, 1, RRDDIM_ABSOLUTE);
363 RRDSET *chart_bytes = rrdset_find("netdata.backend_bytes");
365 chart_bytes = rrdset_create("netdata", "backend_bytes", NULL, "backend", NULL, "Netdata Backend Data Size", "KB", 130610, frequency, RRDSET_TYPE_AREA);
366 rrddim_add(chart_bytes, "buffered", NULL, 1, 1024, RRDDIM_ABSOLUTE);
367 rrddim_add(chart_bytes, "lost", NULL, 1, 1024, RRDDIM_ABSOLUTE);
368 rrddim_add(chart_bytes, "sent", NULL, 1, 1024, RRDDIM_ABSOLUTE);
371 RRDSET *chart_ops = rrdset_find("netdata.backend_ops");
373 chart_ops = rrdset_create("netdata", "backend_ops", NULL, "backend", NULL, "Netdata Backend Operations", "operations", 130630, frequency, RRDSET_TYPE_LINE);
374 rrddim_add(chart_ops, "write", NULL, 1, 1, RRDDIM_ABSOLUTE);
375 rrddim_add(chart_ops, "discard", NULL, 1, 1, RRDDIM_ABSOLUTE);
376 rrddim_add(chart_ops, "reconnect", NULL, 1, 1, RRDDIM_ABSOLUTE);
377 rrddim_add(chart_ops, "failure", NULL, 1, 1, RRDDIM_ABSOLUTE);
380 RRDSET *chart_latency = rrdset_find("netdata.backend_latency");
382 chart_latency = rrdset_create("netdata", "backend_latency", NULL, "backend", NULL, "Netdata Backend Latency", "ms", 130620, frequency, RRDSET_TYPE_AREA);
383 rrddim_add(chart_latency, "latency", NULL, 1, 1000, RRDDIM_ABSOLUTE);
386 RRDSET *chart_rusage = rrdset_find("netdata.backend_thread_cpu");
388 chart_rusage = rrdset_create("netdata", "backend_thread_cpu", NULL, "backend", NULL, "NetData Backend Thread CPU usage", "milliseconds/s", 130630, frequency, RRDSET_TYPE_STACKED);
389 rrddim_add(chart_rusage, "user", NULL, 1, 1000, RRDDIM_INCREMENTAL);
390 rrddim_add(chart_rusage, "system", NULL, 1, 1000, RRDDIM_INCREMENTAL);
393 // ------------------------------------------------------------------------
394 // prepare the backend main loop
396 info("BACKEND configured ('%s' on '%s' sending '%s' data, every %d seconds, as host '%s', with prefix '%s')", type, destination, source, frequency, hostname, prefix);
398 usec_t step_ut = frequency * USEC_PER_SEC;
399 usec_t random_ut = now_realtime_usec() % (step_ut / 2);
400 time_t before = (time_t)((now_realtime_usec() - step_ut) / USEC_PER_SEC);
401 time_t after = before;
405 // ------------------------------------------------------------------------
406 // wait for the next iteration point
408 usec_t now_ut = now_realtime_usec();
409 usec_t next_ut = now_ut - (now_ut % step_ut) + step_ut;
410 before = (time_t)(next_ut / USEC_PER_SEC);
412 // add a little delay (1/4 of the step) plus some randomness
413 next_ut += (step_ut / 4) + random_ut;
415 while(now_ut < next_ut) {
416 sleep_usec(next_ut - now_ut);
417 now_ut = now_realtime_usec();
420 // ------------------------------------------------------------------------
421 // add to the buffer the data we need to send to the backend
424 int pthreadoldcancelstate;
426 if(unlikely(pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &pthreadoldcancelstate) != 0))
427 error("Cannot set pthread cancel state to DISABLE.");
429 rrdhost_rdlock(&localhost);
430 for(st = localhost.rrdset_root; st ;st = st->next) {
431 pthread_rwlock_rdlock(&st->rwlock);
434 for(rd = st->dimensions; rd ;rd = rd->next) {
435 if(rd->last_collected_time.tv_sec >= after)
436 chart_buffered_metrics += formatter(b, prefix, &localhost, hostname, st, rd, after, before, options);
439 pthread_rwlock_unlock(&st->rwlock);
441 rrdhost_unlock(&localhost);
443 if(unlikely(pthread_setcancelstate(pthreadoldcancelstate, NULL) != 0))
444 error("Cannot set pthread cancel state to RESTORE (%d).", pthreadoldcancelstate);
446 chart_buffered_bytes = (collected_number)buffer_strlen(b);
448 // reset the monitoring chart counters
452 chart_transmission_successes =
453 chart_transmission_failures =
454 chart_data_lost_events =
456 chart_backend_reconnects =
457 chart_backend_latency = 0;
459 if(unlikely(netdata_exit)) break;
461 //fprintf(stderr, "\nBACKEND BEGIN:\n%s\nBACKEND END\n", buffer_tostring(b)); // FIXME
462 //fprintf(stderr, "after = %lu, before = %lu\n", after, before);
464 // ------------------------------------------------------------------------
465 // connect to a backend server
467 if(unlikely(sock == -1)) {
468 usec_t start_ut = now_realtime_usec();
469 const char *s = destination;
473 // skip separators, moving both s(tart) and e(nd)
474 while(isspace(*e) || *e == ',') s = ++e;
476 // move e(nd) to the first separator
477 while(*e && !isspace(*e) && *e != ',') e++;
479 // is there anything?
480 if(!*s || s == e) break;
483 strncpyz(buf, s, e - s);
484 chart_backend_reconnects++;
485 sock = connect_to_one(buf, default_port, &timeout);
486 if(sock != -1) break;
489 chart_backend_latency += now_realtime_usec() - start_ut;
492 if(unlikely(netdata_exit)) break;
494 // ------------------------------------------------------------------------
495 // send our buffer to the backend server
497 if(likely(sock != -1)) {
498 size_t len = buffer_strlen(b);
499 usec_t start_ut = now_realtime_usec();
502 flags += MSG_NOSIGNAL;
504 ssize_t written = send(sock, buffer_tostring(b), len, flags);
505 chart_backend_latency += now_realtime_usec() - start_ut;
506 if(written != -1 && (size_t)written == len) {
507 // we sent the data successfully
508 chart_transmission_successes++;
509 chart_sent_bytes += written;
510 chart_sent_metrics = chart_buffered_metrics;
512 // reset the failures count
519 // oops! we couldn't send (all or some of the) data
520 error("Failed to write data to database backend '%s'. Willing to write %zu bytes, wrote %zd bytes. Will re-connect.", destination, len, written);
521 chart_transmission_failures++;
524 chart_sent_bytes += written;
526 // increment the counter we check for data loss
529 // close the socket - we will re-open it next time
534 // either the buffer is empty
535 // or is holding the data we couldn't send
536 // so, make sure the next iteration will continue
537 // from where we are now
541 error("Failed to update database backend '%s'", destination);
542 chart_transmission_failures++;
544 // increment the counter we check for data loss
548 if(failures > buffer_on_failures) {
549 // too bad! we are going to lose data
550 chart_lost_bytes += buffer_strlen(b);
551 error("Reached %d backend failures. Flushing buffers to protect this host - this results in data loss on back-end server '%s'", failures, destination);
554 chart_data_lost_events++;
555 chart_lost_metrics = chart_buffered_metrics;
558 if(unlikely(netdata_exit)) break;
560 // ------------------------------------------------------------------------
561 // update the monitoring charts
563 if(chart_ops->counter_done) rrdset_next(chart_ops);
564 rrddim_set(chart_ops, "write", chart_transmission_successes);
565 rrddim_set(chart_ops, "discard", chart_data_lost_events);
566 rrddim_set(chart_ops, "failure", chart_transmission_failures);
567 rrddim_set(chart_ops, "reconnect", chart_backend_reconnects);
568 rrdset_done(chart_ops);
570 if(chart_metrics->counter_done) rrdset_next(chart_metrics);
571 rrddim_set(chart_metrics, "buffered", chart_buffered_metrics);
572 rrddim_set(chart_metrics, "lost", chart_lost_metrics);
573 rrddim_set(chart_metrics, "sent", chart_sent_metrics);
574 rrdset_done(chart_metrics);
576 if(chart_bytes->counter_done) rrdset_next(chart_bytes);
577 rrddim_set(chart_bytes, "buffered", chart_buffered_bytes);
578 rrddim_set(chart_bytes, "lost", chart_lost_bytes);
579 rrddim_set(chart_bytes, "sent", chart_sent_bytes);
580 rrdset_done(chart_bytes);
582 if(chart_latency->counter_done) rrdset_next(chart_latency);
583 rrddim_set(chart_latency, "latency", chart_backend_latency);
584 rrdset_done(chart_latency);
586 getrusage(RUSAGE_THREAD, &thread);
587 if(chart_rusage->counter_done) rrdset_next(chart_rusage);
588 rrddim_set(chart_rusage, "user", thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec);
589 rrddim_set(chart_rusage, "system", thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec);
590 rrdset_done(chart_rusage);
592 if(likely(buffer_strlen(b) == 0))
593 chart_buffered_metrics = 0;
595 if(unlikely(netdata_exit)) break;
602 info("BACKEND thread exiting");