]> arthur.barton.de Git - netdata.git/blob - src/rrdhost.c
self-cleaning obsolete cgroups and network interfaces from memory; fixes #1163; fixes...
[netdata.git] / src / rrdhost.c
1 #define NETDATA_RRD_INTERNALS 1
2 #include "common.h"
3
4 RRDHOST *localhost = NULL;
5 size_t rrd_hosts_available = 0;
6 pthread_rwlock_t rrd_rwlock = PTHREAD_RWLOCK_INITIALIZER;
7
8 time_t rrdset_free_obsolete_time = 3600;
9 time_t rrdhost_free_orphan_time = 3600;
10
11 // ----------------------------------------------------------------------------
12 // RRDHOST index
13
14 int rrdhost_compare(void* a, void* b) {
15     if(((RRDHOST *)a)->hash_machine_guid < ((RRDHOST *)b)->hash_machine_guid) return -1;
16     else if(((RRDHOST *)a)->hash_machine_guid > ((RRDHOST *)b)->hash_machine_guid) return 1;
17     else return strcmp(((RRDHOST *)a)->machine_guid, ((RRDHOST *)b)->machine_guid);
18 }
19
20 avl_tree_lock rrdhost_root_index = {
21         .avl_tree = { NULL, rrdhost_compare },
22         .rwlock = AVL_LOCK_INITIALIZER
23 };
24
25 RRDHOST *rrdhost_find_by_guid(const char *guid, uint32_t hash) {
26     debug(D_RRDHOST, "Searching in index for host with guid '%s'", guid);
27
28     RRDHOST tmp;
29     strncpyz(tmp.machine_guid, guid, GUID_LEN);
30     tmp.hash_machine_guid = (hash)?hash:simple_hash(tmp.machine_guid);
31
32     return (RRDHOST *)avl_search_lock(&(rrdhost_root_index), (avl *) &tmp);
33 }
34
35 RRDHOST *rrdhost_find_by_hostname(const char *hostname, uint32_t hash) {
36     if(unlikely(!strcmp(hostname, "localhost")))
37         return localhost;
38
39     if(unlikely(!hash)) hash = simple_hash(hostname);
40
41     rrd_rdlock();
42     RRDHOST *host;
43     rrdhost_foreach_read(host) {
44         if(unlikely((hash == host->hash_hostname && !strcmp(hostname, host->hostname)))) {
45             rrd_unlock();
46             return host;
47         }
48     }
49     rrd_unlock();
50
51     return NULL;
52 }
53
54 #define rrdhost_index_add(rrdhost) (RRDHOST *)avl_insert_lock(&(rrdhost_root_index), (avl *)(rrdhost))
55 #define rrdhost_index_del(rrdhost) (RRDHOST *)avl_remove_lock(&(rrdhost_root_index), (avl *)(rrdhost))
56
57
58 // ----------------------------------------------------------------------------
59 // RRDHOST - internal helpers
60
61 static inline void rrdhost_init_hostname(RRDHOST *host, const char *hostname) {
62     freez(host->hostname);
63     host->hostname = strdupz(hostname);
64     host->hash_hostname = simple_hash(host->hostname);
65 }
66
67 static inline void rrdhost_init_os(RRDHOST *host, const char *os) {
68     freez(host->os);
69     host->os = strdupz(os?os:"unknown");
70 }
71
72 static inline void rrdhost_init_machine_guid(RRDHOST *host, const char *machine_guid) {
73     strncpy(host->machine_guid, machine_guid, GUID_LEN);
74     host->machine_guid[GUID_LEN] = '\0';
75     host->hash_machine_guid = simple_hash(host->machine_guid);
76 }
77
78
79 // ----------------------------------------------------------------------------
80 // RRDHOST - add a host
81
82 RRDHOST *rrdhost_create(const char *hostname,
83         const char *guid,
84         const char *os,
85         int update_every,
86         int entries,
87         RRD_MEMORY_MODE memory_mode,
88         int health_enabled,
89         int rrdpush_enabled,
90         char *rrdpush_destination,
91         char *rrdpush_api_key,
92         int is_localhost
93 ) {
94
95     debug(D_RRDHOST, "Host '%s': adding with guid '%s'", hostname, guid);
96
97     RRDHOST *host = callocz(1, sizeof(RRDHOST));
98
99     host->rrd_update_every    = update_every;
100     host->rrd_history_entries = entries;
101     host->rrd_memory_mode     = memory_mode;
102     host->health_enabled      = (memory_mode == RRD_MEMORY_MODE_NONE)? 0 : health_enabled;
103     host->rrdpush_enabled     = (rrdpush_enabled && rrdpush_destination && *rrdpush_destination && rrdpush_api_key && *rrdpush_api_key);
104     host->rrdpush_destination = (host->rrdpush_enabled)?strdupz(rrdpush_destination):NULL;
105     host->rrdpush_api_key     = (host->rrdpush_enabled)?strdupz(rrdpush_api_key):NULL;
106
107     host->rrdpush_pipe[0] = -1;
108     host->rrdpush_pipe[1] = -1;
109     host->rrdpush_socket  = -1;
110
111     pthread_mutex_init(&host->rrdpush_mutex, NULL);
112     pthread_rwlock_init(&host->rrdhost_rwlock, NULL);
113
114     rrdhost_init_hostname(host, hostname);
115     rrdhost_init_machine_guid(host, guid);
116     rrdhost_init_os(host, os);
117
118     avl_init_lock(&(host->rrdset_root_index),      rrdset_compare);
119     avl_init_lock(&(host->rrdset_root_index_name), rrdset_compare_name);
120     avl_init_lock(&(host->rrdfamily_root_index),   rrdfamily_compare);
121     avl_init_lock(&(host->variables_root_index),   rrdvar_compare);
122
123     // ------------------------------------------------------------------------
124     // initialize health variables
125
126     host->health_log.next_log_id = 1;
127     host->health_log.next_alarm_id = 1;
128     host->health_log.max = 1000;
129     host->health_log.next_log_id =
130     host->health_log.next_alarm_id = (uint32_t)now_realtime_sec();
131
132     long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
133     if(n < 10) {
134         error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", host->hostname, n, host->health_log.max);
135         config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
136     }
137     else
138         host->health_log.max = (unsigned int)n;
139
140     pthread_rwlock_init(&(host->health_log.alarm_log_rwlock), NULL);
141
142     char filename[FILENAME_MAX + 1];
143
144     if(is_localhost) {
145
146         host->cache_dir  = strdupz(netdata_configured_cache_dir);
147         host->varlib_dir = strdupz(netdata_configured_varlib_dir);
148
149     }
150     else {
151         // this is not localhost - append our GUID to localhost path
152
153         snprintfz(filename, FILENAME_MAX, "%s/%s", netdata_configured_cache_dir, host->machine_guid);
154         host->cache_dir = strdupz(filename);
155
156         if(host->rrd_memory_mode == RRD_MEMORY_MODE_MAP || host->rrd_memory_mode == RRD_MEMORY_MODE_SAVE) {
157             int r = mkdir(host->cache_dir, 0775);
158             if(r != 0 && errno != EEXIST)
159                 error("Host '%s': cannot create directory '%s'", host->hostname, host->cache_dir);
160         }
161
162         snprintfz(filename, FILENAME_MAX, "%s/%s", netdata_configured_varlib_dir, host->machine_guid);
163         host->varlib_dir = strdupz(filename);
164
165         if(host->health_enabled) {
166             int r = mkdir(host->varlib_dir, 0775);
167             if(r != 0 && errno != EEXIST)
168                 error("Host '%s': cannot create directory '%s'", host->hostname, host->varlib_dir);
169        }
170
171     }
172
173     if(host->health_enabled) {
174         snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir);
175         int r = mkdir(filename, 0775);
176         if(r != 0 && errno != EEXIST)
177             error("Host '%s': cannot create directory '%s'", host->hostname, filename);
178     }
179
180     snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir);
181     host->health_log_filename = strdupz(filename);
182
183     snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_plugins_dir);
184     host->health_default_exec = strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
185     host->health_default_recipient = strdup("root");
186
187
188     // ------------------------------------------------------------------------
189     // load health configuration
190
191     if(host->health_enabled) {
192         health_alarm_log_load(host);
193         health_alarm_log_open(host);
194
195         rrdhost_wrlock(host);
196         health_readdir(host, health_config_dir());
197         rrdhost_unlock(host);
198     }
199
200
201     // ------------------------------------------------------------------------
202     // link it and add it to the index
203
204     rrd_wrlock();
205
206     if(is_localhost) {
207         host->next = localhost;
208         localhost = host;
209     }
210     else {
211         if(localhost) {
212             host->next = localhost->next;
213             localhost->next = host;
214         }
215         else localhost = host;
216     }
217
218     RRDHOST *t = rrdhost_index_add(host);
219
220     if(t != host) {
221         error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", host->hostname, host->machine_guid, t->hostname, t->machine_guid);
222         rrdhost_free(host);
223         host = NULL;
224     }
225     else {
226         info("Host '%s' with guid '%s' initialized"
227                      ", os %s"
228                      ", update every %d"
229                      ", memory mode %s"
230                      ", history entries %d"
231                      ", streaming %s"
232                      " (to '%s' with api key '%s')"
233                      ", health %s"
234                      ", cache_dir '%s'"
235                      ", varlib_dir '%s'"
236                      ", health_log '%s'"
237                      ", alarms default handler '%s'"
238                      ", alarms default recipient '%s'"
239              , host->hostname
240              , host->machine_guid
241              , host->os
242              , host->rrd_update_every
243              , rrd_memory_mode_name(host->rrd_memory_mode)
244              , host->rrd_history_entries
245              , host->rrdpush_enabled?"enabled":"disabled"
246              , host->rrdpush_destination?host->rrdpush_destination:""
247              , host->rrdpush_api_key?host->rrdpush_api_key:""
248              , host->health_enabled?"enabled":"disabled"
249              , host->cache_dir
250              , host->varlib_dir
251              , host->health_log_filename
252              , host->health_default_exec
253              , host->health_default_recipient
254         );
255     }
256
257     rrd_hosts_available++;
258     rrd_unlock();
259
260     return host;
261 }
262
263 RRDHOST *rrdhost_find_or_create(
264           const char *hostname
265         , const char *guid
266         , const char *os
267         , int update_every
268         , int history
269         , RRD_MEMORY_MODE mode
270         , int health_enabled
271         , int rrdpush_enabled
272         , char *rrdpush_destination
273         , char *rrdpush_api_key
274 ) {
275     debug(D_RRDHOST, "Searching for host '%s' with guid '%s'", hostname, guid);
276
277     RRDHOST *host = rrdhost_find_by_guid(guid, 0);
278     if(!host) {
279         host = rrdhost_create(
280                 hostname
281                 , guid
282                 , os
283                 , update_every
284                 , history
285                 , mode
286                 , health_enabled
287                 , rrdpush_enabled
288                 , rrdpush_destination
289                 , rrdpush_api_key
290                 , 0
291         );
292     }
293     else {
294         host->health_enabled = health_enabled;
295
296         if(strcmp(host->hostname, hostname)) {
297             char *t = host->hostname;
298             char *n = strdupz(hostname);
299             host->hostname = n;
300             freez(t);
301         }
302
303         if(host->rrd_update_every != update_every)
304             error("Host '%s' has an update frequency of %d seconds, but the wanted one is %d seconds.", host->hostname, host->rrd_update_every, update_every);
305
306         if(host->rrd_history_entries != history)
307             error("Host '%s' has history of %d entries, but the wanted one is %d entries.", host->hostname, host->rrd_history_entries, history);
308
309         if(host->rrd_memory_mode != mode)
310             error("Host '%s' has memory mode '%s', but the wanted one is '%s'.", host->hostname, rrd_memory_mode_name(host->rrd_memory_mode), rrd_memory_mode_name(mode));
311     }
312
313     rrdhost_cleanup_remote_stale(host);
314
315     return host;
316 }
317
318 void rrdhost_cleanup_remote_stale(RRDHOST *protected) {
319     time_t now = now_realtime_sec();
320
321     rrd_wrlock();
322
323     RRDHOST *h;
324
325 restart_after_removal:
326     rrdhost_foreach_write(h) {
327         if(h != protected
328            && h != localhost
329            && !h->connected_senders
330            && h->senders_disconnected_time + rrdhost_free_orphan_time < now) {
331             info("Host '%s' with machine guid '%s' is obsolete - cleaning up.", h->hostname, h->machine_guid);
332             rrdhost_save(h);
333             rrdhost_free(h);
334             goto restart_after_removal;
335         }
336     }
337
338     rrd_unlock();
339 }
340
341 // ----------------------------------------------------------------------------
342 // RRDHOST global / startup initialization
343
344 void rrd_init(char *hostname) {
345     rrdset_free_obsolete_time = config_get_number(CONFIG_SECTION_GLOBAL, "cleanup obsolete charts after seconds", rrdset_free_obsolete_time);
346
347     health_init();
348     registry_init();
349     rrdpush_init();
350
351     debug(D_RRDHOST, "Initializing localhost with hostname '%s'", hostname);
352     localhost = rrdhost_create(
353             hostname
354             , registry_get_this_machine_guid()
355             , os_type
356             , default_rrd_update_every
357             , default_rrd_history_entries
358             , default_rrd_memory_mode
359             , default_health_enabled
360             , default_rrdpush_enabled
361             , default_rrdpush_destination
362             , default_rrdpush_api_key
363             , 1
364     );
365 }
366
367 // ----------------------------------------------------------------------------
368 // RRDHOST - lock validations
369 // there are only used when NETDATA_INTERNAL_CHECKS is set
370
371 void rrdhost_check_rdlock_int(RRDHOST *host, const char *file, const char *function, const unsigned long line) {
372     debug(D_RRDHOST, "Checking read lock on host '%s'", host->hostname);
373
374     int ret = pthread_rwlock_trywrlock(&host->rrdhost_rwlock);
375     if(ret == 0)
376         fatal("RRDHOST '%s' should be read-locked, but it is not, at function %s() at line %lu of file '%s'", host->hostname, function, line, file);
377 }
378
379 void rrdhost_check_wrlock_int(RRDHOST *host, const char *file, const char *function, const unsigned long line) {
380     debug(D_RRDHOST, "Checking write lock on host '%s'", host->hostname);
381
382     int ret = pthread_rwlock_tryrdlock(&host->rrdhost_rwlock);
383     if(ret == 0)
384         fatal("RRDHOST '%s' should be write-locked, but it is not, at function %s() at line %lu of file '%s'", host->hostname, function, line, file);
385 }
386
387 void rrd_check_rdlock_int(const char *file, const char *function, const unsigned long line) {
388     debug(D_RRDHOST, "Checking read lock on all RRDs");
389
390     int ret = pthread_rwlock_trywrlock(&rrd_rwlock);
391     if(ret == 0)
392         fatal("RRDs should be read-locked, but it are not, at function %s() at line %lu of file '%s'", function, line, file);
393 }
394
395 void rrd_check_wrlock_int(const char *file, const char *function, const unsigned long line) {
396     debug(D_RRDHOST, "Checking write lock on all RRDs");
397
398     int ret = pthread_rwlock_tryrdlock(&rrd_rwlock);
399     if(ret == 0)
400         fatal("RRDs should be write-locked, but it are not, at function %s() at line %lu of file '%s'", function, line, file);
401 }
402
403 // ----------------------------------------------------------------------------
404 // RRDHOST - free
405
406 void rrdhost_free(RRDHOST *host) {
407     if(!host) return;
408
409     info("Freeing all memory for host '%s'...", host->hostname);
410
411     rrd_check_wrlock();     // make sure the RRDs are write locked
412     rrdhost_wrlock(host);   // lock this RRDHOST
413
414     // ------------------------------------------------------------------------
415     // release its children resources
416
417     while(host->rrdset_root) rrdset_free(host->rrdset_root);
418
419     while(host->alarms) rrdcalc_free(host, host->alarms);
420     while(host->templates) rrdcalctemplate_free(host, host->templates);
421     health_alarm_log_free(host);
422
423
424     // ------------------------------------------------------------------------
425     // remove it from the indexes
426
427     if(rrdhost_index_del(host) != host)
428         error("RRDHOST '%s' removed from index, deleted the wrong entry.", host->hostname);
429
430
431     // ------------------------------------------------------------------------
432     // unlink it from the host
433
434     if(host == localhost) {
435         localhost = host->next;
436     }
437     else {
438         // find the previous one
439         RRDHOST *h;
440         for(h = localhost; h && h->next != host ; h = h->next) ;
441
442         // bypass it
443         if(h) h->next = host->next;
444         else error("Request to free RRDHOST '%s': cannot find it", host->hostname);
445     }
446
447     // ------------------------------------------------------------------------
448     // free it
449
450     rrdpush_sender_thread_stop(host);
451
452     freez(host->os);
453     freez(host->cache_dir);
454     freez(host->varlib_dir);
455     freez(host->rrdpush_api_key);
456     freez(host->rrdpush_destination);
457     freez(host->health_default_exec);
458     freez(host->health_default_recipient);
459     freez(host->health_log_filename);
460     freez(host->hostname);
461     rrdhost_unlock(host);
462     freez(host);
463
464     rrd_hosts_available--;
465 }
466
467 void rrdhost_free_all(void) {
468     rrd_wrlock();
469     while(localhost) rrdhost_free(localhost);
470     rrd_unlock();
471 }
472
473 // ----------------------------------------------------------------------------
474 // RRDHOST - save
475
476 void rrdhost_save(RRDHOST *host) {
477     if(!host) return;
478
479     info("Saving database of host '%s'...", host->hostname);
480
481     RRDSET *st;
482
483     // we get a write lock
484     // to ensure only one thread is saving the database
485     rrdhost_wrlock(host);
486
487     rrdset_foreach_write(st, host) {
488         rrdset_rdlock(st);
489         rrdset_save(st);
490         rrdset_unlock(st);
491     }
492
493     rrdhost_unlock(host);
494 }
495
496 void rrdhost_save_all(void) {
497     info("Saving database [%zu hosts(s)]...", rrd_hosts_available);
498
499     rrd_rdlock();
500
501     RRDHOST *host;
502     rrdhost_foreach_read(host)
503         rrdhost_save(host);
504
505     rrd_unlock();
506 }
507
508 void rrdhost_cleanup(RRDHOST *host) {
509     time_t now = now_realtime_sec();
510
511     RRDSET *st;
512
513 restart_after_removal:
514     rrdset_foreach_write(st, host) {
515         if(unlikely(rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE)
516                     && st->last_accessed_time + rrdset_free_obsolete_time < now
517                     && st->last_updated.tv_sec + rrdset_free_obsolete_time < now
518                     && st->last_collected_time.tv_sec + rrdset_free_obsolete_time < now
519         )) {
520
521             rrdset_rdlock(st);
522             rrdset_save(st);
523             rrdset_unlock(st);
524
525             rrdset_free(st);
526             goto restart_after_removal;
527         }
528     }
529 }