]> arthur.barton.de Git - netdata.git/blob - src/rrdhost.c
allow netdata to cleanup orphan hosts that have not pushed any metrics for some time...
[netdata.git] / src / rrdhost.c
1 #define NETDATA_RRD_INTERNALS 1
2 #include "common.h"
3
4 RRDHOST *localhost = NULL;
5
6 pthread_rwlock_t rrd_rwlock = PTHREAD_RWLOCK_INITIALIZER;
7
8 time_t rrdhost_free_orphan_time = 3600;
9
10 // ----------------------------------------------------------------------------
11 // RRDHOST index
12
13 int rrdhost_compare(void* a, void* b) {
14     if(((RRDHOST *)a)->hash_machine_guid < ((RRDHOST *)b)->hash_machine_guid) return -1;
15     else if(((RRDHOST *)a)->hash_machine_guid > ((RRDHOST *)b)->hash_machine_guid) return 1;
16     else return strcmp(((RRDHOST *)a)->machine_guid, ((RRDHOST *)b)->machine_guid);
17 }
18
19 avl_tree_lock rrdhost_root_index = {
20         .avl_tree = { NULL, rrdhost_compare },
21         .rwlock = AVL_LOCK_INITIALIZER
22 };
23
24 RRDHOST *rrdhost_find_guid(const char *guid, uint32_t hash) {
25     debug(D_RRDHOST, "Searching in index for host with guid '%s'", guid);
26
27     RRDHOST tmp;
28     strncpyz(tmp.machine_guid, guid, GUID_LEN);
29     tmp.hash_machine_guid = (hash)?hash:simple_hash(tmp.machine_guid);
30
31     return (RRDHOST *)avl_search_lock(&(rrdhost_root_index), (avl *) &tmp);
32 }
33
34 #define rrdhost_index_add(rrdhost) (RRDHOST *)avl_insert_lock(&(rrdhost_root_index), (avl *)(rrdhost))
35 #define rrdhost_index_del(rrdhost) (RRDHOST *)avl_remove_lock(&(rrdhost_root_index), (avl *)(rrdhost))
36
37
38 // ----------------------------------------------------------------------------
39 // RRDHOST - internal helpers
40
41 static inline void rrdhost_init_hostname(RRDHOST *host, const char *hostname) {
42     freez(host->hostname);
43     host->hostname = strdupz(hostname);
44     host->hash_hostname = simple_hash(host->hostname);
45 }
46
47 static inline void rrdhost_init_os(RRDHOST *host, const char *os) {
48     freez(host->os);
49     host->os = strdupz(os?os:"unknown");
50 }
51
52 static inline void rrdhost_init_machine_guid(RRDHOST *host, const char *machine_guid) {
53     strncpy(host->machine_guid, machine_guid, GUID_LEN);
54     host->machine_guid[GUID_LEN] = '\0';
55     host->hash_machine_guid = simple_hash(host->machine_guid);
56 }
57
58
59 // ----------------------------------------------------------------------------
60 // RRDHOST - add a host
61
62 RRDHOST *rrdhost_create(const char *hostname,
63         const char *guid,
64         const char *os,
65         int update_every,
66         int entries,
67         RRD_MEMORY_MODE memory_mode,
68         int health_enabled,
69         int rrdpush_enabled,
70         char *rrdpush_destination,
71         char *rrdpush_api_key,
72         int is_localhost
73 ) {
74
75     debug(D_RRDHOST, "Host '%s': adding with guid '%s'", hostname, guid);
76
77     RRDHOST *host = callocz(1, sizeof(RRDHOST));
78
79     host->rrd_update_every    = update_every;
80     host->rrd_history_entries = entries;
81     host->rrd_memory_mode     = memory_mode;
82     host->health_enabled      = (memory_mode == RRD_MEMORY_MODE_NONE)? 0 : health_enabled;
83     host->rrdpush_enabled     = (rrdpush_enabled && rrdpush_destination && *rrdpush_destination && rrdpush_api_key && *rrdpush_api_key);
84     host->rrdpush_destination = (host->rrdpush_enabled)?strdupz(rrdpush_destination):NULL;
85     host->rrdpush_api_key     = (host->rrdpush_enabled)?strdupz(rrdpush_api_key):NULL;
86
87     host->rrdpush_pipe[0] = -1;
88     host->rrdpush_pipe[1] = -1;
89     host->rrdpush_socket  = -1;
90
91     pthread_mutex_init(&host->rrdpush_mutex, NULL);
92     pthread_rwlock_init(&host->rrdhost_rwlock, NULL);
93
94     rrdhost_init_hostname(host, hostname);
95     rrdhost_init_machine_guid(host, guid);
96     rrdhost_init_os(host, os);
97
98     avl_init_lock(&(host->rrdset_root_index),      rrdset_compare);
99     avl_init_lock(&(host->rrdset_root_index_name), rrdset_compare_name);
100     avl_init_lock(&(host->rrdfamily_root_index),   rrdfamily_compare);
101     avl_init_lock(&(host->variables_root_index),   rrdvar_compare);
102
103     // ------------------------------------------------------------------------
104     // initialize health variables
105
106     host->health_log.next_log_id = 1;
107     host->health_log.next_alarm_id = 1;
108     host->health_log.max = 1000;
109     host->health_log.next_log_id =
110     host->health_log.next_alarm_id = (uint32_t)now_realtime_sec();
111
112     long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
113     if(n < 10) {
114         error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", host->hostname, n, host->health_log.max);
115         config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
116     }
117     else
118         host->health_log.max = (unsigned int)n;
119
120     pthread_rwlock_init(&(host->health_log.alarm_log_rwlock), NULL);
121
122     char filename[FILENAME_MAX + 1];
123
124     if(is_localhost) {
125
126         host->cache_dir  = strdupz(netdata_configured_cache_dir);
127         host->varlib_dir = strdupz(netdata_configured_varlib_dir);
128
129     }
130     else {
131         // this is not localhost - append our GUID to localhost path
132
133         snprintfz(filename, FILENAME_MAX, "%s/%s", netdata_configured_cache_dir, host->machine_guid);
134         host->cache_dir = strdupz(filename);
135
136         if(host->rrd_memory_mode == RRD_MEMORY_MODE_MAP || host->rrd_memory_mode == RRD_MEMORY_MODE_SAVE) {
137             int r = mkdir(host->cache_dir, 0775);
138             if(r != 0 && errno != EEXIST)
139                 error("Host '%s': cannot create directory '%s'", host->hostname, host->cache_dir);
140         }
141
142         snprintfz(filename, FILENAME_MAX, "%s/%s", netdata_configured_varlib_dir, host->machine_guid);
143         host->varlib_dir = strdupz(filename);
144
145         if(host->health_enabled) {
146             int r = mkdir(host->varlib_dir, 0775);
147             if(r != 0 && errno != EEXIST)
148                 error("Host '%s': cannot create directory '%s'", host->hostname, host->varlib_dir);
149
150             snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir);
151             r = mkdir(filename, 0775);
152             if(r != 0 && errno != EEXIST)
153                 error("Host '%s': cannot create directory '%s'", host->hostname, filename);
154         }
155
156     }
157
158     snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir);
159     host->health_log_filename = strdupz(filename);
160
161     snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_plugins_dir);
162     host->health_default_exec = strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
163     host->health_default_recipient = strdup("root");
164
165
166     // ------------------------------------------------------------------------
167     // load health configuration
168
169     if(host->health_enabled) {
170         health_alarm_log_load(host);
171         health_alarm_log_open(host);
172
173         rrdhost_wrlock(host);
174         health_readdir(host, health_config_dir());
175         rrdhost_unlock(host);
176     }
177
178
179     // ------------------------------------------------------------------------
180     // link it and add it to the index
181
182     rrd_wrlock();
183
184     if(is_localhost) {
185         host->next = localhost;
186         localhost = host;
187     }
188     else {
189         if(localhost) {
190             host->next = localhost->next;
191             localhost->next = host;
192         }
193         else localhost = host;
194     }
195
196     RRDHOST *t = rrdhost_index_add(host);
197
198     if(t != host) {
199         error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", host->hostname, host->machine_guid, t->hostname, t->machine_guid);
200         rrdhost_free(host);
201         host = NULL;
202     }
203     else {
204         info("Host '%s' with guid '%s' initialized"
205                      ", os: %s"
206                      ", update every: %d"
207                      ", memory mode: %s"
208                      ", history entries: %d"
209                      ", streaming: %s"
210                      " to: '%s' (api key: '%s')"
211                      ", health: %s"
212                      ", cache_dir: '%s'"
213                      ", varlib_dir: '%s'"
214                      ", health_log: '%s'"
215                      ", alarms default handler: '%s'"
216                      ", alarms default recipient: '%s'"
217              , host->hostname
218              , host->machine_guid
219              , host->os
220              , host->rrd_update_every
221              , rrd_memory_mode_name(host->rrd_memory_mode)
222              , host->rrd_history_entries
223              , host->rrdpush_enabled?"enabled":"disabled"
224              , host->rrdpush_destination
225              , host->rrdpush_api_key
226              , host->health_enabled?"enabled":"disabled"
227              , host->cache_dir
228              , host->varlib_dir
229              , host->health_log_filename
230              , host->health_default_exec
231              , host->health_default_recipient
232         );
233     }
234
235     rrd_unlock();
236
237     return host;
238 }
239
240 RRDHOST *rrdhost_find_or_create(
241           const char *hostname
242         , const char *guid
243         , const char *os
244         , int update_every
245         , int history
246         , RRD_MEMORY_MODE mode
247         , int health_enabled
248         , int rrdpush_enabled
249         , char *rrdpush_destination
250         , char *rrdpush_api_key
251 ) {
252     debug(D_RRDHOST, "Searching for host '%s' with guid '%s'", hostname, guid);
253
254     RRDHOST *host = rrdhost_find_guid(guid, 0);
255     if(!host) {
256         host = rrdhost_create(
257                 hostname
258                 , guid
259                 , os
260                 , update_every
261                 , history
262                 , mode
263                 , health_enabled
264                 , rrdpush_enabled
265                 , rrdpush_destination
266                 , rrdpush_api_key
267                 , 0
268         );
269     }
270     else {
271         host->health_enabled = health_enabled;
272
273         if(strcmp(host->hostname, hostname)) {
274             char *t = host->hostname;
275             char *n = strdupz(hostname);
276             host->hostname = n;
277             freez(t);
278         }
279
280         if(host->rrd_update_every != update_every)
281             error("Host '%s' has an update frequency of %d seconds, but the wanted one is %d seconds.", host->hostname, host->rrd_update_every, update_every);
282
283         if(host->rrd_history_entries != history)
284             error("Host '%s' has history of %d entries, but the wanted one is %d entries.", host->hostname, host->rrd_history_entries, history);
285
286         if(host->rrd_memory_mode != mode)
287             error("Host '%s' has memory mode '%s', but the wanted one is '%s'.", host->hostname, rrd_memory_mode_name(host->rrd_memory_mode), rrd_memory_mode_name(mode));
288     }
289
290     rrdhost_cleanup_remote_stale(host);
291
292     return host;
293 }
294
295 void rrdhost_cleanup_remote_stale(RRDHOST *protected) {
296     rrd_wrlock();
297
298     RRDHOST *h;
299     rrdhost_foreach_write(h) {
300         if(h != protected
301            && h != localhost
302            && !h->connected_senders
303            && h->senders_disconnected_time + rrdhost_free_orphan_time > now_realtime_sec()) {
304             info("Host '%s' with machine guid '%s' is obsolete - cleaning up.", h->hostname, h->machine_guid);
305             rrdhost_save(h);
306             rrdhost_free(h);
307             break;
308         }
309     }
310
311     rrd_unlock();
312 }
313
314 // ----------------------------------------------------------------------------
315 // RRDHOST global / startup initialization
316
317 void rrd_init(char *hostname) {
318     health_init();
319     registry_init();
320     rrdpush_init();
321
322     debug(D_RRDHOST, "Initializing localhost with hostname '%s'", hostname);
323     localhost = rrdhost_create(
324             hostname
325             , registry_get_this_machine_guid()
326             , os_type
327             , default_rrd_update_every
328             , default_rrd_history_entries
329             , default_rrd_memory_mode
330             , default_health_enabled
331             , default_rrdpush_enabled
332             , default_rrdpush_destination
333             , default_rrdpush_api_key
334             , 1
335     );
336 }
337
338 // ----------------------------------------------------------------------------
339 // RRDHOST - lock validations
340 // there are only used when NETDATA_INTERNAL_CHECKS is set
341
342 void rrdhost_check_rdlock_int(RRDHOST *host, const char *file, const char *function, const unsigned long line) {
343     debug(D_RRDHOST, "Checking read lock on host '%s'", host->hostname);
344
345     int ret = pthread_rwlock_trywrlock(&host->rrdhost_rwlock);
346     if(ret == 0)
347         fatal("RRDHOST '%s' should be read-locked, but it is not, at function %s() at line %lu of file '%s'", host->hostname, function, line, file);
348 }
349
350 void rrdhost_check_wrlock_int(RRDHOST *host, const char *file, const char *function, const unsigned long line) {
351     debug(D_RRDHOST, "Checking write lock on host '%s'", host->hostname);
352
353     int ret = pthread_rwlock_tryrdlock(&host->rrdhost_rwlock);
354     if(ret == 0)
355         fatal("RRDHOST '%s' should be write-locked, but it is not, at function %s() at line %lu of file '%s'", host->hostname, function, line, file);
356 }
357
358 void rrd_check_rdlock_int(const char *file, const char *function, const unsigned long line) {
359     debug(D_RRDHOST, "Checking read lock on all RRDs");
360
361     int ret = pthread_rwlock_trywrlock(&rrd_rwlock);
362     if(ret == 0)
363         fatal("RRDs should be read-locked, but it are not, at function %s() at line %lu of file '%s'", function, line, file);
364 }
365
366 void rrd_check_wrlock_int(const char *file, const char *function, const unsigned long line) {
367     debug(D_RRDHOST, "Checking write lock on all RRDs");
368
369     int ret = pthread_rwlock_tryrdlock(&rrd_rwlock);
370     if(ret == 0)
371         fatal("RRDs should be write-locked, but it are not, at function %s() at line %lu of file '%s'", function, line, file);
372 }
373
374 // ----------------------------------------------------------------------------
375 // RRDHOST - free
376
377 void rrdhost_free(RRDHOST *host) {
378     if(!host) return;
379
380     info("Freeing all memory for host '%s'...", host->hostname);
381
382     rrd_check_wrlock();     // make sure the RRDs are write locked
383     rrdhost_wrlock(host);   // lock this RRDHOST
384
385     // ------------------------------------------------------------------------
386     // release its children resources
387
388     while(host->rrdset_root) rrdset_free(host->rrdset_root);
389
390     while(host->alarms) rrdcalc_free(host, host->alarms);
391     while(host->templates) rrdcalctemplate_free(host, host->templates);
392     health_alarm_log_free(host);
393
394
395     // ------------------------------------------------------------------------
396     // remove it from the indexes
397
398     if(rrdhost_index_del(host) != host)
399         error("RRDHOST '%s' removed from index, deleted the wrong entry.", host->hostname);
400
401
402     // ------------------------------------------------------------------------
403     // unlink it from the host
404
405     if(host == localhost) {
406         localhost = host->next;
407     }
408     else {
409         // find the previous one
410         RRDHOST *h;
411         for(h = localhost; h && h->next != host ; h = h->next) ;
412
413         // bypass it
414         if(h) h->next = host->next;
415         else error("Request to free RRDHOST '%s': cannot find it", host->hostname);
416     }
417
418     // ------------------------------------------------------------------------
419     // free it
420
421     rrdpush_sender_thread_stop(host);
422
423     freez(host->os);
424     freez(host->cache_dir);
425     freez(host->varlib_dir);
426     freez(host->rrdpush_api_key);
427     freez(host->rrdpush_destination);
428     freez(host->health_default_exec);
429     freez(host->health_default_recipient);
430     freez(host->health_log_filename);
431     freez(host->hostname);
432     rrdhost_unlock(host);
433     freez(host);
434
435     info("Host memory cleanup completed...");
436 }
437
438 void rrdhost_free_all(void) {
439     rrd_wrlock();
440     while(localhost) rrdhost_free(localhost);
441     rrd_unlock();
442 }
443
444 // ----------------------------------------------------------------------------
445 // RRDHOST - save
446
447 void rrdhost_save(RRDHOST *host) {
448     if(!host) return;
449
450     info("Saving host '%s' database...", host->hostname);
451
452     RRDSET *st;
453     RRDDIM *rd;
454
455     // we get a write lock
456     // to ensure only one thread is saving the database
457     rrdhost_wrlock(host);
458
459     rrdset_foreach_write(st, host) {
460         rrdset_rdlock(st);
461
462         if(st->rrd_memory_mode == RRD_MEMORY_MODE_SAVE) {
463             debug(D_RRD_STATS, "Saving stats '%s' to '%s'.", st->name, st->cache_filename);
464             savememory(st->cache_filename, st, st->memsize);
465         }
466
467         rrddim_foreach_read(rd, st) {
468             if(likely(rd->rrd_memory_mode == RRD_MEMORY_MODE_SAVE)) {
469                 debug(D_RRD_STATS, "Saving dimension '%s' to '%s'.", rd->name, rd->cache_filename);
470                 savememory(rd->cache_filename, rd, rd->memsize);
471             }
472         }
473
474         rrdset_unlock(st);
475     }
476
477     rrdhost_unlock(host);
478 }
479
480 void rrdhost_save_all(void) {
481     info("Saving database...");
482
483     rrd_rdlock();
484
485     RRDHOST *host;
486     rrdhost_foreach_read(host)
487         rrdhost_save(host);
488
489     rrd_unlock();
490 }