handle usleep() on systems that do not accept more than 999999 usec; implement altern...

[netdata.git] / src / apps_plugin.c
diff --git a/src/apps_plugin.c b/src/apps_plugin.c

index 64fef1805ae827937e4e9bf11e4b0f9b611b36db..ba497b7b9ce17949c5b3dae30cfc006b00b11e8f 100644 (file)
--- a/src/apps_plugin.c
+++ b/src/apps_plugin.c
@@ -1,7 +1,3 @@
-// TODO
-//
-// 1. disable RESET_OR_OVERFLOW check in charts
-
  #ifdef HAVE_CONFIG_H
  #include <config.h>
  #endif
@@ -309,10 +305,12 @@ struct target *get_apps_groups_target(const char *id, struct target *target)
         }
         uint32_t hash = simple_hash(id);
  
-       struct target *w;
+       struct target *w, *last = apps_groups_root_target;
         for(w = apps_groups_root_target ; w ; w = w->next) {
                 if(w->idhash == hash && strncmp(nid, w->id, MAX_NAME) == 0)
                         return w;
+
+               last = w;
         }
  
         w = calloc(sizeof(struct target), 1);
@@ -344,8 +342,9 @@ struct target *get_apps_groups_target(const char *id, struct target *target)
         w->debug = tdebug;
         w->target = target;
  
-       w->next = apps_groups_root_target;
-       apps_groups_root_target = w;
+       // append it, to maintain the order in apps_groups.conf
+       if(last) last->next = w;
+       else apps_groups_root_target = w;
  
         if(unlikely(debug))
                 fprintf(stderr, "apps.plugin: ADDING TARGET ID '%s', process name '%s' (%s), aggregated on target '%s', options: %s %s\n"
@@ -1152,32 +1151,34 @@ int collect_data_for_all_processes_from_proc(void)
         all_pids_count = 0;
         for(p = root_of_pids; p ; p = p->next) {
                 all_pids_count++;
-               p->parent = NULL;
-               p->updated = 0;
-               p->children_count = 0;
-               p->merged = 0;
-               p->new_entry = 0;
-
-        p->last_minflt = p->minflt;
-        p->last_majflt = p->majflt;
-        p->last_utime  = p->utime;
-        p->last_stime  = p->stime;
-
-        p->last_cminflt = p->cminflt;
-        p->last_cmajflt = p->cmajflt;
-        p->last_cutime  = p->cutime;
-        p->last_cstime  = p->cstime;
+
+               p->parent           = NULL;
+
+               p->updated          = 0;
+               p->children_count   = 0;
+               p->merged           = 0;
+               p->new_entry        = 0;
+
+        p->last_minflt      = p->minflt;
+        p->last_majflt      = p->majflt;
+        p->last_utime       = p->utime;
+        p->last_stime       = p->stime;
+
+        p->last_cminflt     = p->cminflt;
+        p->last_cmajflt     = p->cmajflt;
+        p->last_cutime      = p->cutime;
+        p->last_cstime      = p->cstime;
  
          p->last_fix_cminflt = p->fix_cminflt;
          p->last_fix_cmajflt = p->fix_cmajflt;
          p->last_fix_cutime  = p->fix_cutime;
          p->last_fix_cstime  = p->fix_cstime;
  
-        p->last_io_logical_bytes_read  = p->io_logical_bytes_read;
+        p->last_io_logical_bytes_read     = p->io_logical_bytes_read;
          p->last_io_logical_bytes_written  = p->io_logical_bytes_written;
-        p->last_io_read_calls  = p->io_read_calls;
-        p->last_io_write_calls  = p->io_write_calls;
-        p->last_io_storage_bytes_read  = p->io_storage_bytes_read;
+        p->last_io_read_calls             = p->io_read_calls;
+        p->last_io_write_calls            = p->io_write_calls;
+        p->last_io_storage_bytes_read     = p->io_storage_bytes_read;
          p->last_io_storage_bytes_written  = p->io_storage_bytes_written;
          p->last_io_cancelled_write_bytes  = p->io_cancelled_write_bytes;
         }
@@ -1214,21 +1215,11 @@ int collect_data_for_all_processes_from_proc(void)
                         p->ppid = 0;
                 }
  
-               // --------------------------------------------------------------------
-               // /proc/<pid>/cmdline
-
-               if(proc_pid_cmdline_is_needed) {
-                       if(unlikely(read_proc_pid_cmdline(p))) {
-                                       error("Cannot process %s/proc/%d/cmdline", host_prefix, pid);
-                       }
-               }
-
                 // --------------------------------------------------------------------
                 // /proc/<pid>/statm
  
                 if(unlikely(read_proc_pid_statm(p))) {
-                               error("Cannot process %s/proc/%d/statm", host_prefix, pid);
-
+                       error("Cannot process %s/proc/%d/statm", host_prefix, pid);
                         // there is no reason to proceed if we cannot get its memory status
                         continue;
                 }
@@ -1258,11 +1249,18 @@ int collect_data_for_all_processes_from_proc(void)
                 // check if it is target
                 // we do this only once, the first time this pid is loaded
                 if(unlikely(p->new_entry)) {
+                       // /proc/<pid>/cmdline
+                       if(proc_pid_cmdline_is_needed) {
+                               if(unlikely(read_proc_pid_cmdline(p))) {
+                                               error("Cannot process %s/proc/%d/cmdline", host_prefix, pid);
+                               }
+                       }
+
                         if(unlikely(debug))
-                               fprintf(stderr, "apps.plugin: \tJust added %s\n", p->comm);
+                               fprintf(stderr, "apps.plugin: \tJust added %d (%s)\n", pid, p->comm);
  
                         uint32_t hash = simple_hash(p->comm);
-                       size_t pclen = strlen(p->comm);
+                       size_t pclen  = strlen(p->comm);
  
                         struct target *w;
                         for(w = apps_groups_root_target; w ; w = w->next) {
@@ -1283,6 +1281,8 @@ int collect_data_for_all_processes_from_proc(void)
  
                                         if(debug || (p->target && p->target->debug))
                                                 fprintf(stderr, "apps.plugin: \t\t%s linked to target %s\n", p->comm, p->target->name);
+
+                                       break;
                                 }
                         }
                 }
@@ -1323,6 +1323,7 @@ int collect_data_for_all_processes_from_proc(void)
  // check: update_apps_groups_statistics()
  
  void link_all_processes_to_their_parents(void) {
+       struct pid_stat *init = all_pids[1];
         struct pid_stat *p = NULL;
  
         // link all children to their parents
@@ -1367,15 +1368,62 @@ void link_all_processes_to_their_parents(void) {
                                 if(likely(pp)) {
                                         // this is an exited child with a parent
                                         // remove the known time from the parent's data
-                                       pp->fix_cminflt += p->minflt + p->cminflt + p->fix_cminflt;
-                                       pp->fix_cmajflt += p->majflt + p->cmajflt + p->fix_cmajflt;
-                                       pp->fix_cutime  += p->utime  + p->cutime  + p->fix_cutime;
-                                       pp->fix_cstime  += p->stime  + p->cstime  + p->fix_cstime;
-
-                                       if(unlikely(pp->cminflt < pp->fix_cminflt)) pp->fix_cminflt = pp->cminflt;
-                                       if(unlikely(pp->cmajflt < pp->fix_cmajflt)) pp->fix_cmajflt = pp->cmajflt;
-                                       if(unlikely(pp->cutime  < pp->fix_cutime))  pp->fix_cutime  = pp->cutime;
-                                       if(unlikely(pp->cstime  < pp->fix_cstime))  pp->fix_cstime  = pp->cstime;
+                                       pp->fix_cminflt += p->last_minflt + p->last_cminflt + p->last_fix_cminflt;
+                                       pp->fix_cmajflt += p->last_majflt + p->last_cmajflt + p->last_fix_cmajflt;
+                                       pp->fix_cutime  += p->last_utime  + p->last_cutime  + p->last_fix_cutime;
+                                       pp->fix_cstime  += p->last_stime  + p->last_cstime  + p->last_fix_cstime;
+
+                                       // The known exited children (the ones we track) may have
+                                       // contributed more than the value accumulated into the process
+                                       // by the kernel.
+                                       // This can happen if the parent process has not waited-for
+                                       // its children (check: man 2 times).
+                                       // In this case, the kernel adds these resources to init (pid 1).
+                                       //
+                                       // The following code, attempts to fix this.
+                                       // Without this code, the charts will have random spikes
+                                       // for example, when an SSH session ends (sshd forks a child
+                                       // to serve the session, but when this session ends, sshd
+                                       // does not wait-for its child, thus all the resources of the
+                                       // ssh session get added to init, resulting in a huge spike on
+                                       // the charts).
+
+                                       if(unlikely(pp->cminflt < pp->fix_cminflt)) {
+                                               if(likely(init && pp != init)) {
+                                                       unsigned long long have = pp->fix_cminflt - pp->cminflt;
+                                                       unsigned long long max = init->cminflt - init->fix_cminflt;
+                                                       if(have > max) have = max;
+                                                       init->fix_cminflt += have;
+                                               }
+                                               pp->fix_cminflt = pp->cminflt;
+                                       }
+                                       if(unlikely(pp->cmajflt < pp->fix_cmajflt)) {
+                                               if(likely(init && pp != init)) {
+                                                       unsigned long long have = pp->fix_cmajflt - pp->cmajflt;
+                                                       unsigned long long max = init->cmajflt - init->fix_cmajflt;
+                                                       if(have > max) have = max;
+                                                       init->fix_cmajflt += have;
+                                               }
+                                               pp->fix_cmajflt = pp->cmajflt;
+                                       }
+                                       if(unlikely(pp->cutime < pp->fix_cutime)) {
+                                               if(likely(init && pp != init)) {
+                                                       unsigned long long have = pp->fix_cutime - pp->cutime;
+                                                       unsigned long long max = init->cutime - init->fix_cutime;
+                                                       if(have > max) have = max;
+                                                       init->fix_cutime += have;
+                                               }
+                                               pp->fix_cutime  = pp->cutime;
+                                       }
+                                       if(unlikely(pp->cstime < pp->fix_cstime)) {
+                                               if(likely(init && pp != init)) {
+                                                       unsigned long long have = pp->fix_cstime - pp->cstime;
+                                                       unsigned long long max = init->cstime - init->fix_cstime;
+                                                       if(have > max) have = max;
+                                                       init->fix_cstime += have;
+                                               }
+                                               pp->fix_cstime = pp->cstime;
+                                       }
  
                                         if(unlikely(debug))
                                                 fprintf(stderr, "apps.plugin: \tupdating child metrics of %d (%s, %s) to its parent %d (%s, %s). Parent has now: utime=%llu, stime=%llu, minflt=%llu, majflt=%llu, cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu, fix_cutime=%llu, fix_cstime=%llu, fix_cminflt=%llu, fix_cmajflt=%llu\n", p->pid, p->comm, p->updated?"running":"exited", pp->pid, pp->comm, pp->updated?"running":"exited", pp->utime, pp->stime, pp->minflt, pp->majflt, pp->cutime, pp->cstime, pp->cminflt, pp->cmajflt, pp->fix_cutime, pp->fix_cstime, pp->fix_cminflt, pp->fix_cmajflt);
@@ -2073,14 +2121,14 @@ void send_charts_updates_to_netdata(struct target *root, const char *type, const
         for (w = root; w ; w = w->next) {
                 if(w->target || (!w->processes && !w->exposed)) continue;
  
-               fprintf(stdout, "DIMENSION %s '' incremental 100 %d noreset\n", w->name, hz);
+               fprintf(stdout, "DIMENSION %s '' incremental 100 %u noreset\n", w->name, hz);
         }
  
         fprintf(stdout, "CHART %s.cpu_system '' '%s CPU System Time (%d%% = %d core%s)' 'cpu time %%' cpu %s.cpu_system stacked 20021 %d\n", type, title, (processors * 100), processors, (processors>1)?"s":"", type, update_every);
         for (w = root; w ; w = w->next) {
                 if(w->target || (!w->processes && !w->exposed)) continue;
  
-               fprintf(stdout, "DIMENSION %s '' incremental 100 %d noreset\n", w->name, hz);
+               fprintf(stdout, "DIMENSION %s '' incremental 100 %u noreset\n", w->name, hz);
         }
  
         fprintf(stdout, "CHART %s.major_faults '' '%s Major Page Faults (swap read)' 'page faults/s' swap %s.major_faults stacked 20010 %d\n", type, title, type, update_every);
@@ -2272,7 +2320,7 @@ int main(int argc, char **argv)
  #ifndef PROFILING_MODE
                 // delay until it is our time to run
                 while((sunow = timems()) < sunext)
-                       usleep((useconds_t)(sunext - sunow));
+                       usecsleep(sunext - sunow);
  
                 // find the next time we need to run
                 while(timems() > sunext)