-// TODO
-//
-// 1. disable RESET_OR_OVERFLOW check in charts
-
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
}
uint32_t hash = simple_hash(id);
- struct target *w;
+ struct target *w, *last = apps_groups_root_target;
for(w = apps_groups_root_target ; w ; w = w->next) {
if(w->idhash == hash && strncmp(nid, w->id, MAX_NAME) == 0)
return w;
+
+ last = w;
}
w = calloc(sizeof(struct target), 1);
w->debug = tdebug;
w->target = target;
- w->next = apps_groups_root_target;
- apps_groups_root_target = w;
+ // append it, to maintain the order in apps_groups.conf
+ if(last) last->next = w;
+ else apps_groups_root_target = w;
if(unlikely(debug))
fprintf(stderr, "apps.plugin: ADDING TARGET ID '%s', process name '%s' (%s), aggregated on target '%s', options: %s %s\n"
all_pids_count = 0;
for(p = root_of_pids; p ; p = p->next) {
all_pids_count++;
- p->parent = NULL;
- p->updated = 0;
- p->children_count = 0;
- p->merged = 0;
- p->new_entry = 0;
-
- p->last_minflt = p->minflt;
- p->last_majflt = p->majflt;
- p->last_utime = p->utime;
- p->last_stime = p->stime;
-
- p->last_cminflt = p->cminflt;
- p->last_cmajflt = p->cmajflt;
- p->last_cutime = p->cutime;
- p->last_cstime = p->cstime;
+
+ p->parent = NULL;
+
+ p->updated = 0;
+ p->children_count = 0;
+ p->merged = 0;
+ p->new_entry = 0;
+
+ p->last_minflt = p->minflt;
+ p->last_majflt = p->majflt;
+ p->last_utime = p->utime;
+ p->last_stime = p->stime;
+
+ p->last_cminflt = p->cminflt;
+ p->last_cmajflt = p->cmajflt;
+ p->last_cutime = p->cutime;
+ p->last_cstime = p->cstime;
p->last_fix_cminflt = p->fix_cminflt;
p->last_fix_cmajflt = p->fix_cmajflt;
p->last_fix_cutime = p->fix_cutime;
p->last_fix_cstime = p->fix_cstime;
- p->last_io_logical_bytes_read = p->io_logical_bytes_read;
+ p->last_io_logical_bytes_read = p->io_logical_bytes_read;
p->last_io_logical_bytes_written = p->io_logical_bytes_written;
- p->last_io_read_calls = p->io_read_calls;
- p->last_io_write_calls = p->io_write_calls;
- p->last_io_storage_bytes_read = p->io_storage_bytes_read;
+ p->last_io_read_calls = p->io_read_calls;
+ p->last_io_write_calls = p->io_write_calls;
+ p->last_io_storage_bytes_read = p->io_storage_bytes_read;
p->last_io_storage_bytes_written = p->io_storage_bytes_written;
p->last_io_cancelled_write_bytes = p->io_cancelled_write_bytes;
}
p->ppid = 0;
}
- // --------------------------------------------------------------------
- // /proc/<pid>/cmdline
-
- if(proc_pid_cmdline_is_needed) {
- if(unlikely(read_proc_pid_cmdline(p))) {
- error("Cannot process %s/proc/%d/cmdline", host_prefix, pid);
- }
- }
-
// --------------------------------------------------------------------
// /proc/<pid>/statm
if(unlikely(read_proc_pid_statm(p))) {
- error("Cannot process %s/proc/%d/statm", host_prefix, pid);
-
+ error("Cannot process %s/proc/%d/statm", host_prefix, pid);
// there is no reason to proceed if we cannot get its memory status
continue;
}
// check if it is target
// we do this only once, the first time this pid is loaded
if(unlikely(p->new_entry)) {
+ // /proc/<pid>/cmdline
+ if(proc_pid_cmdline_is_needed) {
+ if(unlikely(read_proc_pid_cmdline(p))) {
+ error("Cannot process %s/proc/%d/cmdline", host_prefix, pid);
+ }
+ }
+
if(unlikely(debug))
- fprintf(stderr, "apps.plugin: \tJust added %s\n", p->comm);
+ fprintf(stderr, "apps.plugin: \tJust added %d (%s)\n", pid, p->comm);
uint32_t hash = simple_hash(p->comm);
- size_t pclen = strlen(p->comm);
+ size_t pclen = strlen(p->comm);
struct target *w;
for(w = apps_groups_root_target; w ; w = w->next) {
if(debug || (p->target && p->target->debug))
fprintf(stderr, "apps.plugin: \t\t%s linked to target %s\n", p->comm, p->target->name);
+
+ break;
}
}
}
// check: update_apps_groups_statistics()
void link_all_processes_to_their_parents(void) {
+ struct pid_stat *init = all_pids[1];
struct pid_stat *p = NULL;
// link all children to their parents
if(likely(pp)) {
// this is an exited child with a parent
// remove the known time from the parent's data
- pp->fix_cminflt += p->minflt + p->cminflt + p->fix_cminflt;
- pp->fix_cmajflt += p->majflt + p->cmajflt + p->fix_cmajflt;
- pp->fix_cutime += p->utime + p->cutime + p->fix_cutime;
- pp->fix_cstime += p->stime + p->cstime + p->fix_cstime;
-
- if(unlikely(pp->cminflt < pp->fix_cminflt)) pp->fix_cminflt = pp->cminflt;
- if(unlikely(pp->cmajflt < pp->fix_cmajflt)) pp->fix_cmajflt = pp->cmajflt;
- if(unlikely(pp->cutime < pp->fix_cutime)) pp->fix_cutime = pp->cutime;
- if(unlikely(pp->cstime < pp->fix_cstime)) pp->fix_cstime = pp->cstime;
+ pp->fix_cminflt += p->last_minflt + p->last_cminflt + p->last_fix_cminflt;
+ pp->fix_cmajflt += p->last_majflt + p->last_cmajflt + p->last_fix_cmajflt;
+ pp->fix_cutime += p->last_utime + p->last_cutime + p->last_fix_cutime;
+ pp->fix_cstime += p->last_stime + p->last_cstime + p->last_fix_cstime;
+
+ // The known exited children (the ones we track) may have
+ // contributed more than the value accumulated into the process
+ // by the kernel.
+ // This can happen if the parent process has not waited-for
+ // its children (check: man 2 times).
+ // In this case, the kernel adds these resources to init (pid 1).
+ //
+ // The following code, attempts to fix this.
+ // Without this code, the charts will have random spikes
+ // for example, when an SSH session ends (sshd forks a child
+ // to serve the session, but when this session ends, sshd
+ // does not wait-for its child, thus all the resources of the
+ // ssh session get added to init, resulting in a huge spike on
+ // the charts).
+
+ if(unlikely(pp->cminflt < pp->fix_cminflt)) {
+ if(likely(init && pp != init)) {
+ unsigned long long have = pp->fix_cminflt - pp->cminflt;
+ unsigned long long max = init->cminflt - init->fix_cminflt;
+ if(have > max) have = max;
+ init->fix_cminflt += have;
+ }
+ pp->fix_cminflt = pp->cminflt;
+ }
+ if(unlikely(pp->cmajflt < pp->fix_cmajflt)) {
+ if(likely(init && pp != init)) {
+ unsigned long long have = pp->fix_cmajflt - pp->cmajflt;
+ unsigned long long max = init->cmajflt - init->fix_cmajflt;
+ if(have > max) have = max;
+ init->fix_cmajflt += have;
+ }
+ pp->fix_cmajflt = pp->cmajflt;
+ }
+ if(unlikely(pp->cutime < pp->fix_cutime)) {
+ if(likely(init && pp != init)) {
+ unsigned long long have = pp->fix_cutime - pp->cutime;
+ unsigned long long max = init->cutime - init->fix_cutime;
+ if(have > max) have = max;
+ init->fix_cutime += have;
+ }
+ pp->fix_cutime = pp->cutime;
+ }
+ if(unlikely(pp->cstime < pp->fix_cstime)) {
+ if(likely(init && pp != init)) {
+ unsigned long long have = pp->fix_cstime - pp->cstime;
+ unsigned long long max = init->cstime - init->fix_cstime;
+ if(have > max) have = max;
+ init->fix_cstime += have;
+ }
+ pp->fix_cstime = pp->cstime;
+ }
if(unlikely(debug))
fprintf(stderr, "apps.plugin: \tupdating child metrics of %d (%s, %s) to its parent %d (%s, %s). Parent has now: utime=%llu, stime=%llu, minflt=%llu, majflt=%llu, cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu, fix_cutime=%llu, fix_cstime=%llu, fix_cminflt=%llu, fix_cmajflt=%llu\n", p->pid, p->comm, p->updated?"running":"exited", pp->pid, pp->comm, pp->updated?"running":"exited", pp->utime, pp->stime, pp->minflt, pp->majflt, pp->cutime, pp->cstime, pp->cminflt, pp->cmajflt, pp->fix_cutime, pp->fix_cstime, pp->fix_cminflt, pp->fix_cmajflt);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 100 %d noreset\n", w->name, hz);
+ fprintf(stdout, "DIMENSION %s '' incremental 100 %u noreset\n", w->name, hz);
}
fprintf(stdout, "CHART %s.cpu_system '' '%s CPU System Time (%d%% = %d core%s)' 'cpu time %%' cpu %s.cpu_system stacked 20021 %d\n", type, title, (processors * 100), processors, (processors>1)?"s":"", type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 100 %d noreset\n", w->name, hz);
+ fprintf(stdout, "DIMENSION %s '' incremental 100 %u noreset\n", w->name, hz);
}
fprintf(stdout, "CHART %s.major_faults '' '%s Major Page Faults (swap read)' 'page faults/s' swap %s.major_faults stacked 20010 %d\n", type, title, type, update_every);
#ifndef PROFILING_MODE
// delay until it is our time to run
while((sunow = timems()) < sunext)
- usleep((useconds_t)(sunext - sunow));
+ usecsleep(sunext - sunow);
// find the next time we need to run
while(timems() > sunext)