int debug = 0;
int update_every = 1;
+unsigned long long global_iterations_counter = 1;
unsigned long long file_counter = 0;
int proc_pid_cmdline_is_needed = 0;
int include_exited_childs = 1;
char *host_prefix = "";
char *config_dir = CONFIG_DIR;
+pid_t *all_pids_sortlist = NULL;
// ----------------------------------------------------------------------------
unsigned long long num_threads;
unsigned long long rss;
- long long fix_minflt;
- long long fix_cminflt;
- long long fix_majflt;
- long long fix_cmajflt;
- long long fix_utime;
- long long fix_stime;
- long long fix_cutime;
- long long fix_cstime;
-
unsigned long long statm_size;
unsigned long long statm_resident;
unsigned long long statm_share;
unsigned long long io_storage_bytes_written;
unsigned long long io_cancelled_write_bytes;
- unsigned long long fix_io_logical_bytes_read;
- unsigned long long fix_io_logical_bytes_written;
- unsigned long long fix_io_read_calls;
- unsigned long long fix_io_write_calls;
- unsigned long long fix_io_storage_bytes_read;
- unsigned long long fix_io_storage_bytes_written;
- unsigned long long fix_io_cancelled_write_bytes;
-
int *fds;
unsigned long long openfiles;
unsigned long long openpipes;
// int32_t tty_nr;
// int32_t tpgid;
// uint64_t flags;
+
+ // these are raw values collected
+ unsigned long long minflt_raw;
+ unsigned long long cminflt_raw;
+ unsigned long long majflt_raw;
+ unsigned long long cmajflt_raw;
+ unsigned long long utime_raw;
+ unsigned long long stime_raw;
+ unsigned long long cutime_raw;
+ unsigned long long cstime_raw;
+
+ // these are rates
unsigned long long minflt;
unsigned long long cminflt;
unsigned long long majflt;
unsigned long long stime;
unsigned long long cutime;
unsigned long long cstime;
+
// int64_t priority;
// int64_t nice;
int32_t num_threads;
unsigned long long statm_data;
unsigned long long statm_dirty;
+ unsigned long long io_logical_bytes_read_raw;
+ unsigned long long io_logical_bytes_written_raw;
+ unsigned long long io_read_calls_raw;
+ unsigned long long io_write_calls_raw;
+ unsigned long long io_storage_bytes_read_raw;
+ unsigned long long io_storage_bytes_written_raw;
+ unsigned long long io_cancelled_write_bytes_raw;
+
unsigned long long io_logical_bytes_read;
unsigned long long io_logical_bytes_written;
unsigned long long io_read_calls;
unsigned long long io_storage_bytes_written;
unsigned long long io_cancelled_write_bytes;
- // we need the last values
- // for all incremental counters
- // so that when a process switches users/groups
- // we will subtract these values from the old
- // target
- unsigned long long last_minflt;
- unsigned long long last_majflt;
- unsigned long long last_utime;
- unsigned long long last_stime;
-
- unsigned long long last_cminflt;
- unsigned long long last_cmajflt;
- unsigned long long last_cutime;
- unsigned long long last_cstime;
-
- unsigned long long last_fix_cminflt;
- unsigned long long last_fix_cmajflt;
- unsigned long long last_fix_cutime;
- unsigned long long last_fix_cstime;
-
- unsigned long long last_io_logical_bytes_read;
- unsigned long long last_io_logical_bytes_written;
- unsigned long long last_io_read_calls;
- unsigned long long last_io_write_calls;
- unsigned long long last_io_storage_bytes_read;
- unsigned long long last_io_storage_bytes_written;
- unsigned long long last_io_cancelled_write_bytes;
-
- unsigned long long fix_cminflt;
- unsigned long long fix_cmajflt;
- unsigned long long fix_cutime;
- unsigned long long fix_cstime;
-
int *fds; // array of fds it uses
int fds_size; // the size of the fds array
int children_count; // number of processes directly referencing this
- int updated; // 1 when update
+ int keep; // 1 when we need to keep this process in memory even after it exited
+ int keeploops; // increases by 1 every time keep is 1 and updated 0
+ int updated; // 1 when the process is currently running
int merged; // 1 when it has been merged to its parent
- int new_entry;
+ int new_entry; // 1 when this is a new process, just saw for the first time
+ int read; // 1 when we have already read this process for this iteration
+ int sortlist; // higher numbers = top on the process tree
+ // each process gets a unique number
struct target *target; // app_groups.conf targets
struct target *user_target; // uid based targets
struct target *group_target; // gid based targets
+ unsigned long long stat_collected_usec;
+ unsigned long long last_stat_collected_usec;
+
+ unsigned long long io_collected_usec;
+ unsigned long long last_io_collected_usec;
+
+ char *stat_filename;
+ char *statm_filename;
+ char *io_filename;
+ char *cmdline_filename;
+
struct pid_stat *parent;
struct pid_stat *prev;
struct pid_stat *next;
-
} *root_of_pids = NULL, **all_pids;
long all_pids_count = 0;
-struct pid_stat *get_pid_entry(pid_t pid)
-{
+struct pid_stat *get_pid_entry(pid_t pid) {
if(all_pids[pid]) {
all_pids[pid]->new_entry = 0;
return all_pids[pid];
all_pids[pid]->pid = pid;
all_pids[pid]->new_entry = 1;
+ all_pids_count++;
+
return all_pids[pid];
}
-void del_pid_entry(pid_t pid)
-{
- if(!all_pids[pid]) return;
+void del_pid_entry(pid_t pid) {
+ if(!all_pids[pid]) {
+ error("attempted to free pid %d that is not allocated.", pid);
+ return;
+ }
if(unlikely(debug))
fprintf(stderr, "apps.plugin: process %d %s exited, deleting it.\n", pid, all_pids[pid]->comm);
if(all_pids[pid]->prev) all_pids[pid]->prev->next = all_pids[pid]->next;
if(all_pids[pid]->fds) free(all_pids[pid]->fds);
+ if(all_pids[pid]->stat_filename) free(all_pids[pid]->stat_filename);
+ if(all_pids[pid]->statm_filename) free(all_pids[pid]->statm_filename);
+ if(all_pids[pid]->io_filename) free(all_pids[pid]->io_filename);
+ if(all_pids[pid]->cmdline_filename) free(all_pids[pid]->cmdline_filename);
free(all_pids[pid]);
+
all_pids[pid] = NULL;
+ all_pids_count--;
}
// update pids from proc
int read_proc_pid_cmdline(struct pid_stat *p) {
- char filename[FILENAME_MAX + 1];
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d/cmdline", host_prefix, p->pid);
+
+ if(unlikely(!p->cmdline_filename)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/proc/%d/cmdline", host_prefix, p->pid);
+ if(!(p->cmdline_filename = strdup(filename)))
+ fatal("Cannot allocate memory for filename '%s'", filename);
+ }
- int fd = open(filename, O_RDONLY, 0666);
- if(unlikely(fd == -1)) return 1;
+ int fd = open(p->cmdline_filename, O_RDONLY, 0666);
+ if(unlikely(fd == -1)) goto cleanup;
int i, bytes = read(fd, p->cmdline, MAX_CMDLINE);
close(fd);
- if(bytes <= 0) {
- // copy the command to the command line
- strncpyz(p->cmdline, p->comm, MAX_CMDLINE);
- return 0;
- }
+ if(unlikely(bytes <= 0)) goto cleanup;
p->cmdline[bytes] = '\0';
for(i = 0; i < bytes ; i++)
- if(!p->cmdline[i]) p->cmdline[i] = ' ';
+ if(unlikely(!p->cmdline[i])) p->cmdline[i] = ' ';
if(unlikely(debug))
- fprintf(stderr, "Read file '%s' contents: %s\n", filename, p->cmdline);
+ fprintf(stderr, "Read file '%s' contents: %s\n", p->cmdline_filename, p->cmdline);
+
+ return 0;
+cleanup:
+ // copy the command to the command line
+ strncpyz(p->cmdline, p->comm, MAX_CMDLINE);
return 0;
}
int read_proc_pid_ownership(struct pid_stat *p) {
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d", host_prefix, p->pid);
+ if(unlikely(!p->stat_filename)) {
+ error("pid %d does not have a stat_filename", p->pid);
+ return 1;
+ }
// ----------------------------------------
// read uid and gid
struct stat st;
- if(stat(filename, &st) != 0)
+ if(stat(p->stat_filename, &st) != 0) {
+ error("Cannot stat file '%s'", p->stat_filename);
return 1;
+ }
p->uid = st.st_uid;
p->gid = st.st_gid;
int read_proc_pid_stat(struct pid_stat *p) {
static procfile *ff = NULL;
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d/stat", host_prefix, p->pid);
-
- // ----------------------------------------
+ if(unlikely(!p->stat_filename)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/proc/%d/stat", host_prefix, p->pid);
+ if(!(p->stat_filename = strdup(filename)))
+ fatal("Cannot allocate memory for filename '%s'", filename);
+ }
int set_quotes = (!ff)?1:0;
- ff = procfile_reopen(ff, filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
- if(!ff) return 1;
+ ff = procfile_reopen(ff, p->stat_filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
+ if(unlikely(!ff)) goto cleanup;
// if(set_quotes) procfile_set_quotes(ff, "()");
if(set_quotes) procfile_set_open_close(ff, "(", ")");
ff = procfile_readall(ff);
- if(!ff) {
- // procfile_close(ff);
- return 1;
- }
+ if(unlikely(!ff)) goto cleanup;
+ p->last_stat_collected_usec = p->stat_collected_usec;
+ p->stat_collected_usec = timems();
file_counter++;
// parse the process name
// p->tty_nr = atol(procfile_lineword(ff, 0, 6+i));
// p->tpgid = atol(procfile_lineword(ff, 0, 7+i));
// p->flags = strtoull(procfile_lineword(ff, 0, 8+i), NULL, 10);
- p->minflt = strtoull(procfile_lineword(ff, 0, 9+i), NULL, 10);
- p->cminflt = strtoull(procfile_lineword(ff, 0, 10+i), NULL, 10);
- p->majflt = strtoull(procfile_lineword(ff, 0, 11+i), NULL, 10);
- p->cmajflt = strtoull(procfile_lineword(ff, 0, 12+i), NULL, 10);
- p->utime = strtoull(procfile_lineword(ff, 0, 13+i), NULL, 10);
- p->stime = strtoull(procfile_lineword(ff, 0, 14+i), NULL, 10);
- p->cutime = strtoull(procfile_lineword(ff, 0, 15+i), NULL, 10);
- p->cstime = strtoull(procfile_lineword(ff, 0, 16+i), NULL, 10);
+
+ unsigned long long last;
+
+ last = p->minflt_raw;
+ p->minflt_raw = strtoull(procfile_lineword(ff, 0, 9+i), NULL, 10);
+ p->minflt = (p->minflt_raw - last) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->cminflt_raw;
+ p->cminflt_raw = strtoull(procfile_lineword(ff, 0, 10+i), NULL, 10);
+ p->cminflt = (p->cminflt_raw - last) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->majflt_raw;
+ p->majflt_raw = strtoull(procfile_lineword(ff, 0, 11+i), NULL, 10);
+ p->majflt = (p->majflt_raw - last) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->cmajflt_raw;
+ p->cmajflt_raw = strtoull(procfile_lineword(ff, 0, 12+i), NULL, 10);
+ p->cmajflt = (p->cmajflt_raw - last) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->utime_raw;
+ p->utime_raw = strtoull(procfile_lineword(ff, 0, 13+i), NULL, 10);
+ p->utime = (p->utime_raw - last) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->stime_raw;
+ p->stime_raw = strtoull(procfile_lineword(ff, 0, 14+i), NULL, 10);
+ p->stime = (p->stime_raw - last) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->cutime_raw;
+ p->cutime_raw = strtoull(procfile_lineword(ff, 0, 15+i), NULL, 10);
+ p->cutime = (p->cutime_raw - last) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->cstime_raw;
+ p->cstime_raw = strtoull(procfile_lineword(ff, 0, 16+i), NULL, 10);
+ p->cstime = (p->cstime_raw - last) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
// p->priority = strtoull(procfile_lineword(ff, 0, 17+i), NULL, 10);
// p->nice = strtoull(procfile_lineword(ff, 0, 18+i), NULL, 10);
p->num_threads = (int32_t) atol(procfile_lineword(ff, 0, 19 + i));
// p->cguest_time = strtoull(procfile_lineword(ff, 0, 43), NULL, 10);
if(unlikely(debug || (p->target && p->target->debug)))
- fprintf(stderr, "apps.plugin: READ PROC/PID/STAT: %s/proc/%d/stat, process: '%s' VALUES: utime=%llu, stime=%llu, cutime=%llu, cstime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu, threads=%d\n", host_prefix, p->pid, p->comm, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt, p->num_threads);
+ fprintf(stderr, "apps.plugin: READ PROC/PID/STAT: %s/proc/%d/stat, process: '%s' on target '%s' (dt=%llu) VALUES: utime=%llu, stime=%llu, cutime=%llu, cstime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu, threads=%d\n", host_prefix, p->pid, p->comm, (p->target)?p->target->name:"UNSET", p->stat_collected_usec - p->last_stat_collected_usec, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt, p->num_threads);
+
+ if(unlikely(global_iterations_counter == 1)) {
+ p->minflt = 0;
+ p->cminflt = 0;
+ p->majflt = 0;
+ p->cmajflt = 0;
+ p->utime = 0;
+ p->stime = 0;
+ p->cutime = 0;
+ p->cstime = 0;
+ }
- // procfile_close(ff);
return 0;
+
+cleanup:
+ p->minflt = 0;
+ p->cminflt = 0;
+ p->majflt = 0;
+ p->cmajflt = 0;
+ p->utime = 0;
+ p->stime = 0;
+ p->cutime = 0;
+ p->cstime = 0;
+ p->num_threads = 0;
+ p->rss = 0;
+ return 1;
}
int read_proc_pid_statm(struct pid_stat *p) {
static procfile *ff = NULL;
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d/statm", host_prefix, p->pid);
+ if(unlikely(!p->statm_filename)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/proc/%d/statm", host_prefix, p->pid);
+ if(!(p->statm_filename = strdup(filename)))
+ fatal("Cannot allocate memory for filename '%s'", filename);
+ }
- ff = procfile_reopen(ff, filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
- if(!ff) return 1;
+ ff = procfile_reopen(ff, p->statm_filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
+ if(unlikely(!ff)) goto cleanup;
ff = procfile_readall(ff);
- if(!ff) {
- // procfile_close(ff);
- return 1;
- }
+ if(unlikely(!ff)) goto cleanup;
file_counter++;
p->statm_data = strtoull(procfile_lineword(ff, 0, 5), NULL, 10);
p->statm_dirty = strtoull(procfile_lineword(ff, 0, 6), NULL, 10);
- // procfile_close(ff);
return 0;
+
+cleanup:
+ p->statm_size = 0;
+ p->statm_resident = 0;
+ p->statm_share = 0;
+ p->statm_text = 0;
+ p->statm_lib = 0;
+ p->statm_data = 0;
+ p->statm_dirty = 0;
+ return 1;
}
int read_proc_pid_io(struct pid_stat *p) {
static procfile *ff = NULL;
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d/io", host_prefix, p->pid);
+ if(unlikely(!p->io_filename)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/proc/%d/io", host_prefix, p->pid);
+ if(!(p->io_filename = strdup(filename)))
+ fatal("Cannot allocate memory for filename '%s'", filename);
+ }
- ff = procfile_reopen(ff, filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
- if(!ff) return 1;
+ // open the file
+ ff = procfile_reopen(ff, p->io_filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
+ if(unlikely(!ff)) goto cleanup;
ff = procfile_readall(ff);
- if(!ff) {
- // procfile_close(ff);
- return 1;
- }
+ if(unlikely(!ff)) goto cleanup;
file_counter++;
- p->io_logical_bytes_read = strtoull(procfile_lineword(ff, 0, 1), NULL, 10);
- p->io_logical_bytes_written = strtoull(procfile_lineword(ff, 1, 1), NULL, 10);
- p->io_read_calls = strtoull(procfile_lineword(ff, 2, 1), NULL, 10);
- p->io_write_calls = strtoull(procfile_lineword(ff, 3, 1), NULL, 10);
- p->io_storage_bytes_read = strtoull(procfile_lineword(ff, 4, 1), NULL, 10);
- p->io_storage_bytes_written = strtoull(procfile_lineword(ff, 5, 1), NULL, 10);
- p->io_cancelled_write_bytes = strtoull(procfile_lineword(ff, 6, 1), NULL, 10);
+ p->last_io_collected_usec = p->io_collected_usec;
+ p->io_collected_usec = timems();
+
+ unsigned long long last;
+
+ last = p->io_logical_bytes_read_raw;
+ p->io_logical_bytes_read_raw = strtoull(procfile_lineword(ff, 0, 1), NULL, 10);
+ p->io_logical_bytes_read = (p->io_logical_bytes_read_raw - last) * (update_every * 1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_logical_bytes_written_raw;
+ p->io_logical_bytes_written_raw = strtoull(procfile_lineword(ff, 1, 1), NULL, 10);
+ p->io_logical_bytes_written = (p->io_logical_bytes_written_raw - last) * (update_every * 1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_read_calls_raw;
+ p->io_read_calls_raw = strtoull(procfile_lineword(ff, 2, 1), NULL, 10);
+ p->io_read_calls = (p->io_read_calls_raw - last) * (update_every * 1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_write_calls_raw;
+ p->io_write_calls_raw = strtoull(procfile_lineword(ff, 3, 1), NULL, 10);
+ p->io_write_calls = (p->io_write_calls_raw - last) * (update_every * 1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_storage_bytes_read_raw;
+ p->io_storage_bytes_read_raw = strtoull(procfile_lineword(ff, 4, 1), NULL, 10);
+ p->io_storage_bytes_read = (p->io_storage_bytes_read_raw - last) * (update_every * 1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_storage_bytes_written_raw;
+ p->io_storage_bytes_written_raw = strtoull(procfile_lineword(ff, 5, 1), NULL, 10);
+ p->io_storage_bytes_written = (p->io_storage_bytes_written_raw - last) * (update_every * 1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_cancelled_write_bytes_raw;
+ p->io_cancelled_write_bytes_raw = strtoull(procfile_lineword(ff, 6, 1), NULL, 10);
+ p->io_cancelled_write_bytes = (p->io_cancelled_write_bytes_raw - last) * (update_every * 1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ if(unlikely(global_iterations_counter == 1)) {
+ p->io_logical_bytes_read = 0;
+ p->io_logical_bytes_written = 0;
+ p->io_read_calls = 0;
+ p->io_write_calls = 0;
+ p->io_storage_bytes_read = 0;
+ p->io_storage_bytes_written = 0;
+ p->io_cancelled_write_bytes = 0;
+ }
- // procfile_close(ff);
return 0;
+
+cleanup:
+ p->io_logical_bytes_read = 0;
+ p->io_logical_bytes_written = 0;
+ p->io_read_calls = 0;
+ p->io_write_calls = 0;
+ p->io_storage_bytes_read = 0;
+ p->io_storage_bytes_written = 0;
+ p->io_cancelled_write_bytes = 0;
+ return 1;
}
// ----------------------------------------------------------------------------
+#ifdef NETDATA_INTERNAL_CHECKS
+void find_lost_child_debug(struct pid_stat *pe, struct pid_stat *ppe, unsigned long long lost, int type) {
+ int found = 0;
+ struct pid_stat *p = NULL, *pp = pe->parent;
+
+ log_date(stderr);
+ fprintf(stderr, "Searching for candidate of lost resources of process %d (%s, %s) which is aggregated on %d (%s, %s)\n", pe->pid, pe->comm, pe->updated?"running":"exited", ppe->pid, ppe->comm, ppe->updated?"running":"exited");
+ while(pp) {
+ fprintf(stderr, " >> parent %d (%s, %s)\n", pp->pid, pp->comm, pp->updated?"running":"exited");
+ pp = pp->parent;
+ }
+
+ for(p = root_of_pids; p ; p = p->next) {
+ if(p == pe) continue;
+
+ switch(type) {
+ case 1:
+ if(p->cminflt > lost) {
+ fprintf(stderr, " > process %d (%s) could use the lost exited child minflt %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+ found++;
+ }
+ break;
+
+ case 2:
+ if(p->cmajflt > lost) {
+ fprintf(stderr, " > process %d (%s) could use the lost exited child majflt %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+ found++;
+ }
+ break;
+
+ case 3:
+ if(p->cutime > lost) {
+ fprintf(stderr, " > process %d (%s) could use the lost exited child utime %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+ found++;
+ }
+ break;
+
+ case 4:
+ if(p->cstime > lost) {
+ fprintf(stderr, " > process %d (%s) could use the lost exited child stime %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+ found++;
+ }
+ break;
+ }
+ }
+
+ if(!found) {
+ switch(type) {
+ case 1:
+ fprintf(stderr, " > cannot find any process to use the lost exited child minflt %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+ break;
+
+ case 2:
+ fprintf(stderr, " > cannot find any process to use the lost exited child majflt %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+ break;
+
+ case 3:
+ fprintf(stderr, " > cannot find any process to use the lost exited child utime %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+ break;
+
+ case 4:
+ fprintf(stderr, " > cannot find any process to use the lost exited child stime %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+ break;
+ }
+ }
+}
+#endif /* NETDATA_INTERNAL_CHECKS */
+
+void remove_exited_child_from_parent(unsigned long long *field, unsigned long long *pfield, unsigned long long *ifield, struct pid_stat *pe, struct pid_stat *ppe, int type) {
+ if(pfield) {
+ if(*field > *pfield) {
+ *field -= *pfield;
+ *pfield = 0;
+ }
+ else {
+ *pfield -= *field;
+ *field = 0;
+ }
+ }
+
+ if(*field) {
+ if(ifield && ifield != pfield) {
+ if(*field > *ifield) {
+ *field -= *ifield;
+ *ifield = 0;
+ }
+ else {
+ *ifield -= *field;
+ *field = 0;
+ }
+ }
+ }
+
+ if(*field) {
+#ifdef NETDATA_INTERNAL_CHECKS
+ find_lost_child_debug(pe, ppe, *field, type);
+#endif
+ while(pe && !pe->updated) {
+ pe->keep = 1;
+ pe = pe->parent;
+ }
+ }
+}
+
+void process_exited_processes() {
+ struct pid_stat *init = all_pids[1];
+ struct pid_stat *p;
+
+ for(p = root_of_pids; p ; p = p->next) {
+ if(p->updated || !p->stat_collected_usec) continue;
+
+ struct pid_stat *pp = p->parent;
+
+ // find the first parent that is running
+ while(pp && !pp->updated)
+ pp = pp->parent;
+
+ unsigned long long rate;
+
+ rate = (p->utime_raw + p->cutime_raw) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+ remove_exited_child_from_parent(&rate, (pp)?&pp->cutime:NULL, (init)?&init->cutime:NULL, p, pp, 3);
+ p->cutime_raw = 0;
+ p->utime_raw = rate * (p->stat_collected_usec - p->last_stat_collected_usec) / (update_every * 1000000 * 100);
+
+ rate = (p->stime_raw + p->cstime_raw) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+ remove_exited_child_from_parent(&rate, (pp)?&pp->cstime:NULL, (init)?&init->cstime:NULL, p, pp, 4);
+ p->cstime_raw = 0;
+ p->stime_raw = rate * (p->stat_collected_usec - p->last_stat_collected_usec) / (update_every * 1000000 * 100);
+
+ rate = (p->minflt_raw + p->cminflt_raw) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+ remove_exited_child_from_parent(&rate, (pp)?&pp->cminflt:NULL, (init)?&init->cminflt:NULL, p, pp, 1);
+ p->cminflt_raw = 0;
+ p->minflt_raw = rate * (p->stat_collected_usec - p->last_stat_collected_usec) / (update_every * 1000000 * 100);
+
+ rate = (p->majflt_raw + p->cmajflt_raw) * (update_every * 1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+ remove_exited_child_from_parent(&rate, (pp)?&pp->cmajflt:NULL, (init)?&init->cmajflt:NULL, p, pp, 2);
+ p->cmajflt_raw = 0;
+ p->majflt_raw = rate * (p->stat_collected_usec - p->last_stat_collected_usec) / (update_every * 1000000 * 100);
+ }
+}
+
+void link_all_processes_to_their_parents(void) {
+ struct pid_stat *p = NULL;
+
+ // link all children to their parents
+ // and update children count on parents
+ for(p = root_of_pids; p ; p = p->next) {
+ // for each process found running
+
+ if(likely(p->ppid > 0 && all_pids[p->ppid])) {
+ // valid parent processes
+
+ struct pid_stat *pp;
+
+ p->parent = pp = all_pids[p->ppid];
+ p->parent->children_count++;
+
+ if(unlikely(debug || (p->target && p->target->debug)))
+ fprintf(stderr, "apps.plugin: \tchild %d (%s, %s) on target '%s' has parent %d (%s, %s). Parent: utime=%llu, stime=%llu, minflt=%llu, majflt=%llu, cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu\n", p->pid, p->comm, p->updated?"running":"exited", (p->target)?p->target->name:"UNSET", pp->pid, pp->comm, pp->updated?"running":"exited", pp->utime, pp->stime, pp->minflt, pp->majflt, pp->cutime, pp->cstime, pp->cminflt, pp->cmajflt);
+ }
+ else if(unlikely(p->ppid != 0))
+ error("pid %d %s states parent %d, but the later does not exist.", p->pid, p->comm, p->ppid);
+ }
+}
+
+// ----------------------------------------------------------------------------
+
// 1. read all files in /proc
// 2. for each numeric directory:
// i. read /proc/pid/stat
// to avoid filling up all disk space
// if debug is enabled, all errors are printed
-int collect_data_for_all_processes_from_proc(void)
-{
- char dirname[FILENAME_MAX + 1];
+static int compar_pid(const void *pid1, const void *pid2) {
- snprintfz(dirname, FILENAME_MAX, "%s/proc", host_prefix);
- DIR *dir = opendir(dirname);
- if(!dir) return 0;
+ struct pid_stat *p1 = all_pids[*((pid_t *)pid1)];
+ struct pid_stat *p2 = all_pids[*((pid_t *)pid2)];
- struct dirent *file = NULL;
- struct pid_stat *p = NULL;
-
- // mark them all as un-updated
- all_pids_count = 0;
- for(p = root_of_pids; p ; p = p->next) {
- all_pids_count++;
+ if(p1->sortlist > p2->sortlist)
+ return -1;
+ else
+ return 1;
+}
- p->parent = NULL;
+void collect_data_for_pid(pid_t pid) {
+ if(unlikely(pid <= 0 || pid > pid_max)) {
+ error("Invalid pid %d read (expected 1 to %d). Ignoring process.", pid, pid_max);
+ return;
+ }
- p->updated = 0;
- p->children_count = 0;
- p->merged = 0;
- p->new_entry = 0;
+ struct pid_stat *p = get_pid_entry(pid);
+ if(unlikely(!p || p->read)) return;
- p->last_minflt = p->minflt;
- p->last_majflt = p->majflt;
- p->last_utime = p->utime;
- p->last_stime = p->stime;
+ // fprintf(stderr, "Reading process %d (%s), sortlist %d\n", p->pid, p->comm, p->sortlist);
- p->last_cminflt = p->cminflt;
- p->last_cmajflt = p->cmajflt;
- p->last_cutime = p->cutime;
- p->last_cstime = p->cstime;
+ p->read = 1;
+ p->sortlist = 0;
- p->last_fix_cminflt = p->fix_cminflt;
- p->last_fix_cmajflt = p->fix_cmajflt;
- p->last_fix_cutime = p->fix_cutime;
- p->last_fix_cstime = p->fix_cstime;
+ // --------------------------------------------------------------------
+ // /proc/<pid>/stat
- p->last_io_logical_bytes_read = p->io_logical_bytes_read;
- p->last_io_logical_bytes_written = p->io_logical_bytes_written;
- p->last_io_read_calls = p->io_read_calls;
- p->last_io_write_calls = p->io_write_calls;
- p->last_io_storage_bytes_read = p->io_storage_bytes_read;
- p->last_io_storage_bytes_written = p->io_storage_bytes_written;
- p->last_io_cancelled_write_bytes = p->io_cancelled_write_bytes;
+ if(unlikely(read_proc_pid_stat(p))) {
+ error("Cannot process %s/proc/%d/stat", host_prefix, pid);
+ // there is no reason to proceed if we cannot get its status
+ return;
}
- while((file = readdir(dir))) {
- char *endptr = file->d_name;
- pid_t pid = (pid_t) strtoul(file->d_name, &endptr, 10);
+ read_proc_pid_ownership(p);
- // make sure we read a valid number
- if(unlikely(endptr == file->d_name || *endptr != '\0'))
- continue;
+ // check its parent pid
+ if(unlikely(p->ppid < 0 || p->ppid > pid_max)) {
+ error("Pid %d states invalid parent pid %d. Using 0.", pid, p->ppid);
+ p->ppid = 0;
+ }
- if(unlikely(pid <= 0 || pid > pid_max)) {
- error("Invalid pid %d read (expected 1 to %d). Ignoring process.", pid, pid_max);
- continue;
- }
+ // --------------------------------------------------------------------
+ // /proc/<pid>/io
- p = get_pid_entry(pid);
- if(unlikely(!p)) continue;
+ if(unlikely(read_proc_pid_io(p)))
+ error("Cannot process %s/proc/%d/io", host_prefix, pid);
+ // --------------------------------------------------------------------
+ // /proc/<pid>/statm
- // --------------------------------------------------------------------
- // /proc/<pid>/stat
+ if(unlikely(read_proc_pid_statm(p))) {
+ error("Cannot process %s/proc/%d/statm", host_prefix, pid);
+ // there is no reason to proceed if we cannot get its memory status
+ return;
+ }
- if(unlikely(read_proc_pid_stat(p))) {
- error("Cannot process %s/proc/%d/stat", host_prefix, pid);
- // there is no reason to proceed if we cannot get its status
- continue;
- }
+ // --------------------------------------------------------------------
+ // link it
- // check its parent pid
- if(unlikely(p->ppid < 0 || p->ppid > pid_max)) {
- error("Pid %d states invalid parent pid %d. Using 0.", pid, p->ppid);
- p->ppid = 0;
+ // check if it is target
+ // we do this only once, the first time this pid is loaded
+ if(unlikely(p->new_entry)) {
+ // /proc/<pid>/cmdline
+ if(likely(proc_pid_cmdline_is_needed)) {
+ if(unlikely(read_proc_pid_cmdline(p)))
+ error("Cannot process %s/proc/%d/cmdline", host_prefix, pid);
}
- // --------------------------------------------------------------------
- // /proc/<pid>/statm
+ if(unlikely(debug))
+ fprintf(stderr, "apps.plugin: \tJust added %d (%s)\n", pid, p->comm);
+
+ uint32_t hash = simple_hash(p->comm);
+ size_t pclen = strlen(p->comm);
+
+ struct target *w;
+ for(w = apps_groups_root_target; w ; w = w->next) {
+ // if(debug || (p->target && p->target->debug)) fprintf(stderr, "apps.plugin: \t\tcomparing '%s' with '%s'\n", w->compare, p->comm);
+
+ // find it - 4 cases:
+ // 1. the target is not a pattern
+ // 2. the target has the prefix
+ // 3. the target has the suffix
+ // 4. the target is something inside cmdline
+ if( (!w->starts_with && !w->ends_with && w->comparehash == hash && !strcmp(w->compare, p->comm))
+ || (w->starts_with && !w->ends_with && !strncmp(w->compare, p->comm, w->comparelen))
+ || (!w->starts_with && w->ends_with && pclen >= w->comparelen && !strcmp(w->compare, &p->comm[pclen - w->comparelen]))
+ || (proc_pid_cmdline_is_needed && w->starts_with && w->ends_with && strstr(p->cmdline, w->compare))
+ ) {
+ if(w->target) p->target = w->target;
+ else p->target = w;
- if(unlikely(read_proc_pid_statm(p))) {
- error("Cannot process %s/proc/%d/statm", host_prefix, pid);
- // there is no reason to proceed if we cannot get its memory status
- continue;
+ if(debug || (p->target && p->target->debug))
+ fprintf(stderr, "apps.plugin: \t\t%s linked to target %s\n", p->comm, p->target->name);
+
+ break;
+ }
}
+ }
+ // --------------------------------------------------------------------
+ // /proc/<pid>/fd
- // --------------------------------------------------------------------
- // /proc/<pid>/io
+ if(unlikely(read_pid_file_descriptors(p))) {
+ error("Cannot process entries in %s/proc/%d/fd", host_prefix, pid);
+ }
- if(unlikely(read_proc_pid_io(p))) {
- error("Cannot process %s/proc/%d/io", host_prefix, pid);
+ // --------------------------------------------------------------------
+ // done!
- // on systems without /proc/X/io
- // allow proceeding without I/O information
- // continue;
- }
+ if(p->ppid && all_pids[p->ppid] && !all_pids[p->ppid]->read)
+ error("read process %d (%s), but its parent %d (%s) is not read\n", p->pid, p->comm, all_pids[p->ppid]->pid, all_pids[p->ppid]->comm);
- // --------------------------------------------------------------------
- // <pid> ownership
+ // mark it as updated
+ p->updated = 1;
+ p->keep = 0;
+ p->keeploops = 0;
+}
- if(unlikely(read_proc_pid_ownership(p))) {
- error("Cannot stat %s/proc/%d", host_prefix, pid);
+int collect_data_for_all_processes_from_proc(void) {
+ struct pid_stat *p = NULL;
+
+ if(all_pids_count) {
+ // read parents before childs
+ // this is needed to prevent a situation where
+ // a child is found running, but until we read
+ // its parent, it has exited and its parent
+ // has accumulated its resources
+
+ long slc = 0;
+ for(p = root_of_pids; p ; p = p->next) {
+ p->read = 0;
+ p->updated = 0;
+ p->new_entry = 0;
+ p->merged = 0;
+ p->children_count = 0;
+ p->parent = NULL;
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ if(unlikely(slc >= all_pids_count))
+ error("Internal error: I was thinking I had %ld processes in my arrays, but it seems there are more.", all_pids_count);
+#endif
+ all_pids_sortlist[slc++] = p->pid;
}
- // --------------------------------------------------------------------
- // link it
-
- // check if it is target
- // we do this only once, the first time this pid is loaded
- if(unlikely(p->new_entry)) {
- // /proc/<pid>/cmdline
- if(proc_pid_cmdline_is_needed) {
- if(unlikely(read_proc_pid_cmdline(p))) {
- error("Cannot process %s/proc/%d/cmdline", host_prefix, pid);
- }
- }
+ qsort((void *)all_pids_sortlist, all_pids_count, sizeof(pid_t), compar_pid);
- if(unlikely(debug))
- fprintf(stderr, "apps.plugin: \tJust added %d (%s)\n", pid, p->comm);
-
- uint32_t hash = simple_hash(p->comm);
- size_t pclen = strlen(p->comm);
-
- struct target *w;
- for(w = apps_groups_root_target; w ; w = w->next) {
- // if(debug || (p->target && p->target->debug)) fprintf(stderr, "apps.plugin: \t\tcomparing '%s' with '%s'\n", w->compare, p->comm);
-
- // find it - 4 cases:
- // 1. the target is not a pattern
- // 2. the target has the prefix
- // 3. the target has the suffix
- // 4. the target is something inside cmdline
- if( (!w->starts_with && !w->ends_with && w->comparehash == hash && !strcmp(w->compare, p->comm))
- || (w->starts_with && !w->ends_with && !strncmp(w->compare, p->comm, w->comparelen))
- || (!w->starts_with && w->ends_with && pclen >= w->comparelen && !strcmp(w->compare, &p->comm[pclen - w->comparelen]))
- || (proc_pid_cmdline_is_needed && w->starts_with && w->ends_with && strstr(p->cmdline, w->compare))
- ) {
- if(w->target) p->target = w->target;
- else p->target = w;
+ for(slc = 0; slc < all_pids_count; slc++)
+ collect_data_for_pid(all_pids_sortlist[slc]);
+ }
- if(debug || (p->target && p->target->debug))
- fprintf(stderr, "apps.plugin: \t\t%s linked to target %s\n", p->comm, p->target->name);
+ char dirname[FILENAME_MAX + 1];
- break;
- }
- }
- }
+ snprintfz(dirname, FILENAME_MAX, "%s/proc", host_prefix);
+ DIR *dir = opendir(dirname);
+ if(!dir) return 0;
- // --------------------------------------------------------------------
- // /proc/<pid>/fd
+ struct dirent *file = NULL;
- if(unlikely(read_pid_file_descriptors(p))) {
- error("Cannot process entries in %s/proc/%d/fd", host_prefix, pid);
- }
+ while((file = readdir(dir))) {
+ char *endptr = file->d_name;
+ pid_t pid = (pid_t) strtoul(file->d_name, &endptr, 10);
- // --------------------------------------------------------------------
- // done!
+ // make sure we read a valid number
+ if(unlikely(endptr == file->d_name || *endptr != '\0'))
+ continue;
- // mark it as updated
- p->updated = 1;
+ collect_data_for_pid(pid);
}
-
closedir(dir);
+ // normally this is done
+ // however we may have processes exited while we collected values
+ // so let's find the exited ones
+ // we do this by collecting the ownership of process
+ // if we manage to get the ownership, the process still runs
+
+ link_all_processes_to_their_parents();
+ process_exited_processes();
+
return 1;
}
// 9. find the unique file count for each target
// check: update_apps_groups_statistics()
-void link_all_processes_to_their_parents(void) {
- struct pid_stat *init = all_pids[1];
- struct pid_stat *p = NULL;
-
- // link all children to their parents
- // and update children count on parents
- for(p = root_of_pids; p ; p = p->next) {
- // for each process found running
-
- if(likely(p->new_entry && p->updated)) {
- // the first time we see an entry
- // we remove the exited children figures
- // to avoid spikes
- p->fix_cminflt = p->cminflt;
- p->fix_cmajflt = p->cmajflt;
- p->fix_cutime = p->cutime;
- p->fix_cstime = p->cstime;
- }
-
- if(likely(p->ppid > 0 && all_pids[p->ppid])) {
- // valid parent processes
-
- struct pid_stat *pp;
-
- p->parent = pp = all_pids[p->ppid];
- p->parent->children_count++;
-
- if(unlikely(debug || (p->target && p->target->debug)))
- fprintf(stderr, "apps.plugin: \tchild %d (%s, %s) has parent %d (%s, %s). Parent: utime=%llu, stime=%llu, minflt=%llu, majflt=%llu, cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu, fix_cutime=%llu, fix_cstime=%llu, fix_cminflt=%llu, fix_cmajflt=%llu\n", p->pid, p->comm, p->updated?"running":"exited", pp->pid, pp->comm, pp->updated?"running":"exited", pp->utime, pp->stime, pp->minflt, pp->majflt, pp->cutime, pp->cstime, pp->cminflt, pp->cmajflt, pp->fix_cutime, pp->fix_cstime, pp->fix_cminflt, pp->fix_cmajflt);
-
- if(unlikely(!p->updated)) {
- // this process has exit
-
- // find the first parent that has been updated
- while(pp && !pp->updated) {
- // we may have to forward link it to its parent
- if(unlikely(!pp->parent && pp->ppid > 0 && all_pids[pp->ppid]))
- pp->parent = all_pids[pp->ppid];
-
- // check again for parent
- pp = pp->parent;
- }
-
- if(likely(pp)) {
- // this is an exited child with a parent
- // remove the known time from the parent's data
- pp->fix_cminflt += p->last_minflt + p->last_cminflt + p->last_fix_cminflt;
- pp->fix_cmajflt += p->last_majflt + p->last_cmajflt + p->last_fix_cmajflt;
- pp->fix_cutime += p->last_utime + p->last_cutime + p->last_fix_cutime;
- pp->fix_cstime += p->last_stime + p->last_cstime + p->last_fix_cstime;
-
- // The known exited children (the ones we track) may have
- // contributed more than the value accumulated into the process
- // by the kernel.
- // This can happen if the parent process has not waited-for
- // its children (check: man 2 times).
- // In this case, the kernel adds these resources to init (pid 1).
- //
- // The following code, attempts to fix this.
- // Without this code, the charts will have random spikes
- // for example, when an SSH session ends (sshd forks a child
- // to serve the session, but when this session ends, sshd
- // does not wait-for its child, thus all the resources of the
- // ssh session get added to init, resulting in a huge spike on
- // the charts).
-
- if(unlikely(pp->cminflt < pp->fix_cminflt)) {
- if(likely(init && pp != init)) {
- unsigned long long have = pp->fix_cminflt - pp->cminflt;
- unsigned long long max = init->cminflt - init->fix_cminflt;
- if(have > max) have = max;
- init->fix_cminflt += have;
- }
- pp->fix_cminflt = pp->cminflt;
- }
- if(unlikely(pp->cmajflt < pp->fix_cmajflt)) {
- if(likely(init && pp != init)) {
- unsigned long long have = pp->fix_cmajflt - pp->cmajflt;
- unsigned long long max = init->cmajflt - init->fix_cmajflt;
- if(have > max) have = max;
- init->fix_cmajflt += have;
- }
- pp->fix_cmajflt = pp->cmajflt;
- }
- if(unlikely(pp->cutime < pp->fix_cutime)) {
- if(likely(init && pp != init)) {
- unsigned long long have = pp->fix_cutime - pp->cutime;
- unsigned long long max = init->cutime - init->fix_cutime;
- if(have > max) have = max;
- init->fix_cutime += have;
- }
- pp->fix_cutime = pp->cutime;
- }
- if(unlikely(pp->cstime < pp->fix_cstime)) {
- if(likely(init && pp != init)) {
- unsigned long long have = pp->fix_cstime - pp->cstime;
- unsigned long long max = init->cstime - init->fix_cstime;
- if(have > max) have = max;
- init->fix_cstime += have;
- }
- pp->fix_cstime = pp->cstime;
- }
-
- if(unlikely(debug))
- fprintf(stderr, "apps.plugin: \tupdating child metrics of %d (%s, %s) to its parent %d (%s, %s). Parent has now: utime=%llu, stime=%llu, minflt=%llu, majflt=%llu, cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu, fix_cutime=%llu, fix_cstime=%llu, fix_cminflt=%llu, fix_cmajflt=%llu\n", p->pid, p->comm, p->updated?"running":"exited", pp->pid, pp->comm, pp->updated?"running":"exited", pp->utime, pp->stime, pp->minflt, pp->majflt, pp->cutime, pp->cstime, pp->cminflt, pp->cmajflt, pp->fix_cutime, pp->fix_cstime, pp->fix_cminflt, pp->fix_cmajflt);
- }
- }
- }
- else if(unlikely(p->ppid != 0))
- error("pid %d %s states parent %d, but the later does not exist.", p->pid, p->comm, p->ppid);
- }
-}
-
-
-void cleanup_non_existing_pids(void) {
+void cleanup_exited_pids(void) {
int c;
struct pid_stat *p = NULL;
for(p = root_of_pids; p ;) {
- if(!p->updated) {
+ if(!p->updated && (!p->keep || p->keeploops > 1)) {
// fprintf(stderr, "\tEXITED %d %s [parent %d %s, target %s] utime=%llu, stime=%llu, cutime=%llu, cstime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu\n", p->pid, p->comm, p->parent->pid, p->parent->comm, p->target->name, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt);
+#ifdef NETDATA_INTERNAL_CHECKS
+ if(p->keep)
+ fprintf(stderr, " > cannot keep exited process %d (%s) anymore - removing it.\n", p->pid, p->comm);
+#endif
+
for(c = 0 ; c < p->fds_size ; c++) if(p->fds[c] > 0) {
file_descriptor_not_used(p->fds[c]);
p->fds[c] = 0;
p = p->next;
del_pid_entry(r);
}
- else p = p->next;
+ else {
+ if(unlikely(p->keep)) p->keeploops++;
+ p->keep = 0;
+ p = p->next;
+ }
}
}
}
}
-
// find all the procs with 0 childs and merge them to their parents
// repeat, until nothing more can be done.
+ int sortlist = 1;
found = 1;
while(found) {
if(unlikely(debug)) loops++;
found = 0;
+
for(p = root_of_pids; p ; p = p->next) {
// if this process does not have any children
// and is not already merged
found++;
}
+
+ // since this process does not have any childs
+ // assign it to the current sortlist
+ if(unlikely(!p->sortlist && !p->children_count))
+ p->sortlist = sortlist++;
}
if(unlikely(debug))
- fprintf(stderr, "apps.plugin: merged %d processes\n", found);
+ fprintf(stderr, "apps.plugin: TARGET INHERITANCE: merged %d processes\n", found);
}
// init goes always to default target
for(p = root_of_pids; p ; p = p->next) {
// if the process is not merged itself
// then is is a top level process
- if(!p->merged && !p->target)
+ if(unlikely(!p->merged && !p->target))
p->target = apps_groups_default_target;
+
+ // make sure all processes have a sortlist
+ if(unlikely(!p->sortlist))
+ p->sortlist = sortlist++;
}
// give a target to all merged child processes
}
void aggregate_pid_on_target(struct target *w, struct pid_stat *p, struct target *o) {
+ (void)o;
+
if(unlikely(!w->fds)) {
w->fds = calloc(sizeof(int), (size_t) all_files_size);
if(unlikely(!w->fds))
}
if(likely(p->updated)) {
- if(unlikely(debug && (p->fix_cutime || p->fix_cstime || p->fix_cminflt || p->fix_cmajflt)))
- fprintf(stderr, "apps.plugin: \tadding child counters of %d (%s) to target %s. Currents: cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu, Fixes: cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu\n", p->pid, p->comm, w->name, p->cutime, p->cstime, p->cminflt, p->cmajflt, p->fix_cutime, p->fix_cstime, p->fix_cminflt, p->fix_cmajflt);
-
- w->cutime += p->cutime - p->fix_cutime;
- w->cstime += p->cstime - p->fix_cstime;
- w->cminflt += p->cminflt - p->fix_cminflt;
- w->cmajflt += p->cmajflt - p->fix_cmajflt;
-
- w->utime += p->utime; //+ (p->pid != 1)?(p->cutime - p->fix_cutime):0;
- w->stime += p->stime; //+ (p->pid != 1)?(p->cstime - p->fix_cstime):0;
- w->minflt += p->minflt; //+ (p->pid != 1)?(p->cminflt - p->fix_cminflt):0;
- w->majflt += p->majflt; //+ (p->pid != 1)?(p->cmajflt - p->fix_cmajflt):0;
+ w->cutime += p->cutime;
+ w->cstime += p->cstime;
+ w->cminflt += p->cminflt;
+ w->cmajflt += p->cmajflt;
- //if(p->num_threads < 0)
- // error("Negative threads number for pid '%s' (%d): %d", p->comm, p->pid, p->num_threads);
+ w->utime += p->utime;
+ w->stime += p->stime;
+ w->minflt += p->minflt;
+ w->majflt += p->majflt;
- //if(p->num_threads > 10000)
- // error("Excessive threads number for pid '%s' (%d): %d", p->comm, p->pid, p->num_threads);
-
- w->num_threads += p->num_threads;
w->rss += p->rss;
w->statm_size += p->statm_size;
w->statm_data += p->statm_data;
w->statm_dirty += p->statm_dirty;
- w->io_logical_bytes_read += p->io_logical_bytes_read;
+ w->io_logical_bytes_read += p->io_logical_bytes_read;
w->io_logical_bytes_written += p->io_logical_bytes_written;
- w->io_read_calls += p->io_read_calls;
- w->io_write_calls += p->io_write_calls;
- w->io_storage_bytes_read += p->io_storage_bytes_read;
+ w->io_read_calls += p->io_read_calls;
+ w->io_write_calls += p->io_write_calls;
+ w->io_storage_bytes_read += p->io_storage_bytes_read;
w->io_storage_bytes_written += p->io_storage_bytes_written;
w->io_cancelled_write_bytes += p->io_cancelled_write_bytes;
w->processes++;
+ w->num_threads += p->num_threads;
if(likely(w->fds)) {
int c;
}
if(unlikely(debug || w->debug))
- fprintf(stderr, "apps.plugin: \tAggregating %s pid %d on %s utime=%llu, stime=%llu, cutime=%llu, cstime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu, fix_cutime=%llu, fix_cstime=%llu, fix_cminflt=%llu, fix_cmajflt=%llu\n", p->comm, p->pid, w->name, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt, p->fix_cutime, p->fix_cstime, p->fix_cminflt, p->fix_cmajflt);
-
-/* if(p->utime - p->old_utime > 100) fprintf(stderr, "BIG CHANGE: %d %s utime increased by %llu from %llu to %llu\n", p->pid, p->comm, p->utime - p->old_utime, p->old_utime, p->utime);
- if(p->cutime - p->old_cutime > 100) fprintf(stderr, "BIG CHANGE: %d %s cutime increased by %llu from %llu to %llu\n", p->pid, p->comm, p->cutime - p->old_cutime, p->old_cutime, p->cutime);
- if(p->stime - p->old_stime > 100) fprintf(stderr, "BIG CHANGE: %d %s stime increased by %llu from %llu to %llu\n", p->pid, p->comm, p->stime - p->old_stime, p->old_stime, p->stime);
- if(p->cstime - p->old_cstime > 100) fprintf(stderr, "BIG CHANGE: %d %s cstime increased by %llu from %llu to %llu\n", p->pid, p->comm, p->cstime - p->old_cstime, p->old_cstime, p->cstime);
- if(p->minflt - p->old_minflt > 5000) fprintf(stderr, "BIG CHANGE: %d %s minflt increased by %llu from %llu to %llu\n", p->pid, p->comm, p->minflt - p->old_minflt, p->old_minflt, p->minflt);
- if(p->majflt - p->old_majflt > 5000) fprintf(stderr, "BIG CHANGE: %d %s majflt increased by %llu from %llu to %llu\n", p->pid, p->comm, p->majflt - p->old_majflt, p->old_majflt, p->majflt);
- if(p->cminflt - p->old_cminflt > 15000) fprintf(stderr, "BIG CHANGE: %d %s cminflt increased by %llu from %llu to %llu\n", p->pid, p->comm, p->cminflt - p->old_cminflt, p->old_cminflt, p->cminflt);
- if(p->cmajflt - p->old_cmajflt > 15000) fprintf(stderr, "BIG CHANGE: %d %s cmajflt increased by %llu from %llu to %llu\n", p->pid, p->comm, p->cmajflt - p->old_cmajflt, p->old_cmajflt, p->cmajflt);
-*/
-
- if(o) {
- // since the process switched target
- // for all incremental values
- // we have to subtract its OLD values from the new target
- // and add its OLD values to the old target
-
- // IMPORTANT
- // We add/subtract the last/OLD values we added to the target
-
- unsigned long long cutime = p->last_cutime - p->last_fix_cutime;
- unsigned long long cstime = p->last_cstime - p->last_fix_cstime;
- unsigned long long cminflt = p->last_cminflt - p->last_fix_cminflt;
- unsigned long long cmajflt = p->last_cmajflt - p->last_fix_cmajflt;
-
- w->fix_cutime -= cutime;
- w->fix_cstime -= cstime;
- w->fix_cminflt -= cminflt;
- w->fix_cmajflt -= cmajflt;
-
- w->fix_utime -= p->last_utime;
- w->fix_stime -= p->last_stime;
- w->fix_minflt -= p->last_minflt;
- w->fix_majflt -= p->last_majflt;
-
- w->fix_io_logical_bytes_read -= p->last_io_logical_bytes_read;
- w->fix_io_logical_bytes_written -= p->last_io_logical_bytes_written;
- w->fix_io_read_calls -= p->last_io_read_calls;
- w->fix_io_write_calls -= p->last_io_write_calls;
- w->fix_io_storage_bytes_read -= p->last_io_storage_bytes_read;
- w->fix_io_storage_bytes_written -= p->last_io_storage_bytes_written;
- w->fix_io_cancelled_write_bytes -= p->last_io_cancelled_write_bytes;
-
- // ---
-
- o->fix_cutime += cutime;
- o->fix_cstime += cstime;
- o->fix_cminflt += cminflt;
- o->fix_cmajflt += cmajflt;
-
- o->fix_utime += p->last_utime;
- o->fix_stime += p->last_stime;
- o->fix_minflt += p->last_minflt;
- o->fix_majflt += p->last_majflt;
-
- o->fix_io_logical_bytes_read += p->last_io_logical_bytes_read;
- o->fix_io_logical_bytes_written += p->last_io_logical_bytes_written;
- o->fix_io_read_calls += p->last_io_read_calls;
- o->fix_io_write_calls += p->last_io_write_calls;
- o->fix_io_storage_bytes_read += p->last_io_storage_bytes_read;
- o->fix_io_storage_bytes_written += p->last_io_storage_bytes_written;
- o->fix_io_cancelled_write_bytes += p->last_io_cancelled_write_bytes;
- }
- }
- else {
- // if(o) fprintf(stderr, "apps.plugin: \t\tpid %d (%s) is not updated by OLD target %s (%s) is present.\n", p->pid, p->comm, o->id, o->name);
-
- // since the process has exited, the user
- // will see a drop in our charts, because the incremental
- // values of this process will not be there from now on
-
- // add them to the fix_* values and they will be added to
- // the reported values, so that the report goes steady
-
- w->fix_minflt += p->last_minflt;
- w->fix_majflt += p->last_majflt;
- w->fix_utime += p->last_utime;
- w->fix_stime += p->last_stime;
-
- w->fix_cminflt += (p->last_cminflt - p->last_fix_cminflt);
- w->fix_cmajflt += (p->last_cmajflt - p->last_fix_cmajflt);
- w->fix_cutime += (p->last_cutime - p->last_fix_cutime);
- w->fix_cstime += (p->last_cstime - p->last_fix_cstime);
-
- w->fix_io_logical_bytes_read += p->last_io_logical_bytes_read;
- w->fix_io_logical_bytes_written += p->last_io_logical_bytes_written;
- w->fix_io_read_calls += p->last_io_read_calls;
- w->fix_io_write_calls += p->last_io_write_calls;
- w->fix_io_storage_bytes_read += p->last_io_storage_bytes_read;
- w->fix_io_storage_bytes_written += p->last_io_storage_bytes_written;
- w->fix_io_cancelled_write_bytes += p->last_io_cancelled_write_bytes;
+ fprintf(stderr, "apps.plugin: \taggregating '%s' pid %d on target '%s' utime=%llu, stime=%llu, cutime=%llu, cstime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu\n", p->comm, p->pid, w->name, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt);
}
-
- //if((long long)w->cutime + w->fix_cutime < 0)
- // error("Negative total cutime (%llu - %lld) on target %s after adding process %d (%s, %s) with utime=%llu, stime=%llu, minflt=%llu, majflt=%llu, cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu, fix_cutime=%llu, fix_cstime=%llu, fix_cminflt=%llu, fix_cmajflt=%llu\n",
- // w->cutime, w->fix_cutime, w->name, p->pid, p->comm, p->updated?"running":"exited", p->utime, p->stime, p->minflt, p->majflt, p->cutime, p->cstime, p->cminflt, p->cmajflt, p->fix_cutime, p->fix_cstime, p->fix_cminflt, p->fix_cmajflt);
}
void count_targets_fds(struct target *root) {
}
}
-void calculate_netdata_statistics(void)
-{
- link_all_processes_to_their_parents();
+void calculate_netdata_statistics(void) {
apply_apps_groups_targets_inheritance();
zero_all_targets(users_root_target);
count_targets_fds(users_root_target);
count_targets_fds(groups_root_target);
- cleanup_non_existing_pids();
+ cleanup_exited_pids();
}
// ----------------------------------------------------------------------------
void send_collected_data_to_netdata(struct target *root, const char *type, unsigned long long usec)
{
struct target *w;
+ int childs = include_exited_childs;
+
+ {
+ // childs processing introduces spikes
+ // here we try to eliminate them by disabling childs processing either for specific dimensions
+ // or entirely. Of course, either way, we disable it just a single iteration.
+
+ unsigned long long max = update_every * processors * 100 * 100;
+ unsigned long long utime = 0, cutime = 0, stime = 0, cstime = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0;
+
+ for (w = root; w ; w = w->next) {
+ if(w->target || (!w->processes && !w->exposed)) continue;
+
+ if((w->utime + w->stime + w->cutime + w->cstime) > max) {
+#ifdef NETDATA_INTERNAL_CHECKS
+ log_date(stderr);
+ fprintf(stderr, "Prevented a spike on target '%s', reported CPU time = %llu (without childs = %llu)\n", w->name, (w->utime + w->stime + w->cutime + w->cstime) / 100, (w->utime + w->stime) / 100);
+#endif
+ w->cutime = w->cstime = w->cminflt = w->majflt = 0;
+ }
+
+ utime += w->utime;
+ cutime += w->cutime;
+ stime += w->stime;
+ cstime += w->cstime;
+ minflt += w->minflt;
+ cminflt += w->cminflt;
+ majflt += w->majflt;
+ cmajflt += w->cmajflt;
+ }
+
+ if((utime + stime + cutime + cstime) > max) {
+ childs = 0;
+#ifdef NETDATA_INTERNAL_CHECKS
+ log_date(stderr);
+ fprintf(stderr, "Prevented a spike because the total CPU of all dimensions = %llu (without childs = %llu)\n", (utime + stime + cutime + cstime) / 100, (utime + stime) / 100);
+#endif
+ }
+
+ if((utime + stime) > max) {
+ childs = 0;
+ unsigned long long multiplier = max, divider = utime + stime;
+ for (w = root; w ; w = w->next) {
+ w->utime = w->utime * multiplier / divider;
+ w->stime = w->stime * multiplier / divider;
+ w->minflt = w->minflt * multiplier / divider;
+ w->majflt = w->majflt * multiplier / divider;
+ }
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ log_date(stderr);
+ fprintf(stderr, "Reduced processes utilization (without childs) by %0.2f%% (CPU was %llu)\n", (float)((utime + stime - max * 100.0)/(float)max), (utime + stime) / 100);
+#endif
+ }
+
+ }
fprintf(stdout, "BEGIN %s.cpu %llu\n", type, usec);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->utime + w->stime + w->fix_utime + w->fix_stime + (include_exited_childs?(w->cutime + w->cstime + w->fix_cutime + w->fix_cstime):0));
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->utime + w->stime + (childs?(w->cutime + w->cstime):0));
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->utime + w->fix_utime + (include_exited_childs?(w->cutime + w->fix_cutime):0));
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->utime + (childs?(w->cutime):0));
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->stime + w->fix_stime + (include_exited_childs?(w->cstime + w->fix_cstime):0));
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->stime + (childs?(w->cstime):0));
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->minflt + w->fix_minflt + (include_exited_childs?(w->cminflt + w->fix_cminflt):0));
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->minflt + (childs?(w->cminflt):0));
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->majflt + w->fix_majflt + (include_exited_childs?(w->cmajflt + w->fix_cmajflt):0));
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->majflt + (childs?(w->cmajflt):0));
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->io_logical_bytes_read + w->fix_io_logical_bytes_read);
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->io_logical_bytes_read);
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->io_logical_bytes_written + w->fix_io_logical_bytes_written);
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->io_logical_bytes_written);
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->io_storage_bytes_read + w->fix_io_storage_bytes_read);
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->io_storage_bytes_read);
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "SET %s = %llu\n", w->name, w->io_storage_bytes_written + w->fix_io_storage_bytes_written);
+ fprintf(stdout, "SET %s = %llu\n", w->name, w->io_storage_bytes_written);
}
fprintf(stdout, "END\n");
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 100 %u %s\n", w->name, hz, w->hidden ? "hidden,noreset" : "noreset");
+ fprintf(stdout, "DIMENSION %s '' absolute 1 %u %s\n", w->name, hz, w->hidden ? "hidden,noreset" : "noreset");
}
fprintf(stdout, "CHART %s.mem '' '%s Dedicated Memory (w/o shared)' 'MB' mem %s.mem stacked 20003 %d\n", type, title, type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 100 %u noreset\n", w->name, hz);
+ fprintf(stdout, "DIMENSION %s '' absolute 1 %u noreset\n", w->name, hz);
}
fprintf(stdout, "CHART %s.cpu_system '' '%s CPU System Time (%d%% = %d core%s)' 'cpu time %%' cpu %s.cpu_system stacked 20021 %d\n", type, title, (processors * 100), processors, (processors>1)?"s":"", type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 100 %u noreset\n", w->name, hz);
+ fprintf(stdout, "DIMENSION %s '' absolute 1 %u noreset\n", w->name, hz);
}
fprintf(stdout, "CHART %s.major_faults '' '%s Major Page Faults (swap read)' 'page faults/s' swap %s.major_faults stacked 20010 %d\n", type, title, type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 1 1 noreset\n", w->name);
+ fprintf(stdout, "DIMENSION %s '' absolute 1 100 noreset\n", w->name);
}
fprintf(stdout, "CHART %s.minor_faults '' '%s Minor Page Faults' 'page faults/s' mem %s.minor_faults stacked 20011 %d\n", type, title, type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 1 1 noreset\n", w->name);
+ fprintf(stdout, "DIMENSION %s '' absolute 1 100 noreset\n", w->name);
}
fprintf(stdout, "CHART %s.lreads '' '%s Disk Logical Reads' 'kilobytes/s' disk %s.lreads stacked 20042 %d\n", type, title, type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 1 %d noreset\n", w->name, 1024);
+ fprintf(stdout, "DIMENSION %s '' incremental 1 %d noreset\n", w->name, 1024*100);
}
fprintf(stdout, "CHART %s.lwrites '' '%s I/O Logical Writes' 'kilobytes/s' disk %s.lwrites stacked 20042 %d\n", type, title, type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 1 %d noreset\n", w->name, 1024);
+ fprintf(stdout, "DIMENSION %s '' incremental 1 %d noreset\n", w->name, 1024*100);
}
fprintf(stdout, "CHART %s.preads '' '%s Disk Reads' 'kilobytes/s' disk %s.preads stacked 20002 %d\n", type, title, type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 1 %d noreset\n", w->name, 1024);
+ fprintf(stdout, "DIMENSION %s '' incremental 1 %d noreset\n", w->name, 1024*100);
}
fprintf(stdout, "CHART %s.pwrites '' '%s Disk Writes' 'kilobytes/s' disk %s.pwrites stacked 20002 %d\n", type, title, type, update_every);
for (w = root; w ; w = w->next) {
if(w->target || (!w->processes && !w->exposed)) continue;
- fprintf(stdout, "DIMENSION %s '' incremental 1 %d noreset\n", w->name, 1024);
+ fprintf(stdout, "DIMENSION %s '' incremental 1 %d noreset\n", w->name, 1024*100);
}
fprintf(stdout, "CHART %s.files '' '%s Open Files' 'open files' disk %s.files stacked 20050 %d\n", type, title, type, update_every);
if(strcmp("debug", argv[i]) == 0) {
debug = 1;
- debug_flags = 0xffffffff;
+ // debug_flags = 0xffffffff;
continue;
}
parse_args(argc, argv);
+ all_pids_sortlist = calloc(sizeof(pid_t), (size_t)pid_max);
+ if(!all_pids_sortlist) {
+ error("Cannot allocate %lu bytes of memory.", sizeof(pid_t) * pid_max);
+ printf("DISABLE\n");
+ exit(1);
+ }
+
all_pids = calloc(sizeof(struct pid_stat *), (size_t) pid_max);
if(!all_pids) {
error("Cannot allocate %lu bytes of memory.", sizeof(struct pid_stat *) * pid_max);
unsigned long long sunow;
#endif /* PROFILING_MODE */
- unsigned long long counter = 1;
- for(;1; counter++) {
+ global_iterations_counter = 1;
+ for(;1; global_iterations_counter++) {
#ifndef PROFILING_MODE
// delay until it is our time to run
while((sunow = timems()) < sunext)
send_collected_data_to_netdata(groups_root_target, "groups", dt);
if(unlikely(debug))
- fprintf(stderr, "apps.plugin: done Loop No %llu\n", counter);
+ fprintf(stderr, "apps.plugin: done Loop No %llu\n", global_iterations_counter);
current_t = time(NULL);