Exercise 3
Part 1
In the first part of this exercise we will need to check which fields of struct task_struct
in Linux or proc_t
are responsible for which aspects of process functioning. You will need to apply following changes to dump task scripts: timer probe has to be replaced to a pair of probes: proc:::exec-*
and proc:::exit
in DTrace or kprocess.exec_complete
and kprocess.exit
in SystemTap. We have used exit probes for execve()
system call to collect command line arguments: they are not filled in unless execve()
call finishes.
Here list of expected observations during this exercise:
-
When you run program with extra argument, it will be cleared in
main()
function, so you will see original argument in exec-probe, but only 'X' letters when program exits. -
When you run program through symbolic link
lab3-1
, VFS node which refer to a binary file will point to a regular filelab3
. That node is represented byp_exec
field ofproc_t
in Solaris orexe_file
oftask_struct
in Linux.execname
, however, will behave differently in Solaris and Linux. -
When you run program in chroot environment, root process directory will change from
/
to/tmp/chroot
.
Here are resulting scripts (they are not much different from original):
/** * taskdump.stp * * Prints information about current task * Extracts data from `task_struct` * * Tested on CentOS 7.0 */ /** * Structures `dentry` and `vfsmnt` were separate in older kernels. * Newer kernels feature unified `path` structures that contain them both. * * SystemTap doesn't cache full path, so we have to use function task_dentry_path(), * to get entire path in this manner: * dentry = @cast(file, "file")->f_path->dentry; * vfsmnt = @cast(file, "file")->f_path->mnt; * return task_dentry_path(task, dentry, vfsmnt); * * Unfortunately, SystemTap has bug 16991, fixed in 2.6, so * we limit output to a basename */ function file_path:string(task:long, file:long) { if(@defined(@cast(file, "file")->f_vfsmnt)) return d_name(@cast(file, "file")->f_dentry); return d_name(@cast(file, "file")->f_path->dentry); } function task_root_path:string(task:long, fs_ptr:long) { if(@defined(@cast(fs_ptr, "fs_struct")->rootmnt)) return d_name(@cast(fs_ptr, "fs_struct")->root); return d_name(@cast(fs_ptr, "fs_struct")->root->dentry); } function task_pwd_path:string(task:long, fs_ptr:long) { if(@defined(@cast(fs_ptr, "fs_struct")->pwdmnt)) return d_name(@cast(fs_ptr, "fs_struct")->pwd); return d_name(@cast(fs_ptr, "fs_struct")->pwd->dentry); } /** * Prints exectuable file name from `mm->exe_file` */ function task_exefile(task:long, mm_ptr:long) { if(mm_ptr) { printf("\texe: %s\n", file_path(task, @cast(mm_ptr, "mm_struct")->exe_file)); } } /** * Prints root and current dir of a task */ function task_paths(task:long, fs_ptr:long) { if(fs_ptr) { printf("\troot: %s\n", task_root_path(task, fs_ptr)); printf("\tcwd: %s\n", task_pwd_path(task, fs_ptr)); } } /** * Prints arguments vector. Arguments are copied into process memory (stack) * and located in memory area (mm->arg_start; mm_arg_end), of the strings that * separated with NULL-terminators, i.e.: * +-----+----+-------------+----+ * | cat | \0 | /etc/passwd | \0 | * +-----+----+-------------+----+ * ^ ^ * arg_start arg_end * * WARNING: This is only a demostration functions, use cmdline_*() functions * instead * * NOTE: functions user_string* read from current address space * To get arguments from other processes, use Embedded C and * function that look like proc_pid_cmdline */ function task_args(mm_ptr:long) { if(mm_ptr) { arg_start = @cast(mm_ptr, "mm_struct")->arg_start; arg_end = @cast(mm_ptr, "mm_struct")->arg_end; if (arg_start != 0 && arg_end != 0) { len = arg_end - arg_start; nr = 0; /*Выбираем первый аргумент*/ arg = user_string2(arg_start, ""); while (len > 0) { printf("\targ%d: %s\n", nr, arg); arg_len = strlen(arg); arg_start += arg_len + 1; len -= arg_len + 1; nr++; arg = user_string2(arg_start, ""); } } } } /** * Returns file descriptor using fd * NOTE: see pfiles.stp */ function task_fd_filp:long(files:long, fd:long) { return @cast(files, "files_struct")->fdt->fd[fd]; } function task_fds(task:long) { task_files = @cast(task, "task_struct", "kernel")->files; if(task_files) { max_fds = task_max_file_handles(task); for (fd = 0; fd < max_fds; fd++) { filp = task_fd_filp(task_files, fd); if(filp) { printf("\tfile%d: %s\n", fd, file_path(task, filp)); } } } } /** * Prints start time of a process in seconds * start time - monotonic * real start time - boot time based * * NOTE: This function assumes that `timespec` is used, but these * variables were replaced with u64 in kernel 3.17 */ function task_start_time_x(task:long) { if(@defined(@cast(task, "task_struct", "kernel ") ->start_time)) { start_time_sec = @cast(task, "task_struct", "kernel ") ->start_time->tv_sec; real_time_sec = @cast(task, "task_struct", "kernel ") ->real_time->tv_sec; printf("\tstart time: %ds\t real start time: %ds\n", start_time_sec, real_time_sec); } else { real_time_sec = @cast(task, "task_struct", "kernel ") ->real_start_time->tv_sec; printf("\treal start time: %ds\n", real_time_sec); } } /** * Prints scheduler stats */ function task_time_stats(task:long) { user = @cast(task, "task_struct", "kernel ")->utime; kernel = @cast(task, "task_struct", "kernel ")->stime; printf("\tuser: %s\t kernel: %s\n", cputime_to_string(user), cputime_to_string(kernel)); } function dump_task(task:long) { task_mm = @cast(task, "task_struct", "kernel ")->mm; task_fs = @cast(task, "task_struct", "kernel ")->fs; printf("Task %p is %d@%d %s\n", task, task_pid(task), task_cpu(task), task_execname(task)); task_exefile(task, task_mm); task_paths(task, task_fs); task_args(task_mm); task_fds(task); task_start_time_x(task); task_time_stats(task); } probe kprocess.exec_complete, kprocess.exit { dump_task(task_current()); }
#!/usr/sbin/dtrace -qCs /** * dumptask.d * * Prints information about current task once per second * Contains macros to extract data from `kthread_t` and its siblings * Some parts use standard translators `psinfo_t` and `lwpsinfo_t*` * * Tested on Solaris 11.2 */ int argnum; void* argvec; string pargs[int]; int fdnum; uf_entry_t* fdlist; #define PSINFO(thread) xlate(thread->t_procp) #define LWPSINFO(thread) xlate (thread) #define PUSER(thread) thread->t_procp->p_user /** * Extract pointer depending on data model: 8 byte for 64-bit * programs and 4 bytes for 32-bit programs. */ #define DATAMODEL_ILP32 0x00100000 #define GETPTR(proc, array, idx) \ ((uintptr_t) ((proc->p_model == DATAMODEL_ILP32) \ ? ((uint32_t*) array)[idx] : ((uint64_t*) array)[idx])) #define GETPTRSIZE(proc) \ ((proc->p_model == DATAMODEL_ILP32)? 4 : 8) #define FILE(list, num) list[num].uf_file #define CLOCK_TO_MS(clk) (clk) * (`nsec_per_tick / 1000000) /* Helper to extract vnode path in safe manner */ #define VPATH(vn) \ ((vn) == NULL || (vn)->v_path == NULL) \ ? "unknown" : stringof((vn)->v_path) /* Prints process root - can be not `/` for zones */ #define DUMP_TASK_ROOT(thread) \ printf("\troot: %s\n", \ PUSER(thread).u_rdir == NULL \ ? "/" \ : VPATH(PUSER(thread).u_rdir)); /* Prints current working directory of a process */ #define DUMP_TASK_CWD(thread) \ printf("\tcwd: %s\n", \ VPATH(PUSER(thread).u_cdir)); /* Prints executable file of a process */ #define DUMP_TASK_EXEFILE(thread) \ printf("\texe: %s\n", \ VPATH(thread->t_procp->p_exec)); /* Copy up to 9 process arguments. We use `psinfo_t` tapset to get number of arguments, and copy pointers to them into `argvec` array, and strings into `pargs` array. See also kernel function `exec_args()` */ #define COPYARG(t, n) \ pargs[n] = (n < argnum && argvec != 0) \ ? copyinstr(GETPTR(t->t_procp, argvec, n)) : "???" #define DUMP_TASK_ARGS_START(thread) \ printf("\tpsargs: %s\n", PSINFO(thread)->pr_psargs); \ argnum = PSINFO(thread)->pr_argc; \ argvec = (PSINFO(thread)->pr_argv != 0) ? \ copyin(PSINFO(thread)->pr_argv, \ argnum * GETPTRSIZE(thread->t_procp)) : 0;\ COPYARG(thread, 0); COPYARG(thread, 1); COPYARG(thread, 2); \ COPYARG(thread, 3); COPYARG(thread, 4); COPYARG(thread, 5); \ COPYARG(thread, 6); COPYARG(thread, 7); COPYARG(thread, 8); /* Prints start time of process */ #define DUMP_TASK_START_TIME(thread) \ printf("\tstart time: %ums\n", \ (unsigned long) thread->t_procp->p_mstart / 1000000); /* Processor time used by a process. Only for conformance with dumptask.d, it is actually set when process exits */ #define DUMP_TASK_TIME_STATS(thread) \ printf("\tuser: %ldms\t kernel: %ldms\n", \ CLOCK_TO_MS(thread->t_procp->p_utime), \ CLOCK_TO_MS(thread->t_procp->p_stime)); #define DUMP_TASK_FDS_START(thread) \ fdlist = PUSER(thread).u_finfo.fi_list; \ fdcnt = 0; \ fdnum = PUSER(thread).u_finfo.fi_nfiles; #define DUMP_TASK(thread) \ printf("Task %p is %d/%d@%d %s\n", thread, \ PSINFO(thread)->pr_pid, \ LWPSINFO(thread)->pr_lwpid, \ LWPSINFO(thread)->pr_onpro, \ PUSER(thread).u_comm); \ DUMP_TASK_EXEFILE(thread) \ DUMP_TASK_ROOT(thread) \ DUMP_TASK_CWD(thread) \ DUMP_TASK_ARGS_START(thread) \ DUMP_TASK_FDS_START(thread) \ DUMP_TASK_START_TIME(thread) \ DUMP_TASK_TIME_STATS(thread) #define _DUMP_ARG_PROBE(probe, argi) \ probe /argi < argnum/ { \ printf("\targ%d: %s\n", argi, pargs[argi]); } #define DUMP_ARG_PROBE(probe) \ _DUMP_ARG_PROBE(probe, 0) _DUMP_ARG_PROBE(probe, 1) \ _DUMP_ARG_PROBE(probe, 2) _DUMP_ARG_PROBE(probe, 3) \ _DUMP_ARG_PROBE(probe, 4) _DUMP_ARG_PROBE(probe, 5) \ _DUMP_ARG_PROBE(probe, 6) _DUMP_ARG_PROBE(probe, 7) \ _DUMP_ARG_PROBE(probe, 8) #define _DUMP_FILE_PROBE(probe, fd) \ probe /fd < fdnum && FILE(fdlist, fd)/ { \ printf("\tfile%d: %s\n", fd, \ VPATH(FILE(fdlist, fd)->f_vnode)); } #define DUMP_FILE_PROBE(probe) \ _DUMP_FILE_PROBE(probe, 0) _DUMP_FILE_PROBE(probe, 1) \ _DUMP_FILE_PROBE(probe, 2) _DUMP_FILE_PROBE(probe, 3) \ _DUMP_FILE_PROBE(probe, 4) _DUMP_FILE_PROBE(probe, 5) \ _DUMP_FILE_PROBE(probe, 6) _DUMP_FILE_PROBE(probe, 7) BEGIN { proc = 0; argnum = 0; fdnum = 0; } proc:::exec-*, proc:::exit { DUMP_TASK(curthread); } DUMP_ARG_PROBE(proc:::exec-*) DUMP_ARG_PROBE(proc:::exit) DUMP_FILE_PROBE(proc:::exec-*) DUMP_FILE_PROBE(proc:::exit)
Part 2
First of all we have to create several associative arrays which will use PID as a key (we can't use thread-local variables here because exit()
can be called from any of process threads), and timestamp as a value. Final data will be kept in aggregations which we already learned in exercise 2.
We will use probes from the section Lifetime of a process. They are shown in the following picture:
However, we do not know PID at the time fork()
is called so we will use thread-local variable for that. We can check return value of fork()
in return probe and re-use timestamp saved previously if everything went fine and fork()
has returned value greater than 1 or throw it away.
We wrote an ugly function task_args()
to collect process arguments in dumptask.stp
script. This data is available since SystemTap 2.5: kprocess.exec
probe provides program's arguments in argstr
argument. We will use curpsinfo->pr_psargs
on Solaris as it keeps first 80 characters of command line to get rid of copying userspace arguments too. We will use timestamp
variable in DTrace as a source of timestamps (again, a tautology). We will use local_clock_us()
function as we do not care about CPU time skew.
Finally, to reduce memory footprint in SystemTap, we will reduce associative arrays sizes. Here are resulting scripts:
global tm_fork_start_par[128], tm_fork_start[128], tm_fork_end[128], tm_exec_start[128], tm_exec_end[128], p_argstr[128]; global fork[128], postfork[128], exec[128], proc[128]; probe syscall.fork { tm_fork_start_par[tid()] = local_clock_us(); } probe syscall.fork.return { if($return > 1) { tm_fork_start[$return] = tm_fork_start_par[tid()]; delete tm_fork_start_par[tid()]; } } probe kprocess.start { tm_fork_end[pid()] = local_clock_us(); } probe kprocess.exec { p_argstr[pid()] = argstr; tm_exec_start[pid()] = local_clock_us(); } probe kprocess.exec_complete { tm_exec_end[pid()] = local_clock_us(); } probe kprocess.exit { argstr = p_argstr[pid()]; fork[execname(), argstr] <<< tm_fork_end[pid()] - tm_fork_start[pid()]; postfork[execname(), argstr] <<< tm_exec_start[pid()] - tm_fork_end[pid()]; exec[execname(), argstr] <<< tm_exec_end[pid()] - tm_exec_start[pid()]; proc[execname(), argstr] <<< local_clock_us() - tm_exec_end[pid()]; delete tm_fork_start[pid()]; delete tm_fork_end[pid()]; delete tm_exec_start[pid()]; delete tm_exec_end[pid()]; delete p_argstr[pid()]; } probe timer.s(1) { printf("%48s %8s %8s %8s %8s\n", "COMMAND", "FORK", "POSTFORK", "EXEC", "PROC"); foreach([execname, args] in proc) { printf("%10s %36s %6dus %6dus %6dus %6dus\n", execname, args, @avg(fork[execname, args]), @avg(postfork[execname, args]), @avg(exec[execname, args]), @avg(proc[execname, args])); } delete fork; delete postfork; delete exec; delete proc; }
uint64_t tm_fork_start[int]; uint64_t tm_fork_end[int]; uint64_t tm_exec_start[int]; uint64_t tm_exec_end[int]; syscall::*fork*:entry { self->tm_fork_start = timestamp; } syscall::*fork*:return /arg1 > 0/ { tm_fork_start[arg1] = self->tm_fork_start; } proc:::start { tm_fork_end[pid] = timestamp; } proc:::exec { tm_exec_start[pid] = timestamp; } proc:::exec-* { tm_exec_end[pid] = timestamp; } proc:::exit / tm_fork_start[pid] > 0 && tm_fork_end[pid] > 0 && tm_exec_start[pid] > 0 && tm_exec_end[pid] > 0 / { @fork[curpsinfo->pr_fname, curpsinfo->pr_psargs] = avg(tm_fork_end[pid] - tm_fork_start[pid]); @postfork[curpsinfo->pr_fname, curpsinfo->pr_psargs] = avg(tm_exec_start[pid] - tm_fork_end[pid]); @exec[curpsinfo->pr_fname, curpsinfo->pr_psargs] = avg(tm_exec_end[pid] - tm_exec_start[pid]); @proc[curpsinfo->pr_fname, curpsinfo->pr_psargs] = avg(timestamp - tm_exec_end[pid]); tm_fork_start[pid] = 0; tm_fork_end[pid] = 0; tm_exec_start[pid] = 0; tm_exec_end[pid] = 0; } tick-1s { normalize(@fork, 1000); normalize(@postfork, 1000); normalize(@exec, 1000); normalize(@proc, 1000); printf("%32s %8s %8s %8s %8s\n", "COMMAND", "FORK", "POSTFORK", "EXEC", "PROC"); printa("%10s %22s %@6dus %@6dus %@6dus %@6dus\n", @fork, @exec, @postfork, @proc); clear(@fork); clear(@postfork); clear(@exec); clear(@proc); }