diff options
Diffstat (limited to 'Documentation/accounting')
| -rw-r--r-- | Documentation/accounting/.gitignore | 1 | ||||
| -rw-r--r-- | Documentation/accounting/Makefile | 10 | ||||
| -rw-r--r-- | Documentation/accounting/cgroupstats.txt | 27 | ||||
| -rw-r--r-- | Documentation/accounting/delay-accounting.txt | 11 | ||||
| -rw-r--r-- | Documentation/accounting/getdelays.c | 189 | ||||
| -rw-r--r-- | Documentation/accounting/taskstats-struct.txt | 21 |
6 files changed, 221 insertions, 38 deletions
diff --git a/Documentation/accounting/.gitignore b/Documentation/accounting/.gitignore new file mode 100644 index 00000000000..86485203c4a --- /dev/null +++ b/Documentation/accounting/.gitignore @@ -0,0 +1 @@ +getdelays diff --git a/Documentation/accounting/Makefile b/Documentation/accounting/Makefile new file mode 100644 index 00000000000..31929eb875b --- /dev/null +++ b/Documentation/accounting/Makefile @@ -0,0 +1,10 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +# List of programs to build +hostprogs-y := getdelays + +# Tell kbuild to always build the programs +always := $(hostprogs-y) + +HOSTCFLAGS_getdelays.o += -I$(objtree)/usr/include diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt new file mode 100644 index 00000000000..d16a9849e60 --- /dev/null +++ b/Documentation/accounting/cgroupstats.txt @@ -0,0 +1,27 @@ +Control Groupstats is inspired by the discussion at +http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as +suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263. + +Per cgroup statistics infrastructure re-uses code from the taskstats +interface. A new set of cgroup operations are registered with commands +and attributes specific to cgroups. It should be very easy to +extend per cgroup statistics, by adding members to the cgroupstats +structure. + +The current model for cgroupstats is a pull, a push model (to post +statistics on interesting events), should be very easy to add. Currently +user space requests for statistics by passing the cgroup path. +Statistics about the state of all the tasks in the cgroup is returned to +user space. + +NOTE: We currently rely on delay accounting for extracting information +about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this +information will not be available. + +To extract cgroup statistics a utility very similar to getdelays.c +has been developed, the sample output of the utility is shown below + +~/balbir/cgroupstats # ./getdelays -C "/sys/fs/cgroup/a" +sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0 +~/balbir/cgroupstats # ./getdelays -C "/sys/fs/cgroup" +sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2 diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt index 1443cd71d26..8a12f0730c9 100644 --- a/Documentation/accounting/delay-accounting.txt +++ b/Documentation/accounting/delay-accounting.txt @@ -11,6 +11,7 @@ the delays experienced by a task while a) waiting for a CPU (while being runnable) b) completion of synchronous block I/O initiated by the task c) swapping in pages +d) memory reclaim and makes these statistics available to userspace through the taskstats interface. @@ -41,7 +42,7 @@ this structure. See include/linux/taskstats.h for a description of the fields pertaining to delay accounting. It will generally be in the form of counters returning the cumulative -delay seen for cpu, sync block I/O, swapin etc. +delay seen for cpu, sync block I/O, swapin, memory reclaim etc. Taking the difference of two successive readings of a given counter (say cpu_delay_total) for a task will give the delay @@ -94,7 +95,9 @@ CPU count real total virtual total delay total 7876 92005750 100000000 24001500 IO count delay total 0 0 -MEM count delay total +SWAP count delay total + 0 0 +RECLAIM count delay total 0 0 Get delays seen in executing a given simple command @@ -108,5 +111,7 @@ CPU count real total virtual total delay total 6 4000250 4000000 0 IO count delay total 0 0 -MEM count delay total +SWAP count delay total + 0 0 +RECLAIM count delay total 0 0 diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index e9126e794ed..f40578026a0 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c @@ -21,11 +21,12 @@ #include <sys/types.h> #include <sys/stat.h> #include <sys/socket.h> -#include <sys/types.h> +#include <sys/wait.h> #include <signal.h> #include <linux/genetlink.h> #include <linux/taskstats.h> +#include <linux/cgroupstats.h> /* * Generic macros for dealing with netlink sockets. Might be duplicated @@ -49,7 +50,7 @@ char name[100]; int dbg; int print_delays; int print_io_accounting; -__u64 stime, utime; +int print_task_context_switch_counts; #define PRINTF(fmt, arg...) { \ if (dbg) { \ @@ -61,8 +62,6 @@ __u64 stime, utime; #define MAX_MSG_SIZE 1024 /* Maximum number of cpus expected to be specified in a cpumask */ #define MAX_CPUS 32 -/* Maximum length of pathname to log file */ -#define MAX_FILENAME 256 struct msgtemplate { struct nlmsghdr n; @@ -72,6 +71,17 @@ struct msgtemplate { char cpumask[100+6*MAX_CPUS]; +static void usage(void) +{ + fprintf(stderr, "getdelays [-dilv] [-w logfile] [-r bufsize] " + "[-m cpumask] [-t tgid] [-p pid]\n"); + fprintf(stderr, " -d: print delayacct stats\n"); + fprintf(stderr, " -i: print IO accounting (works only with -p)\n"); + fprintf(stderr, " -l: listen forever\n"); + fprintf(stderr, " -v: debug on\n"); + fprintf(stderr, " -C: container path\n"); +} + /* * Create a raw netlink socket and bind */ @@ -87,10 +97,9 @@ static int create_nl_socket(int protocol) if (rcvbufsz) if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbufsz, sizeof(rcvbufsz)) < 0) { - fprintf(stderr, "Unable to set socket rcv buf size " - "to %d\n", + fprintf(stderr, "Unable to set socket rcv buf size to %d\n", rcvbufsz); - return -1; + goto error; } memset(&local, 0, sizeof(local)); @@ -106,7 +115,7 @@ error: } -int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, __u8 genl_cmd, __u16 nla_type, void *nla_data, int nla_len) { @@ -150,7 +159,7 @@ int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, * Probe the controller in genetlink to find the family id * for the TASKSTATS family */ -int get_family_id(int sd) +static int get_family_id(int sd) { struct { struct nlmsghdr n; @@ -158,7 +167,7 @@ int get_family_id(int sd) char buf[256]; } ans; - int id, rc; + int id = 0, rc; struct nlattr *na; int rep_len; @@ -166,6 +175,8 @@ int get_family_id(int sd) rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, CTRL_ATTR_FAMILY_NAME, (void *)name, strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) + return 0; /* sendto() failure? */ rep_len = recv(sd, &ans, sizeof(ans), 0); if (ans.n.nlmsg_type == NLMSG_ERROR || @@ -180,23 +191,59 @@ int get_family_id(int sd) return id; } -void print_delayacct(struct taskstats *t) +#define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) + +static void print_delayacct(struct taskstats *t) +{ + printf("\n\nCPU %15s%15s%15s%15s%15s\n" + " %15llu%15llu%15llu%15llu%15.3fms\n" + "IO %15s%15s%15s\n" + " %15llu%15llu%15llums\n" + "SWAP %15s%15s%15s\n" + " %15llu%15llu%15llums\n" + "RECLAIM %12s%15s%15s\n" + " %15llu%15llu%15llums\n", + "count", "real total", "virtual total", + "delay total", "delay average", + (unsigned long long)t->cpu_count, + (unsigned long long)t->cpu_run_real_total, + (unsigned long long)t->cpu_run_virtual_total, + (unsigned long long)t->cpu_delay_total, + average_ms((double)t->cpu_delay_total, t->cpu_count), + "count", "delay total", "delay average", + (unsigned long long)t->blkio_count, + (unsigned long long)t->blkio_delay_total, + average_ms(t->blkio_delay_total, t->blkio_count), + "count", "delay total", "delay average", + (unsigned long long)t->swapin_count, + (unsigned long long)t->swapin_delay_total, + average_ms(t->swapin_delay_total, t->swapin_count), + "count", "delay total", "delay average", + (unsigned long long)t->freepages_count, + (unsigned long long)t->freepages_delay_total, + average_ms(t->freepages_delay_total, t->freepages_count)); +} + +static void task_context_switch_counts(struct taskstats *t) { - printf("\n\nCPU %15s%15s%15s%15s\n" - " %15llu%15llu%15llu%15llu\n" - "IO %15s%15s\n" - " %15llu%15llu\n" - "MEM %15s%15s\n" - " %15llu%15llu\n\n", - "count", "real total", "virtual total", "delay total", - t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total, - t->cpu_delay_total, - "count", "delay total", - t->blkio_count, t->blkio_delay_total, - "count", "delay total", t->swapin_count, t->swapin_delay_total); + printf("\n\nTask %15s%15s\n" + " %15llu%15llu\n", + "voluntary", "nonvoluntary", + (unsigned long long)t->nvcsw, (unsigned long long)t->nivcsw); } -void print_ioacct(struct taskstats *t) +static void print_cgroupstats(struct cgroupstats *c) +{ + printf("sleeping %llu, blocked %llu, running %llu, stopped %llu, " + "uninterruptible %llu\n", (unsigned long long)c->nr_sleeping, + (unsigned long long)c->nr_io_wait, + (unsigned long long)c->nr_running, + (unsigned long long)c->nr_stopped, + (unsigned long long)c->nr_uninterruptible); +} + + +static void print_ioacct(struct taskstats *t) { printf("%s: read=%llu, write=%llu, cancelled_write=%llu\n", t->ac_comm, @@ -207,7 +254,8 @@ void print_ioacct(struct taskstats *t) int main(int argc, char *argv[]) { - int c, rc, rep_len, aggr_len, len2, cmd_type; + int c, rc, rep_len, aggr_len, len2; + int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC; __u16 id; __u32 mypid; @@ -221,13 +269,18 @@ int main(int argc, char *argv[]) int count = 0; int write_file = 0; int maskset = 0; - char logfile[128]; + char *logfile = NULL; int loop = 0; + int containerset = 0; + char *containerpath = NULL; + int cfd = 0; + int forking = 0; + sigset_t sigset; struct msgtemplate msg; - while (1) { - c = getopt(argc, argv, "diw:r:m:t:p:v:l"); + while (!forking) { + c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:c:"); if (c < 0) break; @@ -240,8 +293,16 @@ int main(int argc, char *argv[]) printf("printing IO accounting\n"); print_io_accounting = 1; break; + case 'q': + printf("printing task/process context switch rates\n"); + print_task_context_switch_counts = 1; + break; + case 'C': + containerset = 1; + containerpath = optarg; + break; case 'w': - strncpy(logfile, optarg, MAX_FILENAME); + logfile = strdup(optarg); printf("write to file %s\n", logfile); write_file = 1; break; @@ -253,6 +314,7 @@ int main(int argc, char *argv[]) break; case 'm': strncpy(cpumask, optarg, sizeof(cpumask)); + cpumask[sizeof(cpumask) - 1] = '\0'; maskset = 1; printf("cpumask %s maskset %d\n", cpumask, maskset); break; @@ -268,6 +330,28 @@ int main(int argc, char *argv[]) err(1, "Invalid pid\n"); cmd_type = TASKSTATS_CMD_ATTR_PID; break; + case 'c': + + /* Block SIGCHLD for sigwait() later */ + if (sigemptyset(&sigset) == -1) + err(1, "Failed to empty sigset"); + if (sigaddset(&sigset, SIGCHLD)) + err(1, "Failed to set sigchld in sigset"); + sigprocmask(SIG_BLOCK, &sigset, NULL); + + /* fork/exec a child */ + tid = fork(); + if (tid < 0) + err(1, "Fork failed\n"); + if (tid == 0) + if (execvp(argv[optind - 1], + &argv[optind - 1]) < 0) + exit(-1); + + /* Set the command type and avoid further processing */ + cmd_type = TASKSTATS_CMD_ATTR_PID; + forking = 1; + break; case 'v': printf("debug on\n"); dbg = 1; @@ -277,7 +361,7 @@ int main(int argc, char *argv[]) loop = 1; break; default: - printf("Unknown option %d\n", c); + usage(); exit(-1); } } @@ -314,6 +398,20 @@ int main(int argc, char *argv[]) } } + if (tid && containerset) { + fprintf(stderr, "Select either -t or -C, not both\n"); + goto err; + } + + /* + * If we forked a child, wait for it to exit. Cannot use waitpid() + * as all the delicious data would be reaped as part of the wait + */ + if (tid && forking) { + int sig_received; + sigwait(&sigset, &sig_received); + } + if (tid) { rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, cmd_type, &tid, sizeof(__u32)); @@ -324,9 +422,25 @@ int main(int argc, char *argv[]) } } - do { - int i; + if (containerset) { + cfd = open(containerpath, O_RDONLY); + if (cfd < 0) { + perror("error opening container file"); + goto err; + } + rc = send_cmd(nl_sd, id, mypid, CGROUPSTATS_CMD_GET, + CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)); + if (rc < 0) { + perror("error sending cgroupstats command"); + goto err; + } + } + if (!maskset && !tid && !containerset) { + usage(); + goto err; + } + do { rep_len = recv(nl_sd, &msg, sizeof(msg), 0); PRINTF("received %d bytes\n", rep_len); @@ -343,7 +457,7 @@ int main(int argc, char *argv[]) goto done; } - PRINTF("nlmsghdr size=%d, nlmsg_len=%d, rep_len=%d\n", + PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n", sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len); @@ -351,7 +465,6 @@ int main(int argc, char *argv[]) na = (struct nlattr *) GENLMSG_DATA(&msg); len = 0; - i = 0; while (len < rep_len) { len += NLA_ALIGN(na->nla_len); switch (na->nla_type) { @@ -381,6 +494,8 @@ int main(int argc, char *argv[]) print_delayacct((struct taskstats *) NLA_DATA(na)); if (print_io_accounting) print_ioacct((struct taskstats *) NLA_DATA(na)); + if (print_task_context_switch_counts) + task_context_switch_counts((struct taskstats *) NLA_DATA(na)); if (fd) { if (write(fd, NLA_DATA(na), na->nla_len) < 0) { err(1,"write error\n"); @@ -400,9 +515,13 @@ int main(int argc, char *argv[]) } break; + case CGROUPSTATS_TYPE_CGROUP_STATS: + print_cgroupstats(NLA_DATA(na)); + break; default: fprintf(stderr, "Unknown nla_type %d\n", na->nla_type); + case TASKSTATS_TYPE_NULL: break; } na = (struct nlattr *) (GENLMSG_DATA(&msg) + len); @@ -421,5 +540,7 @@ err: close(nl_sd); if (fd) close(fd); + if (cfd) + close(cfd); return 0; } diff --git a/Documentation/accounting/taskstats-struct.txt b/Documentation/accounting/taskstats-struct.txt index 661c797eaf7..e7512c061c1 100644 --- a/Documentation/accounting/taskstats-struct.txt +++ b/Documentation/accounting/taskstats-struct.txt @@ -6,7 +6,7 @@ This document contains an explanation of the struct taskstats fields. There are three different groups of fields in the struct taskstats: 1) Common and basic accounting fields - If CONFIG_TASKSTATS is set, the taskstats inteface is enabled and + If CONFIG_TASKSTATS is set, the taskstats interface is enabled and the common fields and basic accounting fields are collected for delivery at do_exit() of a task. 2) Delay accounting fields @@ -22,6 +22,12 @@ There are three different groups of fields in the struct taskstats: /* Extended accounting fields end */ Their values are collected if CONFIG_TASK_XACCT is set. +4) Per-task and per-thread context switch count statistics + +5) Time accounting for SMT machines + +6) Extended delay accounting fields for memory reclaim + Future extension should add fields to the end of the taskstats struct, and should not change the relative position of each field within the struct. @@ -158,4 +164,17 @@ struct taskstats { /* Extended accounting fields end */ +4) Per-task and per-thread statistics + __u64 nvcsw; /* Context voluntary switch counter */ + __u64 nivcsw; /* Context involuntary switch counter */ + +5) Time accounting for SMT machines + __u64 ac_utimescaled; /* utime scaled on frequency etc */ + __u64 ac_stimescaled; /* stime scaled on frequency etc */ + __u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */ + +6) Extended delay accounting fields for memory reclaim + /* Delay waiting for memory reclaim */ + __u64 freepages_count; + __u64 freepages_delay_total; } |
